79 files changed, 2259 insertions, 1426 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 2168c902d5ca..d9f8afe6f0c4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -81,8 +81,6 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)           += coda/
 obj-$(CONFIG_MINIX_FS)          += minix/
 obj-$(CONFIG_FAT_FS)            += fat/
-obj-$(CONFIG_MSDOS_FS)          += msdos/
-obj-$(CONFIG_VFAT_FS)           += vfat/
 obj-$(CONFIG_BFS_FS)            += bfs/
 obj-$(CONFIG_ISO9660_FS)        += isofs/
 obj-$(CONFIG_HFSPLUS_FS)        += hfsplus/ # Before hfs to find wrapped HFS+
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 625abf5422e2..33bf8cbfd051 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -128,9 +128,10 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
-        int err = -EINVAL;
+        int err;
-        if (check_dev_ioctl_version(cmd, param)) {
+        err = check_dev_ioctl_version(cmd, param);
+        if (err) {
                AUTOFS_WARN("invalid device control module version "
                     "supplied for cmd(0x%08x)", cmd);
                goto out;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cde2f8e8935a..4b6fb3f628c0 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,12 +56,23 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        mntget(mnt);
        dget(dentry);
-        if (!autofs4_follow_mount(&mnt, &dentry))
+        if (!follow_down(&mnt, &dentry))
                goto done;
-        /* This is an autofs submount, we can't expire it */
+        if (is_autofs4_dentry(dentry)) {
-        if (is_autofs4_dentry(dentry))
+                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-                goto done;
+                /* This is an autofs submount, we can't expire it */
+                if (sbi->type == AUTOFS_TYPE_INDIRECT)
+                        goto done;
+                /*
+                 * Otherwise it's an offset mount and we need to check
+                 * if we can umount its mount, if there is one.
+                 */
+                if (!d_mountpoint(dentry))
+                        goto done;
+        }
        /* Update the expiry counter if fs is busy */
        if (!may_umount_tree(mnt)) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88a776fa0ef6..db831efbdbbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -986,7 +986,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
        struct gendisk *disk;
-        struct hd_struct *part = NULL;
        int ret;
        int partno;
        int perm = 0;
@@ -1004,24 +1003,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                return ret;
        }
-        ret = -ENXIO;
        lock_kernel();
+        ret = -ENXIO;
        disk = get_gendisk(bdev->bd_dev, &partno);
        if (!disk)
                goto out_unlock_kernel;
-        part = disk_get_part(disk, partno);
-        if (!part)
-                goto out_unlock_kernel;
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
-                bdev->bd_part = part;
                bdev->bd_contains = bdev;
                if (!partno) {
                        struct backing_dev_info *bdi;
+                        ret = -ENXIO;
+                        bdev->bd_part = disk_get_part(disk, partno);
+                        if (!bdev->bd_part)
+                                goto out_clear;
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev, mode);
                                if (ret)
@@ -1049,18 +1049,17 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        bdev->bd_contains = whole;
                        bdev->bd_inode->i_data.backing_dev_info =
                           whole->bd_inode->i_data.backing_dev_info;
+                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
-                            !part || !part->nr_sects) {
+                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
                                ret = -ENXIO;
                                goto out_clear;
                        }
-                        bd_set_size(bdev, (loff_t)part->nr_sects << 9);
+                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
                }
        } else {
-                disk_put_part(part);
                put_disk(disk);
                module_put(disk->fops->owner);
-                part = NULL;
                disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
@@ -1080,6 +1079,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        return 0;
 out_clear:
+        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
@@ -1091,7 +1091,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 out_unlock_kernel:
        unlock_kernel();
-        disk_put_part(part);
        if (disk)
                module_put(disk->fops->owner);
        put_disk(disk);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8f528ea24c48..8855331b2fba 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,11 @@ Various fixes to make delete of open files behavior more predictable
 (when delete of an open file fails we mark the file as "delete-on-close"
 in a way that more servers accept, but only if we can first rename the
 file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).
+handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests).
 Version 1.54
 ------------
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 69a12aae91d3..490e34bbf27a 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,12 +107,13 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-        struct list_head *tmp;
+        struct list_head *tmp1, *tmp2, *tmp3;
-        struct list_head *tmp1;
        struct mid_q_entry *mid_entry;
+        struct TCP_Server_Info *server;
        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
-        int i;
+        int i, j;
+        __u32 dev_type;
        seq_puts(m,
                    "Display Internal CIFS Data Structures for Debugging\n"
@@ -122,46 +123,78 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
        seq_printf(m, "Servers:");
        i = 0;
-        read_lock(&GlobalSMBSeslock);
+        read_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &GlobalSMBSessionList) {
+        list_for_each(tmp1, &cifs_tcp_ses_list) {
+                server = list_entry(tmp1, struct TCP_Server_Info,
+                                    tcp_ses_list);
                i++;
-                ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
+                list_for_each(tmp2, &server->smb_ses_list) {
-                if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
+                        ses = list_entry(tmp2, struct cifsSesInfo,
-                   (ses->serverNOS == NULL)) {
+                                         smb_ses_list);
-                        seq_printf(m, "\nentry for %s not fully "
+                        if ((ses->serverDomain == NULL) ||
-                                        "displayed\n\t", ses->serverName);
+                                (ses->serverOS == NULL) ||
-                } else {
+                                (ses->serverNOS == NULL)) {
-                        seq_printf(m,
+                                seq_printf(m, "\n%d) entry for %s not fully "
-                                    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
+                                           "displayed\n\t", i, ses->serverName);
-                                    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+                        } else {
+                                seq_printf(m,
+                                    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+                                    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
                                    " session status: %d\t",
                                i, ses->serverName, ses->serverDomain,
-                                atomic_read(&ses->inUse),
+                                ses->ses_count, ses->serverOS, ses->serverNOS,
-                                ses->serverOS, ses->serverNOS,
                                ses->capabilities, ses->status);
-                }
+                        }
-                if (ses->server) {
                        seq_printf(m, "TCP status: %d\n\tLocal Users To "
-                                    "Server: %d SecMode: 0x%x Req On Wire: %d",
+                                   "Server: %d SecMode: 0x%x Req On Wire: %d",
-                                ses->server->tcpStatus,
+                                   server->tcpStatus, server->srv_count,
-                                atomic_read(&ses->server->socketUseCount),
+                                   server->secMode,
-                                ses->server->secMode,
+                                   atomic_read(&server->inFlight));
-                                atomic_read(&ses->server->inFlight));
 #ifdef CONFIG_CIFS_STATS2
                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-                                atomic_read(&ses->server->inSend),
+                                atomic_read(&server->inSend),
-                                atomic_read(&ses->server->num_waiters));
+                                atomic_read(&server->num_waiters));
 #endif
-                        seq_puts(m, "\nMIDs:\n");
+                        seq_puts(m, "\n\tShares:");
+                        j = 0;
+                        list_for_each(tmp3, &ses->tcon_list) {
+                                tcon = list_entry(tmp3, struct cifsTconInfo,
+                                                  tcon_list);
+                                ++j;
+                                dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+                                seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+                                           tcon->treeName, tcon->tc_count);
+                                if (tcon->nativeFileSystem) {
+                                        seq_printf(m, "Type: %s ",
+                                                   tcon->nativeFileSystem);
+                                }
+                                seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+                                        "\nPathComponentMax: %d Status: 0x%d",
+                                        le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+                                        le32_to_cpu(tcon->fsAttrInfo.Attributes),
+                                        le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+                                        tcon->tidStatus);
+                                if (dev_type == FILE_DEVICE_DISK)
+                                        seq_puts(m, " type: DISK ");
+                                else if (dev_type == FILE_DEVICE_CD_ROM)
+                                        seq_puts(m, " type: CDROM ");
+                                else
+                                        seq_printf(m, " type: %d ", dev_type);
+                                if (tcon->need_reconnect)
+                                        seq_puts(m, "\tDISCONNECTED ");
+                                seq_putc(m, '\n');
+                        }
+                        seq_puts(m, "\n\tMIDs:\n");
                        spin_lock(&GlobalMid_Lock);
-                        list_for_each(tmp1, &ses->server->pending_mid_q) {
+                        list_for_each(tmp3, &server->pending_mid_q) {
-                                mid_entry = list_entry(tmp1, struct
+                                mid_entry = list_entry(tmp3, struct mid_q_entry,
-                                        mid_q_entry,
                                        qhead);
-                                seq_printf(m, "State: %d com: %d pid:"
+                                seq_printf(m, "\tState: %d com: %d pid:"
                                                " %d tsk: %p mid %d\n",
                                                mid_entry->midState,
                                                (int)mid_entry->command,
@@ -171,44 +204,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                        }
                        spin_unlock(&GlobalMid_Lock);
                }
-        }
-        read_unlock(&GlobalSMBSeslock);
-        seq_putc(m, '\n');
-        seq_puts(m, "Shares:");
-        i = 0;
-        read_lock(&GlobalSMBSeslock);
-        list_for_each(tmp, &GlobalTreeConnectionList) {
-                __u32 dev_type;
-                i++;
-                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-                dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-                seq_printf(m, "\n%d) %s Uses: %d ", i,
-                                 tcon->treeName, atomic_read(&tcon->useCount));
-                if (tcon->nativeFileSystem) {
-                        seq_printf(m, "Type: %s ",
-                                         tcon->nativeFileSystem);
-                }
-                seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-                                 "\nPathComponentMax: %d Status: %d",
-                            le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
-                            le32_to_cpu(tcon->fsAttrInfo.Attributes),
-                            le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
-                            tcon->tidStatus);
-                if (dev_type == FILE_DEVICE_DISK)
-                        seq_puts(m, " type: DISK ");
-                else if (dev_type == FILE_DEVICE_CD_ROM)
-                        seq_puts(m, " type: CDROM ");
-                else
-                        seq_printf(m, " type: %d ", dev_type);
-                if (tcon->tidStatus == CifsNeedReconnect)
-                        seq_puts(m, "\tDISCONNECTED ");
        }
-        read_unlock(&GlobalSMBSeslock);
+        read_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        /* BB add code to dump additional info such as TCP session info now */
@@ -234,7 +231,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 {
        char c;
        int rc;
-        struct list_head *tmp;
+        struct list_head *tmp1, *tmp2, *tmp3;
+        struct TCP_Server_Info *server;
+        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
        rc = get_user(c, buffer);
@@ -242,33 +241,42 @@ static ssize_t cifs_stats_proc_write(struct file *file,
                return rc;
        if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
-                read_lock(&GlobalSMBSeslock);
 #ifdef CONFIG_CIFS_STATS2
                atomic_set(&totBufAllocCount, 0);
                atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-                list_for_each(tmp, &GlobalTreeConnectionList) {
+                read_lock(&cifs_tcp_ses_lock);
-                        tcon = list_entry(tmp, struct cifsTconInfo,
+                list_for_each(tmp1, &cifs_tcp_ses_list) {
-                                        cifsConnectionList);
+                        server = list_entry(tmp1, struct TCP_Server_Info,
-                        atomic_set(&tcon->num_smbs_sent, 0);
+                                            tcp_ses_list);
-                        atomic_set(&tcon->num_writes, 0);
+                        list_for_each(tmp2, &server->smb_ses_list) {
-                        atomic_set(&tcon->num_reads, 0);
+                                ses = list_entry(tmp2, struct cifsSesInfo,
-                        atomic_set(&tcon->num_oplock_brks, 0);
+                                                 smb_ses_list);
-                        atomic_set(&tcon->num_opens, 0);
+                                list_for_each(tmp3, &ses->tcon_list) {
-                        atomic_set(&tcon->num_closes, 0);
+                                        tcon = list_entry(tmp3,
-                        atomic_set(&tcon->num_deletes, 0);
+                                                          struct cifsTconInfo,
-                        atomic_set(&tcon->num_mkdirs, 0);
+                                                          tcon_list);
-                        atomic_set(&tcon->num_rmdirs, 0);
+                                        atomic_set(&tcon->num_smbs_sent, 0);
-                        atomic_set(&tcon->num_renames, 0);
+                                        atomic_set(&tcon->num_writes, 0);
-                        atomic_set(&tcon->num_t2renames, 0);
+                                        atomic_set(&tcon->num_reads, 0);
-                        atomic_set(&tcon->num_ffirst, 0);
+                                        atomic_set(&tcon->num_oplock_brks, 0);
-                        atomic_set(&tcon->num_fnext, 0);
+                                        atomic_set(&tcon->num_opens, 0);
-                        atomic_set(&tcon->num_fclose, 0);
+                                        atomic_set(&tcon->num_closes, 0);
-                        atomic_set(&tcon->num_hardlinks, 0);
+                                        atomic_set(&tcon->num_deletes, 0);
-                        atomic_set(&tcon->num_symlinks, 0);
+                                        atomic_set(&tcon->num_mkdirs, 0);
-                        atomic_set(&tcon->num_locks, 0);
+                                        atomic_set(&tcon->num_rmdirs, 0);
+                                        atomic_set(&tcon->num_renames, 0);
+                                        atomic_set(&tcon->num_t2renames, 0);
+                                        atomic_set(&tcon->num_ffirst, 0);
+                                        atomic_set(&tcon->num_fnext, 0);
+                                        atomic_set(&tcon->num_fclose, 0);
+                                        atomic_set(&tcon->num_hardlinks, 0);
+                                        atomic_set(&tcon->num_symlinks, 0);
+                                        atomic_set(&tcon->num_locks, 0);
+                                }
+                        }
                }
-                read_unlock(&GlobalSMBSeslock);
+                read_unlock(&cifs_tcp_ses_lock);
        }
        return count;
@@ -277,7 +285,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
        int i;
-        struct list_head *tmp;
+        struct list_head *tmp1, *tmp2, *tmp3;
+        struct TCP_Server_Info *server;
+        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
        seq_printf(m,
@@ -306,44 +316,55 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
                GlobalCurrentXid, GlobalMaxActiveXid);
        i = 0;
-        read_lock(&GlobalSMBSeslock);
+        read_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &GlobalTreeConnectionList) {
+        list_for_each(tmp1, &cifs_tcp_ses_list) {
-                i++;
+                server = list_entry(tmp1, struct TCP_Server_Info,
-                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
+                                    tcp_ses_list);
-                seq_printf(m, "\n%d) %s", i, tcon->treeName);
+                list_for_each(tmp2, &server->smb_ses_list) {
-                if (tcon->tidStatus == CifsNeedReconnect)
+                        ses = list_entry(tmp2, struct cifsSesInfo,
-                        seq_puts(m, "\tDISCONNECTED ");
+                                         smb_ses_list);
-                seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
+                        list_for_each(tmp3, &ses->tcon_list) {
-                        atomic_read(&tcon->num_smbs_sent),
+                                tcon = list_entry(tmp3,
-                        atomic_read(&tcon->num_oplock_brks));
+                                                  struct cifsTconInfo,
-                seq_printf(m, "\nReads:  %d Bytes: %lld",
+                                                  tcon_list);
-                        atomic_read(&tcon->num_reads),
+                                i++;
-                        (long long)(tcon->bytes_read));
+                                seq_printf(m, "\n%d) %s", i, tcon->treeName);
-                seq_printf(m, "\nWrites: %d Bytes: %lld",
+                                if (tcon->need_reconnect)
-                        atomic_read(&tcon->num_writes),
+                                        seq_puts(m, "\tDISCONNECTED ");
-                        (long long)(tcon->bytes_written));
+                                seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-                seq_printf(m,
+                                        atomic_read(&tcon->num_smbs_sent),
-                        "\nLocks: %d HardLinks: %d Symlinks: %d",
+                                        atomic_read(&tcon->num_oplock_brks));
-                        atomic_read(&tcon->num_locks),
+                                seq_printf(m, "\nReads:  %d Bytes: %lld",
-                        atomic_read(&tcon->num_hardlinks),
+                                        atomic_read(&tcon->num_reads),
-                        atomic_read(&tcon->num_symlinks));
+                                        (long long)(tcon->bytes_read));
+                                seq_printf(m, "\nWrites: %d Bytes: %lld",
-                seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
+                                        atomic_read(&tcon->num_writes),
-                        atomic_read(&tcon->num_opens),
+                                        (long long)(tcon->bytes_written));
-                        atomic_read(&tcon->num_closes),
+                                seq_printf(m, "\nLocks: %d HardLinks: %d "
-                        atomic_read(&tcon->num_deletes));
+                                              "Symlinks: %d",
-                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+                                        atomic_read(&tcon->num_locks),
-                        atomic_read(&tcon->num_mkdirs),
+                                        atomic_read(&tcon->num_hardlinks),
-                        atomic_read(&tcon->num_rmdirs));
+                                        atomic_read(&tcon->num_symlinks));
-                seq_printf(m, "\nRenames: %d T2 Renames %d",
+                                seq_printf(m, "\nOpens: %d Closes: %d"
-                        atomic_read(&tcon->num_renames),
+                                              "Deletes: %d",
-                        atomic_read(&tcon->num_t2renames));
+                                        atomic_read(&tcon->num_opens),
-                seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
+                                        atomic_read(&tcon->num_closes),
-                        atomic_read(&tcon->num_ffirst),
+                                        atomic_read(&tcon->num_deletes));
-                        atomic_read(&tcon->num_fnext),
+                                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-                        atomic_read(&tcon->num_fclose));
+                                        atomic_read(&tcon->num_mkdirs),
+                                        atomic_read(&tcon->num_rmdirs));
+                                seq_printf(m, "\nRenames: %d T2 Renames %d",
+                                        atomic_read(&tcon->num_renames),
+                                        atomic_read(&tcon->num_t2renames));
+                                seq_printf(m, "\nFindFirst: %d FNext %d "
+                                              "FClose %d",
+                                        atomic_read(&tcon->num_ffirst),
+                                        atomic_read(&tcon->num_fnext),
+                                        atomic_read(&tcon->num_fclose));
+                        }
+                }
        }
-        read_unlock(&GlobalSMBSeslock);
+        read_unlock(&cifs_tcp_ses_lock);
        seq_putc(m, '\n');
        return 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d2c8eef84f3c..e1c18362ba46 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -106,7 +106,8 @@ static char *cifs_get_share_name(const char *node_name)
 /**
 * compose_mount_options        -       creates mount options for refferral
 * @sb_mountdata:       parent/root DFS mount options (template)
- * @ref_unc:            refferral server UNC
+ * @dentry:             point where we are going to mount
+ * @ref:                server's referral
 * @devname:            pointer for saving device name
 *
 * creates mount options for submount based on template options sb_mountdata
@@ -116,7 +117,8 @@ static char *cifs_get_share_name(const char *node_name)
 * Caller is responcible for freeing retunrned value if it is not error.
 */
 static char *compose_mount_options(const char *sb_mountdata,
-                                   const char *ref_unc,
+                                   struct dentry *dentry,
+                                   const struct dfs_info3_param *ref,
                                   char **devname)
 {
        int rc;
@@ -126,11 +128,12 @@ static char *compose_mount_options(const char *sb_mountdata,
        char *srvIP = NULL;
        char sep = ',';
        int off, noff;
+        char *fullpath;
        if (sb_mountdata == NULL)
                return ERR_PTR(-EINVAL);
-        *devname = cifs_get_share_name(ref_unc);
+        *devname = cifs_get_share_name(ref->node_name);
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP",
@@ -138,7 +141,12 @@ static char *compose_mount_options(const char *sb_mountdata,
                mountdata = ERR_PTR(rc);
                goto compose_mount_options_out;
        }
-        md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
+         * assuming that we have 'unc=' and 'ip=' in
+         * the original sb_mountdata
+         */
+        md_len = strlen(sb_mountdata) + strlen(srvIP) +
+                strlen(ref->node_name) + 12;
        mountdata = kzalloc(md_len+1, GFP_KERNEL);
        if (mountdata == NULL) {
                mountdata = ERR_PTR(-ENOMEM);
@@ -152,41 +160,56 @@ static char *compose_mount_options(const char *sb_mountdata,
                        strncpy(mountdata, sb_mountdata, 5);
                        off += 5;
        }
-        while ((tkn_e = strchr(sb_mountdata+off, sep))) {
-                noff = (tkn_e - (sb_mountdata+off)) + 1;
+        do {
-                if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+                tkn_e = strchr(sb_mountdata + off, sep);
+                if (tkn_e == NULL)
+                        noff = strlen(sb_mountdata + off);
+                else
+                        noff = tkn_e - (sb_mountdata + off) + 1;
+                if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
                        off += noff;
                        continue;
                }
-                if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+                if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
                        off += noff;
                        continue;
                }
-                if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+                if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
                        off += noff;
                        continue;
                }
-                strncat(mountdata, sb_mountdata+off, noff);
+                strncat(mountdata, sb_mountdata + off, noff);
                off += noff;
-        }
+        } while (tkn_e);
-        strcat(mountdata, sb_mountdata+off);
+        strcat(mountdata, sb_mountdata + off);
        mountdata[md_len] = '\0';
        /* copy new IP and ref share name */
-        strcat(mountdata, ",ip=");
+        if (mountdata[strlen(mountdata) - 1] != sep)
+                strncat(mountdata, &sep, 1);
+        strcat(mountdata, "ip=");
        strcat(mountdata, srvIP);
-        strcat(mountdata, ",unc=");
+        strncat(mountdata, &sep, 1);
+        strcat(mountdata, "unc=");
        strcat(mountdata, *devname);
        /* find & copy prefixpath */
-        tkn_e = strchr(ref_unc+2, '\\');
+        tkn_e = strchr(ref->node_name + 2, '\\');
-        if (tkn_e) {
+        if (tkn_e == NULL) /* invalid unc, missing share name*/
-                tkn_e = strchr(tkn_e+1, '\\');
+                goto compose_mount_options_out;
-                if (tkn_e) {
-                        strcat(mountdata, ",prefixpath=");
+        fullpath = build_path_from_dentry(dentry);
-                        strcat(mountdata, tkn_e+1);
+        tkn_e = strchr(tkn_e + 1, '\\');
-                }
+        if (tkn_e || strlen(fullpath) - (ref->path_consumed)) {
+                strncat(mountdata, &sep, 1);
+                strcat(mountdata, "prefixpath=");
+                if (tkn_e)
+                        strcat(mountdata, tkn_e + 1);
+                strcat(mountdata, fullpath + (ref->path_consumed));
        }
+        kfree(fullpath);
        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
        /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -198,7 +221,7 @@ compose_mount_options_out:
 static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
-                struct dentry *dentry, char *ref_unc)
+                struct dentry *dentry, const struct dfs_info3_param *ref)
 {
        struct cifs_sb_info *cifs_sb;
        struct vfsmount *mnt;
@@ -207,7 +230,7 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
        mountdata = compose_mount_options(cifs_sb->mountdata,
-                                                ref_unc, &devname);
+                                                dentry, ref, &devname);
        if (IS_ERR(mountdata))
                return (struct vfsmount *)mountdata;
@@ -310,7 +333,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                        }
                        mnt = cifs_dfs_do_refmount(nd->path.mnt,
                                                nd->path.dentry,
-                                                referrals[i].node_name);
+                                                referrals + i);
                        cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
                                         __func__,
                                        referrals[i].node_name, mnt));
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index fcee9298b620..0ab2fb5afef1 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -73,8 +73,8 @@ struct key_type cifs_spnego_key_type = {
 * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN        13
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN       42
+#define MAX_IPV6_ADDR_LEN       43
 /* strlen of "host=" */
 #define HOST_KEY_LEN            5
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ac5915d61dca..d9cf467309e8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -514,10 +514,11 @@ static void cifs_umount_begin(struct super_block *sb)
        tcon = cifs_sb->tcon;
        if (tcon == NULL)
                return;
-        down(&tcon->tconSem);
-        if (atomic_read(&tcon->useCount) == 1)
+        read_lock(&cifs_tcp_ses_lock);
+        if (tcon->tc_count == 1)
                tcon->tidStatus = CifsExiting;
-        up(&tcon->tconSem);
+        read_unlock(&cifs_tcp_ses_lock);
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
@@ -1013,7 +1014,7 @@ static int cifs_oplock_thread(void *dummyarg)
                                not bother sending an oplock release if session
                                to server still is disconnected since oplock
                                already released by the server in that case */
-                        if (pTcon->tidStatus != CifsNeedReconnect) {
+                        if (!pTcon->need_reconnect) {
                                rc = CIFSSMBLock(0, pTcon, netfid,
                                                0 /* len */ , 0 /* offset */, 0,
                                                0, LOCKING_ANDX_OPLOCK_RELEASE,
@@ -1031,24 +1032,24 @@ static int cifs_oplock_thread(void *dummyarg)
 static int cifs_dnotify_thread(void *dummyarg)
 {
        struct list_head *tmp;
-        struct cifsSesInfo *ses;
+        struct TCP_Server_Info *server;
        do {
                if (try_to_freeze())
                        continue;
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(15*HZ);
-                read_lock(&GlobalSMBSeslock);
                /* check if any stuck requests that need
                   to be woken up and wakeq so the
                   thread can wake up and error out */
-                list_for_each(tmp, &GlobalSMBSessionList) {
+                read_lock(&cifs_tcp_ses_lock);
-                        ses = list_entry(tmp, struct cifsSesInfo,
+                list_for_each(tmp, &cifs_tcp_ses_list) {
-                                cifsSessionList);
+                        server = list_entry(tmp, struct TCP_Server_Info,
-                        if (ses->server && atomic_read(&ses->server->inFlight))
+                                         tcp_ses_list);
-                                wake_up_all(&ses->server->response_q);
+                        if (atomic_read(&server->inFlight))
+                                wake_up_all(&server->response_q);
                }
-                read_unlock(&GlobalSMBSeslock);
+                read_unlock(&cifs_tcp_ses_lock);
        } while (!kthread_should_stop());
        return 0;
@@ -1059,9 +1060,7 @@ init_cifs(void)
 {
        int rc = 0;
        cifs_proc_init();
-/*      INIT_LIST_HEAD(&GlobalServerList);*/    /* BB not implemented yet */
+        INIT_LIST_HEAD(&cifs_tcp_ses_list);
-        INIT_LIST_HEAD(&GlobalSMBSessionList);
-        INIT_LIST_HEAD(&GlobalTreeConnectionList);
        INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        INIT_LIST_HEAD(&GlobalDnotifyReqList);
@@ -1089,6 +1088,7 @@ init_cifs(void)
        GlobalMaxActiveXid = 0;
        memset(Local_System_Name, 0, 15);
        rwlock_init(&GlobalSMBSeslock);
+        rwlock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&GlobalMid_Lock);
        if (cifs_max_pending < 2) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c791e5b5a914..f1ae1f57c30d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -85,8 +85,7 @@ enum securityEnum {
 };
 enum protocolEnum {
-        IPV4 = 0,
+        TCP = 0,
-        IPV6,
        SCTP
        /* Netbios frames protocol not supported at this time */
 };
@@ -122,6 +121,9 @@ struct cifs_cred {
 */
 struct TCP_Server_Info {
+        struct list_head tcp_ses_list;
+        struct list_head smb_ses_list;
+        int srv_count; /* reference counter */
        /* 15 character server name + 0x20 16th byte indicating type = srv */
        char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
        char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -141,7 +143,8 @@ struct TCP_Server_Info {
        char versionMajor;
        char versionMinor;
        bool svlocal:1;                 /* local server or remote */
-        atomic_t socketUseCount; /* number of open cifs sessions on socket */
+        bool noblocksnd;                /* use blocking sendmsg */
+        bool noautotune;                /* do not autotune send buf sizes */
        atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
        atomic_t inSend; /* requests trying to send */
@@ -192,13 +195,14 @@ struct cifsUidInfo {
 * Session structure.  One of these for each uid session with a particular host
 */
 struct cifsSesInfo {
-        struct list_head cifsSessionList;
+        struct list_head smb_ses_list;
+        struct list_head tcon_list;
        struct semaphore sesSem;
 #if 0
        struct cifsUidInfo *uidInfo;    /* pointer to user info */
 #endif
        struct TCP_Server_Info *server; /* pointer to server info */
-        atomic_t inUse; /* # of mounts (tree connections) on this ses */
+        int ses_count;          /* reference counter */
        enum statusEnum status;
        unsigned overrideSecFlg;  /* if non-zero override global sec flags */
        __u16 ipc_tid;          /* special tid for connection to IPC share */
@@ -214,6 +218,7 @@ struct cifsSesInfo {
        char userName[MAX_USERNAME_SIZE + 1];
        char *domainName;
        char *password;
+        bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
@@ -228,16 +233,15 @@ struct cifsSesInfo {
 * session
 */
 struct cifsTconInfo {
-        struct list_head cifsConnectionList;
+        struct list_head tcon_list;
+        int tc_count;
        struct list_head openFileList;
-        struct semaphore tconSem;
        struct cifsSesInfo *ses;        /* pointer to session associated with */
        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
        char *nativeFileSystem;
        __u16 tid;              /* The 2 byte tree id */
        __u16 Flags;            /* optional support bits */
        enum statusEnum tidStatus;
-        atomic_t useCount;      /* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
        atomic_t num_smbs_sent;
        atomic_t num_writes;
@@ -286,6 +290,7 @@ struct cifsTconInfo {
        bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
                                for this mount even if server would support */
        bool local_lease:1; /* check leases (only) on local system not remote */
+        bool need_reconnect:1; /* connection reset, tid now invalid */
        /* BB add field for back pointer to sb struct(s)? */
 };
@@ -586,21 +591,21 @@ require use of the stronger protocol */
 #endif
 /*
- * The list of servers that did not respond with NT LM 0.12.
+ * the list of TCP_Server_Info structures, ie each of the sockets
- * This list helps improve performance and eliminate the messages indicating
+ * connecting our client to a distinct server (ip address), is
- * that we had a communications error talking to the server in this list.
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over cifs_tcp_ses_list
 */
-/* Feature not supported */
+GLOBAL_EXTERN struct list_head          cifs_tcp_ses_list;
-/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
 /*
- * The following is a hash table of all the users we know about.
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
 */
-GLOBAL_EXTERN struct smbUidInfo *GlobalUidList[UID_HASH];
+GLOBAL_EXTERN rwlock_t          cifs_tcp_ses_lock;
-/* GLOBAL_EXTERN struct list_head GlobalServerList; BB not implemented yet */
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList;
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0cff7fe986e8..6f21ecb85ce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,7 +36,7 @@ extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
 extern int smb_send(struct socket *, struct smb_hdr *,
-                        unsigned int /* length */ , struct sockaddr *);
+                        unsigned int /* length */ , struct sockaddr *, bool);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 843a85fb8b9a..bdda46dd435a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -190,10 +190,10 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                /* need to prevent multiple threads trying to
                simultaneously reconnect the same SMB session */
                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->status == CifsNeedReconnect)
+                        if (tcon->ses->need_reconnect)
                                rc = cifs_setup_session(0, tcon->ses,
                                                        nls_codepage);
-                        if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+                        if (!rc && (tcon->need_reconnect)) {
                                mark_open_files_invalid(tcon);
                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
                                              tcon, nls_codepage);
@@ -295,7 +295,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
           check for tcp and smb session status done differently
           for those three - in the calling routine */
        if (tcon) {
-                if (tcon->tidStatus == CifsExiting) {
+                if (tcon->need_reconnect) {
                        /* only tree disconnect, open, and write,
                          (and ulogoff which does not have tcon)
                          are allowed as we start force umount */
@@ -337,10 +337,10 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                /* need to prevent multiple threads trying to
                simultaneously reconnect the same SMB session */
                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->status == CifsNeedReconnect)
+                        if (tcon->ses->need_reconnect)
                                rc = cifs_setup_session(0, tcon->ses,
                                                        nls_codepage);
-                        if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+                        if (!rc && (tcon->need_reconnect)) {
                                mark_open_files_invalid(tcon);
                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
                                              tcon, nls_codepage);
@@ -664,8 +664,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        rc = -EIO;
                        goto neg_err_exit;
                }
+                read_lock(&cifs_tcp_ses_lock);
-                if (server->socketUseCount.counter > 1) {
+                if (server->srv_count > 1) {
+                        read_unlock(&cifs_tcp_ses_lock);
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
@@ -674,9 +675,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                        pSMBr->u.extended_response.GUID,
                                        16);
                        }
-                } else
+                } else {
+                        read_unlock(&cifs_tcp_ses_lock);
                        memcpy(server->server_GUID,
                               pSMBr->u.extended_response.GUID, 16);
+                }
                if (count == 16) {
                        server->secType = RawNTLMSSP;
@@ -739,50 +742,31 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        int rc = 0;
        cFYI(1, ("In tree disconnect"));
-        /*
-         *  If last user of the connection and
-         *  connection alive - disconnect it
-         *  If this is the last connection on the server session disconnect it
-         *  (and inside session disconnect we should check if tcp socket needs
-         *  to be freed and kernel thread woken up).
-         */
-        if (tcon)
-                down(&tcon->tconSem);
-        else
-                return -EIO;
-        atomic_dec(&tcon->useCount);
+        /* BB: do we need to check this? These should never be NULL. */
-        if (atomic_read(&tcon->useCount) > 0) {
+        if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
-                up(&tcon->tconSem);
+                return -EIO;
-                return -EBUSY;
-        }
-        /* No need to return error on this operation if tid invalidated and
+        /*
-        closed on server already e.g. due to tcp session crashing */
+         * No need to return error on this operation if tid invalidated and
-        if (tcon->tidStatus == CifsNeedReconnect) {
+         * closed on server already e.g. due to tcp session crashing. Also,
-                up(&tcon->tconSem);
+         * the tcon is no longer on the list, so no need to take lock before
+         * checking this.
+         */
+        if (tcon->need_reconnect)
                return 0;
-        }
-        if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) {
-                up(&tcon->tconSem);
-                return -EIO;
-        }
        rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
                            (void **)&smb_buffer);
-        if (rc) {
+        if (rc)
-                up(&tcon->tconSem);
                return rc;
-        }
        rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
        if (rc)
                cFYI(1, ("Tree disconnect failed %d", rc));
-        up(&tcon->tconSem);
        /* No need to return error on this operation if tid invalidated and
-        closed on server already e.g. due to tcp session crashing */
+           closed on server already e.g. due to tcp session crashing */
        if (rc == -EAGAIN)
                rc = 0;
@@ -796,43 +780,36 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        int rc = 0;
        cFYI(1, ("In SMBLogoff for session disconnect"));
-        if (ses)
-                down(&ses->sesSem);
+        /*
-        else
+         * BB: do we need to check validity of ses and server? They should
+         * always be valid since we have an active reference. If not, that
+         * should probably be a BUG()
+         */
+        if (!ses || !ses->server)
                return -EIO;
-        atomic_dec(&ses->inUse);
+        down(&ses->sesSem);
-        if (atomic_read(&ses->inUse) > 0) {
+        if (ses->need_reconnect)
-                up(&ses->sesSem);
+                goto session_already_dead; /* no need to send SMBlogoff if uid
-                return -EBUSY;
+                                              already closed due to reconnect */
-        }
        rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
        if (rc) {
                up(&ses->sesSem);
                return rc;
        }
-        if (ses->server) {
+        pSMB->hdr.Mid = GetNextMid(ses->server);
-                pSMB->hdr.Mid = GetNextMid(ses->server);
-                if (ses->server->secMode &
+        if (ses->server->secMode &
                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
                        pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-        }
        pSMB->hdr.Uid = ses->Suid;
        pSMB->AndXCommand = 0xFF;
        rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
-        if (ses->server) {
+session_already_dead:
-                atomic_dec(&ses->server->socketUseCount);
-                if (atomic_read(&ses->server->socketUseCount) == 0) {
-                        spin_lock(&GlobalMid_Lock);
-                        ses->server->tcpStatus = CifsExiting;
-                        spin_unlock(&GlobalMid_Lock);
-                        rc = -ESHUTDOWN;
-                }
-        }
        up(&ses->sesSem);
        /* if session dead then we do not need to do ulogoff,
@@ -1536,7 +1513,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        __u32 bytes_sent;
        __u16 byte_count;
-        /* cFYI(1,("write at %lld %d bytes",offset,count));*/
+        /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
        if (tcon->ses == NULL)
                return -ECONNABORTED;
@@ -3922,6 +3899,27 @@ GetInodeNumOut:
        return rc;
 }
+/* computes length of UCS string converted to host codepage
+ * @src:        UCS string
+ * @maxlen:     length of the input string in UCS characters
+ *              (not in bytes)
+ *
+ * return:      size of input string in host codepage
+ */
+static int hostlen_fromUCS(const __le16 *src, const int maxlen,
+                const struct nls_table *nls_codepage) {
+        int i;
+        int hostlen = 0;
+        char to[4];
+        int charlen;
+        for (i = 0; (i < maxlen) && src[i]; ++i) {
+                charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
+                                to, NLS_MAX_CHARSET_SIZE);
+                hostlen += charlen > 0 ? charlen : 1;
+        }
+        return hostlen;
+}
 /* parses DFS refferal V3 structure
 * caller is responsible for freeing target_nodes
 * returns:
@@ -3932,7 +3930,8 @@ static int
 parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                unsigned int *num_of_nodes,
                struct dfs_info3_param **target_nodes,
-                const struct nls_table *nls_codepage)
+                const struct nls_table *nls_codepage, int remap,
+                const char *searchName)
 {
        int i, rc = 0;
        char *data_end;
@@ -3983,7 +3982,17 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                struct dfs_info3_param *node = (*target_nodes)+i;
                node->flags = le16_to_cpu(pSMBr->DFSFlags);
-                node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+                if (is_unicode) {
+                        __le16 *tmp = kmalloc(strlen(searchName)*2, GFP_KERNEL);
+                        cifsConvertToUCS((__le16 *) tmp, searchName,
+                                        PATH_MAX, nls_codepage, remap);
+                        node->path_consumed = hostlen_fromUCS(tmp,
+                                        le16_to_cpu(pSMBr->PathConsumed)/2,
+                                        nls_codepage);
+                        kfree(tmp);
+                } else
+                        node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
                node->server_type = le16_to_cpu(ref->ServerType);
                node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
@@ -4116,7 +4125,8 @@ getDFSRetry:
        /* parse returned result into more usable form */
        rc = parse_DFS_referrals(pSMBr, num_of_nodes,
-                                 target_nodes, nls_codepage);
+                                 target_nodes, nls_codepage, remap,
+                                 searchName);
 GetDFSRefExit:
        cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71b7661e2260..c7d341714586 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,6 +92,8 @@ struct smb_vol {
        bool seal:1;       /* request transport encryption on share */
        bool nodfs:1;      /* Do not request DFS, even if available */
        bool local_lease:1; /* check leases only on local system, not remote */
+        bool noblocksnd:1;
+        bool noautotune:1;
        unsigned int rsize;
        unsigned int wsize;
        unsigned int sockopt;
@@ -102,9 +104,11 @@ struct smb_vol {
 static int ipv4_connect(struct sockaddr_in *psin_server,
                        struct socket **csocket,
                        char *netb_name,
-                        char *server_netb_name);
+                        char *server_netb_name,
+                        bool noblocksnd,
+                        bool nosndbuf); /* ipv6 never set sndbuf size */
 static int ipv6_connect(struct sockaddr_in6 *psin_server,
-                        struct socket **csocket);
+                        struct socket **csocket, bool noblocksnd);
        /*
@@ -120,7 +124,7 @@ static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        struct list_head *tmp;
+        struct list_head *tmp, *tmp2;
        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
        struct mid_q_entry *mid_entry;
@@ -140,23 +144,17 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
-        read_lock(&GlobalSMBSeslock);
+        read_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &GlobalSMBSessionList) {
+        list_for_each(tmp, &server->smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
+                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                if (ses->server) {
+                ses->need_reconnect = true;
-                        if (ses->server == server) {
+                ses->ipc_tid = 0;
-                                ses->status = CifsNeedReconnect;
+                list_for_each(tmp2, &ses->tcon_list) {
-                                ses->ipc_tid = 0;
+                        tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
-                        }
+                        tcon->need_reconnect = true;
                }
-                /* else tcp and smb sessions need reconnection */
-        }
-        list_for_each(tmp, &GlobalTreeConnectionList) {
-                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-                if ((tcon->ses) && (tcon->ses->server == server))
-                        tcon->tidStatus = CifsNeedReconnect;
        }
-        read_unlock(&GlobalSMBSeslock);
+        read_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
        down(&server->tcpSem);
        if (server->ssocket) {
@@ -189,14 +187,15 @@ cifs_reconnect(struct TCP_Server_Info *server)
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->protocolType == IPV6) {
+                if (server->addr.sockAddr6.sin6_family == AF_INET6) {
                        rc = ipv6_connect(&server->addr.sockAddr6,
-                                          &server->ssocket);
+                                          &server->ssocket, server->noautotune);
                } else {
                        rc = ipv4_connect(&server->addr.sockAddr,
                                        &server->ssocket,
                                        server->workstation_RFC1001_name,
-                                        server->server_RFC1001_name);
+                                        server->server_RFC1001_name,
+                                        server->noblocksnd, server->noautotune);
                }
                if (rc) {
                        cFYI(1, ("reconnect error %d", rc));
@@ -412,9 +411,14 @@ incomplete_rcv:
                        msleep(1); /* minimum sleep to prevent looping
                                allowing socket to clear and app threads to set
                                tcpStatus CifsNeedReconnect if server hung */
-                        if (pdu_length < 4)
+                        if (pdu_length < 4) {
+                                iov.iov_base = (4 - pdu_length) +
+                                                        (char *)smb_buffer;
+                                iov.iov_len = pdu_length;
+                                smb_msg.msg_control = NULL;
+                                smb_msg.msg_controllen = 0;
                                goto incomplete_rcv;
-                        else
+                        } else
                                continue;
                } else if (length <= 0) {
                        if (server->tcpStatus == CifsNew) {
@@ -649,6 +653,11 @@ multi_t2_fnd:
                }
        } /* end while !EXITING */
+        /* take it off the list, if it's not already */
+        write_lock(&cifs_tcp_ses_lock);
+        list_del_init(&server->tcp_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
        spin_lock(&GlobalMid_Lock);
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
@@ -681,29 +690,29 @@ multi_t2_fnd:
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
-        read_lock(&GlobalSMBSeslock);
+        /*
+         * BB: we shouldn't have to do any of this. It shouldn't be
+         * possible to exit from the thread with active SMB sessions
+         */
+        read_lock(&cifs_tcp_ses_lock);
        if (list_empty(&server->pending_mid_q)) {
                /* loop through server session structures attached to this and
                    mark them dead */
-                list_for_each(tmp, &GlobalSMBSessionList) {
+                list_for_each(tmp, &server->smb_ses_list) {
-                        ses =
+                        ses = list_entry(tmp, struct cifsSesInfo,
-                            list_entry(tmp, struct cifsSesInfo,
+                                         smb_ses_list);
-                                       cifsSessionList);
+                        ses->status = CifsExiting;
-                        if (ses->server == server) {
+                        ses->server = NULL;
-                                ses->status = CifsExiting;
-                                ses->server = NULL;
-                        }
                }
-                read_unlock(&GlobalSMBSeslock);
+                read_unlock(&cifs_tcp_ses_lock);
        } else {
                /* although we can not zero the server struct pointer yet,
                since there are active requests which may depnd on them,
                mark the corresponding SMB sessions as exiting too */
-                list_for_each(tmp, &GlobalSMBSessionList) {
+                list_for_each(tmp, &server->smb_ses_list) {
                        ses = list_entry(tmp, struct cifsSesInfo,
-                                         cifsSessionList);
+                                         smb_ses_list);
-                        if (ses->server == server)
+                        ses->status = CifsExiting;
-                                ses->status = CifsExiting;
                }
                spin_lock(&GlobalMid_Lock);
@@ -718,7 +727,7 @@ multi_t2_fnd:
                        }
                }
                spin_unlock(&GlobalMid_Lock);
-                read_unlock(&GlobalSMBSeslock);
+                read_unlock(&cifs_tcp_ses_lock);
                /* 1/8th of sec is more than enough time for them to exit */
                msleep(125);
        }
@@ -740,14 +749,13 @@ multi_t2_fnd:
        if there are any pointing to this (e.g
        if a crazy root user tried to kill cifsd
        kernel thread explicitly this might happen) */
-        write_lock(&GlobalSMBSeslock);
+        /* BB: This shouldn't be necessary, see above */
-        list_for_each(tmp, &GlobalSMBSessionList) {
+        read_lock(&cifs_tcp_ses_lock);
-                ses = list_entry(tmp, struct cifsSesInfo,
+        list_for_each(tmp, &server->smb_ses_list) {
-                                cifsSessionList);
+                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                if (ses->server == server)
+                ses->server = NULL;
-                        ses->server = NULL;
        }
-        write_unlock(&GlobalSMBSeslock);
+        read_unlock(&cifs_tcp_ses_lock);
        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
@@ -1192,6 +1200,10 @@ cifs_parse_mount_options(char *options, const char *devname,
                        /* ignore */
                } else if (strnicmp(data, "rw", 2) == 0) {
                        vol->rw = true;
+                } else if (strnicmp(data, "noblocksend", 11) == 0) {
+                        vol->noblocksnd = 1;
+                } else if (strnicmp(data, "noautotune", 10) == 0) {
+                        vol->noautotune = 1;
                } else if ((strnicmp(data, "suid", 4) == 0) ||
                                   (strnicmp(data, "nosuid", 6) == 0) ||
                                   (strnicmp(data, "exec", 4) == 0) ||
@@ -1343,94 +1355,158 @@ cifs_parse_mount_options(char *options, const char *devname,
        return 0;
 }
-static struct cifsSesInfo *
+static struct TCP_Server_Info *
-cifs_find_tcp_session(struct in_addr *target_ip_addr,
+cifs_find_tcp_session(struct sockaddr *addr)
-                      struct in6_addr *target_ip6_addr,
-                      char *userName, struct TCP_Server_Info **psrvTcp)
 {
        struct list_head *tmp;
-        struct cifsSesInfo *ses;
+        struct TCP_Server_Info *server;
+        struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
-        *psrvTcp = NULL;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
+        write_lock(&cifs_tcp_ses_lock);
+        list_for_each(tmp, &cifs_tcp_ses_list) {
+                server = list_entry(tmp, struct TCP_Server_Info,
+                                    tcp_ses_list);
+                /*
+                 * the demux thread can exit on its own while still in CifsNew
+                 * so don't accept any sockets in that state. Since the
+                 * tcpStatus never changes back to CifsNew it's safe to check
+                 * for this without a lock.
+                 */
+                if (server->tcpStatus == CifsNew)
+                        continue;
-        read_lock(&GlobalSMBSeslock);
+                if (addr->sa_family == AF_INET &&
-        list_for_each(tmp, &GlobalSMBSessionList) {
+                    (addr4->sin_addr.s_addr !=
-                ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
+                     server->addr.sockAddr.sin_addr.s_addr))
-                if (!ses->server)
+                        continue;
+                else if (addr->sa_family == AF_INET6 &&
+                         memcmp(&server->addr.sockAddr6.sin6_addr,
+                                &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
                        continue;
-                if (target_ip_addr &&
+                ++server->srv_count;
-                    ses->server->addr.sockAddr.sin_addr.s_addr != target_ip_addr->s_addr)
+                write_unlock(&cifs_tcp_ses_lock);
-                                continue;
+                cFYI(1, ("Existing tcp session with server found"));
-                else if (target_ip6_addr &&
+                return server;
-                         memcmp(&ses->server->addr.sockAddr6.sin6_addr,
+        }
-                                target_ip6_addr, sizeof(*target_ip6_addr)))
+        write_unlock(&cifs_tcp_ses_lock);
-                                continue;
+        return NULL;
-                /* BB lock server and tcp session; increment use count here?? */
+}
-                /* found a match on the TCP session */
+static void
-                *psrvTcp = ses->server;
+cifs_put_tcp_session(struct TCP_Server_Info *server)
+{
+        struct task_struct *task;
-                /* BB check if reconnection needed */
+        write_lock(&cifs_tcp_ses_lock);
-                if (strncmp(ses->userName, userName, MAX_USERNAME_SIZE) == 0) {
+        if (--server->srv_count > 0) {
-                        read_unlock(&GlobalSMBSeslock);
+                write_unlock(&cifs_tcp_ses_lock);
-                        /* Found exact match on both TCP and
+                return;
-                           SMB sessions */
-                        return ses;
-                }
-                /* else tcp and smb sessions need reconnection */
        }
-        read_unlock(&GlobalSMBSeslock);
-        return NULL;
+        list_del_init(&server->tcp_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        spin_lock(&GlobalMid_Lock);
+        server->tcpStatus = CifsExiting;
+        spin_unlock(&GlobalMid_Lock);
+        task = xchg(&server->tsk, NULL);
+        if (task)
+                force_sig(SIGKILL, task);
 }
-static struct cifsTconInfo *
+static struct cifsSesInfo *
-find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
+cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
 {
        struct list_head *tmp;
-        struct cifsTconInfo *tcon;
+        struct cifsSesInfo *ses;
-        __be32 old_ip;
-        read_lock(&GlobalSMBSeslock);
-        list_for_each(tmp, &GlobalTreeConnectionList) {
+        write_lock(&cifs_tcp_ses_lock);
-                cFYI(1, ("Next tcon"));
+        list_for_each(tmp, &server->smb_ses_list) {
-                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
+                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                if (!tcon->ses || !tcon->ses->server)
+                if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
                        continue;
-                old_ip = tcon->ses->server->addr.sockAddr.sin_addr.s_addr;
+                ++ses->ses_count;
-                cFYI(1, ("old ip addr: %x == new ip %x ?",
+                write_unlock(&cifs_tcp_ses_lock);
-                        old_ip, new_target_ip_addr));
+                return ses;
+        }
+        write_unlock(&cifs_tcp_ses_lock);
+        return NULL;
+}
+static void
+cifs_put_smb_ses(struct cifsSesInfo *ses)
+{
+        int xid;
+        struct TCP_Server_Info *server = ses->server;
-                if (old_ip != new_target_ip_addr)
+        write_lock(&cifs_tcp_ses_lock);
-                        continue;
+        if (--ses->ses_count > 0) {
+                write_unlock(&cifs_tcp_ses_lock);
+                return;
+        }
-                /* BB lock tcon, server, tcp session and increment use count? */
+        list_del_init(&ses->smb_ses_list);
-                /* found a match on the TCP session */
+        write_unlock(&cifs_tcp_ses_lock);
-                /* BB check if reconnection needed */
-                cFYI(1, ("IP match, old UNC: %s new: %s",
-                        tcon->treeName, uncName));
-                if (strncmp(tcon->treeName, uncName, MAX_TREE_SIZE))
+        if (ses->status == CifsGood) {
-                        continue;
+                xid = GetXid();
+                CIFSSMBLogoff(xid, ses);
+                _FreeXid(xid);
+        }
+        sesInfoFree(ses);
+        cifs_put_tcp_session(server);
+}
-                cFYI(1, ("and old usr: %s new: %s",
+static struct cifsTconInfo *
-                        tcon->treeName, uncName));
+cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+        struct list_head *tmp;
+        struct cifsTconInfo *tcon;
-                if (strncmp(tcon->ses->userName, userName, MAX_USERNAME_SIZE))
+        write_lock(&cifs_tcp_ses_lock);
+        list_for_each(tmp, &ses->tcon_list) {
+                tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+                if (tcon->tidStatus == CifsExiting)
+                        continue;
+                if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
                        continue;
-                /* matched smb session (user name) */
+                ++tcon->tc_count;
-                read_unlock(&GlobalSMBSeslock);
+                write_unlock(&cifs_tcp_ses_lock);
                return tcon;
        }
+        write_unlock(&cifs_tcp_ses_lock);
-        read_unlock(&GlobalSMBSeslock);
        return NULL;
 }
+static void
+cifs_put_tcon(struct cifsTconInfo *tcon)
+{
+        int xid;
+        struct cifsSesInfo *ses = tcon->ses;
+        write_lock(&cifs_tcp_ses_lock);
+        if (--tcon->tc_count > 0) {
+                write_unlock(&cifs_tcp_ses_lock);
+                return;
+        }
+        list_del_init(&tcon->tcon_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        xid = GetXid();
+        CIFSSMBTDis(xid, tcon);
+        _FreeXid(xid);
+        DeleteTconOplockQEntries(tcon);
+        tconInfoFree(tcon);
+        cifs_put_smb_ses(ses);
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1518,7 +1594,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 static int
 ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
-             char *netbios_name, char *target_name)
+             char *netbios_name, char *target_name,
+             bool noblocksnd, bool noautotune)
 {
        int rc = 0;
        int connected = 0;
@@ -1590,11 +1667,16 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
                 (*csocket)->sk->sk_sndbuf,
                 (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo));
        (*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+        if (!noblocksnd)
+                (*csocket)->sk->sk_sndtimeo = 3 * HZ;
        /* make the bufsizes depend on wsize/rsize and max requests */
-        if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
+        if (noautotune) {
-                (*csocket)->sk->sk_sndbuf = 200 * 1024;
+                if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
-        if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+                        (*csocket)->sk->sk_sndbuf = 200 * 1024;
-                (*csocket)->sk->sk_rcvbuf = 140 * 1024;
+                if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+                        (*csocket)->sk->sk_rcvbuf = 140 * 1024;
+        }
        /* send RFC1001 sessinit */
        if (psin_server->sin_port == htons(RFC1001_PORT)) {
@@ -1631,7 +1713,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
                        smb_buf->smb_buf_length = 0x81000044;
                        rc = smb_send(*csocket, smb_buf, 0x44,
-                                (struct sockaddr *)psin_server);
+                                (struct sockaddr *)psin_server, noblocksnd);
                        kfree(ses_init_buf);
                        msleep(1); /* RFC1001 layer in at least one server
                                      requires very short break before negprot
@@ -1651,7 +1733,8 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 }
 static int
-ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
+ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket,
+             bool noblocksnd)
 {
        int rc = 0;
        int connected = 0;
@@ -1720,6 +1803,9 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
                the default. sock_setsockopt not used because it expects
                user space buffer */
        (*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+        if (!noblocksnd)
+                (*csocket)->sk->sk_sndtimeo = 3 * HZ;
        return rc;
 }
@@ -1857,14 +1943,90 @@ convert_delimiter(char *path, char delim)
        }
 }
-static void
+static void setup_cifs_sb(struct smb_vol *pvolume_info,
-kill_cifsd(struct TCP_Server_Info *server)
+                          struct cifs_sb_info *cifs_sb)
 {
-        struct task_struct *task;
+        if (pvolume_info->rsize > CIFSMaxBufSize) {
+                cERROR(1, ("rsize %d too large, using MaxBufSize",
-        task = xchg(&server->tsk, NULL);
+                        pvolume_info->rsize));
-        if (task)
+                cifs_sb->rsize = CIFSMaxBufSize;
-                force_sig(SIGKILL, task);
+        } else if ((pvolume_info->rsize) &&
+                        (pvolume_info->rsize <= CIFSMaxBufSize))
+                cifs_sb->rsize = pvolume_info->rsize;
+        else /* default */
+                cifs_sb->rsize = CIFSMaxBufSize;
+        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+                cERROR(1, ("wsize %d too large, using 4096 instead",
+                          pvolume_info->wsize));
+                cifs_sb->wsize = 4096;
+        } else if (pvolume_info->wsize)
+                cifs_sb->wsize = pvolume_info->wsize;
+        else
+                cifs_sb->wsize = min_t(const int,
+                                        PAGEVEC_SIZE * PAGE_CACHE_SIZE,
+                                        127*1024);
+                /* old default of CIFSMaxBufSize was too small now
+                   that SMB Write2 can send multiple pages in kvec.
+                   RFC1001 does not describe what happens when frame
+                   bigger than 128K is sent so use that as max in
+                   conjunction with 52K kvec constraint on arch with 4K
+                   page size  */
+        if (cifs_sb->rsize < 2048) {
+                cifs_sb->rsize = 2048;
+                /* Windows ME may prefer this */
+                cFYI(1, ("readsize set to minimum: 2048"));
+        }
+        /* calculate prepath */
+        cifs_sb->prepath = pvolume_info->prepath;
+        if (cifs_sb->prepath) {
+                cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+                /* we can not convert the / to \ in the path
+                separators in the prefixpath yet because we do not
+                know (until reset_cifs_unix_caps is called later)
+                whether POSIX PATH CAP is available. We normalize
+                the / to \ after reset_cifs_unix_caps is called */
+                pvolume_info->prepath = NULL;
+        } else
+                cifs_sb->prepathlen = 0;
+        cifs_sb->mnt_uid = pvolume_info->linux_uid;
+        cifs_sb->mnt_gid = pvolume_info->linux_gid;
+        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+        cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+        cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+        if (pvolume_info->noperm)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+        if (pvolume_info->setuids)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+        if (pvolume_info->server_ino)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+        if (pvolume_info->remap)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+        if (pvolume_info->no_xattr)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+        if (pvolume_info->sfu_emul)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+        if (pvolume_info->nobrl)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+        if (pvolume_info->cifs_acl)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+        if (pvolume_info->override_uid)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+        if (pvolume_info->override_gid)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+        if (pvolume_info->dynperm)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+        if (pvolume_info->direct_io) {
+                cFYI(1, ("mounting share using direct i/o"));
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+        }
+        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+                cERROR(1, ("mount option dynperm ignored if cifsacl "
+                           "mount option supported"));
 }
 int
@@ -1873,13 +2035,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 {
        int rc = 0;
        int xid;
-        int address_type = AF_INET;
        struct socket *csocket = NULL;
-        struct sockaddr_in sin_server;
+        struct sockaddr addr;
-        struct sockaddr_in6 sin_server6;
+        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
        struct smb_vol volume_info;
        struct cifsSesInfo *pSesInfo = NULL;
-        struct cifsSesInfo *existingCifsSes = NULL;
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
@@ -1887,6 +2048,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 /* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */
+        memset(&addr, 0, sizeof(struct sockaddr));
        memset(&volume_info, 0, sizeof(struct smb_vol));
        if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
                rc = -EINVAL;
@@ -1909,16 +2071,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        if (volume_info.UNCip && volume_info.UNC) {
                rc = cifs_inet_pton(AF_INET, volume_info.UNCip,
-                                    &sin_server.sin_addr.s_addr);
+                                    &sin_server->sin_addr.s_addr);
                if (rc <= 0) {
                        /* not ipv4 address, try ipv6 */
                        rc = cifs_inet_pton(AF_INET6, volume_info.UNCip,
-                                            &sin_server6.sin6_addr.in6_u);
+                                            &sin_server6->sin6_addr.in6_u);
                        if (rc > 0)
-                                address_type = AF_INET6;
+                                addr.sa_family = AF_INET6;
                } else {
-                        address_type = AF_INET;
+                        addr.sa_family = AF_INET;
                }
                if (rc <= 0) {
@@ -1958,38 +2120,25 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                }
        }
-        if (address_type == AF_INET)
+        srvTcp = cifs_find_tcp_session(&addr);
-                existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
+        if (!srvTcp) { /* create socket */
-                        NULL /* no ipv6 addr */,
+                if (addr.sa_family == AF_INET6) {
-                        volume_info.username, &srvTcp);
-        else if (address_type == AF_INET6) {
-                cFYI(1, ("looking for ipv6 address"));
-                existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-                        &sin_server6.sin6_addr,
-                        volume_info.username, &srvTcp);
-        } else {
-                rc = -EINVAL;
-                goto out;
-        }
-        if (srvTcp) {
-                cFYI(1, ("Existing tcp session with server found"));
-        } else {        /* create socket */
-                if (volume_info.port)
-                        sin_server.sin_port = htons(volume_info.port);
-                else
-                        sin_server.sin_port = 0;
-                if (address_type == AF_INET6) {
                        cFYI(1, ("attempting ipv6 connect"));
                        /* BB should we allow ipv6 on port 139? */
                        /* other OS never observed in Wild doing 139 with v6 */
-                        rc = ipv6_connect(&sin_server6, &csocket);
+                        sin_server6->sin6_port = htons(volume_info.port);
-                } else
+                        rc = ipv6_connect(sin_server6, &csocket,
-                        rc = ipv4_connect(&sin_server, &csocket,
+                                        volume_info.noblocksnd);
+                } else {
+                        sin_server->sin_port = htons(volume_info.port);
+                        rc = ipv4_connect(sin_server, &csocket,
                                  volume_info.source_rfc1001_name,
-                                  volume_info.target_rfc1001_name);
+                                  volume_info.target_rfc1001_name,
+                                  volume_info.noblocksnd,
+                                  volume_info.noautotune);
+                }
                if (rc < 0) {
-                        cERROR(1, ("Error connecting to IPv4 socket. "
+                        cERROR(1, ("Error connecting to socket. "
                                   "Aborting operation"));
                        if (csocket != NULL)
                                sock_release(csocket);
@@ -2002,12 +2151,17 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        sock_release(csocket);
                        goto out;
                } else {
-                        memcpy(&srvTcp->addr.sockAddr, &sin_server,
+                        srvTcp->noblocksnd = volume_info.noblocksnd;
-                                sizeof(struct sockaddr_in));
+                        srvTcp->noautotune = volume_info.noautotune;
+                        if (addr.sa_family == AF_INET6)
+                                memcpy(&srvTcp->addr.sockAddr6, sin_server6,
+                                        sizeof(struct sockaddr_in6));
+                        else
+                                memcpy(&srvTcp->addr.sockAddr, sin_server,
+                                        sizeof(struct sockaddr_in));
                        atomic_set(&srvTcp->inFlight, 0);
                        /* BB Add code for ipv6 case too */
                        srvTcp->ssocket = csocket;
-                        srvTcp->protocolType = IPV4;
                        srvTcp->hostname = extract_hostname(volume_info.UNC);
                        if (IS_ERR(srvTcp->hostname)) {
                                rc = PTR_ERR(srvTcp->hostname);
@@ -2037,15 +2191,28 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        memcpy(srvTcp->server_RFC1001_name,
                                volume_info.target_rfc1001_name, 16);
                        srvTcp->sequence_number = 0;
+                        INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+                        INIT_LIST_HEAD(&srvTcp->smb_ses_list);
+                        ++srvTcp->srv_count;
+                        write_lock(&cifs_tcp_ses_lock);
+                        list_add(&srvTcp->tcp_ses_list,
+                                 &cifs_tcp_ses_list);
+                        write_unlock(&cifs_tcp_ses_lock);
                }
        }
-        if (existingCifsSes) {
+        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info.username);
-                pSesInfo = existingCifsSes;
+        if (pSesInfo) {
                cFYI(1, ("Existing smb sess found (status=%d)",
                        pSesInfo->status));
+                /*
+                 * The existing SMB session already has a reference to srvTcp,
+                 * so we can put back the extra one we got before
+                 */
+                cifs_put_tcp_session(srvTcp);
                down(&pSesInfo->sesSem);
-                if (pSesInfo->status == CifsNeedReconnect) {
+                if (pSesInfo->need_reconnect) {
                        cFYI(1, ("Session needs reconnect"));
                        rc = cifs_setup_session(xid, pSesInfo,
                                                cifs_sb->local_nls);
@@ -2054,187 +2221,101 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        } else if (!rc) {
                cFYI(1, ("Existing smb sess not found"));
                pSesInfo = sesInfoAlloc();
-                if (pSesInfo == NULL)
+                if (pSesInfo == NULL) {
                        rc = -ENOMEM;
-                else {
+                        goto mount_fail_check;
-                        pSesInfo->server = srvTcp;
+                }
-                        sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-                                NIPQUAD(sin_server.sin_addr.s_addr));
+                /* new SMB session uses our srvTcp ref */
-                }
+                pSesInfo->server = srvTcp;
+                sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
+                        NIPQUAD(sin_server->sin_addr.s_addr));
+                write_lock(&cifs_tcp_ses_lock);
+                list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
+                write_unlock(&cifs_tcp_ses_lock);
+                /* volume_info.password freed at unmount */
+                if (volume_info.password) {
+                        pSesInfo->password = volume_info.password;
+                        /* set to NULL to prevent freeing on exit */
+                        volume_info.password = NULL;
+                }
+                if (volume_info.username)
+                        strncpy(pSesInfo->userName, volume_info.username,
+                                MAX_USERNAME_SIZE);
+                if (volume_info.domainname) {
+                        int len = strlen(volume_info.domainname);
+                        pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
+                        if (pSesInfo->domainName)
+                                strcpy(pSesInfo->domainName,
+                                        volume_info.domainname);
+                }
+                pSesInfo->linux_uid = volume_info.linux_uid;
+                pSesInfo->overrideSecFlg = volume_info.secFlg;
+                down(&pSesInfo->sesSem);
-                if (!rc) {
+                /* BB FIXME need to pass vol->secFlgs BB */
-                        /* volume_info.password freed at unmount */
+                rc = cifs_setup_session(xid, pSesInfo,
-                        if (volume_info.password) {
+                                        cifs_sb->local_nls);
-                                pSesInfo->password = volume_info.password;
+                up(&pSesInfo->sesSem);
-                                /* set to NULL to prevent freeing on exit */
-                                volume_info.password = NULL;
-                        }
-                        if (volume_info.username)
-                                strncpy(pSesInfo->userName,
-                                        volume_info.username,
-                                        MAX_USERNAME_SIZE);
-                        if (volume_info.domainname) {
-                                int len = strlen(volume_info.domainname);
-                                pSesInfo->domainName =
-                                        kmalloc(len + 1, GFP_KERNEL);
-                                if (pSesInfo->domainName)
-                                        strcpy(pSesInfo->domainName,
-                                                volume_info.domainname);
-                        }
-                        pSesInfo->linux_uid = volume_info.linux_uid;
-                        pSesInfo->overrideSecFlg = volume_info.secFlg;
-                        down(&pSesInfo->sesSem);
-                        /* BB FIXME need to pass vol->secFlgs BB */
-                        rc = cifs_setup_session(xid, pSesInfo,
-                                                cifs_sb->local_nls);
-                        up(&pSesInfo->sesSem);
-                        if (!rc)
-                                atomic_inc(&srvTcp->socketUseCount);
-                }
        }
        /* search for existing tcon to this server share */
        if (!rc) {
-                if (volume_info.rsize > CIFSMaxBufSize) {
+                setup_cifs_sb(&volume_info, cifs_sb);
-                        cERROR(1, ("rsize %d too large, using MaxBufSize",
-                                volume_info.rsize));
-                        cifs_sb->rsize = CIFSMaxBufSize;
-                } else if ((volume_info.rsize) &&
-                                (volume_info.rsize <= CIFSMaxBufSize))
-                        cifs_sb->rsize = volume_info.rsize;
-                else /* default */
-                        cifs_sb->rsize = CIFSMaxBufSize;
-                if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                        cERROR(1, ("wsize %d too large, using 4096 instead",
-                                  volume_info.wsize));
-                        cifs_sb->wsize = 4096;
-                } else if (volume_info.wsize)
-                        cifs_sb->wsize = volume_info.wsize;
-                else
-                        cifs_sb->wsize =
-                                min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-                                        127*1024);
-                        /* old default of CIFSMaxBufSize was too small now
-                           that SMB Write2 can send multiple pages in kvec.
-                           RFC1001 does not describe what happens when frame
-                           bigger than 128K is sent so use that as max in
-                           conjunction with 52K kvec constraint on arch with 4K
-                           page size  */
-                if (cifs_sb->rsize < 2048) {
-                        cifs_sb->rsize = 2048;
-                        /* Windows ME may prefer this */
-                        cFYI(1, ("readsize set to minimum: 2048"));
-                }
-                /* calculate prepath */
-                cifs_sb->prepath = volume_info.prepath;
-                if (cifs_sb->prepath) {
-                        cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-                        /* we can not convert the / to \ in the path
-                        separators in the prefixpath yet because we do not
-                        know (until reset_cifs_unix_caps is called later)
-                        whether POSIX PATH CAP is available. We normalize
-                        the / to \ after reset_cifs_unix_caps is called */
-                        volume_info.prepath = NULL;
-                } else
-                        cifs_sb->prepathlen = 0;
-                cifs_sb->mnt_uid = volume_info.linux_uid;
-                cifs_sb->mnt_gid = volume_info.linux_gid;
-                cifs_sb->mnt_file_mode = volume_info.file_mode;
-                cifs_sb->mnt_dir_mode = volume_info.dir_mode;
-                cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-                        cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
-                if (volume_info.noperm)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-                if (volume_info.setuids)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-                if (volume_info.server_ino)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-                if (volume_info.remap)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-                if (volume_info.no_xattr)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-                if (volume_info.sfu_emul)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-                if (volume_info.nobrl)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-                if (volume_info.cifs_acl)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-                if (volume_info.override_uid)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
-                if (volume_info.override_gid)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
-                if (volume_info.dynperm)
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
-                if (volume_info.direct_io) {
-                        cFYI(1, ("mounting share using direct i/o"));
-                        cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-                }
-                if ((volume_info.cifs_acl) && (volume_info.dynperm))
-                        cERROR(1, ("mount option dynperm ignored if cifsacl "
-                                   "mount option supported"));
-                tcon =
+                tcon = cifs_find_tcon(pSesInfo, volume_info.UNC);
-                    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
-                             volume_info.username);
                if (tcon) {
                        cFYI(1, ("Found match on UNC path"));
-                        /* we can have only one retry value for a connection
+                        /* existing tcon already has a reference */
-                           to a share so for resources mounted more than once
+                        cifs_put_smb_ses(pSesInfo);
-                           to the same server share the last value passed in
-                           for the retry flag is used */
-                        tcon->retry = volume_info.retry;
-                        tcon->nocase = volume_info.nocase;
-                        tcon->local_lease = volume_info.local_lease;
                        if (tcon->seal != volume_info.seal)
                                cERROR(1, ("transport encryption setting "
                                           "conflicts with existing tid"));
                } else {
                        tcon = tconInfoAlloc();
-                        if (tcon == NULL)
+                        if (tcon == NULL) {
                                rc = -ENOMEM;
-                        else {
+                                goto mount_fail_check;
-                                /* check for null share name ie connecting to
+                        }
-                                 * dfs root */
+                        tcon->ses = pSesInfo;
-                                /* BB check if this works for exactly length
+                        /* check for null share name ie connect to dfs root */
-                                 * three strings */
+                        if ((strchr(volume_info.UNC + 3, '\\') == NULL)
-                                if ((strchr(volume_info.UNC + 3, '\\') == NULL)
+                            && (strchr(volume_info.UNC + 3, '/') == NULL)) {
-                                    && (strchr(volume_info.UNC + 3, '/') ==
+                                /* rc = connect_to_dfs_path(...) */
-                                        NULL)) {
+                                cFYI(1, ("DFS root not supported"));
-/*                                      rc = connect_to_dfs_path(xid, pSesInfo,
+                                rc = -ENODEV;
-                                                "", cifs_sb->local_nls,
+                                goto mount_fail_check;
-                                                cifs_sb->mnt_cifs_flags &
+                        } else {
-                                                  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
+                                /* BB Do we need to wrap sesSem around
-                                        cFYI(1, ("DFS root not supported"));
+                                 * this TCon call and Unix SetFS as
-                                        rc = -ENODEV;
+                                 * we do on SessSetup and reconnect? */
-                                        goto out;
+                                rc = CIFSTCon(xid, pSesInfo, volume_info.UNC,
-                                } else {
+                                              tcon, cifs_sb->local_nls);
-                                        /* BB Do we need to wrap sesSem around
+                                cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                         * this TCon call and Unix SetFS as
+                                if (volume_info.nodfs) {
-                                         * we do on SessSetup and reconnect? */
+                                        tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-                                        rc = CIFSTCon(xid, pSesInfo,
+                                        cFYI(1, ("DFS disabled (%d)",
-                                                volume_info.UNC,
+                                                tcon->Flags));
-                                                tcon, cifs_sb->local_nls);
-                                        cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                        if (volume_info.nodfs) {
-                                                tcon->Flags &=
-                                                        ~SMB_SHARE_IS_IN_DFS;
-                                                cFYI(1, ("DFS disabled (%d)",
-                                                        tcon->Flags));
-                                        }
-                                }
-                                if (!rc) {
-                                        atomic_inc(&pSesInfo->inUse);
-                                        tcon->retry = volume_info.retry;
-                                        tcon->nocase = volume_info.nocase;
-                                        tcon->seal = volume_info.seal;
                                }
                        }
-                }
+                        if (rc)
+                                goto mount_fail_check;
+                        tcon->seal = volume_info.seal;
+                        write_lock(&cifs_tcp_ses_lock);
+                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
+                        write_unlock(&cifs_tcp_ses_lock);
+                }
+                /* we can have only one retry value for a connection
+                   to a share so for resources mounted more than once
+                   to the same server share the last value passed in
+                   for the retry flag is used */
+                tcon->retry = volume_info.retry;
+                tcon->nocase = volume_info.nocase;
+                tcon->local_lease = volume_info.local_lease;
        }
        if (pSesInfo) {
                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
@@ -2246,80 +2327,49 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-/* on error free sesinfo and tcon struct if needed */
+mount_fail_check:
+        /* on error free sesinfo and tcon struct if needed */
        if (rc) {
-                /* if session setup failed, use count is zero but
+                /* If find_unc succeeded then rc == 0 so we can not end */
-                we still need to free cifsd thread */
+                /* up accidently freeing someone elses tcon struct */
-                if (atomic_read(&srvTcp->socketUseCount) == 0) {
+                if (tcon)
-                        spin_lock(&GlobalMid_Lock);
+                        cifs_put_tcon(tcon);
-                        srvTcp->tcpStatus = CifsExiting;
+                else if (pSesInfo)
-                        spin_unlock(&GlobalMid_Lock);
+                        cifs_put_smb_ses(pSesInfo);
-                        kill_cifsd(srvTcp);
-                }
-                 /* If find_unc succeeded then rc == 0 so we can not end */
-                if (tcon)  /* up accidently freeing someone elses tcon struct */
-                        tconInfoFree(tcon);
-                if (existingCifsSes == NULL) {
-                        if (pSesInfo) {
-                                if ((pSesInfo->server) &&
-                                    (pSesInfo->status == CifsGood)) {
-                                        int temp_rc;
-                                        temp_rc = CIFSSMBLogoff(xid, pSesInfo);
-                                        /* if the socketUseCount is now zero */
-                                        if ((temp_rc == -ESHUTDOWN) &&
-                                            (pSesInfo->server))
-                                                kill_cifsd(pSesInfo->server);
-                                } else {
-                                        cFYI(1, ("No session or bad tcon"));
-                                        if (pSesInfo->server) {
-                                                spin_lock(&GlobalMid_Lock);
-                                                srvTcp->tcpStatus = CifsExiting;
-                                                spin_unlock(&GlobalMid_Lock);
-                                                kill_cifsd(pSesInfo->server);
-                                        }
-                                }
-                                sesInfoFree(pSesInfo);
-                                /* pSesInfo = NULL; */
-                        }
-                }
-        } else {
-                atomic_inc(&tcon->useCount);
-                cifs_sb->tcon = tcon;
-                tcon->ses = pSesInfo;
-                /* do not care if following two calls succeed - informational */
-                if (!tcon->ipc) {
-                        CIFSSMBQFSDeviceInfo(xid, tcon);
-                        CIFSSMBQFSAttributeInfo(xid, tcon);
-                }
-                /* tell server which Unix caps we support */
-                if (tcon->ses->capabilities & CAP_UNIX)
-                        /* reset of caps checks mount to see if unix extensions
-                           disabled for just this mount */
-                        reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
                else
-                        tcon->unix_ext = 0; /* server does not support them */
+                        cifs_put_tcp_session(srvTcp);
+                goto out;
+        }
+        cifs_sb->tcon = tcon;
-                /* convert forward to back slashes in prepath here if needed */
+        /* do not care if following two calls succeed - informational */
-                if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+        if (!tcon->ipc) {
-                        convert_delimiter(cifs_sb->prepath,
+                CIFSSMBQFSDeviceInfo(xid, tcon);
-                                          CIFS_DIR_SEP(cifs_sb));
+                CIFSSMBQFSAttributeInfo(xid, tcon);
+        }
-                if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+        /* tell server which Unix caps we support */
-                        cifs_sb->rsize = 1024 * 127;
+        if (tcon->ses->capabilities & CAP_UNIX)
-                        cFYI(DBG2,
+                /* reset of caps checks mount to see if unix extensions
-                                ("no very large read support, rsize now 127K"));
+                   disabled for just this mount */
-                }
+                reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
-                if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
+        else
-                        cifs_sb->wsize = min(cifs_sb->wsize,
+                tcon->unix_ext = 0; /* server does not support them */
-                                             (tcon->ses->server->maxBuf -
-                                              MAX_CIFS_HDR_SIZE));
+        /* convert forward to back slashes in prepath here if needed */
-                if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-                        cifs_sb->rsize = min(cifs_sb->rsize,
+                convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
-                                             (tcon->ses->server->maxBuf -
-                                              MAX_CIFS_HDR_SIZE));
+        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+                cifs_sb->rsize = 1024 * 127;
+                cFYI(DBG2, ("no very large read support, rsize now 127K"));
        }
+        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
+                cifs_sb->wsize = min(cifs_sb->wsize,
+                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+        if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+                cifs_sb->rsize = min(cifs_sb->rsize,
+                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
        /* volume_info.password is freed above when existing session found
        (in which case it is not needed anymore) but when new sesion is created
@@ -3489,6 +3539,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
        /* above now done in SendReceive */
        if ((rc == 0) && (tcon != NULL)) {
                tcon->tidStatus = CifsGood;
+                tcon->need_reconnect = false;
                tcon->tid = smb_buffer_response->Tid;
                bcc_ptr = pByteArea(smb_buffer_response);
                length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
@@ -3560,48 +3611,17 @@ int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
        int rc = 0;
-        int xid;
-        struct cifsSesInfo *ses = NULL;
        char *tmp;
-        xid = GetXid();
+        if (cifs_sb->tcon)
+                cifs_put_tcon(cifs_sb->tcon);
-        if (cifs_sb->tcon) {
-                ses = cifs_sb->tcon->ses; /* save ptr to ses before delete tcon!*/
-                rc = CIFSSMBTDis(xid, cifs_sb->tcon);
-                if (rc == -EBUSY) {
-                        FreeXid(xid);
-                        return 0;
-                }
-                DeleteTconOplockQEntries(cifs_sb->tcon);
-                tconInfoFree(cifs_sb->tcon);
-                if ((ses) && (ses->server)) {
-                        /* save off task so we do not refer to ses later */
-                        cFYI(1, ("About to do SMBLogoff "));
-                        rc = CIFSSMBLogoff(xid, ses);
-                        if (rc == -EBUSY) {
-                                FreeXid(xid);
-                                return 0;
-                        } else if (rc == -ESHUTDOWN) {
-                                cFYI(1, ("Waking up socket by sending signal"));
-                                if (ses->server)
-                                        kill_cifsd(ses->server);
-                                rc = 0;
-                        } /* else - we have an smb session
-                                left on this socket do not kill cifsd */
-                } else
-                        cFYI(1, ("No session or bad tcon"));
-        }
        cifs_sb->tcon = NULL;
        tmp = cifs_sb->prepath;
        cifs_sb->prepathlen = 0;
        cifs_sb->prepath = NULL;
        kfree(tmp);
-        if (ses)
-                sesInfoFree(ses);
-        FreeXid(xid);
        return rc;
 }
@@ -3717,6 +3737,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                cFYI(1, ("CIFS Session Established successfully"));
                        spin_lock(&GlobalMid_Lock);
                        pSesInfo->status = CifsGood;
+                        pSesInfo->need_reconnect = false;
                        spin_unlock(&GlobalMid_Lock);
        }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 62d8bd8f14c0..6449e1aae621 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -493,7 +493,7 @@ int cifs_close(struct inode *inode, struct file *file)
                if (pTcon) {
                        /* no sense reconnecting to close a file that is
                           already closed */
-                        if (pTcon->tidStatus != CifsNeedReconnect) {
+                        if (!pTcon->need_reconnect) {
                                timeout = 2;
                                while ((atomic_read(&pSMBFile->wrtPending) != 0)
                                        && (timeout <= 2048)) {
@@ -1404,7 +1404,10 @@ retry:
                        if ((wbc->nr_to_write -= n_iov) <= 0)
                                done = 1;
                        index = next;
-                }
+                } else
+                        /* Need to re-find the pages we skipped */
+                        index = pvec.pages[0]->index + 1;
                pagevec_release(&pvec);
        }
        if (!scanned && !done) {
@@ -1824,7 +1827,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        pTcon = cifs_sb->tcon;
        pagevec_init(&lru_pvec, 0);
-                cFYI(DBG2, ("rpages: num pages %d", num_pages));
+        cFYI(DBG2, ("rpages: num pages %d", num_pages));
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d54fa8aeaea9..ff8c68de4a92 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1361,9 +1361,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc == 0 && (info_buf_source->UniqueId ==
-                                   info_buf_target->UniqueId))
+                                   info_buf_target->UniqueId)) {
                        /* same file, POSIX says that this is a noop */
+                        rc = 0;
                        goto cifs_rename_exit;
+                }
        } /* else ... BB we could add the same check for Windows by
                     checking the UniqueId via FILE_INTERNAL_INFO */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 88786ba02d27..addd1dcc2d79 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,12 +75,12 @@ sesInfoAlloc(void)
        ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
        if (ret_buf) {
-                write_lock(&GlobalSMBSeslock);
                atomic_inc(&sesInfoAllocCount);
                ret_buf->status = CifsNew;
-                list_add(&ret_buf->cifsSessionList, &GlobalSMBSessionList);
+                ++ret_buf->ses_count;
+                INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+                INIT_LIST_HEAD(&ret_buf->tcon_list);
                init_MUTEX(&ret_buf->sesSem);
-                write_unlock(&GlobalSMBSeslock);
        }
        return ret_buf;
 }
@@ -93,10 +93,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
                return;
        }
-        write_lock(&GlobalSMBSeslock);
        atomic_dec(&sesInfoAllocCount);
-        list_del(&buf_to_free->cifsSessionList);
-        write_unlock(&GlobalSMBSeslock);
        kfree(buf_to_free->serverOS);
        kfree(buf_to_free->serverDomain);
        kfree(buf_to_free->serverNOS);
@@ -111,17 +108,14 @@ tconInfoAlloc(void)
        struct cifsTconInfo *ret_buf;
        ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
        if (ret_buf) {
-                write_lock(&GlobalSMBSeslock);
                atomic_inc(&tconInfoAllocCount);
-                list_add(&ret_buf->cifsConnectionList,
-                         &GlobalTreeConnectionList);
                ret_buf->tidStatus = CifsNew;
+                ++ret_buf->tc_count;
                INIT_LIST_HEAD(&ret_buf->openFileList);
-                init_MUTEX(&ret_buf->tconSem);
+                INIT_LIST_HEAD(&ret_buf->tcon_list);
 #ifdef CONFIG_CIFS_STATS
                spin_lock_init(&ret_buf->stat_lock);
 #endif
-                write_unlock(&GlobalSMBSeslock);
        }
        return ret_buf;
 }
@@ -133,10 +127,7 @@ tconInfoFree(struct cifsTconInfo *buf_to_free)
                cFYI(1, ("Null buffer passed to tconInfoFree"));
                return;
        }
-        write_lock(&GlobalSMBSeslock);
        atomic_dec(&tconInfoAllocCount);
-        list_del(&buf_to_free->cifsConnectionList);
-        write_unlock(&GlobalSMBSeslock);
        kfree(buf_to_free->nativeFileSystem);
        kfree(buf_to_free);
 }
@@ -350,9 +341,9 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                if (current->fsuid != treeCon->ses->linux_uid) {
                                        cFYI(1, ("Multiuser mode and UID "
                                                 "did not match tcon uid"));
-                                        read_lock(&GlobalSMBSeslock);
+                                        read_lock(&cifs_tcp_ses_lock);
-                                        list_for_each(temp_item, &GlobalSMBSessionList) {
+                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
-                                                ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList);
+                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current->fsuid) {
                                                        if (ses->server == treeCon->ses->server) {
                                                                cFYI(1, ("found matching uid substitute right smb_uid"));
@@ -364,7 +355,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                                                        }
                                                }
                                        }
-                                        read_unlock(&GlobalSMBSeslock);
+                                        read_unlock(&cifs_tcp_ses_lock);
                                }
                        }
                }
@@ -497,9 +488,10 @@ bool
 is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
        struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
-        struct list_head *tmp;
+        struct list_head *tmp, *tmp1, *tmp2;
-        struct list_head *tmp1;
+        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
+        struct cifsInodeInfo *pCifsInode;
        struct cifsFileInfo *netfile;
        cFYI(1, ("Checking for oplock break or dnotify response"));
@@ -554,42 +546,42 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                return false;
        /* look up tcon based on tid & uid */
-        read_lock(&GlobalSMBSeslock);
+        read_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &GlobalTreeConnectionList) {
+        list_for_each(tmp, &srv->smb_ses_list) {
-                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
+                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
-                if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
+                list_for_each(tmp1, &ses->tcon_list) {
+                        tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+                        if (tcon->tid != buf->Tid)
+                                continue;
                        cifs_stats_inc(&tcon->num_oplock_brks);
-                        list_for_each(tmp1, &tcon->openFileList) {
+                        list_for_each(tmp2, &tcon->openFileList) {
-                                netfile = list_entry(tmp1, struct cifsFileInfo,
+                                netfile = list_entry(tmp2, struct cifsFileInfo,
                                                     tlist);
-                                if (pSMB->Fid == netfile->netfid) {
+                                if (pSMB->Fid != netfile->netfid)
-                                        struct cifsInodeInfo *pCifsInode;
+                                        continue;
-                                        read_unlock(&GlobalSMBSeslock);
-                                        cFYI(1,
+                                read_unlock(&cifs_tcp_ses_lock);
-                                            ("file id match, oplock break"));
+                                cFYI(1, ("file id match, oplock break"));
-                                        pCifsInode =
+                                pCifsInode = CIFS_I(netfile->pInode);
-                                                CIFS_I(netfile->pInode);
+                                pCifsInode->clientCanCacheAll = false;
-                                        pCifsInode->clientCanCacheAll = false;
+                                if (pSMB->OplockLevel == 0)
-                                        if (pSMB->OplockLevel == 0)
+                                        pCifsInode->clientCanCacheRead = false;
-                                                pCifsInode->clientCanCacheRead
+                                pCifsInode->oplockPending = true;
-                                                        = false;
+                                AllocOplockQEntry(netfile->pInode,
-                                        pCifsInode->oplockPending = true;
+                                                  netfile->netfid, tcon);
-                                        AllocOplockQEntry(netfile->pInode,
+                                cFYI(1, ("about to wake up oplock thread"));
-                                                          netfile->netfid,
+                                if (oplockThread)
-                                                          tcon);
+                                        wake_up_process(oplockThread);
-                                        cFYI(1,
-                                            ("about to wake up oplock thread"));
+                                return true;
-                                        if (oplockThread)
-                                            wake_up_process(oplockThread);
-                                        return true;
-                                }
                        }
-                        read_unlock(&GlobalSMBSeslock);
+                        read_unlock(&cifs_tcp_ses_lock);
                        cFYI(1, ("No matching file for oplock break"));
                        return true;
                }
        }
-        read_unlock(&GlobalSMBSeslock);
+        read_unlock(&cifs_tcp_ses_lock);
        cFYI(1, ("Can not process oplock break for non-existent connection"));
        return true;
 }
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bf0e6d8e382a..ff8243a8fe3e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -161,7 +161,7 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 int
 smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-         unsigned int smb_buf_length, struct sockaddr *sin)
+         unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
 {
        int rc = 0;
        int i = 0;
@@ -178,7 +178,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
-        smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+        if (noblocksnd)
+                smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+        else
+                smb_msg.msg_flags = MSG_NOSIGNAL;
        /* smb header is converted in header_assemble. bcc and rest of SMB word
           area, and byte area if necessary, is converted to littleendian in
@@ -229,8 +232,8 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 }
 static int
-smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
+smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
-          struct sockaddr *sin)
+          struct sockaddr *sin, bool noblocksnd)
 {
        int rc = 0;
        int i = 0;
@@ -240,6 +243,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
        unsigned int total_len;
        int first_vec = 0;
        unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+        struct socket *ssocket = server->ssocket;
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
@@ -248,7 +252,10 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
-        smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+        if (noblocksnd)
+                smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+        else
+                smb_msg.msg_flags = MSG_NOSIGNAL;
        /* smb header is converted in header_assemble. bcc and rest of SMB word
           area, and byte area if necessary, is converted to littleendian in
@@ -283,8 +290,11 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
                if (rc < 0)
                        break;
-                if (rc >= total_len) {
+                if (rc == total_len) {
-                        WARN_ON(rc > total_len);
+                        total_len = 0;
+                        break;
+                } else if (rc > total_len) {
+                        cERROR(1, ("sent %d requested %d", rc, total_len));
                        break;
                }
                if (rc == 0) {
@@ -312,6 +322,16 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
                i = 0; /* in case we get ENOSPC on the next send */
        }
+        if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+                cFYI(1, ("partial send (%d remaining), terminating session",
+                        total_len));
+                /* If we have only sent part of an SMB then the next SMB
+                   could be taken as the remainder of this one.  We need
+                   to kill the socket so the server throws away the partial
+                   SMB */
+                server->tcpStatus = CifsNeedReconnect;
+        }
        if (rc < 0) {
                cERROR(1, ("Error %d sending data on socket to server", rc));
        } else
@@ -518,8 +538,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send2(ses->server->ssocket, iov, n_vec,
+        rc = smb_send2(ses->server, iov, n_vec,
-                      (struct sockaddr *) &(ses->server->addr.sockAddr));
+                      (struct sockaddr *) &(ses->server->addr.sockAddr),
+                       ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -711,7 +732,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        atomic_inc(&ses->server->inSend);
 #endif
        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-                      (struct sockaddr *) &(ses->server->addr.sockAddr));
+                      (struct sockaddr *) &(ses->server->addr.sockAddr),
+                      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -851,7 +873,8 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
                return rc;
        }
        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-              (struct sockaddr *) &(ses->server->addr.sockAddr));
+              (struct sockaddr *) &(ses->server->addr.sockAddr),
+              ses->server->noblocksnd);
        up(&ses->server->tcpSem);
        return rc;
 }
@@ -941,7 +964,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        atomic_inc(&ses->server->inSend);
 #endif
        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-                      (struct sockaddr *) &(ses->server->addr.sockAddr));
+                      (struct sockaddr *) &(ses->server->addr.sockAddr),
+                      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index cfd29da714d1..0376ac66c44a 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -2,7 +2,7 @@
 *              An implementation of a loadable kernel mode driver providing
 *              multiple kernel/user space bidirectional communications links.
 *
- *              Author:         Alan Cox <alan@redhat.com>
+ *              Author:         Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d910501de6d2..8d86b7960f0d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -812,7 +812,7 @@ int dlm_release_lockspace(void *lockspace, int force)
        error = release_lockspace(ls, force);
        if (!error)
                ls_count--;
-        else if (!ls_count)
+        if (!ls_count)
                threads_stop();
        mutex_unlock(&ls_lock);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 06db79d05c12..6046239465a1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1251,6 +1251,7 @@ struct kmem_cache *ecryptfs_header_cache_2;
 /**
 * ecryptfs_write_headers_virt
 * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
 * @size: Set to the number of bytes written by this function
 * @crypt_stat: The cryptographic context
 * @ecryptfs_dentry: The eCryptfs dentry
@@ -1278,7 +1279,8 @@ struct kmem_cache *ecryptfs_header_cache_2;
 *
 * Returns zero on success
 */
-static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+                                       size_t *size,
                                       struct ecryptfs_crypt_stat *crypt_stat,
                                       struct dentry *ecryptfs_dentry)
 {
@@ -1296,7 +1298,7 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
        offset += written;
        rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
                                              ecryptfs_dentry, &written,
-                                              PAGE_CACHE_SIZE - offset);
+                                              max - offset);
        if (rc)
                ecryptfs_printk(KERN_WARNING, "Error generating key packet "
                                "set; rc = [%d]\n", rc);
@@ -1368,14 +1370,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                goto out;
        }
        /* Released in this function */
-        virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL);
+        virt = (char *)get_zeroed_page(GFP_KERNEL);
        if (!virt) {
                printk(KERN_ERR "%s: Out of memory\n", __func__);
                rc = -ENOMEM;
                goto out;
        }
-        rc = ecryptfs_write_headers_virt(virt, &size, crypt_stat,
+        rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
-                                         ecryptfs_dentry);
+                                         crypt_stat, ecryptfs_dentry);
        if (unlikely(rc)) {
                printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
                       __func__, rc);
@@ -1393,8 +1395,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                goto out_free;
        }
 out_free:
-        memset(virt, 0, crypt_stat->num_header_bytes_at_front);
+        free_page((unsigned long)virt);
-        kfree(virt);
 out:
        return rc;
 }
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18eaa78ecb4e..f6c94f232ec1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -281,7 +281,8 @@ void ext3_abort (struct super_block * sb, const char * function,
        EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
        EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
-        journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+        if (EXT3_SB(sb)->s_journal)
+                journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 void ext3_warning (struct super_block * sb, const char * function,
@@ -390,11 +391,14 @@ static void ext3_put_super (struct super_block * sb)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
        struct ext3_super_block *es = sbi->s_es;
-        int i;
+        int i, err;
        ext3_xattr_put_super(sb);
-        if (journal_destroy(sbi->s_journal) < 0)
+        err = journal_destroy(sbi->s_journal);
+        sbi->s_journal = NULL;
+        if (err < 0)
                ext3_abort(sb, __func__, "Couldn't clean up the journal");
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -2371,12 +2375,9 @@ int ext3_force_commit(struct super_block *sb)
 /*
 * Ext3 always journals updates to the superblock itself, so we don't
 * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
+ * point.  (We can probably nuke this function altogether, and remove
- * to the disk.
+ * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- *
- * This implicitly triggers the writebehind on sync().
 */
 static void ext3_write_super (struct super_block * sb)
 {
        if (mutex_trylock(&sb->s_lock) != 0)
@@ -2386,13 +2387,12 @@ static void ext3_write_super (struct super_block * sb)
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-        tid_t target;
        sb->s_dirt = 0;
-        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+        if (wait)
-                if (wait)
+                ext3_force_commit(sb);
-                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
+        else
-        }
+                journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
        return 0;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b9821be709bd..d2003cdc36aa 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -589,21 +589,23 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        return;
 }
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+/**
-                                                s64 nblocks)
+ * ext4_has_free_blocks()
+ * @sbi:        in-core super block structure.
+ * @nblocks:    number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
-        s64 free_blocks, dirty_blocks;
+        s64 free_blocks, dirty_blocks, root_blocks;
-        s64 root_blocks = 0;
        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
        struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
        free_blocks  = percpu_counter_read_positive(fbc);
        dirty_blocks = percpu_counter_read_positive(dbc);
+        root_blocks = ext4_r_blocks_count(sbi->s_es);
-        if (!capable(CAP_SYS_RESOURCE) &&
-                sbi->s_resuid != current->fsuid &&
-                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-                root_blocks = ext4_r_blocks_count(sbi->s_es);
        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
                                                EXT4_FREEBLOCKS_WATERMARK) {
@@ -616,57 +618,32 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
                }
        }
        /* Check whether we have space after
-         * accounting for current dirty blocks
+         * accounting for current dirty blocks & root reserved blocks.
         */
-        if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
+        if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
-                /* we don't have free space */
+                return 1;
-                return -ENOSPC;
+        /* Hm, nope.  Are (enough) root reserved blocks available? */
+        if (sbi->s_resuid == current->fsuid ||
+            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+            capable(CAP_SYS_RESOURCE)) {
+                if (free_blocks >= (nblocks + dirty_blocks))
+                        return 1;
+        }
-        /* Add the blocks to nblocks */
-        percpu_counter_add(dbc, nblocks);
        return 0;
 }
-/**
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- * ext4_has_free_blocks()
- * @sbi:        in-core super block structure.
- * @nblocks:    number of neeed blocks
- *
- * Check if filesystem has free blocks available for allocation.
- * Return the number of blocks avaible for allocation for this request
- * On success, return nblocks
- */
-ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
                                                s64 nblocks)
 {
-        s64 free_blocks, dirty_blocks;
+        if (ext4_has_free_blocks(sbi, nblocks)) {
-        s64 root_blocks = 0;
+                percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
-        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-        struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-        free_blocks  = percpu_counter_read_positive(fbc);
-        dirty_blocks = percpu_counter_read_positive(dbc);
-        if (!capable(CAP_SYS_RESOURCE) &&
-                sbi->s_resuid != current->fsuid &&
-                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-                root_blocks = ext4_r_blocks_count(sbi->s_es);
-        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-                                                EXT4_FREEBLOCKS_WATERMARK) {
-                free_blocks  = percpu_counter_sum(fbc);
-                dirty_blocks = percpu_counter_sum(dbc);
-        }
-        if (free_blocks <= (root_blocks + dirty_blocks))
-                /* we don't have free space */
                return 0;
+        } else
-        if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+                return -ENOSPC;
-                return free_blocks - (root_blocks + dirty_blocks);
-        return nblocks;
 }
 /**
 * ext4_should_retry_alloc()
 * @sb:                 super block
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4880cc3e6727..b0537c827024 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1003,8 +1003,7 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
                                        unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-                                         s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fe34d74cfb19..2a117e286e54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -718,6 +718,8 @@ got:
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        free = ext4_free_blocks_after_init(sb, group, gdp);
                        gdp->bg_free_blocks_count = cpu_to_le16(free);
+                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+                                                                gdp);
                }
                spin_unlock(sb_bgl_lock(sbi, group));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbf6953845b..be21a5ae33cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page,
                        unlock_page(page);
                        return 0;
                }
+                /* now mark the buffer_heads as dirty and uptodate */
+                block_commit_write(page, 0, PAGE_CACHE_SIZE);
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -4580,9 +4582,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
-                return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
-        return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
 /*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dfe17a134052..444ad998f72e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
+                        ext4_unlock_group(sb, group);
                        ext4_error(sb, __func__,
                            "Double free of blocks %d (%d %d)\n",
                            block, entry->start_blk, entry->count);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bdddea14e782..e4a241c65dbe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,8 @@ void ext4_abort(struct super_block *sb, const char *function,
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
-        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+        if (EXT4_SB(sb)->s_journal)
+                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 void ext4_warning(struct super_block *sb, const char *function,
@@ -442,14 +443,16 @@ static void ext4_put_super(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
-        int i;
+        int i, err;
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-        if (jbd2_journal_destroy(sbi->s_journal) < 0)
+        err = jbd2_journal_destroy(sbi->s_journal);
-                ext4_abort(sb, __func__, "Couldn't clean up the journal");
        sbi->s_journal = NULL;
+        if (err < 0)
+                ext4_abort(sb, __func__, "Couldn't clean up the journal");
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -1455,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
        /* We allocate both existing and potentially added groups */
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-                            ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
-                              EXT4_DESC_PER_BLOCK_BITS(sb))) /
+                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-                           groups_per_flex;
        sbi->s_flex_groups = kzalloc(flex_group_count *
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
@@ -2882,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb)
 /*
 * Ext4 always journals updates to the superblock itself, so we don't
 * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
+ * point.  (We can probably nuke this function altogether, and remove
- * to the disk.
+ * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- *
- * This implicitly triggers the writebehind on sync().
 */
 static void ext4_write_super(struct super_block *sb)
 {
        if (mutex_trylock(&sb->s_lock) != 0)
@@ -2897,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
-        tid_t target;
+        int ret = 0;
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
-        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+        if (wait)
-                if (wait)
+                ret = ext4_force_commit(sb);
-                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+        else
-        }
+                jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
-        return 0;
+        return ret;
 }
 /*
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index bfb5f06cf2c8..e06190322c1c 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -3,5 +3,9 @@
 #
 obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
-fat-objs := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3222f51c41cf..b42602298087 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,8 +9,8 @@
 */
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 /* this must be > 0. */
 #define FAT_MAX_CACHE   8
@@ -293,10 +293,12 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 }
 int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-             unsigned long *mapped_blocks)
+             unsigned long *mapped_blocks, int create)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        const unsigned long blocksize = sb->s_blocksize;
+        const unsigned char blocksize_bits = sb->s_blocksize_bits;
        sector_t last_block;
        int cluster, offset;
@@ -309,10 +311,21 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
                }
                return 0;
        }
-        last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
-                >> sb->s_blocksize_bits;
+        last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
-        if (sector >= last_block)
+        if (sector >= last_block) {
-                return 0;
+                if (!create)
+                        return 0;
+                /*
+                 * ->mmu_private can access on only allocation path.
+                 * (caller must hold ->i_mutex)
+                 */
+                last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+                        >> blocksize_bits;
+                if (sector >= last_block)
+                        return 0;
+        }
        cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
        offset  = sector & (sbi->sec_per_clus - 1);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bae1c3292522..67e058357098 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,11 +16,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include "fat.h"
 static inline loff_t fat_make_i_pos(struct super_block *sb,
                                    struct buffer_head *bh,
@@ -77,7 +77,7 @@ next:
        *bh = NULL;
        iblock = *pos >> sb->s_blocksize_bits;
-        err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
+        err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
        if (err || !phys)
                return -1;      /* beyond EOF or error */
@@ -86,7 +86,7 @@ next:
        *bh = sb_bread(sb, phys);
        if (*bh == NULL) {
                printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
-                       (unsigned long long)phys);
+                       (llu)phys);
                /* skip this block */
                *pos = (iblock + 1) << sb->s_blocksize_bits;
                goto next;
@@ -373,9 +373,10 @@ parse_record:
                if (de->attr == ATTR_EXT) {
                        int status = fat_parse_long(inode, &cpos, &bh, &de,
                                                    &unicode, &nr_slots);
-                        if (status < 0)
+                        if (status < 0) {
-                                return status;
+                                err = status;
-                        else if (status == PARSE_INVALID)
+                                goto end_of_dir;
+                        } else if (status == PARSE_INVALID)
                                continue;
                        else if (status == PARSE_NOT_LONGNAME)
                                goto parse_record;
@@ -832,6 +833,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 #endif /* CONFIG_COMPAT */
 const struct file_operations fat_dir_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = fat_readdir,
        .ioctl          = fat_dir_ioctl,
@@ -1089,6 +1091,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
        struct msdos_dir_entry *de;
        sector_t blknr;
        __le16 date, time;
+        u8 time_cs;
        int err, cluster;
        err = fat_alloc_clusters(dir, &cluster, 1);
@@ -1102,7 +1105,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
                goto error_free;
        }
-        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+        fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
        de = (struct msdos_dir_entry *)bhs[0]->b_data;
        /* filling the new directory slots ("." and ".." entries) */
@@ -1112,13 +1115,14 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
        de[0].lcase = de[1].lcase = 0;
        de[0].time = de[1].time = time;
        de[0].date = de[1].date = date;
-        de[0].ctime_cs = de[1].ctime_cs = 0;
        if (sbi->options.isvfat) {
                /* extra timestamps */
                de[0].ctime = de[1].ctime = time;
+                de[0].ctime_cs = de[1].ctime_cs = time_cs;
                de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
        } else {
                de[0].ctime = de[1].ctime = 0;
+                de[0].ctime_cs = de[1].ctime_cs = 0;
                de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
        }
        de[0].start = cpu_to_le16(cluster);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 000000000000..ea440d65819c
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,329 @@
+#ifndef _FAT_H
+#define _FAT_H
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/msdos_fs.h>
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER  0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95  0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT  0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95   0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT   0x0200 /* emulate winnt rule for create */
+struct fat_mount_options {
+        uid_t fs_uid;
+        gid_t fs_gid;
+        unsigned short fs_fmask;
+        unsigned short fs_dmask;
+        unsigned short codepage;  /* Codepage for shortname conversions */
+        char *iocharset;          /* Charset used for filename input/display */
+        unsigned short shortname; /* flags for shortname display/create rule */
+        unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+        unsigned short allow_utime;/* permission for setting the [am]time */
+        unsigned quiet:1,         /* set = fake successful chmods and chowns */
+                 showexec:1,      /* set = only set x bit for com/exe/bat */
+                 sys_immutable:1, /* set = system files are immutable */
+                 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+                 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+                 utf8:1,          /* Use of UTF-8 character set (Default) */
+                 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+                 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+                 flush:1,         /* write things quickly */
+                 nocase:1,        /* Does this need case conversion? 0=need case conversion*/
+                 usefree:1,       /* Use free_clusters for FAT32 */
+                 tz_utc:1,        /* Filesystem timestamps are in UTC */
+                 rodir:1;         /* allow ATTR_RO for directory */
+};
+#define FAT_HASH_BITS   8
+#define FAT_HASH_SIZE   (1UL << FAT_HASH_BITS)
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+        unsigned short sec_per_clus; /* sectors/cluster */
+        unsigned short cluster_bits; /* log2(cluster_size) */
+        unsigned int cluster_size;   /* cluster size */
+        unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+        unsigned short fat_start;
+        unsigned long fat_length;    /* FAT start & length (sec.) */
+        unsigned long dir_start;
+        unsigned short dir_entries;  /* root dir start & entries */
+        unsigned long data_start;    /* first data sector */
+        unsigned long max_cluster;   /* maximum cluster number */
+        unsigned long root_cluster;  /* first cluster of the root directory */
+        unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+        struct mutex fat_lock;
+        unsigned int prev_free;      /* previously allocated cluster number */
+        unsigned int free_clusters;  /* -1 if undefined */
+        unsigned int free_clus_valid; /* is free_clusters valid? */
+        struct fat_mount_options options;
+        struct nls_table *nls_disk;  /* Codepage used on disk */
+        struct nls_table *nls_io;    /* Charset used for input and display */
+        const void *dir_ops;                 /* Opaque; default directory operations */
+        int dir_per_block;           /* dir entries per block */
+        int dir_per_block_bits;      /* log2(dir_per_block) */
+        int fatent_shift;
+        struct fatent_operations *fatent_ops;
+        spinlock_t inode_hash_lock;
+        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+#define FAT_CACHE_VALID 0       /* special case for valid cache */
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+        spinlock_t cache_lru_lock;
+        struct list_head cache_lru;
+        int nr_caches;
+        /* for avoiding the race between fat_free() and fat_get_cluster() */
+        unsigned int cache_valid_id;
+        /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+        loff_t mmu_private;     /* physically allocated size */
+        int i_start;            /* first cluster or 0 */
+        int i_logstart;         /* logical first cluster */
+        int i_attrs;            /* unused attribute bits */
+        loff_t i_pos;           /* on-disk position of directory entry or 0 */
+        struct hlist_node i_fat_hash;   /* hash by i_location */
+        struct inode vfs_inode;
+};
+struct fat_slot_info {
+        loff_t i_pos;           /* on-disk position of directory entry */
+        loff_t slot_off;        /* offset for slot or de start */
+        int nr_slots;           /* number of slots + 1(de) in filename */
+        struct msdos_dir_entry *de;
+        struct buffer_head *bh;
+};
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+        return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        mode_t mask;
+        if (S_ISDIR(inode->i_mode)) {
+                if (!sbi->options.rodir)
+                        return 0;
+                mask = ~sbi->options.fs_dmask;
+        } else
+                mask = ~sbi->options.fs_fmask;
+        if (!(mask & S_IWUGO))
+                return 0;
+        return 1;
+}
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+                                   u8 attrs, mode_t mode)
+{
+        if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
+                mode &= ~S_IWUGO;
+        if (attrs & ATTR_DIR)
+                return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+        else
+                return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_make_attrs(struct inode *inode)
+{
+        u8 attrs = MSDOS_I(inode)->i_attrs;
+        if (S_ISDIR(inode->i_mode))
+                attrs |= ATTR_DIR;
+        if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+                attrs |= ATTR_RO;
+        return attrs;
+}
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+        if (fat_mode_can_hold_ro(inode))
+                MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+        else
+                MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
+}
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+        unsigned char s = name[0];
+        s = (s<<7) + (s>>1) + name[1];  s = (s<<7) + (s>>1) + name[2];
+        s = (s<<7) + (s>>1) + name[3];  s = (s<<7) + (s>>1) + name[4];
+        s = (s<<7) + (s>>1) + name[5];  s = (s<<7) + (s>>1) + name[6];
+        s = (s<<7) + (s>>1) + name[7];  s = (s<<7) + (s>>1) + name[8];
+        s = (s<<7) + (s>>1) + name[9];  s = (s<<7) + (s>>1) + name[10];
+        return s;
+}
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+        return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+                + sbi->data_start;
+}
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+        while (len--) {
+                *dst++ = src[0] | (src[1] << 8);
+                src += 2;
+        }
+#else
+        memcpy(dst, src, len * 2);
+#endif
+}
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+        while (len--) {
+                dst[0] = *src & 0x00FF;
+                dst[1] = (*src & 0xFF00) >> 8;
+                dst += 2;
+                src++;
+        }
+#else
+        memcpy(dst, src, len * 2);
+#endif
+}
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+                           int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+                    unsigned long *mapped_blocks, int create);
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+                           int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+                    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+                                struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+                           struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+/* fat/fatent.c */
+struct fat_entry {
+        int entry;
+        union {
+                u8 *ent12_p[2];
+                __le16 *ent16_p;
+                __le32 *ent32_p;
+        } u;
+        int nr_bhs;
+        struct buffer_head *bhs[2];
+};
+static inline void fatent_init(struct fat_entry *fatent)
+{
+        fatent->nr_bhs = 0;
+        fatent->entry = 0;
+        fatent->u.ent32_p = NULL;
+        fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+        fatent->entry = entry;
+        fatent->u.ent32_p = NULL;
+}
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+        int i;
+        fatent->u.ent32_p = NULL;
+        for (i = 0; i < fatent->nr_bhs; i++)
+                brelse(fatent->bhs[i]);
+        fatent->nr_bhs = 0;
+        fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+                        int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+                         int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+                              int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+/* fat/file.c */
+extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+                             unsigned int cmd, unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate(struct inode *inode);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                       struct kstat *stat);
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+                        struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+                        const struct inode_operations *fs_dir_inode_ops, int isvfat);
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+                            struct inode *i2);
+/* fat/misc.c */
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3))) __cold;
+extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+                              __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+                              __le16 *time, __le16 *date, u8 *time_cs);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+/* helper for printk */
+typedef unsigned long long      llu;
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index fb98b3d847ed..da6eea47872f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
 #include <linux/blkdev.h>
+#include "fat.h"
 struct fatent_operations {
        void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -92,8 +93,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
        brelse(bhs[0]);
 err:
-        printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
+        printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
-               (unsigned long long)blocknr);
        return -EIO;
 }
@@ -106,7 +106,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-                       (unsigned long long)blocknr);
+                       (llu)blocknr);
                return -EIO;
        }
        fatent->nr_bhs = 1;
@@ -316,10 +316,20 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
        /* Is this fatent's blocks including this entry? */
        if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
                return 0;
-        /* Does this entry need the next block? */
+        if (sbi->fat_bits == 12) {
-        if (sbi->fat_bits == 12 && (offset + 1) >= sb->s_blocksize) {
+                if ((offset + 1) < sb->s_blocksize) {
-                if (fatent->nr_bhs != 2 || bhs[1]->b_blocknr != (blocknr + 1))
+                        /* This entry is on bhs[0]. */
-                        return 0;
+                        if (fatent->nr_bhs == 2) {
+                                brelse(bhs[1]);
+                                fatent->nr_bhs = 1;
+                        }
+                } else {
+                        /* This entry needs the next block. */
+                        if (fatent->nr_bhs != 2)
+                                return 0;
+                        if (bhs[1]->b_blocknr != (blocknr + 1))
+                                return 0;
+                }
        }
        ops->ent_set_ptr(fatent, offset);
        return 1;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index ddde37025ca6..f06a4e525ece 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,13 +10,13 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include "fat.h"
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
                      unsigned int cmd, unsigned long arg)
@@ -29,10 +29,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        {
                u32 attr;
-                if (inode->i_ino == MSDOS_ROOT_INO)
+                mutex_lock(&inode->i_mutex);
-                        attr = ATTR_DIR;
+                attr = fat_make_attrs(inode);
-                else
+                mutex_unlock(&inode->i_mutex);
-                        attr = fat_attr(inode);
                return put_user(attr, user_attr);
        }
@@ -62,20 +61,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                /* Merge in ATTR_VOLUME and ATTR_DIR */
                attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
                        (is_dir ? ATTR_DIR : 0);
-                oldattr = fat_attr(inode);
+                oldattr = fat_make_attrs(inode);
                /* Equivalent to a chmod() */
                ia.ia_valid = ATTR_MODE | ATTR_CTIME;
                ia.ia_ctime = current_fs_time(inode->i_sb);
-                if (is_dir) {
+                if (is_dir)
-                        ia.ia_mode = MSDOS_MKMODE(attr,
+                        ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
-                                S_IRWXUGO & ~sbi->options.fs_dmask)
+                else {
-                                | S_IFDIR;
+                        ia.ia_mode = fat_make_mode(sbi, attr,
-                } else {
+                                S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
-                        ia.ia_mode = MSDOS_MKMODE(attr,
-                                (S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO))
-                                & ~sbi->options.fs_fmask)
-                                | S_IFREG;
                }
                /* The root directory has no attributes */
@@ -115,7 +110,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                                inode->i_flags &= S_IMMUTABLE;
                }
-                MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
+                fat_save_attrs(inode, attr);
                mark_inode_dirty(inode);
 up:
                mnt_drop_write(filp->f_path.mnt);
@@ -274,7 +269,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
        /*
         * Note, the basic check is already done by a caller of
-         * (attr->ia_mode & ~MSDOS_VALID_MODE)
+         * (attr->ia_mode & ~FAT_VALID_MODE)
         */
        if (S_ISREG(inode->i_mode))
@@ -287,11 +282,18 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
        /*
         * Of the r and x bits, all (subject to umask) must be present. Of the
         * w bits, either all (subject to umask) or none must be present.
+         *
+         * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
         */
        if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
                return -EPERM;
-        if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+        if (fat_mode_can_hold_ro(inode)) {
-                return -EPERM;
+                if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+                        return -EPERM;
+        } else {
+                if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+                        return -EPERM;
+        }
        *mode_ptr &= S_IFMT | perm;
@@ -314,13 +316,15 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 }
 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE  (S_IFREG | S_IFDIR | S_IRWXUGO)
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
-        int error = 0;
        unsigned int ia_valid;
+        int error;
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
@@ -356,7 +360,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
            ((attr->ia_valid & ATTR_GID) &&
             (attr->ia_gid != sbi->options.fs_gid)) ||
            ((attr->ia_valid & ATTR_MODE) &&
-             (attr->ia_mode & ~MSDOS_VALID_MODE)))
+             (attr->ia_mode & ~FAT_VALID_MODE)))
                error = -EPERM;
        if (error) {
@@ -374,7 +378,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        attr->ia_valid &= ~ATTR_MODE;
        }
-        error = inode_setattr(inode, attr);
+        if (attr->ia_valid)
+                error = inode_setattr(inode, attr);
 out:
        return error;
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c379..bdd8fb7be2ca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
-#include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
@@ -27,7 +26,9 @@
 #include <linux/uio.h>
 #include <linux/writeback.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #include <asm/unaligned.h>
+#include "fat.h"
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
 /* if user don't select VFAT, this is undefined. */
@@ -63,7 +64,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
        sector_t phys;
        int err, offset;
-        err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+        err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
        if (err)
                return err;
        if (phys) {
@@ -93,7 +94,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
        *max_blocks = min(mapped_blocks, *max_blocks);
        MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
-        err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+        err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
        if (err)
                return err;
@@ -175,7 +176,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
        if (rw == WRITE) {
                /*
-                 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+                 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
                 * so we need to update the ->mmu_private to block boundary.
                 *
                 * But we must fill the remaining area or hole by nul for
@@ -198,7 +199,14 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
-        return generic_block_bmap(mapping, block, fat_get_block);
+        sector_t blocknr;
+        /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+        mutex_lock(&mapping->host->i_mutex);
+        blocknr = generic_block_bmap(mapping, block, fat_get_block);
+        mutex_unlock(&mapping->host->i_mutex);
+        return blocknr;
 }
 static const struct address_space_operations fat_aops = {
@@ -247,25 +255,21 @@ static void fat_hash_init(struct super_block *sb)
                INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
 }
-static inline unsigned long fat_hash(struct super_block *sb, loff_t i_pos)
+static inline unsigned long fat_hash(loff_t i_pos)
 {
-        unsigned long tmp = (unsigned long)i_pos | (unsigned long) sb;
+        return hash_32(i_pos, FAT_HASH_BITS);
-        tmp = tmp + (tmp >> FAT_HASH_BITS) + (tmp >> FAT_HASH_BITS * 2);
-        return tmp & FAT_HASH_MASK;
 }
 void fat_attach(struct inode *inode, loff_t i_pos)
 {
-        struct super_block *sb = inode->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
        spin_lock(&sbi->inode_hash_lock);
        MSDOS_I(inode)->i_pos = i_pos;
-        hlist_add_head(&MSDOS_I(inode)->i_fat_hash,
+        hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
-                        sbi->inode_hashtable + fat_hash(sb, i_pos));
        spin_unlock(&sbi->inode_hash_lock);
 }
 EXPORT_SYMBOL_GPL(fat_attach);
 void fat_detach(struct inode *inode)
@@ -276,13 +280,12 @@ void fat_detach(struct inode *inode)
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
 }
 EXPORT_SYMBOL_GPL(fat_detach);
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        struct hlist_head *head = sbi->inode_hashtable + fat_hash(sb, i_pos);
+        struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
        struct hlist_node *_p;
        struct msdos_inode_info *i;
        struct inode *inode = NULL;
@@ -341,8 +344,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
        if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
                inode->i_generation &= ~1;
-                inode->i_mode = MSDOS_MKMODE(de->attr,
+                inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
-                        S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
                inode->i_op = sbi->dir_ops;
                inode->i_fop = &fat_dir_operations;
@@ -359,10 +361,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
                inode->i_nlink = fat_subdirs(inode);
        } else { /* not a directory */
                inode->i_generation |= 1;
-                inode->i_mode = MSDOS_MKMODE(de->attr,
+                inode->i_mode = fat_make_mode(sbi, de->attr,
-                    ((sbi->options.showexec && !is_exec(de->name + 8))
+                        ((sbi->options.showexec && !is_exec(de->name + 8))
-                        ? S_IRUGO|S_IWUGO : S_IRWXUGO)
+                         ? S_IRUGO|S_IWUGO : S_IRWXUGO));
-                    & ~sbi->options.fs_fmask) | S_IFREG;
                MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
                if (sbi->fat_bits == 32)
                        MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
@@ -378,25 +379,16 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
                if (sbi->options.sys_immutable)
                        inode->i_flags |= S_IMMUTABLE;
        }
-        MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
+        fat_save_attrs(inode, de->attr);
        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;
-        inode->i_mtime.tv_sec =
-                date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
+        fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
-                              sbi->options.tz_utc);
-        inode->i_mtime.tv_nsec = 0;
        if (sbi->options.isvfat) {
-                int secs = de->ctime_cs / 100;
+                fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
-                int csecs = de->ctime_cs % 100;
+                                  de->cdate, de->ctime_cs);
-                inode->i_ctime.tv_sec  =
+                fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
-                        date_dos2unix(le16_to_cpu(de->ctime),
-                                      le16_to_cpu(de->cdate),
-                                      sbi->options.tz_utc) + secs;
-                inode->i_ctime.tv_nsec = csecs * 10000000;
-                inode->i_atime.tv_sec =
-                        date_dos2unix(0, le16_to_cpu(de->adate),
-                                      sbi->options.tz_utc);
-                inode->i_atime.tv_nsec = 0;
        } else
                inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -443,13 +435,8 @@ static void fat_delete_inode(struct inode *inode)
 static void fat_clear_inode(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
-        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        spin_lock(&sbi->inode_hash_lock);
        fat_cache_inval_inode(inode);
-        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
+        fat_detach(inode);
-        spin_unlock(&sbi->inode_hash_lock);
 }
 static void fat_write_super(struct super_block *sb)
@@ -555,6 +542,20 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+                                    struct inode *inode)
+{
+        loff_t i_pos;
+#if BITS_PER_LONG == 32
+        spin_lock(&sbi->inode_hash_lock);
+#endif
+        i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+        spin_unlock(&sbi->inode_hash_lock);
+#endif
+        return i_pos;
+}
 static int fat_write_inode(struct inode *inode, int wait)
 {
        struct super_block *sb = inode->i_sb;
@@ -564,9 +565,12 @@ static int fat_write_inode(struct inode *inode, int wait)
        loff_t i_pos;
        int err;
+        if (inode->i_ino == MSDOS_ROOT_INO)
+                return 0;
 retry:
-        i_pos = MSDOS_I(inode)->i_pos;
+        i_pos = fat_i_pos_read(sbi, inode);
-        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
+        if (!i_pos)
                return 0;
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
@@ -588,19 +592,17 @@ retry:
                raw_entry->size = 0;
        else
                raw_entry->size = cpu_to_le32(inode->i_size);
-        raw_entry->attr = fat_attr(inode);
+        raw_entry->attr = fat_make_attrs(inode);
        raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
        raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-        fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
+        fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
-                          &raw_entry->date, sbi->options.tz_utc);
+                          &raw_entry->date, NULL);
        if (sbi->options.isvfat) {
                __le16 atime;
-                fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
+                fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
-                                  &raw_entry->cdate, sbi->options.tz_utc);
+                                  &raw_entry->cdate, &raw_entry->ctime_cs);
-                fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
+                fat_time_unix2fat(sbi, &inode->i_atime, &atime,
-                                  &raw_entry->adate, sbi->options.tz_utc);
+                                  &raw_entry->adate, NULL);
-                raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
-                        inode->i_ctime.tv_nsec / 10000000;
        }
        spin_unlock(&sbi->inode_hash_lock);
        mark_buffer_dirty(bh);
@@ -819,8 +821,10 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                        seq_puts(m, ",uni_xlate");
                if (!opts->numtail)
                        seq_puts(m, ",nonumtail");
+                if (opts->rodir)
+                        seq_puts(m, ",rodir");
        }
-        if (sbi->options.flush)
+        if (opts->flush)
                seq_puts(m, ",flush");
        if (opts->tz_utc)
                seq_puts(m, ",tz=UTC");
@@ -836,7 +840,7 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -908,6 +912,7 @@ static const match_table_t vfat_tokens = {
        {Opt_nonumtail_yes, "nonumtail=yes"},
        {Opt_nonumtail_yes, "nonumtail=true"},
        {Opt_nonumtail_yes, "nonumtail"},
+        {Opt_rodir, "rodir"},
        {Opt_err, NULL}
 };
@@ -927,10 +932,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        opts->iocharset = fat_default_iocharset;
-        if (is_vfat)
+        if (is_vfat) {
                opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
-        else
+                opts->rodir = 0;
+        } else {
                opts->shortname = 0;
+                opts->rodir = 1;
+        }
        opts->name_check = 'n';
        opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
        opts->utf8 = opts->unicode_xlate = 0;
@@ -1081,6 +1089,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_nonumtail_yes:         /* empty or 1 or yes or true */
                        opts->numtail = 0;      /* negated option */
                        break;
+                case Opt_rodir:
+                        opts->rodir = 1;
+                        break;
                /* obsolete mount options */
                case Opt_obsolate:
@@ -1126,7 +1137,7 @@ static int fat_read_root(struct inode *inode)
        inode->i_gid = sbi->options.fs_gid;
        inode->i_version++;
        inode->i_generation = 0;
-        inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+        inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
        inode->i_op = sbi->dir_ops;
        inode->i_fop = &fat_dir_operations;
        if (sbi->fat_bits == 32) {
@@ -1143,7 +1154,7 @@ static int fat_read_root(struct inode *inode)
        MSDOS_I(inode)->i_logstart = 0;
        MSDOS_I(inode)->mmu_private = inode->i_size;
-        MSDOS_I(inode)->i_attrs = ATTR_NONE;
+        fat_save_attrs(inode, ATTR_DIR);
        inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
        inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
        inode->i_nlink = fat_subdirs(inode)+2;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 79fb98ad36d4..ac39ebcc1496 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -8,8 +8,8 @@
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 /*
 * fat_fs_panic reports a severe file system problem and sets the file system
@@ -124,8 +124,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                        mark_inode_dirty(inode);
        }
        if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-                fat_fs_panic(sb, "clusters badly computed (%d != %lu)",
+                fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
-                        new_fclus, inode->i_blocks >> (sbi->cluster_bits - 9));
+                             new_fclus,
+                             (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
                fat_cache_inval_inode(inode);
        }
        inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
@@ -135,65 +136,131 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 extern struct timezone sys_tz;
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day   (1 -  31)
+ * date:  5 -  8: month (1 -  12)
+ * date:  9 - 15: year  (0 - 127) from 1980
+ * time:  0 -  4: sec   (0 -  29) 2sec counts
+ * time:  5 - 10: min   (0 -  59)
+ * time: 11 - 15: hour  (0 -  23)
+ */
+#define SECS_PER_MIN    60
+#define SECS_PER_HOUR   (60 * 60)
+#define SECS_PER_DAY    (SECS_PER_HOUR * 24)
+#define UNIX_SECS_1980  315532800L
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108  4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA      (365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100       120
+#define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100)
 /* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] = {
+static time_t days_in_year[] = {
-   /* Jan  Feb  Mar  Apr   May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+        /* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
-        0,  31,  59,  90,  120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0
+        0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+                       __le16 __time, __le16 __date, u8 time_cs)
 {
-        int month, year, secs;
+        u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+        time_t second, day, leap_day, month, year;
-        /*
+        year  = date >> 9;
-         * first subtract and mask after that... Otherwise, if
+        month = max(1, (date >> 5) & 0xf);
-         * date == 0, bad things happen
+        day   = max(1, date & 0x1f) - 1;
-         */
-        month = ((date >> 5) - 1) & 15;
+        leap_day = (year + 3) / 4;
-        year = date >> 9;
+        if (year > YEAR_2100)           /* 2100 isn't leap year */
-        secs = (time & 31)*2+60*((time >> 5) & 63)+(time >> 11)*3600+86400*
+                leap_day--;
-            ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
+        if (IS_LEAP_YEAR(year) && month > 2)
-            month < 2 ? 1 : 0)+3653);
+                leap_day++;
-                        /* days since 1.1.70 plus 80's leap day */
-        if (!tz_utc)
+        second =  (time & 0x1f) << 1;
-                secs += sys_tz.tz_minuteswest*60;
+        second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
-        return secs;
+        second += (time >> 11) * SECS_PER_HOUR;
+        second += (year * 365 + leap_day
+                   + days_in_year[month] + day
+                   + DAYS_DELTA) * SECS_PER_DAY;
+        if (!sbi->options.tz_utc)
+                second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+        if (time_cs) {
+                ts->tv_sec = second + (time_cs / 100);
+                ts->tv_nsec = (time_cs % 100) * 10000000;
+        } else {
+                ts->tv_sec = second;
+                ts->tv_nsec = 0;
+        }
 }
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
+/* Convert linear UNIX date to a FAT time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+                       __le16 *time, __le16 *date, u8 *time_cs)
 {
-        int day, year, nl_day, month;
+        time_t second = ts->tv_sec;
+        time_t day, leap_day, month, year;
-        if (!tz_utc)
+        if (!sbi->options.tz_utc)
-                unix_date -= sys_tz.tz_minuteswest*60;
+                second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
        /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-        if (unix_date < 315532800)
+        if (second < UNIX_SECS_1980) {
-                unix_date = 315532800;
+                *time = 0;
+                *date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
-        *time = cpu_to_le16((unix_date % 60)/2+(((unix_date/60) % 60) << 5)+
+                if (time_cs)
-            (((unix_date/3600) % 24) << 11));
+                        *time_cs = 0;
-        day = unix_date/86400-3652;
+                return;
-        year = day/365;
+        }
-        if ((year+3)/4+365*year > day)
+#if BITS_PER_LONG == 64
+        if (second >= UNIX_SECS_2108) {
+                *time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+                *date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+                if (time_cs)
+                        *time_cs = 199;
+                return;
+        }
+#endif
+        day = second / SECS_PER_DAY - DAYS_DELTA;
+        year = day / 365;
+        leap_day = (year + 3) / 4;
+        if (year > YEAR_2100)           /* 2100 isn't leap year */
+                leap_day--;
+        if (year * 365 + leap_day > day)
                year--;
-        day -= (year+3)/4+365*year;
+        leap_day = (year + 3) / 4;
-        if (day == 59 && !(year & 3)) {
+        if (year > YEAR_2100)           /* 2100 isn't leap year */
-                nl_day = day;
+                leap_day--;
+        day -= year * 365 + leap_day;
+        if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
                month = 2;
        } else {
-                nl_day = (year & 3) || day <= 59 ? day : day-1;
+                if (IS_LEAP_YEAR(year) && day > days_in_year[3])
-                for (month = 0; month < 12; month++) {
+                        day--;
-                        if (day_n[month] > nl_day)
+                for (month = 1; month < 12; month++) {
+                        if (days_in_year[month + 1] > day)
                                break;
                }
        }
-        *date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
+        day -= days_in_year[month];
-}
-EXPORT_SYMBOL_GPL(fat_date_unix2dos);
+        *time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+                            | ((second / SECS_PER_MIN) % 60) << 5
+                            | (second % SECS_PER_MIN) >> 1);
+        *date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
+        if (time_cs)
+                *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
diff --git a/fs/msdos/namei.c b/fs/fat/namei_msdos.c
index e844b9809d27..7ba03a4acbe0 100644
--- a/fs/msdos/namei.c
+++ b/fs/fat/namei_msdos.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include "fat.h"
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
@@ -203,33 +203,37 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
-        struct inode *inode = NULL;
+        struct inode *inode;
-        int res;
+        int err;
-        dentry->d_op = &msdos_dentry_operations;
        lock_super(sb);
-        res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-        if (res == -ENOENT)
+        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-                goto add;
+        if (err) {
-        if (res < 0)
+                if (err == -ENOENT) {
-                goto out;
+                        inode = NULL;
+                        goto out;
+                }
+                goto error;
+        }
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                res = PTR_ERR(inode);
+                err = PTR_ERR(inode);
-                goto out;
+                goto error;
        }
-add:
+out:
-        res = 0;
+        unlock_super(sb);
+        dentry->d_op = &msdos_dentry_operations;
        dentry = d_splice_alias(inode, dentry);
        if (dentry)
                dentry->d_op = &msdos_dentry_operations;
-out:
+        return dentry;
+error:
        unlock_super(sb);
-        if (!res)
+        return ERR_PTR(err);
-                return dentry;
-        return ERR_PTR(res);
 }
 /***** Creates a directory entry (name is already formatted). */
@@ -247,7 +251,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
        if (is_hid)
                de.attr |= ATTR_HIDDEN;
        de.lcase = 0;
-        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+        fat_time_unix2fat(sbi, ts, &time, &date, NULL);
        de.cdate = de.adate = 0;
        de.ctime = 0;
        de.ctime_cs = 0;
diff --git a/fs/vfat/namei.c b/fs/fat/namei_vfat.c
index 155c10b4adbd..bf326d4356a3 100644
--- a/fs/vfat/namei.c
+++ b/fs/fat/namei_vfat.c
@@ -16,36 +16,75 @@
 */
 #include <linux/module.h>
 #include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include "fat.h"
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
 {
        int ret = 1;
+        spin_lock(&dentry->d_lock);
-        if (!dentry->d_inode &&
+        if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-            nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-                /*
-                 * negative dentry is dropped, in order to make sure
-                 * to use the name which a user desires if this is
-                 * create path.
-                 */
                ret = 0;
-        else {
+        spin_unlock(&dentry->d_lock);
-                spin_lock(&dentry->d_lock);
-                if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-                        ret = 0;
-                spin_unlock(&dentry->d_lock);
-        }
        return ret;
 }
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        /* This is not negative dentry. Always valid. */
+        if (dentry->d_inode)
+                return 1;
+        return vfat_revalidate_shortname(dentry);
+}
+static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+{
+        /*
+         * This is not negative dentry. Always valid.
+         *
+         * Note, rename() to existing directory entry will have ->d_inode,
+         * and will use existing name which isn't specified name by user.
+         *
+         * We may be able to drop this positive dentry here. But dropping
+         * positive dentry isn't good idea. So it's unsupported like
+         * rename("filename", "FILENAME") for now.
+         */
+        if (dentry->d_inode)
+                return 1;
+        /*
+         * This may be nfsd (or something), anyway, we can't see the
+         * intent of this. So, since this can be for creation, drop it.
+         */
+        if (!nd)
+                return 0;
+        /*
+         * Drop the negative dentry, in order to make sure to use the
+         * case sensitive name which is specified by user if this is
+         * for creation.
+         */
+        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+                if (nd->flags & LOOKUP_CREATE)
+                        return 0;
+        }
+        return vfat_revalidate_shortname(dentry);
+}
 /* returns the length of a struct qstr, ignoring trailing dots */
 static unsigned int vfat_striptail_len(struct qstr *qstr)
 {
@@ -127,25 +166,16 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
        return 1;
 }
-static struct dentry_operations vfat_dentry_ops[4] = {
+static struct dentry_operations vfat_ci_dentry_ops = {
-        {
+        .d_revalidate   = vfat_revalidate_ci,
-                .d_hash         = vfat_hashi,
+        .d_hash         = vfat_hashi,
-                .d_compare      = vfat_cmpi,
+        .d_compare      = vfat_cmpi,
-        },
+};
-        {
-                .d_revalidate   = vfat_revalidate,
+static struct dentry_operations vfat_dentry_ops = {
-                .d_hash         = vfat_hashi,
+        .d_revalidate   = vfat_revalidate,
-                .d_compare      = vfat_cmpi,
+        .d_hash         = vfat_hash,
-        },
+        .d_compare      = vfat_cmp,
-        {
-                .d_hash         = vfat_hash,
-                .d_compare      = vfat_cmp,
-        },
-        {
-                .d_revalidate   = vfat_revalidate,
-                .d_hash         = vfat_hash,
-                .d_compare      = vfat_cmp,
-        }
 };
 /* Characters that are undesirable in an MS-DOS file name */
@@ -569,6 +599,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
        unsigned char msdos_name[MSDOS_NAME];
        wchar_t *uname;
        __le16 time, date;
+        u8 time_cs;
        int err, ulen, usize, i;
        loff_t offset;
@@ -621,10 +652,10 @@ shortname:
        memcpy(de->name, msdos_name, MSDOS_NAME);
        de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
        de->lcase = lcase;
-        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+        fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
        de->time = de->ctime = time;
        de->date = de->cdate = de->adate = date;
-        de->ctime_cs = 0;
+        de->ctime_cs = time_cs;
        de->start = cpu_to_le16(cluster);
        de->starthi = cpu_to_le16(cluster >> 16);
        de->size = 0;
@@ -683,46 +714,58 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 {
        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
-        struct inode *inode = NULL;
+        struct inode *inode;
        struct dentry *alias;
-        int err, table;
+        int err;
        lock_super(sb);
-        table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-        dentry->d_op = &vfat_dentry_ops[table];
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err) {
-                table++;
+                if (err == -ENOENT) {
+                        inode = NULL;
+                        goto out;
+                }
                goto error;
        }
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                unlock_super(sb);
+                err = PTR_ERR(inode);
-                return ERR_CAST(inode);
+                goto error;
        }
-        alias = d_find_alias(inode);
-        if (alias) {
-                if (d_invalidate(alias) == 0)
-                        dput(alias);
-                else {
-                        iput(inode);
-                        unlock_super(sb);
-                        return alias;
-                }
+        alias = d_find_alias(inode);
+        if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+                /*
+                 * This inode has non DCACHE_DISCONNECTED dentry. This
+                 * means, the user did ->lookup() by an another name
+                 * (longname vs 8.3 alias of it) in past.
+                 *
+                 * Switch to new one for reason of locality if possible.
+                 */
+                BUG_ON(d_unhashed(alias));
+                if (!S_ISDIR(inode->i_mode))
+                        d_move(alias, dentry);
+                iput(inode);
+                unlock_super(sb);
+                return alias;
        }
-error:
+out:
        unlock_super(sb);
-        dentry->d_op = &vfat_dentry_ops[table];
+        dentry->d_op = sb->s_root->d_op;
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
        if (dentry) {
-                dentry->d_op = &vfat_dentry_ops[table];
+                dentry->d_op = sb->s_root->d_op;
                dentry->d_time = dentry->d_parent->d_inode->i_version;
        }
        return dentry;
+error:
+        unlock_super(sb);
+        return ERR_PTR(err);
 }
 static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -1014,9 +1057,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
                return res;
        if (MSDOS_SB(sb)->options.name_check != 's')
-                sb->s_root->d_op = &vfat_dentry_ops[0];
+                sb->s_root->d_op = &vfat_ci_dentry_ops;
        else
-                sb->s_root->d_op = &vfat_dentry_ops[2];
+                sb->s_root->d_op = &vfat_dentry_ops;
        return 0;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index efc06faede6c..5ad0eca6eea2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -269,6 +269,10 @@ void __fput(struct file *file)
        eventpoll_release(file);
        locks_remove_flock(file);
+        if (unlikely(file->f_flags & FASYNC)) {
+                if (file->f_op && file->f_op->fasync)
+                        file->f_op->fasync(-1, file, 0);
+        }
        if (file->f_op && file->f_op->release)
                file->f_op->release(inode, file);
        security_file_free(file);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 87250b6a8682..b72361479be2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1056,7 +1056,6 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
                end_requests(fc, &fc->pending);
                end_requests(fc, &fc->processing);
                spin_unlock(&fc->lock);
-                fasync_helper(-1, file, 0, &fc->fasync);
                fuse_conn_put(fc);
        }
diff --git a/fs/inotify.c b/fs/inotify.c
index 690e72595e6e..7bbed1b89825 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(get_inotify_watch);
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+        struct super_block *sb = watch->inode->i_sb;
+        spin_lock(&sb_lock);
+        if (sb->s_count >= S_BIAS) {
+                atomic_inc(&sb->s_active);
+                spin_unlock(&sb_lock);
+                atomic_inc(&watch->count);
+                return 1;
+        }
+        spin_unlock(&sb_lock);
+        return 0;
+}
 /**
 * put_inotify_watch - decrements the ref count on a given watch.  cleans up
 * watch references if the count reaches zero.  inotify_watch is freed by
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(put_inotify_watch);
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+        struct super_block *sb = watch->inode->i_sb;
+        put_inotify_watch(watch);
+        deactivate_super(sb);
+}
 /*
 * inotify_handle_get_wd - returns the next WD for use by the given handle
 *
@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(inotify_init_watch);
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+        struct super_block *sb = watch->inode->i_sb;
+        s32 wd = watch->wd;
+        spin_lock(&sb_lock);
+        if (sb->s_count >= S_BIAS) {
+                atomic_inc(&sb->s_active);
+                spin_unlock(&sb_lock);
+                get_inotify_watch(watch);
+                mutex_unlock(&ih->mutex);
+                return 1;       /* the best outcome */
+        }
+        sb->s_count++;
+        spin_unlock(&sb_lock);
+        mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+        down_read(&sb->s_umount);
+        if (likely(!sb->s_root)) {
+                /* fs is already shut down; the watch is dead */
+                drop_super(sb);
+                return 0;
+        }
+        /* raced with the final deactivate_super() */
+        mutex_lock(&ih->mutex);
+        if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+                /* the watch is dead */
+                mutex_unlock(&ih->mutex);
+                drop_super(sb);
+                return 0;
+        }
+        /* still alive or freed and reused with the same sb and wd; kill */
+        get_inotify_watch(watch);
+        mutex_unlock(&ih->mutex);
+        return 2;
+}
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+        struct super_block *sb = watch->inode->i_sb;
+        put_inotify_watch(watch);
+        switch (how) {
+        case 1:
+                deactivate_super(sb);
+                break;
+        case 2:
+                drop_super(sb);
+        }
+}
 /**
 * inotify_destroy - clean up and destroy an inotify instance
 * @ih: inotify handle
@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih)
         * pretty.  We cannot do a simple iteration over the list, because we
         * do not know the inode until we iterate to the watch.  But we need to
         * hold inode->inotify_mutex before ih->mutex.  The following works.
+         *
+         * AV: it had to become even uglier to start working ;-/
         */
        while (1) {
                struct inotify_watch *watch;
                struct list_head *watches;
+                struct super_block *sb;
                struct inode *inode;
+                int how;
                mutex_lock(&ih->mutex);
                watches = &ih->watches;
@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih)
                        break;
                }
                watch = list_first_entry(watches, struct inotify_watch, h_list);
-                get_inotify_watch(watch);
+                sb = watch->inode->i_sb;
-                mutex_unlock(&ih->mutex);
+                how = pin_to_kill(ih, watch);
+                if (!how)
+                        continue;
                inode = watch->inode;
                mutex_lock(&inode->inotify_mutex);
@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih)
                mutex_unlock(&ih->mutex);
                mutex_unlock(&inode->inotify_mutex);
-                put_inotify_watch(watch);
+                unpin_and_kill(watch, how);
        }
        /* free this handle: the put matching the get in inotify_init() */
@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
 int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
        struct inotify_watch *watch;
+        struct super_block *sb;
        struct inode *inode;
+        int how;
        mutex_lock(&ih->mutex);
        watch = idr_find(&ih->idr, wd);
@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
                mutex_unlock(&ih->mutex);
                return -EINVAL;
        }
-        get_inotify_watch(watch);
+        sb = watch->inode->i_sb;
+        how = pin_to_kill(ih, watch);
+        if (!how)
+                return 0;
        inode = watch->inode;
-        mutex_unlock(&ih->mutex);
        mutex_lock(&inode->inotify_mutex);
        mutex_lock(&ih->mutex);
@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
        mutex_unlock(&ih->mutex);
        mutex_unlock(&inode->inotify_mutex);
-        put_inotify_watch(watch);
+        unpin_and_kill(watch, how);
        return 0;
 }
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index d85c7d931cdf..d367e9b92862 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -537,9 +537,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
                inotify_dev_event_dequeue(dev);
        mutex_unlock(&dev->ev_mutex);
-        if (file->f_flags & FASYNC)
-                inotify_fasync(-1, file, 0);
        /* free this device: the put matching the get in inotify_init() */
        put_inotify_dev(dev);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 1bd8d4acc6f2..61f32f3868cd 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 */
 void __log_wait_for_space(journal_t *journal)
 {
-        int nblocks;
+        int nblocks, space_left;
        assert_spin_locked(&journal->j_state_lock);
        nblocks = jbd_space_needed(journal);
@@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal)
                /*
                 * Test again, another process may have checkpointed while we
                 * were waiting for the checkpoint lock. If there are no
-                 * outstanding transactions there is nothing to checkpoint and
+                 * transactions ready to be checkpointed, try to recover
-                 * we can't make progress. Abort the journal in this case.
+                 * journal space by calling cleanup_journal_tail(), and if
+                 * that doesn't work, by waiting for the currently committing
+                 * transaction to complete.  If there is absolutely no way
+                 * to make progress, this is either a BUG or corrupted
+                 * filesystem, so abort the journal and leave a stack
+                 * trace for forensic evidence.
                 */
                spin_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
-                if (__log_space_left(journal) < nblocks) {
+                space_left = __log_space_left(journal);
+                if (space_left < nblocks) {
                        int chkpt = journal->j_checkpoint_transactions != NULL;
+                        tid_t tid = 0;
+                        if (journal->j_committing_transaction)
+                                tid = journal->j_committing_transaction->t_tid;
                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&journal->j_state_lock);
                        if (chkpt) {
                                log_do_checkpoint(journal);
+                        } else if (cleanup_journal_tail(journal) == 0) {
+                                /* We were able to recover space; yay! */
+                                ;
+                        } else if (tid) {
+                                log_wait_commit(journal, tid);
                        } else {
-                                printk(KERN_ERR "%s: no transactions\n",
+                                printk(KERN_ERR "%s: needed %d blocks and "
-                                       __func__);
+                                       "only had %d space available\n",
+                                       __func__, nblocks, space_left);
+                                printk(KERN_ERR "%s: no way to get more "
+                                       "journal space\n", __func__);
+                                WARN_ON(1);
                                journal_abort(journal, 0);
                        }
                        spin_lock(&journal->j_state_lock);
                } else {
                        spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d15cd6e7251e..60d4c32c8808 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -860,7 +860,6 @@ out:
 * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
 *
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9203c3332f17..9497718fe920 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 */
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
-        int nblocks;
+        int nblocks, space_left;
        assert_spin_locked(&journal->j_state_lock);
        nblocks = jbd_space_needed(journal);
@@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                /*
                 * Test again, another process may have checkpointed while we
                 * were waiting for the checkpoint lock. If there are no
-                 * outstanding transactions there is nothing to checkpoint and
+                 * transactions ready to be checkpointed, try to recover
-                 * we can't make progress. Abort the journal in this case.
+                 * journal space by calling cleanup_journal_tail(), and if
+                 * that doesn't work, by waiting for the currently committing
+                 * transaction to complete.  If there is absolutely no way
+                 * to make progress, this is either a BUG or corrupted
+                 * filesystem, so abort the journal and leave a stack
+                 * trace for forensic evidence.
                 */
                spin_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
-                if (__jbd2_log_space_left(journal) < nblocks) {
+                space_left = __jbd2_log_space_left(journal);
+                if (space_left < nblocks) {
                        int chkpt = journal->j_checkpoint_transactions != NULL;
+                        tid_t tid = 0;
+                        if (journal->j_committing_transaction)
+                                tid = journal->j_committing_transaction->t_tid;
                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&journal->j_state_lock);
                        if (chkpt) {
                                jbd2_log_do_checkpoint(journal);
+                        } else if (jbd2_cleanup_journal_tail(journal) == 0) {
+                                /* We were able to recover space; yay! */
+                                ;
+                        } else if (tid) {
+                                jbd2_log_wait_commit(journal, tid);
                        } else {
-                                printk(KERN_ERR "%s: no transactions\n",
+                                printk(KERN_ERR "%s: needed %d blocks and "
-                                       __func__);
+                                       "only had %d space available\n",
+                                       __func__, nblocks, space_left);
+                                printk(KERN_ERR "%s: no way to get more "
+                                       "journal space in %s\n", __func__,
+                                       journal->j_devname);
+                                WARN_ON(1);
                                jbd2_journal_abort(journal, 0);
                        }
                        spin_lock(&journal->j_state_lock);
                } else {
                        spin_unlock(&journal->j_list_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8b119e16aa36..ebc667bc54a8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -974,6 +974,9 @@ restart_loop:
        journal->j_committing_transaction = NULL;
        spin_unlock(&journal->j_state_lock);
+        if (journal->j_commit_callback)
+                journal->j_commit_callback(journal, commit_transaction);
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -995,11 +998,8 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
-        if (journal->j_commit_callback)
-                journal->j_commit_callback(journal, commit_transaction);
        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-                   journal->j_devname, commit_transaction->t_tid,
+                   journal->j_devname, journal->j_commit_sequence,
                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 783de118de92..e70d657a19f8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
+                jbd2_stats_proc_exit(journal);
                kfree(journal);
                return NULL;
        }
@@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
+                jbd2_stats_proc_exit(journal);
                kfree(journal);
                return NULL;
        }
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 8adebd3e43c6..3cceef4ad2b7 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -85,15 +85,15 @@ static int jffs2_garbage_collect_thread(void *_c)
        for (;;) {
                allow_signal(SIGHUP);
        again:
+                spin_lock(&c->erase_completion_lock);
                if (!jffs2_thread_should_wake(c)) {
                        set_current_state (TASK_INTERRUPTIBLE);
+                        spin_unlock(&c->erase_completion_lock);
                        D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
-                        /* Yes, there's a race here; we checked jffs2_thread_should_wake()
-                           before setting current->state to TASK_INTERRUPTIBLE. But it doesn't
-                           matter - We don't care if we miss a wakeup, because the GC thread
-                           is only an optimisation anyway. */
                        schedule();
-                }
+                } else
+                        spin_unlock(&c->erase_completion_lock);
+                        
                /* This thread is purely an optimisation. But if it runs when
                   other things could be running, it actually makes things a
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 47b045797e42..90cb60d09787 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -19,7 +19,7 @@
 static void *lzo_mem;
 static void *lzo_compress_buf;
-static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(deflate_mutex);     /* for lzo_mem and lzo_compress_buf */
 static void free_workspace(void)
 {
@@ -49,18 +49,21 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
        mutex_lock(&deflate_mutex);
        ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
-        mutex_unlock(&deflate_mutex);
        if (ret != LZO_E_OK)
-                return -1;
+                goto fail;
        if (compress_size > *dstlen)
-                return -1;
+                goto fail;
        memcpy(cpage_out, lzo_compress_buf, compress_size);
-        *dstlen = compress_size;
+        mutex_unlock(&deflate_mutex);
+        *dstlen = compress_size;
        return 0;
+ fail:
+        mutex_unlock(&deflate_mutex);
+        return -1;
 }
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0875b60b4bf7..21a052915aa9 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,9 +261,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
        jffs2_sum_reset_collected(c->summary); /* reset collected summary */
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
        /* adjust write buffer offset, else we get a non contiguous write bug */
        if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
                c->wbuf_ofs = 0xffffffff;
+#endif
        D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf7..e960a8321902 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 014f6ce48172..4dfdcbc6bf68 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -434,6 +434,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
         * reclaim all locks we hold on this server.
         */
        memset(&saddr, 0, sizeof(saddr));
+        saddr.sin_family = AF_INET;
        saddr.sin_addr.s_addr = argp->addr;
        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 548b0bb2b84d..3ca89e2a9381 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -466,6 +466,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
         * reclaim all locks we hold on this server.
         */
        memset(&saddr, 0, sizeof(saddr));
+        saddr.sin_family = AF_INET;
        saddr.sin_addr.s_addr = argp->addr;
        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
diff --git a/fs/msdos/Makefile b/fs/msdos/Makefile
deleted file mode 100644
index ea67646fcb95..000000000000
--- a/fs/msdos/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the Linux msdos filesystem routines.
-#
-obj-$(CONFIG_MSDOS_FS) += msdos.o
-msdos-y := namei.o
diff --git a/fs/namespace.c b/fs/namespace.c
index cce46702d33c..65b3dc844c87 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1815,8 +1815,8 @@ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct vfsmount,
                                                mnt_expire);
-                        touch_mnt_namespace(mnt->mnt_ns);
+                        touch_mnt_namespace(m->mnt_ns);
-                        umount_tree(mnt, 1, umounts);
+                        umount_tree(m, 1, umounts);
                }
        }
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b9195c02a863..d22eb383e1cf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -5,7 +5,7 @@
 *
 *  nfs inode and superblock handling functions
 *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
 *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
 *
 *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
@@ -908,21 +908,16 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt
        return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
-static unsigned long nfs_attr_generation_counter;
+static atomic_long_t nfs_attr_generation_counter;
 static unsigned long nfs_read_attr_generation_counter(void)
 {
-        smp_rmb();
+        return atomic_long_read(&nfs_attr_generation_counter);
-        return nfs_attr_generation_counter;
 }
 unsigned long nfs_inc_attr_generation_counter(void)
 {
-        unsigned long ret;
+        return atomic_long_inc_return(&nfs_attr_generation_counter);
-        smp_rmb();
-        ret = ++nfs_attr_generation_counter;
-        smp_wmb();
-        return ret;
 }
 void nfs_fattr_init(struct nfs_fattr *fattr)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a3b0061dfd45..f48db679a1c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -5,7 +5,7 @@
 *
 *  nfs superblock handling functions
 *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
 *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
 *
 *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0bc56f6d9276..4433c8f00163 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
                return -ENOMEM;
        offset = *offsetp;
-        cdp->err = nfserr_eof; /* will be cleared on successful read */
        while (1) {
                unsigned int reclen;
+                cdp->err = nfserr_eof; /* will be cleared on successful read */
                buf.used = 0;
                buf.full = 0;
@@ -1912,8 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
                        de = (struct buffered_dirent *)((char *)de + reclen);
                }
                offset = vfs_llseek(file, 0, SEEK_CUR);
-                if (!buf.full)
-                        break;
        }
 done:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a78073..e2570a3bc2b2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -247,8 +247,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
        mlog_entry_void();
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out;
        }
@@ -312,8 +312,8 @@ static int ocfs2_simple_size_update(struct inode *inode,
        handle_t *handle = NULL;
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out;
        }
@@ -679,8 +679,7 @@ leave:
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
- * ->commit_write(). */
 static int ocfs2_write_zero_page(struct inode *inode,
                                 u64 size)
 {
@@ -1056,8 +1055,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out;
        }
@@ -1260,8 +1259,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
        }
        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out;
        }
@@ -1353,8 +1352,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                goto out;
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out;
        }
@@ -1867,6 +1866,13 @@ relock:
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
+                        /*
+                         * direct write may have instantiated a few
+                         * blocks outside i_size. Trim these off again.
+                         * Don't need i_size_read because we hold i_mutex.
+                         */
+                        if (*ppos + count > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4903688f72a9..7aa00d511874 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1106,6 +1106,12 @@ void ocfs2_clear_inode(struct inode *inode)
        oi->ip_last_trans = 0;
        oi->ip_dir_start_lookup = 0;
        oi->ip_blkno = 0ULL;
+        /*
+         * ip_jinode is used to track txns against this inode. We ensure that
+         * the journal is flushed before journal shutdown. Thus it is safe to
+         * have inodes get cleaned up after journal shutdown.
+         */
        jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
                                       &oi->ip_jinode);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 81e40677eecb..99fe9d584f3c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -690,6 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        /* Shutdown the kernel journal system */
        jbd2_journal_destroy(journal->j_journal);
+        journal->j_journal = NULL;
        OCFS2_I(inode)->ip_open_count--;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3dc18d67557c..eea1d24713ea 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -113,7 +113,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
         * ocfs2_write_begin_nolock().
         */
        if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
-                ret = -EINVAL;
+                /*
+                 * the page has been umapped in ocfs2_data_downconvert_worker.
+                 * So return 0 here and let VFS retry.
+                 */
+                ret = 0;
                goto out;
        }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 485a6aa0ad39..f4967e634ffd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -378,8 +378,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        }
        inode = new_inode(dir->i_sb);
-        if (IS_ERR(inode)) {
+        if (!inode) {
-                status = PTR_ERR(inode);
+                status = -ENOMEM;
                mlog(ML_ERROR, "new_inode failed!\n");
                goto leave;
        }
@@ -491,8 +491,10 @@ leave:
                        brelse(*new_fe_bh);
                        *new_fe_bh = NULL;
                }
-                if (inode)
+                if (inode) {
+                        clear_nlink(inode);
                        iput(inode);
+                }
        }
        mlog_exit(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a21a465490c4..fef7ece32376 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -473,6 +473,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
                (____gd)->bg_signature);                                \
 } while (0)
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)                                 \
+        (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f24ce3d3f956..5f180cf7abbd 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -742,12 +742,12 @@ struct ocfs2_group_desc
 */
 struct ocfs2_xattr_entry {
        __le32  xe_name_hash;    /* hash value of xattr prefix+suffix. */
-        __le16  xe_name_offset;  /* byte offset from the 1st etnry in the local
+        __le16  xe_name_offset;  /* byte offset from the 1st entry in the
                                    local xattr storage(inode, xattr block or
                                    xattr bucket). */
        __u8    xe_name_len;     /* xattr name len, does't include prefix. */
-        __u8    xe_type;         /* the low 7 bits indicates the name prefix's
+        __u8    xe_type;         /* the low 7 bits indicate the name prefix
-                                  * type and the highest 1 bits indicate whether
+                                  * type and the highest bit indicates whether
                                  * the EA is stored in the local storage. */
        __le64  xe_value_size;   /* real xattr value length. */
 };
@@ -766,9 +766,10 @@ struct ocfs2_xattr_header {
                                                   xattr. */
        __le16  xh_name_value_len;              /* total length of name/value
                                                   length in this bucket. */
-        __le16  xh_num_buckets;                 /* bucket nums in one extent
+        __le16  xh_num_buckets;                 /* Number of xattr buckets
-                                                   record, only valid in the
+                                                   in this extent record,
-                                                   first bucket. */
+                                                   only valid in the first
+                                                   bucket. */
        __le64  xh_csum;
        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -776,8 +777,8 @@ struct ocfs2_xattr_header {
 /*
 * On disk structure for xattr value root.
 *
- * It is used when one extended attribute's size is larger, and we will save it
+ * When an xattr's value is large enough, it is stored in an external
- * in an outside cluster. It will stored in a b-tree like file content.
+ * b-tree like file data.  The xattr value root points to this structure.
 */
 struct ocfs2_xattr_value_root {
 /*00*/  __le32  xr_clusters;              /* clusters covered by xattr value. */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 802c41492214..054e2efb0b7e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3,25 +3,20 @@
 *
 * xattr.c
 *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
 *
 * CREDITS:
- * Lots of code in this file is taken from ext3.
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
+ * License version 2 as published by the Free Software Foundation.
- * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
 */
 #include <linux/capability.h>
@@ -83,7 +78,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
 };
@@ -116,6 +111,10 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
                                             int *block_off,
                                             int *new_offset);
+static int ocfs2_xattr_block_find(struct inode *inode,
+                                  int name_index,
+                                  const char *name,
+                                  struct ocfs2_xattr_search *xs);
 static int ocfs2_xattr_index_block_find(struct inode *inode,
                                        struct buffer_head *root_bh,
                                        int name_index,
@@ -137,6 +136,24 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
                                          struct buffer_head *xb_bh);
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+        return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+        return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+        u16 len = sb->s_blocksize -
+                 offsetof(struct ocfs2_xattr_header, xh_entries);
+        return len / sizeof(struct ocfs2_xattr_entry);
+}
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
        struct xattr_handler *handler = NULL;
@@ -542,14 +559,12 @@ static int ocfs2_xattr_block_list(struct inode *inode,
                mlog_errno(ret);
                return ret;
        }
-        /*Verify the signature of xattr block*/
-        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-                ret = -EFAULT;
-                goto cleanup;
-        }
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+                ret = -EIO;
+                goto cleanup;
+        }
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
@@ -749,47 +764,25 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                                 size_t buffer_size,
                                 struct ocfs2_xattr_search *xs)
 {
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        struct buffer_head *blk_bh = NULL;
        struct ocfs2_xattr_block *xb;
        struct ocfs2_xattr_value_root *xv;
        size_t size;
        int ret = -ENODATA, name_offset, name_len, block_off, i;
-        if (!di->i_xattr_loc)
-                return ret;
        memset(&xs->bucket, 0, sizeof(xs->bucket));
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
-        if (ret < 0) {
+        if (ret) {
                mlog_errno(ret);
-                return ret;
-        }
-        /*Verify the signature of xattr block*/
-        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-                ret = -EFAULT;
                goto cleanup;
        }
-        xs->xattr_bh = blk_bh;
+        if (xs->not_found) {
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+                ret = -ENODATA;
-        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
-                xs->header = &xb->xb_attrs.xb_header;
-                xs->base = (void *)xs->header;
-                xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-                xs->here = xs->header->xh_entries;
-                ret = ocfs2_xattr_find_entry(name_index, name, xs);
-        } else
-                ret = ocfs2_xattr_index_block_find(inode, blk_bh,
-                                                   name_index,
-                                                   name, xs);
-        if (ret)
                goto cleanup;
+        }
+        xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        size = le64_to_cpu(xs->here->xe_value_size);
        if (buffer) {
                ret = -ERANGE;
@@ -828,7 +821,8 @@ cleanup:
                brelse(xs->bucket.bhs[i]);
        memset(&xs->bucket, 0, sizeof(xs->bucket));
-        brelse(blk_bh);
+        brelse(xs->xattr_bh);
+        xs->xattr_bh = NULL;
        return ret;
 }
@@ -837,11 +831,11 @@ cleanup:
 * Copy an extended attribute into the buffer provided.
 * Buffer is NULL to compute the size of buffer required.
 */
-int ocfs2_xattr_get(struct inode *inode,
+static int ocfs2_xattr_get(struct inode *inode,
-                    int name_index,
+                           int name_index,
-                    const char *name,
+                           const char *name,
-                    void *buffer,
+                           void *buffer,
-                    size_t buffer_size)
+                           size_t buffer_size)
 {
        int ret;
        struct ocfs2_dinode *di = NULL;
@@ -871,7 +865,7 @@ int ocfs2_xattr_get(struct inode *inode,
        down_read(&oi->ip_xattr_sem);
        ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
                                    buffer_size, &xis);
-        if (ret == -ENODATA)
+        if (ret == -ENODATA && di->i_xattr_loc)
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
        up_read(&oi->ip_xattr_sem);
@@ -1229,7 +1223,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
        if (free < 0)
-                return -EFAULT;
+                return -EIO;
        if (!xs->not_found) {
                size_t size = 0;
@@ -1514,10 +1508,9 @@ static int ocfs2_xattr_free_block(struct inode *inode,
                goto out;
        }
-        /*Verify the signature of xattr block*/
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+                ret = -EIO;
-                ret = -EFAULT;
                goto out;
        }
@@ -1527,7 +1520,6 @@ static int ocfs2_xattr_free_block(struct inode *inode,
                goto out;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1771,15 +1763,14 @@ static int ocfs2_xattr_block_find(struct inode *inode,
                mlog_errno(ret);
                return ret;
        }
-        /*Verify the signature of xattr block*/
-        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                        ret = -EFAULT;
+                ret = -EIO;
-                        goto cleanup;
+                goto cleanup;
        }
        xs->xattr_bh = blk_bh;
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                xs->header = &xb->xb_attrs.xb_header;
@@ -1806,52 +1797,6 @@ cleanup:
 }
 /*
- * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
- * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
- * re-initialized.
- */
-static int ocfs2_restore_xattr_block(struct inode *inode,
-                                     struct ocfs2_xattr_search *xs)
-{
-        int ret;
-        handle_t *handle;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_xattr_block *xb =
-                (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
-        struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
-                le16_to_cpu(el->l_next_free_rec) != 0);
-        handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                handle = NULL;
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-               offsetof(struct ocfs2_xattr_block, xb_attrs));
-        xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
-        ocfs2_journal_dirty(handle, xs->xattr_bh);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out:
-        return ret;
-}
-/*
 * ocfs2_xattr_block_set()
 *
 * Set, replace or remove an extended attribute into external block.
@@ -1961,8 +1906,6 @@ out:
        }
        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
-        if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
-                ret = ocfs2_restore_xattr_block(inode, xs);
 end:
@@ -2398,7 +2341,8 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
        BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
        mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-             "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+             "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
+             first_hash);
        ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
                                      p_blkno, first_hash, num_clusters, xs);
@@ -2422,7 +2366,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
        memset(&bucket, 0, sizeof(bucket));
        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-             clusters, blkno);
+             clusters, (unsigned long long)blkno);
        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
@@ -2440,7 +2384,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                if (i == 0)
                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
-                mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
+                     (unsigned long long)blkno,
                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
                if (func) {
                        ret = func(inode, &bucket, para);
@@ -2776,7 +2721,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
-        mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
+             (unsigned long long)blkno);
        xh_bh = sb_getblk(inode->i_sb, blkno);
        if (!xh_bh) {
@@ -2818,7 +2764,11 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        if (data_bh)
                ocfs2_journal_dirty(handle, data_bh);
-        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+        ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2941,8 +2891,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        mlog(0, "adjust xattr bucket in %llu, count = %u, "
             "xh_free_start = %u, xh_name_value_len = %u.\n",
-             blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+             (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
-             le16_to_cpu(xh->xh_name_value_len));
+             xh_free_start, le16_to_cpu(xh->xh_name_value_len));
        /*
         * sort all the entries by their offset.
@@ -3058,7 +3008,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-             prev_blkno, new_blkno);
+             (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
        /*
         * We need to update the 1st half of the new cluster and
@@ -3168,26 +3118,74 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 }
 /*
- * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+        struct ocfs2_xattr_entry *entries = xh->xh_entries;
+        int count = le16_to_cpu(xh->xh_count);
+        int delta, middle = count / 2;
+        /*
+         * We start at the middle.  Each step gets farther away in both
+         * directions.  We therefore hit the change in hash value
+         * nearest to the middle.  Note that this loop does not execute for
+         * count < 2.
+         */
+        for (delta = 0; delta < middle; delta++) {
+                /* Let's check delta earlier than middle */
+                if (cmp_xe(&entries[middle - delta - 1],
+                           &entries[middle - delta]))
+                        return middle - delta;
+                /* For even counts, don't walk off the end */
+                if ((middle + delta + 1) == count)
+                        continue;
+                /* Now try delta past middle */
+                if (cmp_xe(&entries[middle + delta],
+                           &entries[middle + delta + 1]))
+                        return middle + delta + 1;
+        }
+        /* Every entry had the same hash */
+        return count;
+}
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
 * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
 */
-static int ocfs2_half_xattr_bucket(struct inode *inode,
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
-                                   handle_t *handle,
+                                    handle_t *handle,
-                                   u64 blk,
+                                    u64 blk,
-                                   u64 new_blk,
+                                    u64 new_blk,
-                                   u32 *first_hash,
+                                    u32 *first_hash,
-                                   int new_bucket_head)
+                                    int new_bucket_head)
 {
        int ret, i;
-        u16 count, start, len, name_value_len, xe_len, name_offset;
+        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        struct buffer_head **s_bhs, **t_bhs = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
-        mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
-             blk, new_blk);
+             (unsigned long long)blk, (unsigned long long)new_blk);
        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
        if (!s_bhs)
@@ -3220,21 +3218,44 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
        for (i = 0; i < blk_per_bucket; i++) {
                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                                           new_bucket_head ?
+                                           OCFS2_JOURNAL_ACCESS_CREATE :
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
+        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        count = le16_to_cpu(xh->xh_count);
+        start = ocfs2_xattr_find_divide_pos(xh);
+        if (start == count) {
+                xe = &xh->xh_entries[start-1];
+                /*
+                 * initialized a new empty bucket here.
+                 * The hash value is set as one larger than
+                 * that of the last entry in the previous bucket.
+                 */
+                for (i = 0; i < blk_per_bucket; i++)
+                        memset(t_bhs[i]->b_data, 0, blocksize);
+                xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+                xh->xh_free_start = cpu_to_le16(blocksize);
+                xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+                le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+                goto set_num_buckets;
+        }
        /* copy the whole bucket to the new first. */
        for (i = 0; i < blk_per_bucket; i++)
                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
        /* update the new bucket. */
        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
-        count = le16_to_cpu(xh->xh_count);
-        start = count / 2;
        /*
         * Calculate the total name/value len and xh_free_start for
@@ -3291,6 +3312,7 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
                        xh->xh_free_start = xe->xe_name_offset;
        }
+set_num_buckets:
        /* set xh->xh_num_buckets for the new xh. */
        if (new_bucket_head)
                xh->xh_num_buckets = cpu_to_le16(1);
@@ -3308,9 +3330,13 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
        /*
-         * Now only update the 1st block of the old bucket.
+         * Now only update the 1st block of the old bucket.  If we
-         * Please note that the entry has been sorted already above.
+         * just added a new empty bucket, there is no need to modify
+         * it.
         */
+        if (start == count)
+                goto out;
        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
        memset(&xh->xh_entries[start], 0,
               sizeof(struct ocfs2_xattr_entry) * (count - start));
@@ -3358,7 +3384,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
        BUG_ON(s_blkno == t_blkno);
        mlog(0, "cp bucket %llu to %llu, target is %d\n",
-             s_blkno, t_blkno, t_is_new);
+             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
+             t_is_new);
        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
                        GFP_NOFS);
@@ -3382,6 +3409,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
        for (i = 0; i < blk_per_bucket; i++) {
                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+                                           t_is_new ?
+                                           OCFS2_JOURNAL_ACCESS_CREATE :
                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret)
                        goto out;
@@ -3428,7 +3457,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
        struct ocfs2_xattr_header *xh;
        u64 to_blk_start = to_blk;
-        mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+        mlog(0, "cp xattrs from cluster %llu to %llu\n",
+             (unsigned long long)src_blk, (unsigned long long)to_blk);
        /*
         * We need to update the new cluster and 1 more for the update of
@@ -3493,15 +3523,15 @@ out:
 }
 /*
- * Move half of the xattrs in this cluster to the new cluster.
+ * Move some xattrs in this cluster to the new cluster.
 * This function should only be called when bucket size == cluster size.
 * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
 */
-static int ocfs2_half_xattr_cluster(struct inode *inode,
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
-                                    handle_t *handle,
+                                      handle_t *handle,
-                                    u64 prev_blk,
+                                      u64 prev_blk,
-                                    u64 new_blk,
+                                      u64 new_blk,
-                                    u32 *first_hash)
+                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int ret, credits = 2 * blk_per_bucket;
@@ -3515,8 +3545,8 @@ static int ocfs2_half_xattr_cluster(struct inode *inode,
        }
        /* Move half of the xattr in start_blk to the next bucket. */
-        return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+        return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
-                                        new_blk, first_hash, 1);
+                                          new_blk, first_hash, 1);
 }
 /*
@@ -3559,7 +3589,8 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-             prev_blk, prev_clusters, new_blk);
+             (unsigned long long)prev_blk, prev_clusters,
+             (unsigned long long)new_blk);
        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -3578,9 +3609,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
                                                     last_blk, new_blk,
                                                     v_start);
                else {
-                        ret = ocfs2_half_xattr_cluster(inode, handle,
+                        ret = ocfs2_divide_xattr_cluster(inode, handle,
-                                                       last_blk, new_blk,
+                                                         last_blk, new_blk,
-                                                       v_start);
+                                                         v_start);
                        if ((*header_bh)->b_blocknr == last_blk && extend)
                                *extend = 0;
@@ -3629,7 +3660,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
             "previous xattr blkno = %llu\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             prev_cpos, prev_blkno);
+             prev_cpos, (unsigned long long)prev_blkno);
        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
@@ -3716,7 +3747,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-             num_bits, block, v_start);
+             num_bits, (unsigned long long)block, v_start);
        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
                                  num_bits, 0, meta_ac);
        if (ret < 0) {
@@ -3761,7 +3792,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-             "from %llu, len = %u\n", start_blk,
+             "from %llu, len = %u\n", (unsigned long long)start_blk,
             (unsigned long long)first_bh->b_blocknr, num_clusters);
        BUG_ON(bucket >= num_buckets);
@@ -3797,8 +3828,8 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
        }
        /* Move half of the xattr in start_blk to the next bucket. */
-        ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+        ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
-                                      start_blk + blk_per_bucket, NULL, 0);
+                                        start_blk + blk_per_bucket, NULL, 0);
        le16_add_cpu(&first_xh->xh_num_buckets, 1);
        ocfs2_journal_dirty(handle, first_bh);
@@ -4146,7 +4177,7 @@ static int ocfs2_xattr_value_update_size(struct inode *inode,
        handle_t *handle = NULL;
        handle = ocfs2_start_trans(osb, 1);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -4313,7 +4344,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -4489,11 +4520,21 @@ out:
        return ret;
 }
-/* check whether the xattr bucket is filled up with the same hash value. */
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
 static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
-                                              struct ocfs2_xattr_bucket *bucket)
+                                              struct ocfs2_xattr_bucket *bucket,
+                                              const char *name)
 {
        struct ocfs2_xattr_header *xh = bucket->xh;
+        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+        if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+                return 0;
        if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
            xh->xh_entries[0].xe_name_hash) {
@@ -4616,7 +4657,9 @@ try_again:
                 * one bucket's worth, so check it here whether we need to
                 * add a new bucket for the insert.
                 */
-                ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+                ret = ocfs2_check_xattr_bucket_collision(inode,
+                                                         &xs->bucket,
+                                                         xi->name);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4727,14 +4770,11 @@ out:
 /*
 * 'trusted' attributes support
 */
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
                                       size_t list_size, const char *name,
                                       size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (list && total_len <= list_size) {
@@ -4771,18 +4811,14 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
        .set    = ocfs2_xattr_trusted_set,
 };
 /*
 * 'user' attributes support
 */
-#define XATTR_USER_PREFIX "user."
 static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
                                    size_t list_size, const char *name,
                                    size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index c25c7c62a059..1d8314c7656d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -3,24 +3,16 @@
 *
 * xattr.h
 *
- * Function prototypes
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
+ * License version 2 as published by the Free Software Foundation.
- * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
 */
 #ifndef OCFS2_XATTR_H
@@ -40,29 +32,11 @@ enum ocfs2_xattr_type {
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
-extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
-                           size_t, int);
-extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
 extern struct xattr_handler *ocfs2_xattr_handlers[];
-static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-{
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
-        return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+                    size_t, int);
-}
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
-static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
-{
-        return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
-}
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-        u16 len = sb->s_blocksize -
-                 offsetof(struct ocfs2_xattr_header, xh_entries);
-        return len / sizeof(struct ocfs2_xattr_entry);
-}
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633f7a0ebb2c..6d5b213b8a9b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
-int add_partition(struct gendisk *disk, int partno,
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
-                  sector_t start, sector_t len, int flags)
+                                sector_t start, sector_t len, int flags)
 {
        struct hd_struct *p;
        dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
        err = disk_expand_part_tbl(disk, partno);
        if (err)
-                return err;
+                return ERR_PTR(err);
        ptbl = disk->part_tbl;
        if (ptbl->part[partno])
-                return -EBUSY;
+                return ERR_PTR(-EBUSY);
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
-                return -ENOMEM;
+                return ERR_PTR(-EBUSY);
        if (!init_part_stats(p)) {
                err = -ENOMEM;
@@ -395,7 +395,7 @@ int add_partition(struct gendisk *disk, int partno,
        err = blk_alloc_devt(p, &devt);
        if (err)
-                goto out_free;
+                goto out_free_stats;
        pdev->devt = devt;
        /* delay uevent until 'holders' subdir is created */
@@ -424,18 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
        if (!ddev->uevent_suppress)
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
-        return 0;
+        return p;
+out_free_stats:
+        free_part_stats(p);
 out_free:
        kfree(p);
-        return err;
+        return ERR_PTR(err);
 out_del:
        kobject_put(p->holder_dir);
        device_del(pdev);
 out_put:
        put_device(pdev);
        blk_free_devt(devt);
-        return err;
+        return ERR_PTR(err);
 }
 /* Not exported, helper to add_disk(). */
@@ -566,15 +568,16 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
                               disk->disk_name, p, (unsigned long long) size);
                        size = get_capacity(disk) - from;
                }
-                res = add_partition(disk, p, from, size, state->parts[p].flags);
+                part = add_partition(disk, p, from, size,
-                if (res) {
+                                     state->parts[p].flags);
-                        printk(KERN_ERR " %s: p%d could not be added: %d\n",
+                if (IS_ERR(part)) {
-                                disk->disk_name, p, -res);
+                        printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+                               disk->disk_name, p, -PTR_ERR(part));
                        continue;
                }
 #ifdef CONFIG_BLK_DEV_MD
                if (state->parts[p].flags & ADDPART_FLAG_RAID)
-                        md_autodetect_dev(bdev->bd_dev+p);
+                        md_autodetect_dev(part_to_dev(part)->devt);
 #endif
        }
        kfree(state);
diff --git a/fs/pipe.c b/fs/pipe.c
index fcba6542b8d0..7aea8b89baac 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -717,14 +717,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 static int
 pipe_read_release(struct inode *inode, struct file *filp)
 {
-        pipe_read_fasync(-1, filp, 0);
        return pipe_release(inode, 1, 0);
 }
 static int
 pipe_write_release(struct inode *inode, struct file *filp)
 {
-        pipe_write_fasync(-1, filp, 0);
        return pipe_release(inode, 0, 1);
 }
@@ -733,7 +731,6 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
 {
        int decr, decw;
-        pipe_rdwr_fasync(-1, filp, 0);
        decr = (filp->f_mode & FMODE_READ) != 0;
        decw = (filp->f_mode & FMODE_WRITE) != 0;
        return pipe_release(inode, decr, decw);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb9f4b05703d..6af7fba7abb1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -40,7 +40,7 @@
 *
 *
 * Alan Cox          :  security fixes.
- *                      <Alan.Cox@linux.org>
+ *                      <alan@lxorguk.ukuu.org.uk>
 *
 * Al Viro           :  safe handling of mm_struct
 *
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863a..06ed10b7da9e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,6 +31,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
+        inode->i_uid = inode->i_gid = 0;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..df26aa88fa47 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,43 +1,45 @@
-#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
-#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
-static int uptime_proc_show(struct seq_file *m, void *v)
+static int proc_calc_metrics(char *page, char **start, off_t off,
+                                 int count, int *eof, int len)
+{
+        if (len <= off + count)
+                *eof = 1;
+        *start = page + off;
+        len -= off;
+        if (len > count)
+                len = count;
+        if (len < 0)
+                len = 0;
+        return len;
+}
+static int uptime_read_proc(char *page, char **start, off_t off, int count,
+                            int *eof, void *data)
 {
        struct timespec uptime;
        struct timespec idle;
+        int len;
        cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
        do_posix_clock_monotonic_gettime(&uptime);
        monotonic_to_bootbased(&uptime);
        cputime_to_timespec(idletime, &idle);
-        seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+        len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
                        (unsigned long) uptime.tv_sec,
                        (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
                        (unsigned long) idle.tv_sec,
                        (idle.tv_nsec / (NSEC_PER_SEC / 100)));
-        return 0;
+        return proc_calc_metrics(page, start, off, count, eof, len);
 }
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, uptime_proc_show, NULL);
-}
-static const struct file_operations uptime_proc_fops = {
-        .open           = uptime_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
 static int __init proc_uptime_init(void)
 {
-        proc_create("uptime", 0, NULL, &uptime_proc_fops);
+        create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
        return 0;
 }
 module_init(proc_uptime_init);
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c27156..1abab5cee4ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
        };
        /*
-         * The actor worker might be calling ->prepare_write and
+         * The actor worker might be calling ->write_begin and
-         * ->commit_write. Most of the time, these expect i_mutex to
+         * ->write_end. Most of the time, these expect i_mutex to
         * be held. Since this may result in an ABBA deadlock with
         * pipe->inode, we have to order lock acquiry here.
         */
diff --git a/fs/vfat/Makefile b/fs/vfat/Makefile
deleted file mode 100644
index 40f2798a4f08..000000000000
--- a/fs/vfat/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux vfat-filesystem routines.
-#
-obj-$(CONFIG_VFAT_FS) += vfat.o
-vfat-y := namei.o
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9e561a9cefca..a11a8390bf6c 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        int nmap, error, w, count, c, got, i, mapi;
        xfs_trans_t *tp;
        xfs_mount_t *mp;
+        xfs_drfsbno_t   nblks;
        dp = args->dp;
        mp = dp->i_mount;
        w = args->whichfork;
        tp = args->trans;
+        nblks = dp->i_d.di_nblocks;
        /*
         * For new directories adjust the file offset and block count.
         */
@@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        }
        if (mapp != &map)
                kmem_free(mapp);
+        /* account for newly allocated blocks in reserved blocks total */
+        args->total -= dp->i_d.di_nblocks - nblks;
        *new_blkno = (xfs_dablk_t)bno;
        return 0;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 80e0dc51361c..1afb12278b8d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -525,11 +525,13 @@ xfs_dir2_grow_inode(
        xfs_mount_t     *mp;
        int             nmap;           /* number of bmap entries */
        xfs_trans_t     *tp;
+        xfs_drfsbno_t   nblks;
        xfs_dir2_trace_args_s("grow_inode", args, space);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
+        nblks = dp->i_d.di_nblocks;
        /*
         * Set lowest possible block in the space requested.
         */
@@ -622,7 +624,11 @@ xfs_dir2_grow_inode(
         */
        if (mapp != &map)
                kmem_free(mapp);
+        /* account for newly allocated blocks in reserved blocks total */
+        args->total -= dp->i_d.di_nblocks - nblks;
        *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
        /*
         * Update file's size if this is the data space and it grew.
         */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dbd9cef852ec..a391b955df01 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1414,7 +1414,7 @@ xfs_itruncate_start(
        mp = ip->i_mount;
        /* wait for the completion of any pending DIOs */
-        if (new_size < ip->i_size)
+        if (new_size == 0 || new_size < ip->i_size)
                vn_iowait(ip);
        /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0b02c6443551..3608a0f0a5f6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -563,6 +563,11 @@ xfs_log_mount(
        }
        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+        if (!mp->m_log) {
+                cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+                error = ENOMEM;
+                goto out;
+        }
        /*
         * Initialize the AIL now we have a log.
@@ -601,6 +606,7 @@ xfs_log_mount(
        return 0;
 error:
        xfs_log_unmount_dealloc(mp);
+out:
        return error;
 }       /* xfs_log_mount */
@@ -1217,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        int                     i;
        int                     iclogsize;
-        log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+        if (!log)
+                return NULL;
        log->l_mp          = mp;
        log->l_targ        = log_target;
@@ -1249,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xlog_get_iclog_buffer_size(mp, log);
        bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+        if (!bp)
+                goto out_free_log;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
@@ -1275,13 +1285,17 @@ xlog_alloc_log(xfs_mount_t	*mp,
        iclogsize = log->l_iclog_size;
        ASSERT(log->l_iclog_size >= 4096);
        for (i=0; i < log->l_iclog_bufs; i++) {
-                *iclogp = (xlog_in_core_t *)
+                *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
-                          kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+                if (!*iclogp)
+                        goto out_free_iclog;
                iclog = *iclogp;
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
                bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+                if (!bp)
+                        goto out_free_iclog;
                if (!XFS_BUF_CPSEMA(bp))
                        ASSERT(0);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
@@ -1323,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
        return log;
+out_free_iclog:
+        for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+                prev_iclog = iclog->ic_next;
+                if (iclog->ic_bp) {
+                        sv_destroy(&iclog->ic_force_wait);
+                        sv_destroy(&iclog->ic_write_wait);
+                        xfs_buf_free(iclog->ic_bp);
+                        xlog_trace_iclog_dealloc(iclog);
+                }
+                kmem_free(iclog);
+        }
+        spinlock_destroy(&log->l_icloglock);
+        spinlock_destroy(&log->l_grant_lock);
+        xlog_trace_loggrant_dealloc(log);
+        xfs_buf_free(log->l_xbuf);
+out_free_log:
+        kmem_free(log);
+        return NULL;
 }       /* xlog_alloc_log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82d46ce69d5f..70e3ba32e6be 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1419,7 +1419,13 @@ xlog_recover_add_to_trans(
                return 0;
        item = trans->r_itemq;
        if (item == NULL) {
-                ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+                /* we need to catch log corruptions here */
+                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+                        xlog_warn("XFS: xlog_recover_add_to_trans: "
+                                  "bad header magic number");
+                        ASSERT(0);
+                        return XFS_ERROR(EIO);
+                }
                if (len == sizeof(xfs_trans_header_t))
                        xlog_recover_add_item(&trans->r_itemq);
                memcpy(&trans->r_theader, dp, len); /* d, s, l */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a4503f5e9497..15f5dd22fbb2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1245,6 +1245,9 @@ xfs_unmountfs(
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+        if (mp->m_quotainfo)
+                XFS_QM_DONE(mp);
        /*
         * Flush out the log synchronously so that we know for sure
         * that nothing is pinned.  This is important because bflush()
@@ -1297,8 +1300,6 @@ xfs_unmountfs(
        xfs_errortag_clearall(mp, 0);
 #endif
        xfs_free_perag(mp);
-        if (mp->m_quotainfo)
-                XFS_QM_DONE(mp);
 }
 STATIC void