aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/coda/psdev.c8
-rw-r--r--fs/compat.c4
-rw-r--r--fs/configfs/dir.c5
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/configfs/mount.c13
-rw-r--r--fs/debugfs/inode.c13
-rw-r--r--fs/dlm/lockspace.c50
-rw-r--r--fs/ecryptfs/main.c129
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/inode.c26
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/bmap.c37
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/daemon.c50
-rw-r--r--fs/gfs2/daemon.h1
-rw-r--r--fs/gfs2/dir.c4
-rw-r--r--fs/gfs2/eaops.c84
-rw-r--r--fs/gfs2/eattr.c2
-rw-r--r--fs/gfs2/glock.c83
-rw-r--r--fs/gfs2/glops.c110
-rw-r--r--fs/gfs2/incore.h47
-rw-r--r--fs/gfs2/inode.c41
-rw-r--r--fs/gfs2/inode.h12
-rw-r--r--fs/gfs2/locking/dlm/mount.c5
-rw-r--r--fs/gfs2/locking/dlm/plock.c18
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c38
-rw-r--r--fs/gfs2/locking/dlm/thread.c9
-rw-r--r--fs/gfs2/log.c119
-rw-r--r--fs/gfs2/log.h14
-rw-r--r--fs/gfs2/lops.c71
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c97
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c649
-rw-r--r--fs/gfs2/ops_address.h7
-rw-r--r--fs/gfs2/ops_file.c229
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c73
-rw-r--r--fs/gfs2/ops_inode.c20
-rw-r--r--fs/gfs2/ops_inode.h6
-rw-r--r--fs/gfs2/ops_super.c1
-rw-r--r--fs/gfs2/ops_vm.c169
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c29
-rw-r--r--fs/gfs2/recovery.c18
-rw-r--r--fs/gfs2/rgrp.c104
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c25
-rw-r--r--fs/gfs2/sys.c36
-rw-r--r--fs/gfs2/trans.c5
-rw-r--r--fs/gfs2/trans.h1
-rw-r--r--fs/hfs/btree.c7
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jfs/jfs_dtree.c27
-rw-r--r--fs/jfs/jfs_dtree.h4
-rw-r--r--fs/jfs/jfs_imap.c4
-rw-r--r--fs/jfs/jfs_logmgr.c34
-rw-r--r--fs/jfs/jfs_metapage.c43
-rw-r--r--fs/jfs/jfs_mount.c2
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c11
-rw-r--r--fs/nfsd/nfs3xdr.c5
-rw-r--r--fs/nfsd/nfsxdr.c5
-rw-r--r--fs/ocfs2/Makefile5
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c137
-rw-r--r--fs/ocfs2/buffer_head_io.c65
-rw-r--r--fs/ocfs2/buffer_head_io.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/sys.c83
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h8
-rw-r--r--fs/ocfs2/cluster/ver.c2
-rw-r--r--fs/ocfs2/dcache.c8
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c2
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c19
-rw-r--r--fs/ocfs2/dlm/dlmver.c2
-rw-r--r--fs/ocfs2/dlmglue.c546
-rw-r--r--fs/ocfs2/dlmglue.h31
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/file.c163
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c80
-rw-r--r--fs/ocfs2/heartbeat.h2
-rw-r--r--fs/ocfs2/inode.c84
-rw-r--r--fs/ocfs2/inode.h10
-rw-r--r--fs/ocfs2/ioctl.c31
-rw-r--r--fs/ocfs2/journal.c51
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/localalloc.c50
-rw-r--r--fs/ocfs2/locks.c125
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/vote.h)29
-rw-r--r--fs/ocfs2/mmap.c17
-rw-r--r--fs/ocfs2/namei.c66
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_fs.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/resize.c634
-rw-r--r--fs/ocfs2/resize.h32
-rw-r--r--fs/ocfs2/slot_map.c19
-rw-r--r--fs/ocfs2/slot_map.h2
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/suballoc.h8
-rw-r--r--fs/ocfs2/super.c140
-rw-r--r--fs/ocfs2/sysfile.c2
-rw-r--r--fs/ocfs2/ver.c2
-rw-r--r--fs/ocfs2/vote.c756
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/partitions/check.c327
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c78
-rw-r--r--fs/read_write.c63
-rw-r--r--fs/splice.c8
-rw-r--r--fs/sysfs/dir.c10
-rw-r--r--fs/sysfs/file.c67
-rw-r--r--fs/sysfs/symlink.c88
126 files changed, 3403 insertions, 3307 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 487236c6583..b6df18f1f67 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -440,14 +440,8 @@ config OCFS2_FS
440 Tools web page: http://oss.oracle.com/projects/ocfs2-tools 440 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
441 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ 441 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
442 442
443 Note: Features which OCFS2 does not support yet: 443 For more information on OCFS2, see the file
444 - extended attributes 444 <file:Documentation/filesystems/ocfs2.txt>.
445 - quotas
446 - cluster aware flock
447 - Directory change notification (F_NOTIFY)
448 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
449 - POSIX ACLs
450 - readpages / writepages (not user visible)
451 445
452config OCFS2_DEBUG_MASKLOG 446config OCFS2_DEBUG_MASKLOG
453 bool "OCFS2 logging support" 447 bool "OCFS2 logging support"
@@ -1028,8 +1022,8 @@ config HUGETLB_PAGE
1028 def_bool HUGETLBFS 1022 def_bool HUGETLBFS
1029 1023
1030config CONFIGFS_FS 1024config CONFIGFS_FS
1031 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" 1025 tristate "Userspace-driven configuration filesystem"
1032 depends on SYSFS && EXPERIMENTAL 1026 depends on SYSFS
1033 help 1027 help
1034 configfs is a ram-based filesystem that provides the converse 1028 configfs is a ram-based filesystem that provides the converse
1035 of sysfs's functionality. Where sysfs is a filesystem-based 1029 of sysfs's functionality. Where sysfs is a filesystem-based
@@ -1112,8 +1106,8 @@ config HFS_FS
1112 help 1106 help
1113 If you say Y here, you will be able to mount Macintosh-formatted 1107 If you say Y here, you will be able to mount Macintosh-formatted
1114 floppy disks and hard drive partitions with full read-write access. 1108 floppy disks and hard drive partitions with full read-write access.
1115 Please read <file:fs/hfs/HFS.txt> to learn about the available mount 1109 Please read <file:Documentation/filesystems/hfs.txt> to learn about
1116 options. 1110 the available mount options.
1117 1111
1118 To compile this file system support as a module, choose M here: the 1112 To compile this file system support as a module, choose M here: the
1119 module will be called hfs. 1113 module will be called hfs.
@@ -2130,4 +2124,3 @@ source "fs/nls/Kconfig"
2130source "fs/dlm/Kconfig" 2124source "fs/dlm/Kconfig"
2131 2125
2132endmenu 2126endmenu
2133
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 993f78c5522..e48a630ae26 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -738,9 +738,9 @@ EXPORT_SYMBOL(bd_release);
738static struct kobject *bdev_get_kobj(struct block_device *bdev) 738static struct kobject *bdev_get_kobj(struct block_device *bdev)
739{ 739{
740 if (bdev->bd_contains != bdev) 740 if (bdev->bd_contains != bdev)
741 return kobject_get(&bdev->bd_part->kobj); 741 return kobject_get(&bdev->bd_part->dev.kobj);
742 else 742 else
743 return kobject_get(&bdev->bd_disk->kobj); 743 return kobject_get(&bdev->bd_disk->dev.kobj);
744} 744}
745 745
746static struct kobject *bdev_get_holder(struct block_device *bdev) 746static struct kobject *bdev_get_holder(struct block_device *bdev)
@@ -1176,7 +1176,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
1176 ret = -ENXIO; 1176 ret = -ENXIO;
1177 goto out_first; 1177 goto out_first;
1178 } 1178 }
1179 kobject_get(&p->kobj); 1179 kobject_get(&p->dev.kobj);
1180 bdev->bd_part = p; 1180 bdev->bd_part = p;
1181 bd_set_size(bdev, (loff_t) p->nr_sects << 9); 1181 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1182 } 1182 }
@@ -1299,7 +1299,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1299 module_put(owner); 1299 module_put(owner);
1300 1300
1301 if (bdev->bd_contains != bdev) { 1301 if (bdev->bd_contains != bdev) {
1302 kobject_put(&bdev->bd_part->kobj); 1302 kobject_put(&bdev->bd_part->dev.kobj);
1303 bdev->bd_part = NULL; 1303 bdev->bd_part = NULL;
1304 } 1304 }
1305 bdev->bd_disk = NULL; 1305 bdev->bd_disk = NULL;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c3bfa76765c..2c7a8b5b459 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -510,9 +510,8 @@ struct cdev *cdev_alloc(void)
510{ 510{
511 struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); 511 struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
512 if (p) { 512 if (p) {
513 p->kobj.ktype = &ktype_cdev_dynamic;
514 INIT_LIST_HEAD(&p->list); 513 INIT_LIST_HEAD(&p->list);
515 kobject_init(&p->kobj); 514 kobject_init(&p->kobj, &ktype_cdev_dynamic);
516 } 515 }
517 return p; 516 return p;
518} 517}
@@ -529,8 +528,7 @@ void cdev_init(struct cdev *cdev, const struct file_operations *fops)
529{ 528{
530 memset(cdev, 0, sizeof *cdev); 529 memset(cdev, 0, sizeof *cdev);
531 INIT_LIST_HEAD(&cdev->list); 530 INIT_LIST_HEAD(&cdev->list);
532 cdev->kobj.ktype = &ktype_cdev_default; 531 kobject_init(&cdev->kobj, &ktype_cdev_default);
533 kobject_init(&cdev->kobj);
534 cdev->ops = fops; 532 cdev->ops = fops;
535} 533}
536 534
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index dcc6aead70f..e3eb3556622 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,8 @@ static int init_coda_psdev(void)
362 goto out_chrdev; 362 goto out_chrdev;
363 } 363 }
364 for (i = 0; i < MAX_CODADEVS; i++) 364 for (i = 0; i < MAX_CODADEVS; i++)
365 class_device_create(coda_psdev_class, NULL, 365 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i); 366 MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
367 coda_sysctl_init(); 367 coda_sysctl_init();
368 goto out; 368 goto out;
369 369
@@ -405,7 +405,7 @@ static int __init init_coda(void)
405 return 0; 405 return 0;
406out: 406out:
407 for (i = 0; i < MAX_CODADEVS; i++) 407 for (i = 0; i < MAX_CODADEVS; i++)
408 class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); 408 device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
409 class_destroy(coda_psdev_class); 409 class_destroy(coda_psdev_class);
410 unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); 410 unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
411 coda_sysctl_clean(); 411 coda_sysctl_clean();
@@ -424,7 +424,7 @@ static void __exit exit_coda(void)
424 printk("coda: failed to unregister filesystem\n"); 424 printk("coda: failed to unregister filesystem\n");
425 } 425 }
426 for (i = 0; i < MAX_CODADEVS; i++) 426 for (i = 0; i < MAX_CODADEVS; i++)
427 class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); 427 device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
428 class_destroy(coda_psdev_class); 428 class_destroy(coda_psdev_class);
429 unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); 429 unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
430 coda_sysctl_clean(); 430 coda_sysctl_clean();
diff --git a/fs/compat.c b/fs/compat.c
index 15078ce4c04..5216c3fd751 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1104,10 +1104,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1104 if (ret < 0) 1104 if (ret < 0)
1105 goto out; 1105 goto out;
1106 1106
1107 ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
1108 if (ret)
1109 goto out;
1110
1111 fnv = NULL; 1107 fnv = NULL;
1112 if (type == READ) { 1108 if (type == READ) {
1113 fn = file->f_op->read; 1109 fn = file->f_op->read;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 50ed691098b..a48dc7dd876 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -546,7 +546,7 @@ static int populate_groups(struct config_group *group)
546 * That said, taking our i_mutex is closer to mkdir 546 * That said, taking our i_mutex is closer to mkdir
547 * emulation, and shouldn't hurt. 547 * emulation, and shouldn't hurt.
548 */ 548 */
549 mutex_lock(&dentry->d_inode->i_mutex); 549 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
550 550
551 for (i = 0; group->default_groups[i]; i++) { 551 for (i = 0; group->default_groups[i]; i++) {
552 new_group = group->default_groups[i]; 552 new_group = group->default_groups[i];
@@ -1405,7 +1405,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1405 sd = configfs_sb->s_root->d_fsdata; 1405 sd = configfs_sb->s_root->d_fsdata;
1406 link_group(to_config_group(sd->s_element), group); 1406 link_group(to_config_group(sd->s_element), group);
1407 1407
1408 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); 1408 mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
1409 I_MUTEX_PARENT);
1409 1410
1410 name.name = group->cg_item.ci_name; 1411 name.name = group->cg_item.ci_name;
1411 name.len = strlen(name.name); 1412 name.len = strlen(name.name);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index a3658f9a082..397cb503a18 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -320,7 +320,7 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; 320 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
321 int error = 0; 321 int error = 0;
322 322
323 mutex_lock(&dir->d_inode->i_mutex); 323 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); 324 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
325 mutex_unlock(&dir->d_inode->i_mutex); 325 mutex_unlock(&dir->d_inode->i_mutex);
326 326
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3bf0278ea84..de3b31d0a37 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -128,7 +128,7 @@ void configfs_release_fs(void)
128} 128}
129 129
130 130
131static decl_subsys(config, NULL, NULL); 131static struct kobject *config_kobj;
132 132
133static int __init configfs_init(void) 133static int __init configfs_init(void)
134{ 134{
@@ -140,9 +140,8 @@ static int __init configfs_init(void)
140 if (!configfs_dir_cachep) 140 if (!configfs_dir_cachep)
141 goto out; 141 goto out;
142 142
143 kobj_set_kset_s(&config_subsys, kernel_subsys); 143 config_kobj = kobject_create_and_add("config", kernel_kobj);
144 err = subsystem_register(&config_subsys); 144 if (!config_kobj) {
145 if (err) {
146 kmem_cache_destroy(configfs_dir_cachep); 145 kmem_cache_destroy(configfs_dir_cachep);
147 configfs_dir_cachep = NULL; 146 configfs_dir_cachep = NULL;
148 goto out; 147 goto out;
@@ -151,7 +150,7 @@ static int __init configfs_init(void)
151 err = register_filesystem(&configfs_fs_type); 150 err = register_filesystem(&configfs_fs_type);
152 if (err) { 151 if (err) {
153 printk(KERN_ERR "configfs: Unable to register filesystem!\n"); 152 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
154 subsystem_unregister(&config_subsys); 153 kobject_put(config_kobj);
155 kmem_cache_destroy(configfs_dir_cachep); 154 kmem_cache_destroy(configfs_dir_cachep);
156 configfs_dir_cachep = NULL; 155 configfs_dir_cachep = NULL;
157 goto out; 156 goto out;
@@ -160,7 +159,7 @@ static int __init configfs_init(void)
160 err = configfs_inode_init(); 159 err = configfs_inode_init();
161 if (err) { 160 if (err) {
162 unregister_filesystem(&configfs_fs_type); 161 unregister_filesystem(&configfs_fs_type);
163 subsystem_unregister(&config_subsys); 162 kobject_put(config_kobj);
164 kmem_cache_destroy(configfs_dir_cachep); 163 kmem_cache_destroy(configfs_dir_cachep);
165 configfs_dir_cachep = NULL; 164 configfs_dir_cachep = NULL;
166 } 165 }
@@ -171,7 +170,7 @@ out:
171static void __exit configfs_exit(void) 170static void __exit configfs_exit(void)
172{ 171{
173 unregister_filesystem(&configfs_fs_type); 172 unregister_filesystem(&configfs_fs_type);
174 subsystem_unregister(&config_subsys); 173 kobject_put(config_kobj);
175 kmem_cache_destroy(configfs_dir_cachep); 174 kmem_cache_destroy(configfs_dir_cachep);
176 configfs_dir_cachep = NULL; 175 configfs_dir_cachep = NULL;
177 configfs_inode_exit(); 176 configfs_inode_exit();
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6a713b33992..d26e2826ba5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -426,20 +426,19 @@ exit:
426} 426}
427EXPORT_SYMBOL_GPL(debugfs_rename); 427EXPORT_SYMBOL_GPL(debugfs_rename);
428 428
429static decl_subsys(debug, NULL, NULL); 429static struct kobject *debug_kobj;
430 430
431static int __init debugfs_init(void) 431static int __init debugfs_init(void)
432{ 432{
433 int retval; 433 int retval;
434 434
435 kobj_set_kset_s(&debug_subsys, kernel_subsys); 435 debug_kobj = kobject_create_and_add("debug", kernel_kobj);
436 retval = subsystem_register(&debug_subsys); 436 if (!debug_kobj)
437 if (retval) 437 return -EINVAL;
438 return retval;
439 438
440 retval = register_filesystem(&debug_fs_type); 439 retval = register_filesystem(&debug_fs_type);
441 if (retval) 440 if (retval)
442 subsystem_unregister(&debug_subsys); 441 kobject_put(debug_kobj);
443 return retval; 442 return retval;
444} 443}
445 444
@@ -447,7 +446,7 @@ static void __exit debugfs_exit(void)
447{ 446{
448 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 447 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
449 unregister_filesystem(&debug_fs_type); 448 unregister_filesystem(&debug_fs_type);
450 subsystem_unregister(&debug_subsys); 449 kobject_put(debug_kobj);
451} 450}
452 451
453core_initcall(debugfs_init); 452core_initcall(debugfs_init);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 6353a838452..5c108c49cb8 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -166,26 +166,7 @@ static struct kobj_type dlm_ktype = {
166 .release = lockspace_kobj_release, 166 .release = lockspace_kobj_release,
167}; 167};
168 168
169static struct kset dlm_kset = { 169static struct kset *dlm_kset;
170 .ktype = &dlm_ktype,
171};
172
173static int kobject_setup(struct dlm_ls *ls)
174{
175 char lsname[DLM_LOCKSPACE_LEN];
176 int error;
177
178 memset(lsname, 0, DLM_LOCKSPACE_LEN);
179 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
180
181 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
182 if (error)
183 return error;
184
185 ls->ls_kobj.kset = &dlm_kset;
186 ls->ls_kobj.ktype = &dlm_ktype;
187 return 0;
188}
189 170
190static int do_uevent(struct dlm_ls *ls, int in) 171static int do_uevent(struct dlm_ls *ls, int in)
191{ 172{
@@ -220,24 +201,22 @@ static int do_uevent(struct dlm_ls *ls, int in)
220 201
221int dlm_lockspace_init(void) 202int dlm_lockspace_init(void)
222{ 203{
223 int error;
224
225 ls_count = 0; 204 ls_count = 0;
226 mutex_init(&ls_lock); 205 mutex_init(&ls_lock);
227 INIT_LIST_HEAD(&lslist); 206 INIT_LIST_HEAD(&lslist);
228 spin_lock_init(&lslist_lock); 207 spin_lock_init(&lslist_lock);
229 208
230 kobject_set_name(&dlm_kset.kobj, "dlm"); 209 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
231 kobj_set_kset_s(&dlm_kset, kernel_subsys); 210 if (!dlm_kset) {
232 error = kset_register(&dlm_kset); 211 printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
233 if (error) 212 return -ENOMEM;
234 printk("dlm_lockspace_init: cannot register kset %d\n", error); 213 }
235 return error; 214 return 0;
236} 215}
237 216
238void dlm_lockspace_exit(void) 217void dlm_lockspace_exit(void)
239{ 218{
240 kset_unregister(&dlm_kset); 219 kset_unregister(dlm_kset);
241} 220}
242 221
243static int dlm_scand(void *data) 222static int dlm_scand(void *data)
@@ -549,13 +528,12 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
549 goto out_delist; 528 goto out_delist;
550 } 529 }
551 530
552 error = kobject_setup(ls); 531 ls->ls_kobj.kset = dlm_kset;
553 if (error) 532 error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
554 goto out_stop; 533 "%s", ls->ls_name);
555
556 error = kobject_register(&ls->ls_kobj);
557 if (error) 534 if (error)
558 goto out_stop; 535 goto out_stop;
536 kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
559 537
560 /* let kobject handle freeing of ls if there's an error */ 538 /* let kobject handle freeing of ls if there's an error */
561 do_unreg = 1; 539 do_unreg = 1;
@@ -601,7 +579,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
601 kfree(ls->ls_rsbtbl); 579 kfree(ls->ls_rsbtbl);
602 out_lsfree: 580 out_lsfree:
603 if (do_unreg) 581 if (do_unreg)
604 kobject_unregister(&ls->ls_kobj); 582 kobject_put(&ls->ls_kobj);
605 else 583 else
606 kfree(ls); 584 kfree(ls);
607 out: 585 out:
@@ -750,7 +728,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
750 dlm_clear_members(ls); 728 dlm_clear_members(ls);
751 dlm_clear_members_gone(ls); 729 dlm_clear_members_gone(ls);
752 kfree(ls->ls_node_array); 730 kfree(ls->ls_node_array);
753 kobject_unregister(&ls->ls_kobj); 731 kobject_put(&ls->ls_kobj);
754 /* The ls structure will be freed when the kobject is done with */ 732 /* The ls structure will be freed when the kobject is done with */
755 733
756 mutex_lock(&ls_lock); 734 mutex_lock(&ls_lock);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e5580bcb923..0249aa4ae18 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -734,127 +734,40 @@ static int ecryptfs_init_kmem_caches(void)
734 return 0; 734 return 0;
735} 735}
736 736
737struct ecryptfs_obj { 737static struct kobject *ecryptfs_kobj;
738 char *name;
739 struct list_head slot_list;
740 struct kobject kobj;
741};
742
743struct ecryptfs_attribute {
744 struct attribute attr;
745 ssize_t(*show) (struct ecryptfs_obj *, char *);
746 ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
747};
748 738
749static ssize_t 739static ssize_t version_show(struct kobject *kobj,
750ecryptfs_attr_store(struct kobject *kobj, 740 struct kobj_attribute *attr, char *buff)
751 struct attribute *attr, const char *buf, size_t len)
752{ 741{
753 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, 742 return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
754 kobj);
755 struct ecryptfs_attribute *attribute =
756 container_of(attr, struct ecryptfs_attribute, attr);
757
758 return (attribute->store ? attribute->store(obj, buf, len) : 0);
759} 743}
760 744
761static ssize_t 745static struct kobj_attribute version_attr = __ATTR_RO(version);
762ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
763{
764 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
765 kobj);
766 struct ecryptfs_attribute *attribute =
767 container_of(attr, struct ecryptfs_attribute, attr);
768
769 return (attribute->show ? attribute->show(obj, buf) : 0);
770}
771 746
772static struct sysfs_ops ecryptfs_sysfs_ops = { 747static struct attribute *attributes[] = {
773 .show = ecryptfs_attr_show, 748 &version_attr.attr,
774 .store = ecryptfs_attr_store 749 NULL,
775}; 750};
776 751
777static struct kobj_type ecryptfs_ktype = { 752static struct attribute_group attr_group = {
778 .sysfs_ops = &ecryptfs_sysfs_ops 753 .attrs = attributes,
779}; 754};
780 755
781static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
782
783static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
784{
785 return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
786}
787
788static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
789
790static struct ecryptfs_version_str_map_elem {
791 u32 flag;
792 char *str;
793} ecryptfs_version_str_map[] = {
794 {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
795 {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
796 {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
797 {ECRYPTFS_VERSIONING_POLICY, "policy"},
798 {ECRYPTFS_VERSIONING_XATTR, "metadata in extended attribute"},
799 {ECRYPTFS_VERSIONING_MULTKEY, "multiple keys per file"}
800};
801
802static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
803{
804 int i;
805 int remaining = PAGE_SIZE;
806 int total_written = 0;
807
808 buff[0] = '\0';
809 for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
810 int entry_size;
811
812 if (!(ECRYPTFS_VERSIONING_MASK
813 & ecryptfs_version_str_map[i].flag))
814 continue;
815 entry_size = strlen(ecryptfs_version_str_map[i].str);
816 if ((entry_size + 2) > remaining)
817 goto out;
818 memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
819 buff[entry_size++] = '\n';
820 buff[entry_size] = '\0';
821 buff += entry_size;
822 total_written += entry_size;
823 remaining -= entry_size;
824 }
825out:
826 return total_written;
827}
828
829static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
830
831static int do_sysfs_registration(void) 756static int do_sysfs_registration(void)
832{ 757{
833 int rc; 758 int rc;
834 759
835 rc = subsystem_register(&ecryptfs_subsys); 760 ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj);
836 if (rc) { 761 if (!ecryptfs_kobj) {
837 printk(KERN_ERR 762 printk(KERN_ERR "Unable to create ecryptfs kset\n");
838 "Unable to register ecryptfs sysfs subsystem\n"); 763 rc = -ENOMEM;
839 goto out;
840 }
841 rc = sysfs_create_file(&ecryptfs_subsys.kobj,
842 &sysfs_attr_version.attr);
843 if (rc) {
844 printk(KERN_ERR
845 "Unable to create ecryptfs version attribute\n");
846 subsystem_unregister(&ecryptfs_subsys);
847 goto out; 764 goto out;
848 } 765 }
849 rc = sysfs_create_file(&ecryptfs_subsys.kobj, 766 rc = sysfs_create_group(ecryptfs_kobj, &attr_group);
850 &sysfs_attr_version_str.attr);
851 if (rc) { 767 if (rc) {
852 printk(KERN_ERR 768 printk(KERN_ERR
853 "Unable to create ecryptfs version_str attribute\n"); 769 "Unable to create ecryptfs version attributes\n");
854 sysfs_remove_file(&ecryptfs_subsys.kobj, 770 kobject_put(ecryptfs_kobj);
855 &sysfs_attr_version.attr);
856 subsystem_unregister(&ecryptfs_subsys);
857 goto out;
858 } 771 }
859out: 772out:
860 return rc; 773 return rc;
@@ -862,11 +775,8 @@ out:
862 775
863static void do_sysfs_unregistration(void) 776static void do_sysfs_unregistration(void)
864{ 777{
865 sysfs_remove_file(&ecryptfs_subsys.kobj, 778 sysfs_remove_group(ecryptfs_kobj, &attr_group);
866 &sysfs_attr_version.attr); 779 kobject_put(ecryptfs_kobj);
867 sysfs_remove_file(&ecryptfs_subsys.kobj,
868 &sysfs_attr_version_str.attr);
869 subsystem_unregister(&ecryptfs_subsys);
870} 780}
871 781
872static int __init ecryptfs_init(void) 782static int __init ecryptfs_init(void)
@@ -894,7 +804,6 @@ static int __init ecryptfs_init(void)
894 printk(KERN_ERR "Failed to register filesystem\n"); 804 printk(KERN_ERR "Failed to register filesystem\n");
895 goto out_free_kmem_caches; 805 goto out_free_kmem_caches;
896 } 806 }
897 kobj_set_kset_s(&ecryptfs_subsys, fs_subsys);
898 rc = do_sysfs_registration(); 807 rc = do_sysfs_registration();
899 if (rc) { 808 if (rc) {
900 printk(KERN_ERR "sysfs registration failed\n"); 809 printk(KERN_ERR "sysfs registration failed\n");
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0fca82021d7..300324bd563 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -482,8 +482,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
482 if (wbc->nr_to_write <= 0) 482 if (wbc->nr_to_write <= 0)
483 break; 483 break;
484 } 484 }
485 if (!list_empty(&sb->s_more_io))
486 wbc->more_io = 1;
487 return; /* Leave any unwritten inodes on s_io */ 485 return; /* Leave any unwritten inodes on s_io */
488} 486}
489 487
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84f9f7dfdf5..e5e80d1a468 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -744,9 +744,6 @@ static inline void unregister_fuseblk(void)
744} 744}
745#endif 745#endif
746 746
747static decl_subsys(fuse, NULL, NULL);
748static decl_subsys(connections, NULL, NULL);
749
750static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo) 747static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
751{ 748{
752 struct inode * inode = foo; 749 struct inode * inode = foo;
@@ -791,32 +788,37 @@ static void fuse_fs_cleanup(void)
791 kmem_cache_destroy(fuse_inode_cachep); 788 kmem_cache_destroy(fuse_inode_cachep);
792} 789}
793 790
791static struct kobject *fuse_kobj;
792static struct kobject *connections_kobj;
793
794static int fuse_sysfs_init(void) 794static int fuse_sysfs_init(void)
795{ 795{
796 int err; 796 int err;
797 797
798 kobj_set_kset_s(&fuse_subsys, fs_subsys); 798 fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
799 err = subsystem_register(&fuse_subsys); 799 if (!fuse_kobj) {
800 if (err) 800 err = -ENOMEM;
801 goto out_err; 801 goto out_err;
802 }
802 803
803 kobj_set_kset_s(&connections_subsys, fuse_subsys); 804 connections_kobj = kobject_create_and_add("connections", fuse_kobj);
804 err = subsystem_register(&connections_subsys); 805 if (!connections_kobj) {
805 if (err) 806 err = -ENOMEM;
806 goto out_fuse_unregister; 807 goto out_fuse_unregister;
808 }
807 809
808 return 0; 810 return 0;
809 811
810 out_fuse_unregister: 812 out_fuse_unregister:
811 subsystem_unregister(&fuse_subsys); 813 kobject_put(fuse_kobj);
812 out_err: 814 out_err:
813 return err; 815 return err;
814} 816}
815 817
816static void fuse_sysfs_cleanup(void) 818static void fuse_sysfs_cleanup(void)
817{ 819{
818 subsystem_unregister(&connections_subsys); 820 kobject_put(connections_kobj);
819 subsystem_unregister(&fuse_subsys); 821 kobject_put(fuse_kobj);
820} 822}
821 823
822static int __init fuse_init(void) 824static int __init fuse_init(void)
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 04ad0caebed..8fff11058ce 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ 8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93fa427bb5f..e4effc47abf 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -59,7 +59,6 @@ struct strip_mine {
59static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, 59static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
60 u64 block, struct page *page) 60 u64 block, struct page *page)
61{ 61{
62 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
63 struct inode *inode = &ip->i_inode; 62 struct inode *inode = &ip->i_inode;
64 struct buffer_head *bh; 63 struct buffer_head *bh;
65 int release = 0; 64 int release = 0;
@@ -95,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
95 set_buffer_uptodate(bh); 94 set_buffer_uptodate(bh);
96 if (!gfs2_is_jdata(ip)) 95 if (!gfs2_is_jdata(ip))
97 mark_buffer_dirty(bh); 96 mark_buffer_dirty(bh);
98 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 97 if (!gfs2_is_writeback(ip))
99 gfs2_trans_add_bh(ip->i_gl, bh, 0); 98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
100 99
101 if (release) { 100 if (release) {
@@ -453,8 +452,8 @@ static inline void bmap_unlock(struct inode *inode, int create)
453 * Returns: errno 452 * Returns: errno
454 */ 453 */
455 454
456int gfs2_block_map(struct inode *inode, u64 lblock, int create, 455int gfs2_block_map(struct inode *inode, sector_t lblock,
457 struct buffer_head *bh_map) 456 struct buffer_head *bh_map, int create)
458{ 457{
459 struct gfs2_inode *ip = GFS2_I(inode); 458 struct gfs2_inode *ip = GFS2_I(inode);
460 struct gfs2_sbd *sdp = GFS2_SB(inode); 459 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -470,6 +469,7 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
470 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; 469 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
471 struct metapath mp; 470 struct metapath mp;
472 u64 size; 471 u64 size;
472 struct buffer_head *dibh = NULL;
473 473
474 BUG_ON(maxlen == 0); 474 BUG_ON(maxlen == 0);
475 475
@@ -500,6 +500,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
500 error = gfs2_meta_inode_buffer(ip, &bh); 500 error = gfs2_meta_inode_buffer(ip, &bh);
501 if (error) 501 if (error)
502 goto out_fail; 502 goto out_fail;
503 dibh = bh;
504 get_bh(dibh);
503 505
504 for (x = 0; x < end_of_metadata; x++) { 506 for (x = 0; x < end_of_metadata; x++) {
505 lookup_block(ip, bh, x, &mp, create, &new, &dblock); 507 lookup_block(ip, bh, x, &mp, create, &new, &dblock);
@@ -518,13 +520,8 @@ int gfs2_block_map(struct inode *inode, u64 lblock, int create,
518 if (boundary) 520 if (boundary)
519 set_buffer_boundary(bh_map); 521 set_buffer_boundary(bh_map);
520 if (new) { 522 if (new) {
521 struct buffer_head *dibh; 523 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
522 error = gfs2_meta_inode_buffer(ip, &dibh); 524 gfs2_dinode_out(ip, dibh->b_data);
523 if (!error) {
524 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
525 gfs2_dinode_out(ip, dibh->b_data);
526 brelse(dibh);
527 }
528 set_buffer_new(bh_map); 525 set_buffer_new(bh_map);
529 goto out_brelse; 526 goto out_brelse;
530 } 527 }
@@ -545,6 +542,8 @@ out_brelse:
545out_ok: 542out_ok:
546 error = 0; 543 error = 0;
547out_fail: 544out_fail:
545 if (dibh)
546 brelse(dibh);
548 bmap_unlock(inode, create); 547 bmap_unlock(inode, create);
549 return error; 548 return error;
550} 549}
@@ -560,7 +559,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
560 BUG_ON(!new); 559 BUG_ON(!new);
561 560
562 bh.b_size = 1 << (inode->i_blkbits + 5); 561 bh.b_size = 1 << (inode->i_blkbits + 5);
563 ret = gfs2_block_map(inode, lblock, create, &bh); 562 ret = gfs2_block_map(inode, lblock, &bh, create);
564 *extlen = bh.b_size >> inode->i_blkbits; 563 *extlen = bh.b_size >> inode->i_blkbits;
565 *dblock = bh.b_blocknr; 564 *dblock = bh.b_blocknr;
566 if (buffer_new(&bh)) 565 if (buffer_new(&bh))
@@ -684,7 +683,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
684 if (metadata) 683 if (metadata)
685 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 684 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
686 685
687 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh); 686 error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
688 if (error) 687 if (error)
689 return error; 688 return error;
690 689
@@ -786,7 +785,7 @@ out_rg_gunlock:
786out_rlist: 785out_rlist:
787 gfs2_rlist_free(&rlist); 786 gfs2_rlist_free(&rlist);
788out: 787out:
789 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh); 788 gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
790 return error; 789 return error;
791} 790}
792 791
@@ -879,7 +878,6 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
879{ 878{
880 struct inode *inode = mapping->host; 879 struct inode *inode = mapping->host;
881 struct gfs2_inode *ip = GFS2_I(inode); 880 struct gfs2_inode *ip = GFS2_I(inode);
882 struct gfs2_sbd *sdp = GFS2_SB(inode);
883 loff_t from = inode->i_size; 881 loff_t from = inode->i_size;
884 unsigned long index = from >> PAGE_CACHE_SHIFT; 882 unsigned long index = from >> PAGE_CACHE_SHIFT;
885 unsigned offset = from & (PAGE_CACHE_SIZE-1); 883 unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -911,7 +909,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
911 err = 0; 909 err = 0;
912 910
913 if (!buffer_mapped(bh)) { 911 if (!buffer_mapped(bh)) {
914 gfs2_get_block(inode, iblock, bh, 0); 912 gfs2_block_map(inode, iblock, bh, 0);
915 /* unmapped? It's a hole - nothing to do */ 913 /* unmapped? It's a hole - nothing to do */
916 if (!buffer_mapped(bh)) 914 if (!buffer_mapped(bh))
917 goto unlock; 915 goto unlock;
@@ -931,7 +929,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
931 err = 0; 929 err = 0;
932 } 930 }
933 931
934 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 932 if (!gfs2_is_writeback(ip))
935 gfs2_trans_add_bh(ip->i_gl, bh, 0); 933 gfs2_trans_add_bh(ip->i_gl, bh, 0);
936 934
937 zero_user_page(page, offset, length, KM_USER0); 935 zero_user_page(page, offset, length, KM_USER0);
@@ -1224,8 +1222,13 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1224 do_div(lblock_stop, bsize); 1222 do_div(lblock_stop, bsize);
1225 } else { 1223 } else {
1226 unsigned int shift = sdp->sd_sb.sb_bsize_shift; 1224 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1225 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1227 lblock = offset >> shift; 1226 lblock = offset >> shift;
1228 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1227 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1228 if (lblock_stop > end_of_file) {
1229 *alloc_required = 1;
1230 return 0;
1231 }
1229 } 1232 }
1230 1233
1231 for (; lblock < lblock_stop; lblock += extlen) { 1234 for (; lblock < lblock_stop; lblock += extlen) {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index ac2fd04370d..4e6cde2943b 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -15,7 +15,7 @@ struct gfs2_inode;
15struct page; 15struct page;
16 16
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh); 18int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
20 20
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 21int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3731ab0771d..e51991947d2 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -83,56 +83,6 @@ int gfs2_recoverd(void *data)
83} 83}
84 84
85/** 85/**
86 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
87 * @sdp: Pointer to GFS2 superblock
88 *
89 * Also, periodically check to make sure that we're using the most recent
90 * journal index.
91 */
92
93int gfs2_logd(void *data)
94{
95 struct gfs2_sbd *sdp = data;
96 struct gfs2_holder ji_gh;
97 unsigned long t;
98 int need_flush;
99
100 while (!kthread_should_stop()) {
101 /* Advance the log tail */
102
103 t = sdp->sd_log_flush_time +
104 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
105
106 gfs2_ail1_empty(sdp, DIO_ALL);
107 gfs2_log_lock(sdp);
108 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
109 gfs2_log_unlock(sdp);
110 if (need_flush || time_after_eq(jiffies, t)) {
111 gfs2_log_flush(sdp, NULL);
112 sdp->sd_log_flush_time = jiffies;
113 }
114
115 /* Check for latest journal index */
116
117 t = sdp->sd_jindex_refresh_time +
118 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
119
120 if (time_after_eq(jiffies, t)) {
121 if (!gfs2_jindex_hold(sdp, &ji_gh))
122 gfs2_glock_dq_uninit(&ji_gh);
123 sdp->sd_jindex_refresh_time = jiffies;
124 }
125
126 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
127 if (freezing(current))
128 refrigerator();
129 schedule_timeout_interruptible(t);
130 }
131
132 return 0;
133}
134
135/**
136 * gfs2_quotad - Write cached quota changes into the quota file 86 * gfs2_quotad - Write cached quota changes into the quota file
137 * @sdp: Pointer to GFS2 superblock 87 * @sdp: Pointer to GFS2 superblock
138 * 88 *
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 0de9b355795..4be084fb6a6 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -12,7 +12,6 @@
12 12
13int gfs2_glockd(void *data); 13int gfs2_glockd(void *data);
14int gfs2_recoverd(void *data); 14int gfs2_recoverd(void *data);
15int gfs2_logd(void *data);
16int gfs2_quotad(void *data); 15int gfs2_quotad(void *data);
17 16
18#endif /* __DAEMON_DOT_H__ */ 17#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9949bb746a5..57e2ed932ad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1876,7 +1876,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1876 if (error) 1876 if (error)
1877 goto out; 1877 goto out;
1878 1878
1879 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh); 1879 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1880 if (error) 1880 if (error)
1881 goto out_qs; 1881 goto out_qs;
1882 1882
@@ -1949,7 +1949,7 @@ out_rg_gunlock:
1949 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); 1949 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1950out_rlist: 1950out_rlist:
1951 gfs2_rlist_free(&rlist); 1951 gfs2_rlist_free(&rlist);
1952 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh); 1952 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1953out_qs: 1953out_qs:
1954 gfs2_quota_unhold(dip); 1954 gfs2_quota_unhold(dip);
1955out: 1955out:
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index aa8dbf303f6..f114ba2b355 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -56,46 +56,6 @@ unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
56 return type; 56 return type;
57} 57}
58 58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) 59static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{ 60{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && 61 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
@@ -108,8 +68,6 @@ static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) 68 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP; 69 return -EOPNOTSUPP;
110 70
111
112
113 return gfs2_ea_get_i(ip, er); 71 return gfs2_ea_get_i(ip, er);
114} 72}
115 73
@@ -170,40 +128,10 @@ static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
170 return gfs2_ea_remove_i(ip, er); 128 return gfs2_ea_remove_i(ip, er);
171} 129}
172 130
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static const struct gfs2_eattr_operations gfs2_user_eaops = { 131static const struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get, 132 .eo_get = gfs2_ea_get_i,
205 .eo_set = user_eo_set, 133 .eo_set = gfs2_ea_set_i,
206 .eo_remove = user_eo_remove, 134 .eo_remove = gfs2_ea_remove_i,
207 .eo_name = "user", 135 .eo_name = "user",
208}; 136};
209 137
@@ -215,9 +143,9 @@ const struct gfs2_eattr_operations gfs2_system_eaops = {
215}; 143};
216 144
217static const struct gfs2_eattr_operations gfs2_security_eaops = { 145static const struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get, 146 .eo_get = gfs2_ea_get_i,
219 .eo_set = security_eo_set, 147 .eo_set = gfs2_ea_set_i,
220 .eo_remove = security_eo_remove, 148 .eo_remove = gfs2_ea_remove_i,
221 .eo_name = "security", 149 .eo_name = "security",
222}; 150};
223 151
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 2a7435b5c4d..bee99704ea1 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -1418,7 +1418,7 @@ out:
1418static int ea_dealloc_block(struct gfs2_inode *ip) 1418static int ea_dealloc_block(struct gfs2_inode *ip)
1419{ 1419{
1420 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1420 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1421 struct gfs2_alloc *al = &ip->i_alloc; 1421 struct gfs2_alloc *al = ip->i_alloc;
1422 struct gfs2_rgrpd *rgd; 1422 struct gfs2_rgrpd *rgd;
1423 struct buffer_head *dibh; 1423 struct buffer_head *dibh;
1424 int error; 1424 int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a37efe4aae6..80e09c50590 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -217,7 +217,6 @@ int gfs2_glock_put(struct gfs2_glock *gl)
217 if (atomic_dec_and_test(&gl->gl_ref)) { 217 if (atomic_dec_and_test(&gl->gl_ref)) {
218 hlist_del(&gl->gl_list); 218 hlist_del(&gl->gl_list);
219 write_unlock(gl_lock_addr(gl->gl_hash)); 219 write_unlock(gl_lock_addr(gl->gl_hash));
220 BUG_ON(spin_is_locked(&gl->gl_spin));
221 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); 220 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
222 gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); 221 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
223 gfs2_assert(sdp, list_empty(&gl->gl_holders)); 222 gfs2_assert(sdp, list_empty(&gl->gl_holders));
@@ -346,7 +345,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
346 gl->gl_object = NULL; 345 gl->gl_object = NULL;
347 gl->gl_sbd = sdp; 346 gl->gl_sbd = sdp;
348 gl->gl_aspace = NULL; 347 gl->gl_aspace = NULL;
349 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
350 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 348 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
351 349
352 /* If this glock protects actual on-disk data or metadata blocks, 350 /* If this glock protects actual on-disk data or metadata blocks,
@@ -461,7 +459,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
461 459
462static void gfs2_demote_wake(struct gfs2_glock *gl) 460static void gfs2_demote_wake(struct gfs2_glock *gl)
463{ 461{
464 BUG_ON(!spin_is_locked(&gl->gl_spin));
465 gl->gl_demote_state = LM_ST_EXCLUSIVE; 462 gl->gl_demote_state = LM_ST_EXCLUSIVE;
466 clear_bit(GLF_DEMOTE, &gl->gl_flags); 463 clear_bit(GLF_DEMOTE, &gl->gl_flags);
467 smp_mb__after_clear_bit(); 464 smp_mb__after_clear_bit();
@@ -507,21 +504,12 @@ static int rq_mutex(struct gfs2_holder *gh)
507static int rq_promote(struct gfs2_holder *gh) 504static int rq_promote(struct gfs2_holder *gh)
508{ 505{
509 struct gfs2_glock *gl = gh->gh_gl; 506 struct gfs2_glock *gl = gh->gh_gl;
510 struct gfs2_sbd *sdp = gl->gl_sbd;
511 507
512 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { 508 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
513 if (list_empty(&gl->gl_holders)) { 509 if (list_empty(&gl->gl_holders)) {
514 gl->gl_req_gh = gh; 510 gl->gl_req_gh = gh;
515 set_bit(GLF_LOCK, &gl->gl_flags); 511 set_bit(GLF_LOCK, &gl->gl_flags);
516 spin_unlock(&gl->gl_spin); 512 spin_unlock(&gl->gl_spin);
517
518 if (atomic_read(&sdp->sd_reclaim_count) >
519 gfs2_tune_get(sdp, gt_reclaim_limit) &&
520 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
521 gfs2_reclaim_glock(sdp);
522 gfs2_reclaim_glock(sdp);
523 }
524
525 gfs2_glock_xmote_th(gh->gh_gl, gh); 513 gfs2_glock_xmote_th(gh->gh_gl, gh);
526 spin_lock(&gl->gl_spin); 514 spin_lock(&gl->gl_spin);
527 } 515 }
@@ -567,7 +555,10 @@ static int rq_demote(struct gfs2_glock *gl)
567 gfs2_demote_wake(gl); 555 gfs2_demote_wake(gl);
568 return 0; 556 return 0;
569 } 557 }
558
570 set_bit(GLF_LOCK, &gl->gl_flags); 559 set_bit(GLF_LOCK, &gl->gl_flags);
560 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
561
571 if (gl->gl_demote_state == LM_ST_UNLOCKED || 562 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
572 gl->gl_state != LM_ST_EXCLUSIVE) { 563 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin); 564 spin_unlock(&gl->gl_spin);
@@ -576,7 +567,9 @@ static int rq_demote(struct gfs2_glock *gl)
576 spin_unlock(&gl->gl_spin); 567 spin_unlock(&gl->gl_spin);
577 gfs2_glock_xmote_th(gl, NULL); 568 gfs2_glock_xmote_th(gl, NULL);
578 } 569 }
570
579 spin_lock(&gl->gl_spin); 571 spin_lock(&gl->gl_spin);
572 clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
580 573
581 return 0; 574 return 0;
582} 575}
@@ -598,23 +591,18 @@ static void run_queue(struct gfs2_glock *gl)
598 if (!list_empty(&gl->gl_waiters1)) { 591 if (!list_empty(&gl->gl_waiters1)) {
599 gh = list_entry(gl->gl_waiters1.next, 592 gh = list_entry(gl->gl_waiters1.next,
600 struct gfs2_holder, gh_list); 593 struct gfs2_holder, gh_list);
601 594 blocked = rq_mutex(gh);
602 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
603 blocked = rq_mutex(gh);
604 else
605 gfs2_assert_warn(gl->gl_sbd, 0);
606
607 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { 595 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
608 blocked = rq_demote(gl); 596 blocked = rq_demote(gl);
597 if (gl->gl_waiters2 && !blocked) {
598 set_bit(GLF_DEMOTE, &gl->gl_flags);
599 gl->gl_demote_state = LM_ST_UNLOCKED;
600 }
601 gl->gl_waiters2 = 0;
609 } else if (!list_empty(&gl->gl_waiters3)) { 602 } else if (!list_empty(&gl->gl_waiters3)) {
610 gh = list_entry(gl->gl_waiters3.next, 603 gh = list_entry(gl->gl_waiters3.next,
611 struct gfs2_holder, gh_list); 604 struct gfs2_holder, gh_list);
612 605 blocked = rq_promote(gh);
613 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
614 blocked = rq_promote(gh);
615 else
616 gfs2_assert_warn(gl->gl_sbd, 0);
617
618 } else 606 } else
619 break; 607 break;
620 608
@@ -632,27 +620,21 @@ static void run_queue(struct gfs2_glock *gl)
632 620
633static void gfs2_glmutex_lock(struct gfs2_glock *gl) 621static void gfs2_glmutex_lock(struct gfs2_glock *gl)
634{ 622{
635 struct gfs2_holder gh;
636
637 gfs2_holder_init(gl, 0, 0, &gh);
638 set_bit(HIF_MUTEX, &gh.gh_iflags);
639 if (test_and_set_bit(HIF_WAIT, &gh.gh_iflags))
640 BUG();
641
642 spin_lock(&gl->gl_spin); 623 spin_lock(&gl->gl_spin);
643 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { 624 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
625 struct gfs2_holder gh;
626
627 gfs2_holder_init(gl, 0, 0, &gh);
628 set_bit(HIF_WAIT, &gh.gh_iflags);
644 list_add_tail(&gh.gh_list, &gl->gl_waiters1); 629 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
630 spin_unlock(&gl->gl_spin);
631 wait_on_holder(&gh);
632 gfs2_holder_uninit(&gh);
645 } else { 633 } else {
646 gl->gl_owner_pid = current->pid; 634 gl->gl_owner_pid = current->pid;
647 gl->gl_ip = (unsigned long)__builtin_return_address(0); 635 gl->gl_ip = (unsigned long)__builtin_return_address(0);
648 clear_bit(HIF_WAIT, &gh.gh_iflags); 636 spin_unlock(&gl->gl_spin);
649 smp_mb();
650 wake_up_bit(&gh.gh_iflags, HIF_WAIT);
651 } 637 }
652 spin_unlock(&gl->gl_spin);
653
654 wait_on_holder(&gh);
655 gfs2_holder_uninit(&gh);
656} 638}
657 639
658/** 640/**
@@ -691,7 +673,6 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
691 gl->gl_owner_pid = 0; 673 gl->gl_owner_pid = 0;
692 gl->gl_ip = 0; 674 gl->gl_ip = 0;
693 run_queue(gl); 675 run_queue(gl);
694 BUG_ON(!spin_is_locked(&gl->gl_spin));
695 spin_unlock(&gl->gl_spin); 676 spin_unlock(&gl->gl_spin);
696} 677}
697 678
@@ -722,7 +703,10 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
722 } 703 }
723 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 704 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
724 gl->gl_demote_state != state) { 705 gl->gl_demote_state != state) {
725 gl->gl_demote_state = LM_ST_UNLOCKED; 706 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
707 gl->gl_waiters2 = 1;
708 else
709 gl->gl_demote_state = LM_ST_UNLOCKED;
726 } 710 }
727 spin_unlock(&gl->gl_spin); 711 spin_unlock(&gl->gl_spin);
728} 712}
@@ -943,8 +927,8 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
943 const struct gfs2_glock_operations *glops = gl->gl_ops; 927 const struct gfs2_glock_operations *glops = gl->gl_ops;
944 unsigned int ret; 928 unsigned int ret;
945 929
946 if (glops->go_drop_th) 930 if (glops->go_xmote_th)
947 glops->go_drop_th(gl); 931 glops->go_xmote_th(gl);
948 932
949 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 933 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
950 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); 934 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -1156,8 +1140,6 @@ restart:
1156 return -EIO; 1140 return -EIO;
1157 } 1141 }
1158 1142
1159 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1160
1161 spin_lock(&gl->gl_spin); 1143 spin_lock(&gl->gl_spin);
1162 add_to_queue(gh); 1144 add_to_queue(gh);
1163 run_queue(gl); 1145 run_queue(gl);
@@ -1248,12 +1230,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1248 list_del_init(&gh->gh_list); 1230 list_del_init(&gh->gh_list);
1249 1231
1250 if (list_empty(&gl->gl_holders)) { 1232 if (list_empty(&gl->gl_holders)) {
1251 spin_unlock(&gl->gl_spin); 1233 if (glops->go_unlock) {
1252 1234 spin_unlock(&gl->gl_spin);
1253 if (glops->go_unlock)
1254 glops->go_unlock(gh); 1235 glops->go_unlock(gh);
1255 1236 spin_lock(&gl->gl_spin);
1256 spin_lock(&gl->gl_spin); 1237 }
1257 gl->gl_stamp = jiffies; 1238 gl->gl_stamp = jiffies;
1258 } 1239 }
1259 1240
@@ -1910,8 +1891,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1910 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); 1891 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
1911 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); 1892 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1912 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); 1893 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1913 print_dbg(gi, " le = %s\n",
1914 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
1915 print_dbg(gi, " reclaim = %s\n", 1894 print_dbg(gi, " reclaim = %s\n",
1916 (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); 1895 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
1917 if (gl->gl_aspace) 1896 if (gl->gl_aspace)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 4670dcb2a87..c663b7a0f41 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,7 +56,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
56 bd = list_entry(head->next, struct gfs2_bufdata, 56 bd = list_entry(head->next, struct gfs2_bufdata,
57 bd_ail_gl_list); 57 bd_ail_gl_list);
58 bh = bd->bd_bh; 58 bh = bd->bd_bh;
59 gfs2_remove_from_ail(NULL, bd); 59 gfs2_remove_from_ail(bd);
60 bd->bd_bh = NULL; 60 bd->bd_bh = NULL;
61 bh->b_private = NULL; 61 bh->b_private = NULL;
62 bd->bd_blkno = bh->b_blocknr; 62 bd->bd_blkno = bh->b_blocknr;
@@ -86,15 +86,10 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
86 if (!ip || !S_ISREG(inode->i_mode)) 86 if (!ip || !S_ISREG(inode->i_mode))
87 return; 87 return;
88 88
89 if (!test_bit(GIF_PAGED, &ip->i_flags))
90 return;
91
92 unmap_shared_mapping_range(inode->i_mapping, 0, 0); 89 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
93
94 if (test_bit(GIF_SW_PAGED, &ip->i_flags)) 90 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
95 set_bit(GLF_DIRTY, &gl->gl_flags); 91 set_bit(GLF_DIRTY, &gl->gl_flags);
96 92
97 clear_bit(GIF_SW_PAGED, &ip->i_flags);
98} 93}
99 94
100/** 95/**
@@ -143,44 +138,34 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
143static void inode_go_sync(struct gfs2_glock *gl) 138static void inode_go_sync(struct gfs2_glock *gl)
144{ 139{
145 struct gfs2_inode *ip = gl->gl_object; 140 struct gfs2_inode *ip = gl->gl_object;
141 struct address_space *metamapping = gl->gl_aspace->i_mapping;
142 int error;
143
144 if (gl->gl_state != LM_ST_UNLOCKED)
145 gfs2_pte_inval(gl);
146 if (gl->gl_state != LM_ST_EXCLUSIVE)
147 return;
146 148
147 if (ip && !S_ISREG(ip->i_inode.i_mode)) 149 if (ip && !S_ISREG(ip->i_inode.i_mode))
148 ip = NULL; 150 ip = NULL;
149 151
150 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 152 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
151 if (ip && !gfs2_is_jdata(ip))
152 filemap_fdatawrite(ip->i_inode.i_mapping);
153 gfs2_log_flush(gl->gl_sbd, gl); 153 gfs2_log_flush(gl->gl_sbd, gl);
154 if (ip && gfs2_is_jdata(ip)) 154 filemap_fdatawrite(metamapping);
155 filemap_fdatawrite(ip->i_inode.i_mapping);
156 gfs2_meta_sync(gl);
157 if (ip) { 155 if (ip) {
158 struct address_space *mapping = ip->i_inode.i_mapping; 156 struct address_space *mapping = ip->i_inode.i_mapping;
159 int error = filemap_fdatawait(mapping); 157 filemap_fdatawrite(mapping);
158 error = filemap_fdatawait(mapping);
160 mapping_set_error(mapping, error); 159 mapping_set_error(mapping, error);
161 } 160 }
161 error = filemap_fdatawait(metamapping);
162 mapping_set_error(metamapping, error);
162 clear_bit(GLF_DIRTY, &gl->gl_flags); 163 clear_bit(GLF_DIRTY, &gl->gl_flags);
163 gfs2_ail_empty_gl(gl); 164 gfs2_ail_empty_gl(gl);
164 } 165 }
165} 166}
166 167
167/** 168/**
168 * inode_go_xmote_th - promote/demote a glock
169 * @gl: the glock
170 * @state: the requested state
171 * @flags:
172 *
173 */
174
175static void inode_go_xmote_th(struct gfs2_glock *gl)
176{
177 if (gl->gl_state != LM_ST_UNLOCKED)
178 gfs2_pte_inval(gl);
179 if (gl->gl_state == LM_ST_EXCLUSIVE)
180 inode_go_sync(gl);
181}
182
183/**
184 * inode_go_xmote_bh - After promoting/demoting a glock 169 * inode_go_xmote_bh - After promoting/demoting a glock
185 * @gl: the glock 170 * @gl: the glock
186 * 171 *
@@ -201,22 +186,6 @@ static void inode_go_xmote_bh(struct gfs2_glock *gl)
201} 186}
202 187
203/** 188/**
204 * inode_go_drop_th - unlock a glock
205 * @gl: the glock
206 *
207 * Invoked from rq_demote().
208 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
209 * is being purged from our node's glock cache; we're dropping lock.
210 */
211
212static void inode_go_drop_th(struct gfs2_glock *gl)
213{
214 gfs2_pte_inval(gl);
215 if (gl->gl_state == LM_ST_EXCLUSIVE)
216 inode_go_sync(gl);
217}
218
219/**
220 * inode_go_inval - prepare a inode glock to be released 189 * inode_go_inval - prepare a inode glock to be released
221 * @gl: the glock 190 * @gl: the glock
222 * @flags: 191 * @flags:
@@ -234,10 +203,8 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
234 set_bit(GIF_INVALID, &ip->i_flags); 203 set_bit(GIF_INVALID, &ip->i_flags);
235 } 204 }
236 205
237 if (ip && S_ISREG(ip->i_inode.i_mode)) { 206 if (ip && S_ISREG(ip->i_inode.i_mode))
238 truncate_inode_pages(ip->i_inode.i_mapping, 0); 207 truncate_inode_pages(ip->i_inode.i_mapping, 0);
239 clear_bit(GIF_PAGED, &ip->i_flags);
240 }
241} 208}
242 209
243/** 210/**
@@ -294,23 +261,6 @@ static int inode_go_lock(struct gfs2_holder *gh)
294} 261}
295 262
296/** 263/**
297 * inode_go_unlock - operation done before an inode lock is unlocked by a
298 * process
299 * @gl: the glock
300 * @flags:
301 *
302 */
303
304static void inode_go_unlock(struct gfs2_holder *gh)
305{
306 struct gfs2_glock *gl = gh->gh_gl;
307 struct gfs2_inode *ip = gl->gl_object;
308
309 if (ip)
310 gfs2_meta_cache_flush(ip);
311}
312
313/**
314 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock 264 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
315 * @gl: the glock 265 * @gl: the glock
316 * 266 *
@@ -350,14 +300,14 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
350} 300}
351 301
352/** 302/**
353 * trans_go_xmote_th - promote/demote the transaction glock 303 * trans_go_sync - promote/demote the transaction glock
354 * @gl: the glock 304 * @gl: the glock
355 * @state: the requested state 305 * @state: the requested state
356 * @flags: 306 * @flags:
357 * 307 *
358 */ 308 */
359 309
360static void trans_go_xmote_th(struct gfs2_glock *gl) 310static void trans_go_sync(struct gfs2_glock *gl)
361{ 311{
362 struct gfs2_sbd *sdp = gl->gl_sbd; 312 struct gfs2_sbd *sdp = gl->gl_sbd;
363 313
@@ -384,7 +334,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
384 334
385 if (gl->gl_state != LM_ST_UNLOCKED && 335 if (gl->gl_state != LM_ST_UNLOCKED &&
386 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { 336 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
387 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
388 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 337 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
389 338
390 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 339 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -402,24 +351,6 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
402} 351}
403 352
404/** 353/**
405 * trans_go_drop_th - unlock the transaction glock
406 * @gl: the glock
407 *
408 * We want to sync the device even with localcaching. Remember
409 * that localcaching journal replay only marks buffers dirty.
410 */
411
412static void trans_go_drop_th(struct gfs2_glock *gl)
413{
414 struct gfs2_sbd *sdp = gl->gl_sbd;
415
416 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
417 gfs2_meta_syncfs(sdp);
418 gfs2_log_shutdown(sdp);
419 }
420}
421
422/**
423 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock 354 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
424 * @gl: the glock 355 * @gl: the glock
425 * 356 *
@@ -433,25 +364,21 @@ static int quota_go_demote_ok(struct gfs2_glock *gl)
433 364
434const struct gfs2_glock_operations gfs2_meta_glops = { 365const struct gfs2_glock_operations gfs2_meta_glops = {
435 .go_xmote_th = meta_go_sync, 366 .go_xmote_th = meta_go_sync,
436 .go_drop_th = meta_go_sync,
437 .go_type = LM_TYPE_META, 367 .go_type = LM_TYPE_META,
438}; 368};
439 369
440const struct gfs2_glock_operations gfs2_inode_glops = { 370const struct gfs2_glock_operations gfs2_inode_glops = {
441 .go_xmote_th = inode_go_xmote_th, 371 .go_xmote_th = inode_go_sync,
442 .go_xmote_bh = inode_go_xmote_bh, 372 .go_xmote_bh = inode_go_xmote_bh,
443 .go_drop_th = inode_go_drop_th,
444 .go_inval = inode_go_inval, 373 .go_inval = inode_go_inval,
445 .go_demote_ok = inode_go_demote_ok, 374 .go_demote_ok = inode_go_demote_ok,
446 .go_lock = inode_go_lock, 375 .go_lock = inode_go_lock,
447 .go_unlock = inode_go_unlock,
448 .go_type = LM_TYPE_INODE, 376 .go_type = LM_TYPE_INODE,
449 .go_min_hold_time = HZ / 10, 377 .go_min_hold_time = HZ / 10,
450}; 378};
451 379
452const struct gfs2_glock_operations gfs2_rgrp_glops = { 380const struct gfs2_glock_operations gfs2_rgrp_glops = {
453 .go_xmote_th = meta_go_sync, 381 .go_xmote_th = meta_go_sync,
454 .go_drop_th = meta_go_sync,
455 .go_inval = meta_go_inval, 382 .go_inval = meta_go_inval,
456 .go_demote_ok = rgrp_go_demote_ok, 383 .go_demote_ok = rgrp_go_demote_ok,
457 .go_lock = rgrp_go_lock, 384 .go_lock = rgrp_go_lock,
@@ -461,9 +388,8 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
461}; 388};
462 389
463const struct gfs2_glock_operations gfs2_trans_glops = { 390const struct gfs2_glock_operations gfs2_trans_glops = {
464 .go_xmote_th = trans_go_xmote_th, 391 .go_xmote_th = trans_go_sync,
465 .go_xmote_bh = trans_go_xmote_bh, 392 .go_xmote_bh = trans_go_xmote_bh,
466 .go_drop_th = trans_go_drop_th,
467 .go_type = LM_TYPE_NONDISK, 393 .go_type = LM_TYPE_NONDISK,
468}; 394};
469 395
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eaddfb5a8e6..513aaf0dc0a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -131,7 +131,6 @@ struct gfs2_bufdata {
131struct gfs2_glock_operations { 131struct gfs2_glock_operations {
132 void (*go_xmote_th) (struct gfs2_glock *gl); 132 void (*go_xmote_th) (struct gfs2_glock *gl);
133 void (*go_xmote_bh) (struct gfs2_glock *gl); 133 void (*go_xmote_bh) (struct gfs2_glock *gl);
134 void (*go_drop_th) (struct gfs2_glock *gl);
135 void (*go_inval) (struct gfs2_glock *gl, int flags); 134 void (*go_inval) (struct gfs2_glock *gl, int flags);
136 int (*go_demote_ok) (struct gfs2_glock *gl); 135 int (*go_demote_ok) (struct gfs2_glock *gl);
137 int (*go_lock) (struct gfs2_holder *gh); 136 int (*go_lock) (struct gfs2_holder *gh);
@@ -141,10 +140,6 @@ struct gfs2_glock_operations {
141}; 140};
142 141
143enum { 142enum {
144 /* Actions */
145 HIF_MUTEX = 0,
146 HIF_PROMOTE = 1,
147
148 /* States */ 143 /* States */
149 HIF_HOLDER = 6, 144 HIF_HOLDER = 6,
150 HIF_FIRST = 7, 145 HIF_FIRST = 7,
@@ -171,6 +166,8 @@ enum {
171 GLF_DEMOTE = 3, 166 GLF_DEMOTE = 3,
172 GLF_PENDING_DEMOTE = 4, 167 GLF_PENDING_DEMOTE = 4,
173 GLF_DIRTY = 5, 168 GLF_DIRTY = 5,
169 GLF_DEMOTE_IN_PROGRESS = 6,
170 GLF_LFLUSH = 7,
174}; 171};
175 172
176struct gfs2_glock { 173struct gfs2_glock {
@@ -190,6 +187,7 @@ struct gfs2_glock {
190 struct list_head gl_holders; 187 struct list_head gl_holders;
191 struct list_head gl_waiters1; /* HIF_MUTEX */ 188 struct list_head gl_waiters1; /* HIF_MUTEX */
192 struct list_head gl_waiters3; /* HIF_PROMOTE */ 189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190 int gl_waiters2; /* GIF_DEMOTE */
193 191
194 const struct gfs2_glock_operations *gl_ops; 192 const struct gfs2_glock_operations *gl_ops;
195 193
@@ -210,7 +208,6 @@ struct gfs2_glock {
210 struct gfs2_sbd *gl_sbd; 208 struct gfs2_sbd *gl_sbd;
211 209
212 struct inode *gl_aspace; 210 struct inode *gl_aspace;
213 struct gfs2_log_element gl_le;
214 struct list_head gl_ail_list; 211 struct list_head gl_ail_list;
215 atomic_t gl_ail_count; 212 atomic_t gl_ail_count;
216 struct delayed_work gl_work; 213 struct delayed_work gl_work;
@@ -239,7 +236,6 @@ struct gfs2_alloc {
239enum { 236enum {
240 GIF_INVALID = 0, 237 GIF_INVALID = 0,
241 GIF_QD_LOCKED = 1, 238 GIF_QD_LOCKED = 1,
242 GIF_PAGED = 2,
243 GIF_SW_PAGED = 3, 239 GIF_SW_PAGED = 3,
244}; 240};
245 241
@@ -268,14 +264,10 @@ struct gfs2_inode {
268 struct gfs2_glock *i_gl; /* Move into i_gh? */ 264 struct gfs2_glock *i_gl; /* Move into i_gh? */
269 struct gfs2_holder i_iopen_gh; 265 struct gfs2_holder i_iopen_gh;
270 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 266 struct gfs2_holder i_gh; /* for prepare/commit_write only */
271 struct gfs2_alloc i_alloc; 267 struct gfs2_alloc *i_alloc;
272 u64 i_last_rg_alloc; 268 u64 i_last_rg_alloc;
273 269
274 spinlock_t i_spin;
275 struct rw_semaphore i_rw_mutex; 270 struct rw_semaphore i_rw_mutex;
276 unsigned long i_last_pfault;
277
278 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
279}; 271};
280 272
281/* 273/*
@@ -287,19 +279,12 @@ static inline struct gfs2_inode *GFS2_I(struct inode *inode)
287 return container_of(inode, struct gfs2_inode, i_inode); 279 return container_of(inode, struct gfs2_inode, i_inode);
288} 280}
289 281
290/* To be removed? */ 282static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode)
291static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
292{ 283{
293 return inode->i_sb->s_fs_info; 284 return inode->i_sb->s_fs_info;
294} 285}
295 286
296enum {
297 GFF_DID_DIRECT_ALLOC = 0,
298 GFF_EXLOCK = 1,
299};
300
301struct gfs2_file { 287struct gfs2_file {
302 unsigned long f_flags; /* GFF_... */
303 struct mutex f_fl_mutex; 288 struct mutex f_fl_mutex;
304 struct gfs2_holder f_fl_gh; 289 struct gfs2_holder f_fl_gh;
305}; 290};
@@ -373,8 +358,17 @@ struct gfs2_ail {
373 u64 ai_sync_gen; 358 u64 ai_sync_gen;
374}; 359};
375 360
361struct gfs2_journal_extent {
362 struct list_head extent_list;
363
364 unsigned int lblock; /* First logical block */
365 u64 dblock; /* First disk block */
366 u64 blocks;
367};
368
376struct gfs2_jdesc { 369struct gfs2_jdesc {
377 struct list_head jd_list; 370 struct list_head jd_list;
371 struct list_head extent_list;
378 372
379 struct inode *jd_inode; 373 struct inode *jd_inode;
380 unsigned int jd_jid; 374 unsigned int jd_jid;
@@ -421,13 +415,9 @@ struct gfs2_args {
421struct gfs2_tune { 415struct gfs2_tune {
422 spinlock_t gt_spin; 416 spinlock_t gt_spin;
423 417
424 unsigned int gt_ilimit;
425 unsigned int gt_ilimit_tries;
426 unsigned int gt_ilimit_min;
427 unsigned int gt_demote_secs; /* Cache retention for unheld glock */ 418 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
428 unsigned int gt_incore_log_blocks; 419 unsigned int gt_incore_log_blocks;
429 unsigned int gt_log_flush_secs; 420 unsigned int gt_log_flush_secs;
430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
431 421
432 unsigned int gt_recoverd_secs; 422 unsigned int gt_recoverd_secs;
433 unsigned int gt_logd_secs; 423 unsigned int gt_logd_secs;
@@ -443,10 +433,8 @@ struct gfs2_tune {
443 unsigned int gt_new_files_jdata; 433 unsigned int gt_new_files_jdata;
444 unsigned int gt_new_files_directio; 434 unsigned int gt_new_files_directio;
445 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 435 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
446 unsigned int gt_lockdump_size;
447 unsigned int gt_stall_secs; /* Detects trouble! */ 436 unsigned int gt_stall_secs; /* Detects trouble! */
448 unsigned int gt_complain_secs; 437 unsigned int gt_complain_secs;
449 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
450 unsigned int gt_statfs_quantum; 438 unsigned int gt_statfs_quantum;
451 unsigned int gt_statfs_slow; 439 unsigned int gt_statfs_slow;
452}; 440};
@@ -539,7 +527,6 @@ struct gfs2_sbd {
539 /* StatFS stuff */ 527 /* StatFS stuff */
540 528
541 spinlock_t sd_statfs_spin; 529 spinlock_t sd_statfs_spin;
542 struct mutex sd_statfs_mutex;
543 struct gfs2_statfs_change_host sd_statfs_master; 530 struct gfs2_statfs_change_host sd_statfs_master;
544 struct gfs2_statfs_change_host sd_statfs_local; 531 struct gfs2_statfs_change_host sd_statfs_local;
545 unsigned long sd_statfs_sync_time; 532 unsigned long sd_statfs_sync_time;
@@ -602,20 +589,18 @@ struct gfs2_sbd {
602 unsigned int sd_log_commited_databuf; 589 unsigned int sd_log_commited_databuf;
603 unsigned int sd_log_commited_revoke; 590 unsigned int sd_log_commited_revoke;
604 591
605 unsigned int sd_log_num_gl;
606 unsigned int sd_log_num_buf; 592 unsigned int sd_log_num_buf;
607 unsigned int sd_log_num_revoke; 593 unsigned int sd_log_num_revoke;
608 unsigned int sd_log_num_rg; 594 unsigned int sd_log_num_rg;
609 unsigned int sd_log_num_databuf; 595 unsigned int sd_log_num_databuf;
610 596
611 struct list_head sd_log_le_gl;
612 struct list_head sd_log_le_buf; 597 struct list_head sd_log_le_buf;
613 struct list_head sd_log_le_revoke; 598 struct list_head sd_log_le_revoke;
614 struct list_head sd_log_le_rg; 599 struct list_head sd_log_le_rg;
615 struct list_head sd_log_le_databuf; 600 struct list_head sd_log_le_databuf;
616 struct list_head sd_log_le_ordered; 601 struct list_head sd_log_le_ordered;
617 602
618 unsigned int sd_log_blks_free; 603 atomic_t sd_log_blks_free;
619 struct mutex sd_log_reserve_mutex; 604 struct mutex sd_log_reserve_mutex;
620 605
621 u64 sd_log_sequence; 606 u64 sd_log_sequence;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5f6dc32946c..728d3169e7b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -31,7 +31,6 @@
31#include "log.h" 31#include "log.h"
32#include "meta_io.h" 32#include "meta_io.h"
33#include "ops_address.h" 33#include "ops_address.h"
34#include "ops_file.h"
35#include "ops_inode.h" 34#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
@@ -132,15 +131,21 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
132 131
133void gfs2_set_iop(struct inode *inode) 132void gfs2_set_iop(struct inode *inode)
134{ 133{
134 struct gfs2_sbd *sdp = GFS2_SB(inode);
135 umode_t mode = inode->i_mode; 135 umode_t mode = inode->i_mode;
136 136
137 if (S_ISREG(mode)) { 137 if (S_ISREG(mode)) {
138 inode->i_op = &gfs2_file_iops; 138 inode->i_op = &gfs2_file_iops;
139 inode->i_fop = &gfs2_file_fops; 139 if (sdp->sd_args.ar_localflocks)
140 inode->i_mapping->a_ops = &gfs2_file_aops; 140 inode->i_fop = &gfs2_file_fops_nolock;
141 else
142 inode->i_fop = &gfs2_file_fops;
141 } else if (S_ISDIR(mode)) { 143 } else if (S_ISDIR(mode)) {
142 inode->i_op = &gfs2_dir_iops; 144 inode->i_op = &gfs2_dir_iops;
143 inode->i_fop = &gfs2_dir_fops; 145 if (sdp->sd_args.ar_localflocks)
146 inode->i_fop = &gfs2_dir_fops_nolock;
147 else
148 inode->i_fop = &gfs2_dir_fops;
144 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
145 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
146 } else { 151 } else {
@@ -291,12 +296,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
291 di->di_entries = be32_to_cpu(str->di_entries); 296 di->di_entries = be32_to_cpu(str->di_entries);
292 297
293 di->di_eattr = be64_to_cpu(str->di_eattr); 298 di->di_eattr = be64_to_cpu(str->di_eattr);
294 return 0; 299 if (S_ISREG(ip->i_inode.i_mode))
295} 300 gfs2_set_aops(&ip->i_inode);
296 301
297static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh) 302 return 0;
298{
299 ip->i_cache[0] = bh;
300} 303}
301 304
302/** 305/**
@@ -366,7 +369,8 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
366 if (error) 369 if (error)
367 goto out_rg_gunlock; 370 goto out_rg_gunlock;
368 371
369 gfs2_trans_add_gl(ip->i_gl); 372 set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
373 set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
370 374
371 gfs2_free_di(rgd, ip); 375 gfs2_free_di(rgd, ip);
372 376
@@ -707,9 +711,10 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
707 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 711 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
708 int error; 712 int error;
709 713
710 gfs2_alloc_get(dip); 714 if (gfs2_alloc_get(dip) == NULL)
715 return -ENOMEM;
711 716
712 dip->i_alloc.al_requested = RES_DINODE; 717 dip->i_alloc->al_requested = RES_DINODE;
713 error = gfs2_inplace_reserve(dip); 718 error = gfs2_inplace_reserve(dip);
714 if (error) 719 if (error)
715 goto out; 720 goto out;
@@ -855,7 +860,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
855 860
856 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); 861 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
857 if (alloc_required < 0) 862 if (alloc_required < 0)
858 goto fail; 863 goto fail_quota_locks;
859 if (alloc_required) { 864 if (alloc_required) {
860 error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); 865 error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
861 if (error) 866 if (error)
@@ -896,7 +901,7 @@ fail_end_trans:
896 gfs2_trans_end(sdp); 901 gfs2_trans_end(sdp);
897 902
898fail_ipreserv: 903fail_ipreserv:
899 if (dip->i_alloc.al_rgd) 904 if (dip->i_alloc->al_rgd)
900 gfs2_inplace_release(dip); 905 gfs2_inplace_release(dip);
901 906
902fail_quota_locks: 907fail_quota_locks:
@@ -966,7 +971,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
966 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 971 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
967 int error; 972 int error;
968 u64 generation; 973 u64 generation;
969 struct buffer_head *bh=NULL; 974 struct buffer_head *bh = NULL;
970 975
971 if (!name->len || name->len > GFS2_FNAMESIZE) 976 if (!name->len || name->len > GFS2_FNAMESIZE)
972 return ERR_PTR(-ENAMETOOLONG); 977 return ERR_PTR(-ENAMETOOLONG);
@@ -1003,8 +1008,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1003 if (IS_ERR(inode)) 1008 if (IS_ERR(inode))
1004 goto fail_gunlock2; 1009 goto fail_gunlock2;
1005 1010
1006 gfs2_inode_bh(GFS2_I(inode), bh);
1007
1008 error = gfs2_inode_refresh(GFS2_I(inode)); 1011 error = gfs2_inode_refresh(GFS2_I(inode));
1009 if (error) 1012 if (error)
1010 goto fail_gunlock2; 1013 goto fail_gunlock2;
@@ -1021,6 +1024,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1021 if (error) 1024 if (error)
1022 goto fail_gunlock2; 1025 goto fail_gunlock2;
1023 1026
1027 if (bh)
1028 brelse(bh);
1024 if (!inode) 1029 if (!inode)
1025 return ERR_PTR(-ENOMEM); 1030 return ERR_PTR(-ENOMEM);
1026 return inode; 1031 return inode;
@@ -1032,6 +1037,8 @@ fail_gunlock2:
1032fail_gunlock: 1037fail_gunlock:
1033 gfs2_glock_dq(ghs); 1038 gfs2_glock_dq(ghs);
1034fail: 1039fail:
1040 if (bh)
1041 brelse(bh);
1035 return ERR_PTR(error); 1042 return ERR_PTR(error);
1036} 1043}
1037 1044
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 351ac87ab38..d4465066261 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,6 +20,18 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
20 return ip->i_di.di_flags & GFS2_DIF_JDATA; 20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21} 21}
22 22
23static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
24{
25 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
26 return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip);
27}
28
29static inline int gfs2_is_ordered(const struct gfs2_inode *ip)
30{
31 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
32 return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip);
33}
34
23static inline int gfs2_is_dir(const struct gfs2_inode *ip) 35static inline int gfs2_is_dir(const struct gfs2_inode *ip)
24{ 36{
25 return S_ISDIR(ip->i_inode.i_mode); 37 return S_ISDIR(ip->i_inode.i_mode);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 41c5b04caab..f2efff42422 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -67,6 +67,11 @@ static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
67 memset(data, 0, 256); 67 memset(data, 0, 256);
68 strncpy(data, data_arg, 255); 68 strncpy(data, data_arg, 255);
69 69
70 if (!strlen(data)) {
71 log_error("no mount options, (u)mount helpers not installed");
72 return -EINVAL;
73 }
74
70 for (options = data; (x = strsep(&options, ":")); ) { 75 for (options = data; (x = strsep(&options, ":")); ) {
71 if (!*x) 76 if (!*x)
72 continue; 77 continue;
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index 1f7b038530b..2ebd374b314 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -89,15 +89,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
89 op->info.number = name->ln_number; 89 op->info.number = name->ln_number;
90 op->info.start = fl->fl_start; 90 op->info.start = fl->fl_start;
91 op->info.end = fl->fl_end; 91 op->info.end = fl->fl_end;
92 op->info.owner = (__u64)(long) fl->fl_owner;
93 if (fl->fl_lmops && fl->fl_lmops->fl_grant) { 92 if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
93 /* fl_owner is lockd which doesn't distinguish
94 processes on the nfs client */
95 op->info.owner = (__u64) fl->fl_pid;
94 xop->callback = fl->fl_lmops->fl_grant; 96 xop->callback = fl->fl_lmops->fl_grant;
95 locks_init_lock(&xop->flc); 97 locks_init_lock(&xop->flc);
96 locks_copy_lock(&xop->flc, fl); 98 locks_copy_lock(&xop->flc, fl);
97 xop->fl = fl; 99 xop->fl = fl;
98 xop->file = file; 100 xop->file = file;
99 } else 101 } else {
102 op->info.owner = (__u64)(long) fl->fl_owner;
100 xop->callback = NULL; 103 xop->callback = NULL;
104 }
101 105
102 send_op(op); 106 send_op(op);
103 107
@@ -203,7 +207,10 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
203 op->info.number = name->ln_number; 207 op->info.number = name->ln_number;
204 op->info.start = fl->fl_start; 208 op->info.start = fl->fl_start;
205 op->info.end = fl->fl_end; 209 op->info.end = fl->fl_end;
206 op->info.owner = (__u64)(long) fl->fl_owner; 210 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
211 op->info.owner = (__u64) fl->fl_pid;
212 else
213 op->info.owner = (__u64)(long) fl->fl_owner;
207 214
208 send_op(op); 215 send_op(op);
209 wait_event(recv_wq, (op->done != 0)); 216 wait_event(recv_wq, (op->done != 0));
@@ -242,7 +249,10 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
242 op->info.number = name->ln_number; 249 op->info.number = name->ln_number;
243 op->info.start = fl->fl_start; 250 op->info.start = fl->fl_start;
244 op->info.end = fl->fl_end; 251 op->info.end = fl->fl_end;
245 op->info.owner = (__u64)(long) fl->fl_owner; 252 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
253 op->info.owner = (__u64) fl->fl_pid;
254 else
255 op->info.owner = (__u64)(long) fl->fl_owner;
246 256
247 send_op(op); 257 send_op(op);
248 wait_event(recv_wq, (op->done != 0)); 258 wait_event(recv_wq, (op->done != 0));
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index ae9e6a25fe2..a87b0983976 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -189,51 +189,39 @@ static struct kobj_type gdlm_ktype = {
189 .sysfs_ops = &gdlm_attr_ops, 189 .sysfs_ops = &gdlm_attr_ops,
190}; 190};
191 191
192static struct kset gdlm_kset = { 192static struct kset *gdlm_kset;
193 .ktype = &gdlm_ktype,
194};
195 193
196int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) 194int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
197{ 195{
198 int error; 196 int error;
199 197
200 error = kobject_set_name(&ls->kobj, "%s", "lock_module"); 198 ls->kobj.kset = gdlm_kset;
201 if (error) { 199 error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
202 log_error("can't set kobj name %d", error); 200 "lock_module");
203 return error;
204 }
205
206 ls->kobj.kset = &gdlm_kset;
207 ls->kobj.ktype = &gdlm_ktype;
208 ls->kobj.parent = fskobj;
209
210 error = kobject_register(&ls->kobj);
211 if (error) 201 if (error)
212 log_error("can't register kobj %d", error); 202 log_error("can't register kobj %d", error);
203 kobject_uevent(&ls->kobj, KOBJ_ADD);
213 204
214 return error; 205 return error;
215} 206}
216 207
217void gdlm_kobject_release(struct gdlm_ls *ls) 208void gdlm_kobject_release(struct gdlm_ls *ls)
218{ 209{
219 kobject_unregister(&ls->kobj); 210 kobject_put(&ls->kobj);
220} 211}
221 212
222int gdlm_sysfs_init(void) 213int gdlm_sysfs_init(void)
223{ 214{
224 int error; 215 gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
225 216 if (!gdlm_kset) {
226 kobject_set_name(&gdlm_kset.kobj, "lock_dlm"); 217 printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__);
227 kobj_set_kset_s(&gdlm_kset, kernel_subsys); 218 return -ENOMEM;
228 error = kset_register(&gdlm_kset); 219 }
229 if (error) 220 return 0;
230 printk("lock_dlm: cannot register kset %d\n", error);
231
232 return error;
233} 221}
234 222
235void gdlm_sysfs_exit(void) 223void gdlm_sysfs_exit(void)
236{ 224{
237 kset_unregister(&gdlm_kset); 225 kset_unregister(gdlm_kset);
238} 226}
239 227
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index bd938f06481..521694fc19d 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -273,18 +273,13 @@ static int gdlm_thread(void *data, int blist)
273 struct gdlm_ls *ls = (struct gdlm_ls *) data; 273 struct gdlm_ls *ls = (struct gdlm_ls *) data;
274 struct gdlm_lock *lp = NULL; 274 struct gdlm_lock *lp = NULL;
275 uint8_t complete, blocking, submit, drop; 275 uint8_t complete, blocking, submit, drop;
276 DECLARE_WAITQUEUE(wait, current);
277 276
278 /* Only thread1 is allowed to do blocking callbacks since gfs 277 /* Only thread1 is allowed to do blocking callbacks since gfs
279 may wait for a completion callback within a blocking cb. */ 278 may wait for a completion callback within a blocking cb. */
280 279
281 while (!kthread_should_stop()) { 280 while (!kthread_should_stop()) {
282 set_current_state(TASK_INTERRUPTIBLE); 281 wait_event_interruptible(ls->thread_wait,
283 add_wait_queue(&ls->thread_wait, &wait); 282 !no_work(ls, blist) || kthread_should_stop());
284 if (no_work(ls, blist))
285 schedule();
286 remove_wait_queue(&ls->thread_wait, &wait);
287 set_current_state(TASK_RUNNING);
288 283
289 complete = blocking = submit = drop = 0; 284 complete = blocking = submit = drop = 0;
290 285
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 7df70247325..161ab6f2058 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,6 +16,8 @@
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/lm_interface.h> 17#include <linux/lm_interface.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/kthread.h>
20#include <linux/freezer.h>
19 21
20#include "gfs2.h" 22#include "gfs2.h"
21#include "incore.h" 23#include "incore.h"
@@ -68,14 +70,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
68 * 70 *
69 */ 71 */
70 72
71void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd) 73void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
72{ 74{
73 bd->bd_ail = NULL; 75 bd->bd_ail = NULL;
74 list_del_init(&bd->bd_ail_st_list); 76 list_del_init(&bd->bd_ail_st_list);
75 list_del_init(&bd->bd_ail_gl_list); 77 list_del_init(&bd->bd_ail_gl_list);
76 atomic_dec(&bd->bd_gl->gl_ail_count); 78 atomic_dec(&bd->bd_gl->gl_ail_count);
77 if (mapping)
78 gfs2_meta_cache_flush(GFS2_I(mapping->host));
79 brelse(bd->bd_bh); 79 brelse(bd->bd_bh);
80} 80}
81 81
@@ -92,8 +92,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
92 struct buffer_head *bh; 92 struct buffer_head *bh;
93 int retry; 93 int retry;
94 94
95 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
96
97 do { 95 do {
98 retry = 0; 96 retry = 0;
99 97
@@ -210,7 +208,7 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
210 gfs2_log_unlock(sdp); 208 gfs2_log_unlock(sdp);
211} 209}
212 210
213int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) 211static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
214{ 212{
215 struct gfs2_ail *ai, *s; 213 struct gfs2_ail *ai, *s;
216 int ret; 214 int ret;
@@ -248,7 +246,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
248 bd = list_entry(head->prev, struct gfs2_bufdata, 246 bd = list_entry(head->prev, struct gfs2_bufdata,
249 bd_ail_st_list); 247 bd_ail_st_list);
250 gfs2_assert(sdp, bd->bd_ail == ai); 248 gfs2_assert(sdp, bd->bd_ail == ai);
251 gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd); 249 gfs2_remove_from_ail(bd);
252 } 250 }
253} 251}
254 252
@@ -303,7 +301,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
303 301
304 mutex_lock(&sdp->sd_log_reserve_mutex); 302 mutex_lock(&sdp->sd_log_reserve_mutex);
305 gfs2_log_lock(sdp); 303 gfs2_log_lock(sdp);
306 while(sdp->sd_log_blks_free <= (blks + reserved_blks)) { 304 while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
307 gfs2_log_unlock(sdp); 305 gfs2_log_unlock(sdp);
308 gfs2_ail1_empty(sdp, 0); 306 gfs2_ail1_empty(sdp, 0);
309 gfs2_log_flush(sdp, NULL); 307 gfs2_log_flush(sdp, NULL);
@@ -312,7 +310,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
312 gfs2_ail1_start(sdp, 0); 310 gfs2_ail1_start(sdp, 0);
313 gfs2_log_lock(sdp); 311 gfs2_log_lock(sdp);
314 } 312 }
315 sdp->sd_log_blks_free -= blks; 313 atomic_sub(blks, &sdp->sd_log_blks_free);
316 gfs2_log_unlock(sdp); 314 gfs2_log_unlock(sdp);
317 mutex_unlock(&sdp->sd_log_reserve_mutex); 315 mutex_unlock(&sdp->sd_log_reserve_mutex);
318 316
@@ -332,27 +330,23 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
332{ 330{
333 331
334 gfs2_log_lock(sdp); 332 gfs2_log_lock(sdp);
335 sdp->sd_log_blks_free += blks; 333 atomic_add(blks, &sdp->sd_log_blks_free);
336 gfs2_assert_withdraw(sdp, 334 gfs2_assert_withdraw(sdp,
337 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); 335 atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
338 gfs2_log_unlock(sdp); 336 gfs2_log_unlock(sdp);
339 up_read(&sdp->sd_log_flush_lock); 337 up_read(&sdp->sd_log_flush_lock);
340} 338}
341 339
342static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) 340static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
343{ 341{
344 struct inode *inode = sdp->sd_jdesc->jd_inode; 342 struct gfs2_journal_extent *je;
345 int error; 343
346 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; 344 list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
347 345 if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
348 bh_map.b_size = 1 << inode->i_blkbits; 346 return je->dblock + lbn - je->lblock;
349 error = gfs2_block_map(inode, lbn, 0, &bh_map); 347 }
350 if (error || !bh_map.b_blocknr) 348
351 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, 349 return -1;
352 (unsigned long long)bh_map.b_blocknr, lbn);
353 gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
354
355 return bh_map.b_blocknr;
356} 350}
357 351
358/** 352/**
@@ -561,8 +555,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
561 ail2_empty(sdp, new_tail); 555 ail2_empty(sdp, new_tail);
562 556
563 gfs2_log_lock(sdp); 557 gfs2_log_lock(sdp);
564 sdp->sd_log_blks_free += dist; 558 atomic_add(dist, &sdp->sd_log_blks_free);
565 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); 559 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
566 gfs2_log_unlock(sdp); 560 gfs2_log_unlock(sdp);
567 561
568 sdp->sd_log_tail = new_tail; 562 sdp->sd_log_tail = new_tail;
@@ -652,7 +646,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
652 get_bh(bh); 646 get_bh(bh);
653 gfs2_log_unlock(sdp); 647 gfs2_log_unlock(sdp);
654 lock_buffer(bh); 648 lock_buffer(bh);
655 if (test_clear_buffer_dirty(bh)) { 649 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
656 bh->b_end_io = end_buffer_write_sync; 650 bh->b_end_io = end_buffer_write_sync;
657 submit_bh(WRITE, bh); 651 submit_bh(WRITE, bh);
658 } else { 652 } else {
@@ -694,20 +688,16 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
694 * 688 *
695 */ 689 */
696 690
697void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 691void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
698{ 692{
699 struct gfs2_ail *ai; 693 struct gfs2_ail *ai;
700 694
701 down_write(&sdp->sd_log_flush_lock); 695 down_write(&sdp->sd_log_flush_lock);
702 696
703 if (gl) { 697 /* Log might have been flushed while we waited for the flush lock */
704 gfs2_log_lock(sdp); 698 if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
705 if (list_empty(&gl->gl_le.le_list)) { 699 up_write(&sdp->sd_log_flush_lock);
706 gfs2_log_unlock(sdp); 700 return;
707 up_write(&sdp->sd_log_flush_lock);
708 return;
709 }
710 gfs2_log_unlock(sdp);
711 } 701 }
712 702
713 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); 703 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
@@ -739,7 +729,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
739 log_flush_commit(sdp); 729 log_flush_commit(sdp);
740 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 730 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
741 gfs2_log_lock(sdp); 731 gfs2_log_lock(sdp);
742 sdp->sd_log_blks_free--; /* Adjust for unreserved buffer */ 732 atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
743 gfs2_log_unlock(sdp); 733 gfs2_log_unlock(sdp);
744 log_write_header(sdp, 0, PULL); 734 log_write_header(sdp, 0, PULL);
745 } 735 }
@@ -767,7 +757,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
767static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 757static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
768{ 758{
769 unsigned int reserved; 759 unsigned int reserved;
770 unsigned int old; 760 unsigned int unused;
771 761
772 gfs2_log_lock(sdp); 762 gfs2_log_lock(sdp);
773 763
@@ -779,14 +769,11 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
779 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
780 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); 770 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
781 reserved = calc_reserved(sdp); 771 reserved = calc_reserved(sdp);
782 old = sdp->sd_log_blks_free; 772 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
783 sdp->sd_log_blks_free += tr->tr_reserved - 773 gfs2_assert_withdraw(sdp, unused >= 0);
784 (reserved - sdp->sd_log_blks_reserved); 774 atomic_add(unused, &sdp->sd_log_blks_free);
785 775 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
786 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
787 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <=
788 sdp->sd_jdesc->jd_blocks); 776 sdp->sd_jdesc->jd_blocks);
789
790 sdp->sd_log_blks_reserved = reserved; 777 sdp->sd_log_blks_reserved = reserved;
791 778
792 gfs2_log_unlock(sdp); 779 gfs2_log_unlock(sdp);
@@ -825,7 +812,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
825 down_write(&sdp->sd_log_flush_lock); 812 down_write(&sdp->sd_log_flush_lock);
826 813
827 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 814 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
828 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
829 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); 815 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
830 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 816 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
831 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); 817 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
@@ -838,7 +824,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
838 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 824 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
839 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); 825 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
840 826
841 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); 827 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
842 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); 828 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
843 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); 829 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
844 830
@@ -866,3 +852,42 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
866 } 852 }
867} 853}
868 854
855
856/**
857 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
858 * @sdp: Pointer to GFS2 superblock
859 *
860 * Also, periodically check to make sure that we're using the most recent
861 * journal index.
862 */
863
864int gfs2_logd(void *data)
865{
866 struct gfs2_sbd *sdp = data;
867 unsigned long t;
868 int need_flush;
869
870 while (!kthread_should_stop()) {
871 /* Advance the log tail */
872
873 t = sdp->sd_log_flush_time +
874 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
875
876 gfs2_ail1_empty(sdp, DIO_ALL);
877 gfs2_log_lock(sdp);
878 need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
879 gfs2_log_unlock(sdp);
880 if (need_flush || time_after_eq(jiffies, t)) {
881 gfs2_log_flush(sdp, NULL);
882 sdp->sd_log_flush_time = jiffies;
883 }
884
885 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
886 if (freezing(current))
887 refrigerator();
888 schedule_timeout_interruptible(t);
889 }
890
891 return 0;
892}
893
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index dae28240062..77115281650 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,8 +48,6 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
49 unsigned int ssize); 49 unsigned int ssize);
50 50
51int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 51int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 52void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp); 53void gfs2_log_incr_head(struct gfs2_sbd *sdp);
@@ -57,11 +55,19 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 55struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 56struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real); 57 struct buffer_head *real);
60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 58void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
59
60static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
61{
62 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
63 __gfs2_log_flush(sbd, gl);
64}
65
61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); 66void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd); 67void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
63 68
64void gfs2_log_shutdown(struct gfs2_sbd *sdp); 69void gfs2_log_shutdown(struct gfs2_sbd *sdp);
65void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 70void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
71int gfs2_logd(void *data);
66 72
67#endif /* __LOG_DOT_H__ */ 73#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c27cea761c..fae59d69d01 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -87,6 +87,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
87 } 87 }
88 bd->bd_ail = ai; 88 bd->bd_ail = ai;
89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
90 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
90 gfs2_log_unlock(sdp); 91 gfs2_log_unlock(sdp);
91 unlock_buffer(bh); 92 unlock_buffer(bh);
92} 93}
@@ -124,49 +125,6 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
124 return bh; 125 return bh;
125} 126}
126 127
127static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
128{
129 struct gfs2_glock *gl;
130 struct gfs2_trans *tr = current->journal_info;
131
132 tr->tr_touched = 1;
133
134 gl = container_of(le, struct gfs2_glock, gl_le);
135 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
136 return;
137
138 if (!list_empty(&le->le_list))
139 return;
140
141 gfs2_glock_hold(gl);
142 set_bit(GLF_DIRTY, &gl->gl_flags);
143 sdp->sd_log_num_gl++;
144 list_add(&le->le_list, &sdp->sd_log_le_gl);
145}
146
147static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
148{
149 gfs2_log_lock(sdp);
150 __glock_lo_add(sdp, le);
151 gfs2_log_unlock(sdp);
152}
153
154static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
155{
156 struct list_head *head = &sdp->sd_log_le_gl;
157 struct gfs2_glock *gl;
158
159 while (!list_empty(head)) {
160 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
161 list_del_init(&gl->gl_le.le_list);
162 sdp->sd_log_num_gl--;
163
164 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
165 gfs2_glock_put(gl);
166 }
167 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
168}
169
170static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 128static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
171{ 129{
172 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 130 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
@@ -182,7 +140,8 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
182 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 140 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
183 if (!list_empty(&le->le_list)) 141 if (!list_empty(&le->le_list))
184 goto out; 142 goto out;
185 __glock_lo_add(sdp, &bd->bd_gl->gl_le); 143 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
144 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
186 gfs2_meta_check(sdp, bd->bd_bh); 145 gfs2_meta_check(sdp, bd->bd_bh);
187 gfs2_pin(sdp, bd->bd_bh); 146 gfs2_pin(sdp, bd->bd_bh);
188 sdp->sd_log_num_buf++; 147 sdp->sd_log_num_buf++;
@@ -556,17 +515,20 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
556 515
557 lock_buffer(bd->bd_bh); 516 lock_buffer(bd->bd_bh);
558 gfs2_log_lock(sdp); 517 gfs2_log_lock(sdp);
559 if (!list_empty(&bd->bd_list_tr)) 518 if (tr) {
560 goto out; 519 if (!list_empty(&bd->bd_list_tr))
561 tr->tr_touched = 1; 520 goto out;
562 if (gfs2_is_jdata(ip)) { 521 tr->tr_touched = 1;
563 tr->tr_num_buf++; 522 if (gfs2_is_jdata(ip)) {
564 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 523 tr->tr_num_buf++;
524 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
525 }
565 } 526 }
566 if (!list_empty(&le->le_list)) 527 if (!list_empty(&le->le_list))
567 goto out; 528 goto out;
568 529
569 __glock_lo_add(sdp, &bd->bd_gl->gl_le); 530 set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
531 set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
570 if (gfs2_is_jdata(ip)) { 532 if (gfs2_is_jdata(ip)) {
571 gfs2_pin(sdp, bd->bd_bh); 533 gfs2_pin(sdp, bd->bd_bh);
572 tr->tr_num_databuf_new++; 534 tr->tr_num_databuf_new++;
@@ -773,12 +735,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
773} 735}
774 736
775 737
776const struct gfs2_log_operations gfs2_glock_lops = {
777 .lo_add = glock_lo_add,
778 .lo_after_commit = glock_lo_after_commit,
779 .lo_name = "glock",
780};
781
782const struct gfs2_log_operations gfs2_buf_lops = { 738const struct gfs2_log_operations gfs2_buf_lops = {
783 .lo_add = buf_lo_add, 739 .lo_add = buf_lo_add,
784 .lo_incore_commit = buf_lo_incore_commit, 740 .lo_incore_commit = buf_lo_incore_commit,
@@ -816,7 +772,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
816}; 772};
817 773
818const struct gfs2_log_operations *gfs2_log_ops[] = { 774const struct gfs2_log_operations *gfs2_log_ops[] = {
819 &gfs2_glock_lops,
820 &gfs2_databuf_lops, 775 &gfs2_databuf_lops,
821 &gfs2_buf_lops, 776 &gfs2_buf_lops,
822 &gfs2_rg_lops, 777 &gfs2_rg_lops,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7ecfe0d3a49..9c7765c12d6 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -29,9 +29,8 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
29 struct gfs2_inode *ip = foo; 29 struct gfs2_inode *ip = foo;
30 30
31 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
32 spin_lock_init(&ip->i_spin);
33 init_rwsem(&ip->i_rw_mutex); 32 init_rwsem(&ip->i_rw_mutex);
34 memset(ip->i_cache, 0, sizeof(ip->i_cache)); 33 ip->i_alloc = NULL;
35} 34}
36 35
37static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo) 36static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 4da423985e4..85aea27b4a8 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -50,6 +50,7 @@ static int gfs2_aspace_writepage(struct page *page,
50static const struct address_space_operations aspace_aops = { 50static const struct address_space_operations aspace_aops = {
51 .writepage = gfs2_aspace_writepage, 51 .writepage = gfs2_aspace_writepage,
52 .releasepage = gfs2_releasepage, 52 .releasepage = gfs2_releasepage,
53 .sync_page = block_sync_page,
53}; 54};
54 55
55/** 56/**
@@ -221,13 +222,14 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
221 struct buffer_head **bhp) 222 struct buffer_head **bhp)
222{ 223{
223 *bhp = getbuf(gl, blkno, CREATE); 224 *bhp = getbuf(gl, blkno, CREATE);
224 if (!buffer_uptodate(*bhp)) 225 if (!buffer_uptodate(*bhp)) {
225 ll_rw_block(READ_META, 1, bhp); 226 ll_rw_block(READ_META, 1, bhp);
226 if (flags & DIO_WAIT) { 227 if (flags & DIO_WAIT) {
227 int error = gfs2_meta_wait(gl->gl_sbd, *bhp); 228 int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
228 if (error) { 229 if (error) {
229 brelse(*bhp); 230 brelse(*bhp);
230 return error; 231 return error;
232 }
231 } 233 }
232 } 234 }
233 235
@@ -282,7 +284,7 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
282 return; 284 return;
283 } 285 }
284 286
285 bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), 287 bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
286 bd->bd_bh = bh; 288 bd->bd_bh = bh;
287 bd->bd_gl = gl; 289 bd->bd_gl = gl;
288 290
@@ -317,7 +319,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
317 } 319 }
318 if (bd) { 320 if (bd) {
319 if (bd->bd_ail) { 321 if (bd->bd_ail) {
320 gfs2_remove_from_ail(NULL, bd); 322 gfs2_remove_from_ail(bd);
321 bh->b_private = NULL; 323 bh->b_private = NULL;
322 bd->bd_bh = NULL; 324 bd->bd_bh = NULL;
323 bd->bd_blkno = bh->b_blocknr; 325 bd->bd_blkno = bh->b_blocknr;
@@ -358,32 +360,6 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
358} 360}
359 361
360/** 362/**
361 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
362 * @ip: The GFS2 inode
363 *
364 * This releases buffers that are in the most-recently-used array of
365 * blocks used for indirect block addressing for this inode.
366 */
367
368void gfs2_meta_cache_flush(struct gfs2_inode *ip)
369{
370 struct buffer_head **bh_slot;
371 unsigned int x;
372
373 spin_lock(&ip->i_spin);
374
375 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
376 bh_slot = &ip->i_cache[x];
377 if (*bh_slot) {
378 brelse(*bh_slot);
379 *bh_slot = NULL;
380 }
381 }
382
383 spin_unlock(&ip->i_spin);
384}
385
386/**
387 * gfs2_meta_indirect_buffer - Get a metadata buffer 363 * gfs2_meta_indirect_buffer - Get a metadata buffer
388 * @ip: The GFS2 inode 364 * @ip: The GFS2 inode
389 * @height: The level of this buf in the metadata (indir addr) tree (if any) 365 * @height: The level of this buf in the metadata (indir addr) tree (if any)
@@ -391,8 +367,6 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
391 * @new: Non-zero if we may create a new buffer 367 * @new: Non-zero if we may create a new buffer
392 * @bhp: the buffer is returned here 368 * @bhp: the buffer is returned here
393 * 369 *
394 * Try to use the gfs2_inode's MRU metadata tree cache.
395 *
396 * Returns: errno 370 * Returns: errno
397 */ 371 */
398 372
@@ -401,58 +375,25 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
401{ 375{
402 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 376 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
403 struct gfs2_glock *gl = ip->i_gl; 377 struct gfs2_glock *gl = ip->i_gl;
404 struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height; 378 struct buffer_head *bh;
405 int in_cache = 0; 379 int ret = 0;
406
407 BUG_ON(!gl);
408 BUG_ON(!sdp);
409
410 spin_lock(&ip->i_spin);
411 if (*bh_slot && (*bh_slot)->b_blocknr == num) {
412 bh = *bh_slot;
413 get_bh(bh);
414 in_cache = 1;
415 }
416 spin_unlock(&ip->i_spin);
417
418 if (!bh)
419 bh = getbuf(gl, num, CREATE);
420
421 if (!bh)
422 return -ENOBUFS;
423 380
424 if (new) { 381 if (new) {
425 if (gfs2_assert_warn(sdp, height)) 382 BUG_ON(height == 0);
426 goto err; 383 bh = gfs2_meta_new(gl, num);
427 meta_prep_new(bh);
428 gfs2_trans_add_bh(ip->i_gl, bh, 1); 384 gfs2_trans_add_bh(ip->i_gl, bh, 1);
429 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); 385 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
430 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); 386 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
431 } else { 387 } else {
432 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; 388 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
433 if (!buffer_uptodate(bh)) { 389 ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
434 ll_rw_block(READ_META, 1, &bh); 390 if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
435 if (gfs2_meta_wait(sdp, bh)) 391 brelse(bh);
436 goto err; 392 ret = -EIO;
437 } 393 }
438 if (gfs2_metatype_check(sdp, bh, mtype))
439 goto err;
440 }
441
442 if (!in_cache) {
443 spin_lock(&ip->i_spin);
444 if (*bh_slot)
445 brelse(*bh_slot);
446 *bh_slot = bh;
447 get_bh(bh);
448 spin_unlock(&ip->i_spin);
449 } 394 }
450
451 *bhp = bh; 395 *bhp = bh;
452 return 0; 396 return ret;
453err:
454 brelse(bh);
455 return -EIO;
456} 397}
457 398
458/** 399/**
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b7048222ebb..73e3b1c76fe 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,7 +56,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
56 56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); 57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58 58
59void gfs2_meta_cache_flush(struct gfs2_inode *ip);
60int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, 59int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
61 int new, struct buffer_head **bhp); 60 int new, struct buffer_head **bhp);
62 61
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 9679f8b9870..38dbe99a30e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -20,6 +20,8 @@
20#include <linux/swap.h> 20#include <linux/swap.h>
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/backing-dev.h>
24#include <linux/pagevec.h>
23 25
24#include "gfs2.h" 26#include "gfs2.h"
25#include "incore.h" 27#include "incore.h"
@@ -32,7 +34,6 @@
32#include "quota.h" 34#include "quota.h"
33#include "trans.h" 35#include "trans.h"
34#include "rgrp.h" 36#include "rgrp.h"
35#include "ops_file.h"
36#include "super.h" 37#include "super.h"
37#include "util.h" 38#include "util.h"
38#include "glops.h" 39#include "glops.h"
@@ -58,22 +59,6 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
58} 59}
59 60
60/** 61/**
61 * gfs2_get_block - Fills in a buffer head with details about a block
62 * @inode: The inode
63 * @lblock: The block number to look up
64 * @bh_result: The buffer head to return the result in
65 * @create: Non-zero if we may add block to the file
66 *
67 * Returns: errno
68 */
69
70int gfs2_get_block(struct inode *inode, sector_t lblock,
71 struct buffer_head *bh_result, int create)
72{
73 return gfs2_block_map(inode, lblock, create, bh_result);
74}
75
76/**
77 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block 62 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
78 * @inode: The inode 63 * @inode: The inode
79 * @lblock: The block number to look up 64 * @lblock: The block number to look up
@@ -88,7 +73,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
88{ 73{
89 int error; 74 int error;
90 75
91 error = gfs2_block_map(inode, lblock, 0, bh_result); 76 error = gfs2_block_map(inode, lblock, bh_result, 0);
92 if (error) 77 if (error)
93 return error; 78 return error;
94 if (!buffer_mapped(bh_result)) 79 if (!buffer_mapped(bh_result))
@@ -99,20 +84,19 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
99static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, 84static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
100 struct buffer_head *bh_result, int create) 85 struct buffer_head *bh_result, int create)
101{ 86{
102 return gfs2_block_map(inode, lblock, 0, bh_result); 87 return gfs2_block_map(inode, lblock, bh_result, 0);
103} 88}
104 89
105/** 90/**
106 * gfs2_writepage - Write complete page 91 * gfs2_writepage_common - Common bits of writepage
107 * @page: Page to write 92 * @page: The page to be written
93 * @wbc: The writeback control
108 * 94 *
109 * Returns: errno 95 * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
110 *
111 * Some of this is copied from block_write_full_page() although we still
112 * call it to do most of the work.
113 */ 96 */
114 97
115static int gfs2_writepage(struct page *page, struct writeback_control *wbc) 98static int gfs2_writepage_common(struct page *page,
99 struct writeback_control *wbc)
116{ 100{
117 struct inode *inode = page->mapping->host; 101 struct inode *inode = page->mapping->host;
118 struct gfs2_inode *ip = GFS2_I(inode); 102 struct gfs2_inode *ip = GFS2_I(inode);
@@ -120,41 +104,133 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
120 loff_t i_size = i_size_read(inode); 104 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 105 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset; 106 unsigned offset;
123 int error; 107 int ret = -EIO;
124 int done_trans = 0;
125 108
126 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { 109 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
127 unlock_page(page); 110 goto out;
128 return -EIO; 111 ret = 0;
129 }
130 if (current->journal_info) 112 if (current->journal_info)
131 goto out_ignore; 113 goto redirty;
132
133 /* Is the page fully outside i_size? (truncate in progress) */ 114 /* Is the page fully outside i_size? (truncate in progress) */
134 offset = i_size & (PAGE_CACHE_SIZE-1); 115 offset = i_size & (PAGE_CACHE_SIZE-1);
135 if (page->index > end_index || (page->index == end_index && !offset)) { 116 if (page->index > end_index || (page->index == end_index && !offset)) {
136 page->mapping->a_ops->invalidatepage(page, 0); 117 page->mapping->a_ops->invalidatepage(page, 0);
137 unlock_page(page); 118 goto out;
138 return 0; /* don't care */ 119 }
120 return 1;
121redirty:
122 redirty_page_for_writepage(wbc, page);
123out:
124 unlock_page(page);
125 return 0;
126}
127
128/**
129 * gfs2_writeback_writepage - Write page for writeback mappings
130 * @page: The page
131 * @wbc: The writeback control
132 *
133 */
134
135static int gfs2_writeback_writepage(struct page *page,
136 struct writeback_control *wbc)
137{
138 int ret;
139
140 ret = gfs2_writepage_common(page, wbc);
141 if (ret <= 0)
142 return ret;
143
144 ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
145 if (ret == -EAGAIN)
146 ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
147 return ret;
148}
149
150/**
151 * gfs2_ordered_writepage - Write page for ordered data files
152 * @page: The page to write
153 * @wbc: The writeback control
154 *
155 */
156
157static int gfs2_ordered_writepage(struct page *page,
158 struct writeback_control *wbc)
159{
160 struct inode *inode = page->mapping->host;
161 struct gfs2_inode *ip = GFS2_I(inode);
162 int ret;
163
164 ret = gfs2_writepage_common(page, wbc);
165 if (ret <= 0)
166 return ret;
167
168 if (!page_has_buffers(page)) {
169 create_empty_buffers(page, inode->i_sb->s_blocksize,
170 (1 << BH_Dirty)|(1 << BH_Uptodate));
139 } 171 }
172 gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
173 return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
174}
140 175
141 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) && 176/**
142 PageChecked(page)) { 177 * __gfs2_jdata_writepage - The core of jdata writepage
178 * @page: The page to write
179 * @wbc: The writeback control
180 *
181 * This is shared between writepage and writepages and implements the
182 * core of the writepage operation. If a transaction is required then
183 * PageChecked will have been set and the transaction will have
184 * already been started before this is called.
185 */
186
187static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
188{
189 struct inode *inode = page->mapping->host;
190 struct gfs2_inode *ip = GFS2_I(inode);
191 struct gfs2_sbd *sdp = GFS2_SB(inode);
192
193 if (PageChecked(page)) {
143 ClearPageChecked(page); 194 ClearPageChecked(page);
144 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
145 if (error)
146 goto out_ignore;
147 if (!page_has_buffers(page)) { 195 if (!page_has_buffers(page)) {
148 create_empty_buffers(page, inode->i_sb->s_blocksize, 196 create_empty_buffers(page, inode->i_sb->s_blocksize,
149 (1 << BH_Dirty)|(1 << BH_Uptodate)); 197 (1 << BH_Dirty)|(1 << BH_Uptodate));
150 } 198 }
151 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); 199 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
200 }
201 return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
202}
203
204/**
205 * gfs2_jdata_writepage - Write complete page
206 * @page: Page to write
207 *
208 * Returns: errno
209 *
210 */
211
212static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
213{
214 struct inode *inode = page->mapping->host;
215 struct gfs2_sbd *sdp = GFS2_SB(inode);
216 int error;
217 int done_trans = 0;
218
219 error = gfs2_writepage_common(page, wbc);
220 if (error <= 0)
221 return error;
222
223 if (PageChecked(page)) {
224 if (wbc->sync_mode != WB_SYNC_ALL)
225 goto out_ignore;
226 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
227 if (error)
228 goto out_ignore;
152 done_trans = 1; 229 done_trans = 1;
153 } 230 }
154 error = block_write_full_page(page, gfs2_get_block_noalloc, wbc); 231 error = __gfs2_jdata_writepage(page, wbc);
155 if (done_trans) 232 if (done_trans)
156 gfs2_trans_end(sdp); 233 gfs2_trans_end(sdp);
157 gfs2_meta_cache_flush(ip);
158 return error; 234 return error;
159 235
160out_ignore: 236out_ignore:
@@ -164,29 +240,190 @@ out_ignore:
164} 240}
165 241
166/** 242/**
167 * gfs2_writepages - Write a bunch of dirty pages back to disk 243 * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
168 * @mapping: The mapping to write 244 * @mapping: The mapping to write
169 * @wbc: Write-back control 245 * @wbc: Write-back control
170 * 246 *
171 * For journaled files and/or ordered writes this just falls back to the 247 * For the data=writeback case we can already ignore buffer heads
172 * kernel's default writepages path for now. We will probably want to change
173 * that eventually (i.e. when we look at allocate on flush).
174 *
175 * For the data=writeback case though we can already ignore buffer heads
176 * and write whole extents at once. This is a big reduction in the 248 * and write whole extents at once. This is a big reduction in the
177 * number of I/O requests we send and the bmap calls we make in this case. 249 * number of I/O requests we send and the bmap calls we make in this case.
178 */ 250 */
179static int gfs2_writepages(struct address_space *mapping, 251static int gfs2_writeback_writepages(struct address_space *mapping,
180 struct writeback_control *wbc) 252 struct writeback_control *wbc)
253{
254 return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
255}
256
257/**
258 * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
259 * @mapping: The mapping
260 * @wbc: The writeback control
261 * @writepage: The writepage function to call for each page
262 * @pvec: The vector of pages
263 * @nr_pages: The number of pages to write
264 *
265 * Returns: non-zero if loop should terminate, zero otherwise
266 */
267
268static int gfs2_write_jdata_pagevec(struct address_space *mapping,
269 struct writeback_control *wbc,
270 struct pagevec *pvec,
271 int nr_pages, pgoff_t end)
181{ 272{
182 struct inode *inode = mapping->host; 273 struct inode *inode = mapping->host;
183 struct gfs2_inode *ip = GFS2_I(inode);
184 struct gfs2_sbd *sdp = GFS2_SB(inode); 274 struct gfs2_sbd *sdp = GFS2_SB(inode);
275 loff_t i_size = i_size_read(inode);
276 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
277 unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
278 unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
279 struct backing_dev_info *bdi = mapping->backing_dev_info;
280 int i;
281 int ret;
282
283 ret = gfs2_trans_begin(sdp, nrblocks, 0);
284 if (ret < 0)
285 return ret;
286
287 for(i = 0; i < nr_pages; i++) {
288 struct page *page = pvec->pages[i];
289
290 lock_page(page);
291
292 if (unlikely(page->mapping != mapping)) {
293 unlock_page(page);
294 continue;
295 }
296
297 if (!wbc->range_cyclic && page->index > end) {
298 ret = 1;
299 unlock_page(page);
300 continue;
301 }
302
303 if (wbc->sync_mode != WB_SYNC_NONE)
304 wait_on_page_writeback(page);
305
306 if (PageWriteback(page) ||
307 !clear_page_dirty_for_io(page)) {
308 unlock_page(page);
309 continue;
310 }
311
312 /* Is the page fully outside i_size? (truncate in progress) */
313 if (page->index > end_index || (page->index == end_index && !offset)) {
314 page->mapping->a_ops->invalidatepage(page, 0);
315 unlock_page(page);
316 continue;
317 }
318
319 ret = __gfs2_jdata_writepage(page, wbc);
320
321 if (ret || (--(wbc->nr_to_write) <= 0))
322 ret = 1;
323 if (wbc->nonblocking && bdi_write_congested(bdi)) {
324 wbc->encountered_congestion = 1;
325 ret = 1;
326 }
327
328 }
329 gfs2_trans_end(sdp);
330 return ret;
331}
332
333/**
334 * gfs2_write_cache_jdata - Like write_cache_pages but different
335 * @mapping: The mapping to write
336 * @wbc: The writeback control
337 * @writepage: The writepage function to call
338 * @data: The data to pass to writepage
339 *
340 * The reason that we use our own function here is that we need to
341 * start transactions before we grab page locks. This allows us
342 * to get the ordering right.
343 */
344
345static int gfs2_write_cache_jdata(struct address_space *mapping,
346 struct writeback_control *wbc)
347{
348 struct backing_dev_info *bdi = mapping->backing_dev_info;
349 int ret = 0;
350 int done = 0;
351 struct pagevec pvec;
352 int nr_pages;
353 pgoff_t index;
354 pgoff_t end;
355 int scanned = 0;
356 int range_whole = 0;
357
358 if (wbc->nonblocking && bdi_write_congested(bdi)) {
359 wbc->encountered_congestion = 1;
360 return 0;
361 }
362
363 pagevec_init(&pvec, 0);
364 if (wbc->range_cyclic) {
365 index = mapping->writeback_index; /* Start from prev offset */
366 end = -1;
367 } else {
368 index = wbc->range_start >> PAGE_CACHE_SHIFT;
369 end = wbc->range_end >> PAGE_CACHE_SHIFT;
370 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
371 range_whole = 1;
372 scanned = 1;
373 }
185 374
186 if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip)) 375retry:
187 return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); 376 while (!done && (index <= end) &&
377 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
378 PAGECACHE_TAG_DIRTY,
379 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
380 scanned = 1;
381 ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
382 if (ret)
383 done = 1;
384 if (ret > 0)
385 ret = 0;
386
387 pagevec_release(&pvec);
388 cond_resched();
389 }
390
391 if (!scanned && !done) {
392 /*
393 * We hit the last page and there is more work to be done: wrap
394 * back to the start of the file
395 */
396 scanned = 1;
397 index = 0;
398 goto retry;
399 }
400
401 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
402 mapping->writeback_index = index;
403 return ret;
404}
405
406
407/**
408 * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
409 * @mapping: The mapping to write
410 * @wbc: The writeback control
411 *
412 */
188 413
189 return generic_writepages(mapping, wbc); 414static int gfs2_jdata_writepages(struct address_space *mapping,
415 struct writeback_control *wbc)
416{
417 struct gfs2_inode *ip = GFS2_I(mapping->host);
418 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
419 int ret;
420
421 ret = gfs2_write_cache_jdata(mapping, wbc);
422 if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
423 gfs2_log_flush(sdp, ip->i_gl);
424 ret = gfs2_write_cache_jdata(mapping, wbc);
425 }
426 return ret;
190} 427}
191 428
192/** 429/**
@@ -231,62 +468,107 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
231 468
232 469
233/** 470/**
234 * gfs2_readpage - readpage with locking 471 * __gfs2_readpage - readpage
235 * @file: The file to read a page for. N.B. This may be NULL if we are 472 * @file: The file to read a page for
236 * reading an internal file.
237 * @page: The page to read 473 * @page: The page to read
238 * 474 *
239 * Returns: errno 475 * This is the core of gfs2's readpage. Its used by the internal file
476 * reading code as in that case we already hold the glock. Also its
477 * called by gfs2_readpage() once the required lock has been granted.
478 *
240 */ 479 */
241 480
242static int gfs2_readpage(struct file *file, struct page *page) 481static int __gfs2_readpage(void *file, struct page *page)
243{ 482{
244 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 483 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
245 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 484 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
246 struct gfs2_file *gf = NULL;
247 struct gfs2_holder gh;
248 int error; 485 int error;
249 int do_unlock = 0;
250
251 if (likely(file != &gfs2_internal_file_sentinel)) {
252 if (file) {
253 gf = file->private_data;
254 if (test_bit(GFF_EXLOCK, &gf->f_flags))
255 /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
256 goto skip_lock;
257 }
258 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
259 do_unlock = 1;
260 error = gfs2_glock_nq_atime(&gh);
261 if (unlikely(error))
262 goto out_unlock;
263 }
264 486
265skip_lock:
266 if (gfs2_is_stuffed(ip)) { 487 if (gfs2_is_stuffed(ip)) {
267 error = stuffed_readpage(ip, page); 488 error = stuffed_readpage(ip, page);
268 unlock_page(page); 489 unlock_page(page);
269 } else 490 } else {
270 error = mpage_readpage(page, gfs2_get_block); 491 error = mpage_readpage(page, gfs2_block_map);
492 }
271 493
272 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 494 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
273 error = -EIO; 495 return -EIO;
496
497 return error;
498}
499
500/**
501 * gfs2_readpage - read a page of a file
502 * @file: The file to read
503 * @page: The page of the file
504 *
505 * This deals with the locking required. We use a trylock in order to
506 * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
507 * in the event that we are unable to get the lock.
508 */
509
510static int gfs2_readpage(struct file *file, struct page *page)
511{
512 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
513 struct gfs2_holder gh;
514 int error;
274 515
275 if (do_unlock) { 516 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
276 gfs2_glock_dq_m(1, &gh); 517 error = gfs2_glock_nq_atime(&gh);
277 gfs2_holder_uninit(&gh); 518 if (unlikely(error)) {
519 unlock_page(page);
520 goto out;
278 } 521 }
522 error = __gfs2_readpage(file, page);
523 gfs2_glock_dq(&gh);
279out: 524out:
280 return error; 525 gfs2_holder_uninit(&gh);
281out_unlock:
282 unlock_page(page);
283 if (error == GLR_TRYFAILED) { 526 if (error == GLR_TRYFAILED) {
284 error = AOP_TRUNCATED_PAGE;
285 yield(); 527 yield();
528 return AOP_TRUNCATED_PAGE;
286 } 529 }
287 if (do_unlock) 530 return error;
288 gfs2_holder_uninit(&gh); 531}
289 goto out; 532
533/**
534 * gfs2_internal_read - read an internal file
535 * @ip: The gfs2 inode
536 * @ra_state: The readahead state (or NULL for no readahead)
537 * @buf: The buffer to fill
538 * @pos: The file position
539 * @size: The amount to read
540 *
541 */
542
543int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
544 char *buf, loff_t *pos, unsigned size)
545{
546 struct address_space *mapping = ip->i_inode.i_mapping;
547 unsigned long index = *pos / PAGE_CACHE_SIZE;
548 unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
549 unsigned copied = 0;
550 unsigned amt;
551 struct page *page;
552 void *p;
553
554 do {
555 amt = size - copied;
556 if (offset + size > PAGE_CACHE_SIZE)
557 amt = PAGE_CACHE_SIZE - offset;
558 page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
559 if (IS_ERR(page))
560 return PTR_ERR(page);
561 p = kmap_atomic(page, KM_USER0);
562 memcpy(buf + copied, p + offset, amt);
563 kunmap_atomic(p, KM_USER0);
564 mark_page_accessed(page);
565 page_cache_release(page);
566 copied += amt;
567 index++;
568 offset = 0;
569 } while(copied < size);
570 (*pos) += size;
571 return size;
290} 572}
291 573
292/** 574/**
@@ -300,10 +582,9 @@ out_unlock:
300 * Any I/O we ignore at this time will be done via readpage later. 582 * Any I/O we ignore at this time will be done via readpage later.
301 * 2. We don't handle stuffed files here we let readpage do the honours. 583 * 2. We don't handle stuffed files here we let readpage do the honours.
302 * 3. mpage_readpages() does most of the heavy lifting in the common case. 584 * 3. mpage_readpages() does most of the heavy lifting in the common case.
303 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. 585 * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
304 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
305 * well as read-ahead.
306 */ 586 */
587
307static int gfs2_readpages(struct file *file, struct address_space *mapping, 588static int gfs2_readpages(struct file *file, struct address_space *mapping,
308 struct list_head *pages, unsigned nr_pages) 589 struct list_head *pages, unsigned nr_pages)
309{ 590{
@@ -311,42 +592,20 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
311 struct gfs2_inode *ip = GFS2_I(inode); 592 struct gfs2_inode *ip = GFS2_I(inode);
312 struct gfs2_sbd *sdp = GFS2_SB(inode); 593 struct gfs2_sbd *sdp = GFS2_SB(inode);
313 struct gfs2_holder gh; 594 struct gfs2_holder gh;
314 int ret = 0; 595 int ret;
315 int do_unlock = 0;
316 596
317 if (likely(file != &gfs2_internal_file_sentinel)) { 597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
318 if (file) { 598 ret = gfs2_glock_nq_atime(&gh);
319 struct gfs2_file *gf = file->private_data; 599 if (unlikely(ret))
320 if (test_bit(GFF_EXLOCK, &gf->f_flags)) 600 goto out_uninit;
321 goto skip_lock;
322 }
323 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
324 LM_FLAG_TRY_1CB|GL_ATIME, &gh);
325 do_unlock = 1;
326 ret = gfs2_glock_nq_atime(&gh);
327 if (ret == GLR_TRYFAILED)
328 goto out_noerror;
329 if (unlikely(ret))
330 goto out_unlock;
331 }
332skip_lock:
333 if (!gfs2_is_stuffed(ip)) 601 if (!gfs2_is_stuffed(ip))
334 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); 602 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
335 603 gfs2_glock_dq(&gh);
336 if (do_unlock) { 604out_uninit:
337 gfs2_glock_dq_m(1, &gh); 605 gfs2_holder_uninit(&gh);
338 gfs2_holder_uninit(&gh);
339 }
340out:
341 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 606 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
342 ret = -EIO; 607 ret = -EIO;
343 return ret; 608 return ret;
344out_noerror:
345 ret = 0;
346out_unlock:
347 if (do_unlock)
348 gfs2_holder_uninit(&gh);
349 goto out;
350} 609}
351 610
352/** 611/**
@@ -382,20 +641,11 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
382 if (unlikely(error)) 641 if (unlikely(error))
383 goto out_uninit; 642 goto out_uninit;
384 643
385 error = -ENOMEM;
386 page = __grab_cache_page(mapping, index);
387 *pagep = page;
388 if (!page)
389 goto out_unlock;
390
391 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); 644 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
392
393 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 645 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
394 if (error) 646 if (error)
395 goto out_putpage; 647 goto out_unlock;
396
397 648
398 ip->i_alloc.al_requested = 0;
399 if (alloc_required) { 649 if (alloc_required) {
400 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
401 651
@@ -424,40 +674,47 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
424 if (error) 674 if (error)
425 goto out_trans_fail; 675 goto out_trans_fail;
426 676
677 error = -ENOMEM;
678 page = __grab_cache_page(mapping, index);
679 *pagep = page;
680 if (unlikely(!page))
681 goto out_endtrans;
682
427 if (gfs2_is_stuffed(ip)) { 683 if (gfs2_is_stuffed(ip)) {
684 error = 0;
428 if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { 685 if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
429 error = gfs2_unstuff_dinode(ip, page); 686 error = gfs2_unstuff_dinode(ip, page);
430 if (error == 0) 687 if (error == 0)
431 goto prepare_write; 688 goto prepare_write;
432 } else if (!PageUptodate(page)) 689 } else if (!PageUptodate(page)) {
433 error = stuffed_readpage(ip, page); 690 error = stuffed_readpage(ip, page);
691 }
434 goto out; 692 goto out;
435 } 693 }
436 694
437prepare_write: 695prepare_write:
438 error = block_prepare_write(page, from, to, gfs2_get_block); 696 error = block_prepare_write(page, from, to, gfs2_block_map);
439
440out: 697out:
441 if (error) { 698 if (error == 0)
442 gfs2_trans_end(sdp); 699 return 0;
700
701 page_cache_release(page);
702 if (pos + len > ip->i_inode.i_size)
703 vmtruncate(&ip->i_inode, ip->i_inode.i_size);
704out_endtrans:
705 gfs2_trans_end(sdp);
443out_trans_fail: 706out_trans_fail:
444 if (alloc_required) { 707 if (alloc_required) {
445 gfs2_inplace_release(ip); 708 gfs2_inplace_release(ip);
446out_qunlock: 709out_qunlock:
447 gfs2_quota_unlock(ip); 710 gfs2_quota_unlock(ip);
448out_alloc_put: 711out_alloc_put:
449 gfs2_alloc_put(ip); 712 gfs2_alloc_put(ip);
450 } 713 }
451out_putpage:
452 page_cache_release(page);
453 if (pos + len > ip->i_inode.i_size)
454 vmtruncate(&ip->i_inode, ip->i_inode.i_size);
455out_unlock: 714out_unlock:
456 gfs2_glock_dq_m(1, &ip->i_gh); 715 gfs2_glock_dq(&ip->i_gh);
457out_uninit: 716out_uninit:
458 gfs2_holder_uninit(&ip->i_gh); 717 gfs2_holder_uninit(&ip->i_gh);
459 }
460
461 return error; 718 return error;
462} 719}
463 720
@@ -565,7 +822,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
565 struct gfs2_inode *ip = GFS2_I(inode); 822 struct gfs2_inode *ip = GFS2_I(inode);
566 struct gfs2_sbd *sdp = GFS2_SB(inode); 823 struct gfs2_sbd *sdp = GFS2_SB(inode);
567 struct buffer_head *dibh; 824 struct buffer_head *dibh;
568 struct gfs2_alloc *al = &ip->i_alloc; 825 struct gfs2_alloc *al = ip->i_alloc;
569 struct gfs2_dinode *di; 826 struct gfs2_dinode *di;
570 unsigned int from = pos & (PAGE_CACHE_SIZE - 1); 827 unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
571 unsigned int to = from + len; 828 unsigned int to = from + len;
@@ -585,19 +842,16 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
585 if (gfs2_is_stuffed(ip)) 842 if (gfs2_is_stuffed(ip))
586 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); 843 return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
587 844
588 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 845 if (!gfs2_is_writeback(ip))
589 gfs2_page_add_databufs(ip, page, from, to); 846 gfs2_page_add_databufs(ip, page, from, to);
590 847
591 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 848 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
592 849
593 if (likely(ret >= 0)) { 850 if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
594 copied = ret; 851 di = (struct gfs2_dinode *)dibh->b_data;
595 if ((pos + copied) > inode->i_size) { 852 ip->i_di.di_size = inode->i_size;
596 di = (struct gfs2_dinode *)dibh->b_data; 853 di->di_size = cpu_to_be64(inode->i_size);
597 ip->i_di.di_size = inode->i_size; 854 mark_inode_dirty(inode);
598 di->di_size = cpu_to_be64(inode->i_size);
599 mark_inode_dirty(inode);
600 }
601 } 855 }
602 856
603 if (inode == sdp->sd_rindex) 857 if (inode == sdp->sd_rindex)
@@ -606,7 +860,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
606 brelse(dibh); 860 brelse(dibh);
607 gfs2_trans_end(sdp); 861 gfs2_trans_end(sdp);
608failed: 862failed:
609 if (al->al_requested) { 863 if (al) {
610 gfs2_inplace_release(ip); 864 gfs2_inplace_release(ip);
611 gfs2_quota_unlock(ip); 865 gfs2_quota_unlock(ip);
612 gfs2_alloc_put(ip); 866 gfs2_alloc_put(ip);
@@ -625,11 +879,7 @@ failed:
625 879
626static int gfs2_set_page_dirty(struct page *page) 880static int gfs2_set_page_dirty(struct page *page)
627{ 881{
628 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 882 SetPageChecked(page);
629 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
630
631 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
632 SetPageChecked(page);
633 return __set_page_dirty_buffers(page); 883 return __set_page_dirty_buffers(page);
634} 884}
635 885
@@ -653,7 +903,7 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
653 return 0; 903 return 0;
654 904
655 if (!gfs2_is_stuffed(ip)) 905 if (!gfs2_is_stuffed(ip))
656 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block); 906 dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
657 907
658 gfs2_glock_dq_uninit(&i_gh); 908 gfs2_glock_dq_uninit(&i_gh);
659 909
@@ -719,13 +969,9 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
719{ 969{
720 /* 970 /*
721 * Should we return an error here? I can't see that O_DIRECT for 971 * Should we return an error here? I can't see that O_DIRECT for
722 * a journaled file makes any sense. For now we'll silently fall 972 * a stuffed file makes any sense. For now we'll silently fall
723 * back to buffered I/O, likewise we do the same for stuffed 973 * back to buffered I/O
724 * files since they are (a) small and (b) unaligned.
725 */ 974 */
726 if (gfs2_is_jdata(ip))
727 return 0;
728
729 if (gfs2_is_stuffed(ip)) 975 if (gfs2_is_stuffed(ip))
730 return 0; 976 return 0;
731 977
@@ -836,9 +1082,23 @@ cannot_release:
836 return 0; 1082 return 0;
837} 1083}
838 1084
839const struct address_space_operations gfs2_file_aops = { 1085static const struct address_space_operations gfs2_writeback_aops = {
840 .writepage = gfs2_writepage, 1086 .writepage = gfs2_writeback_writepage,
841 .writepages = gfs2_writepages, 1087 .writepages = gfs2_writeback_writepages,
1088 .readpage = gfs2_readpage,
1089 .readpages = gfs2_readpages,
1090 .sync_page = block_sync_page,
1091 .write_begin = gfs2_write_begin,
1092 .write_end = gfs2_write_end,
1093 .bmap = gfs2_bmap,
1094 .invalidatepage = gfs2_invalidatepage,
1095 .releasepage = gfs2_releasepage,
1096 .direct_IO = gfs2_direct_IO,
1097 .migratepage = buffer_migrate_page,
1098};
1099
1100static const struct address_space_operations gfs2_ordered_aops = {
1101 .writepage = gfs2_ordered_writepage,
842 .readpage = gfs2_readpage, 1102 .readpage = gfs2_readpage,
843 .readpages = gfs2_readpages, 1103 .readpages = gfs2_readpages,
844 .sync_page = block_sync_page, 1104 .sync_page = block_sync_page,
@@ -849,5 +1109,34 @@ const struct address_space_operations gfs2_file_aops = {
849 .invalidatepage = gfs2_invalidatepage, 1109 .invalidatepage = gfs2_invalidatepage,
850 .releasepage = gfs2_releasepage, 1110 .releasepage = gfs2_releasepage,
851 .direct_IO = gfs2_direct_IO, 1111 .direct_IO = gfs2_direct_IO,
1112 .migratepage = buffer_migrate_page,
852}; 1113};
853 1114
1115static const struct address_space_operations gfs2_jdata_aops = {
1116 .writepage = gfs2_jdata_writepage,
1117 .writepages = gfs2_jdata_writepages,
1118 .readpage = gfs2_readpage,
1119 .readpages = gfs2_readpages,
1120 .sync_page = block_sync_page,
1121 .write_begin = gfs2_write_begin,
1122 .write_end = gfs2_write_end,
1123 .set_page_dirty = gfs2_set_page_dirty,
1124 .bmap = gfs2_bmap,
1125 .invalidatepage = gfs2_invalidatepage,
1126 .releasepage = gfs2_releasepage,
1127};
1128
1129void gfs2_set_aops(struct inode *inode)
1130{
1131 struct gfs2_inode *ip = GFS2_I(inode);
1132
1133 if (gfs2_is_writeback(ip))
1134 inode->i_mapping->a_ops = &gfs2_writeback_aops;
1135 else if (gfs2_is_ordered(ip))
1136 inode->i_mapping->a_ops = &gfs2_ordered_aops;
1137 else if (gfs2_is_jdata(ip))
1138 inode->i_mapping->a_ops = &gfs2_jdata_aops;
1139 else
1140 BUG();
1141}
1142
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
index fa1b5b3d28b..5da21285bba 100644
--- a/fs/gfs2/ops_address.h
+++ b/fs/gfs2/ops_address.h
@@ -14,9 +14,10 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16 16
17extern const struct address_space_operations gfs2_file_aops;
18extern int gfs2_get_block(struct inode *inode, sector_t lblock,
19 struct buffer_head *bh_result, int create);
20extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); 17extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
18extern int gfs2_internal_read(struct gfs2_inode *ip,
19 struct file_ra_state *ra_state,
20 char *buf, loff_t *pos, unsigned size);
21extern void gfs2_set_aops(struct inode *inode);
21 22
22#endif /* __OPS_ADDRESS_DOT_H__ */ 23#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bb11fd6752d..f4842f2548c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -33,57 +33,12 @@
33#include "lm.h" 33#include "lm.h"
34#include "log.h" 34#include "log.h"
35#include "meta_io.h" 35#include "meta_io.h"
36#include "ops_file.h"
37#include "ops_vm.h"
38#include "quota.h" 36#include "quota.h"
39#include "rgrp.h" 37#include "rgrp.h"
40#include "trans.h" 38#include "trans.h"
41#include "util.h" 39#include "util.h"
42#include "eaops.h" 40#include "eaops.h"
43 41#include "ops_address.h"
44/*
45 * Most fields left uninitialised to catch anybody who tries to
46 * use them. f_flags set to prevent file_accessed() from touching
47 * any other part of this. Its use is purely as a flag so that we
48 * know (in readpage()) whether or not do to locking.
49 */
50struct file gfs2_internal_file_sentinel = {
51 .f_flags = O_NOATIME|O_RDONLY,
52};
53
54static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
55 unsigned long offset, unsigned long size)
56{
57 char *kaddr;
58 unsigned long count = desc->count;
59
60 if (size > count)
61 size = count;
62
63 kaddr = kmap(page);
64 memcpy(desc->arg.data, kaddr + offset, size);
65 kunmap(page);
66
67 desc->count = count - size;
68 desc->written += size;
69 desc->arg.buf += size;
70 return size;
71}
72
73int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
74 char *buf, loff_t *pos, unsigned size)
75{
76 struct inode *inode = &ip->i_inode;
77 read_descriptor_t desc;
78 desc.written = 0;
79 desc.arg.data = buf;
80 desc.count = size;
81 desc.error = 0;
82 do_generic_mapping_read(inode->i_mapping, ra_state,
83 &gfs2_internal_file_sentinel, pos, &desc,
84 gfs2_read_actor);
85 return desc.written ? desc.written : desc.error;
86}
87 42
88/** 43/**
89 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -214,7 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
214 if (put_user(fsflags, ptr)) 169 if (put_user(fsflags, ptr))
215 error = -EFAULT; 170 error = -EFAULT;
216 171
217 gfs2_glock_dq_m(1, &gh); 172 gfs2_glock_dq(&gh);
218 gfs2_holder_uninit(&gh); 173 gfs2_holder_uninit(&gh);
219 return error; 174 return error;
220} 175}
@@ -291,7 +246,16 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
291 if (error) 246 if (error)
292 goto out; 247 goto out;
293 } 248 }
294 249 if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
250 if (flags & GFS2_DIF_JDATA)
251 gfs2_log_flush(sdp, ip->i_gl);
252 error = filemap_fdatawrite(inode->i_mapping);
253 if (error)
254 goto out;
255 error = filemap_fdatawait(inode->i_mapping);
256 if (error)
257 goto out;
258 }
295 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 259 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
296 if (error) 260 if (error)
297 goto out; 261 goto out;
@@ -303,6 +267,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
303 gfs2_dinode_out(ip, bh->b_data); 267 gfs2_dinode_out(ip, bh->b_data);
304 brelse(bh); 268 brelse(bh);
305 gfs2_set_inode_flags(inode); 269 gfs2_set_inode_flags(inode);
270 gfs2_set_aops(inode);
306out_trans_end: 271out_trans_end:
307 gfs2_trans_end(sdp); 272 gfs2_trans_end(sdp);
308out: 273out:
@@ -338,6 +303,128 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
338 return -ENOTTY; 303 return -ENOTTY;
339} 304}
340 305
306/**
307 * gfs2_allocate_page_backing - Use bmap to allocate blocks
308 * @page: The (locked) page to allocate backing for
309 *
310 * We try to allocate all the blocks required for the page in
311 * one go. This might fail for various reasons, so we keep
312 * trying until all the blocks to back this page are allocated.
313 * If some of the blocks are already allocated, thats ok too.
314 */
315
316static int gfs2_allocate_page_backing(struct page *page)
317{
318 struct inode *inode = page->mapping->host;
319 struct buffer_head bh;
320 unsigned long size = PAGE_CACHE_SIZE;
321 u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
322
323 do {
324 bh.b_state = 0;
325 bh.b_size = size;
326 gfs2_block_map(inode, lblock, &bh, 1);
327 if (!buffer_mapped(&bh))
328 return -EIO;
329 size -= bh.b_size;
330 lblock += (bh.b_size >> inode->i_blkbits);
331 } while(size > 0);
332 return 0;
333}
334
335/**
336 * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
337 * @vma: The virtual memory area
338 * @page: The page which is about to become writable
339 *
340 * When the page becomes writable, we need to ensure that we have
341 * blocks allocated on disk to back that page.
342 */
343
344static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
345{
346 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
347 struct gfs2_inode *ip = GFS2_I(inode);
348 struct gfs2_sbd *sdp = GFS2_SB(inode);
349 unsigned long last_index;
350 u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
351 unsigned int data_blocks, ind_blocks, rblocks;
352 int alloc_required = 0;
353 struct gfs2_holder gh;
354 struct gfs2_alloc *al;
355 int ret;
356
357 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
358 ret = gfs2_glock_nq_atime(&gh);
359 if (ret)
360 goto out;
361
362 set_bit(GIF_SW_PAGED, &ip->i_flags);
363 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
364 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
365 if (ret || !alloc_required)
366 goto out_unlock;
367 ret = -ENOMEM;
368 al = gfs2_alloc_get(ip);
369 if (al == NULL)
370 goto out_unlock;
371
372 ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
373 if (ret)
374 goto out_alloc_put;
375 ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
376 if (ret)
377 goto out_quota_unlock;
378 al->al_requested = data_blocks + ind_blocks;
379 ret = gfs2_inplace_reserve(ip);
380 if (ret)
381 goto out_quota_unlock;
382
383 rblocks = RES_DINODE + ind_blocks;
384 if (gfs2_is_jdata(ip))
385 rblocks += data_blocks ? data_blocks : 1;
386 if (ind_blocks || data_blocks)
387 rblocks += RES_STATFS + RES_QUOTA;
388 ret = gfs2_trans_begin(sdp, rblocks, 0);
389 if (ret)
390 goto out_trans_fail;
391
392 lock_page(page);
393 ret = -EINVAL;
394 last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
395 if (page->index > last_index)
396 goto out_unlock_page;
397 ret = 0;
398 if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
399 goto out_unlock_page;
400 if (gfs2_is_stuffed(ip)) {
401 ret = gfs2_unstuff_dinode(ip, page);
402 if (ret)
403 goto out_unlock_page;
404 }
405 ret = gfs2_allocate_page_backing(page);
406
407out_unlock_page:
408 unlock_page(page);
409 gfs2_trans_end(sdp);
410out_trans_fail:
411 gfs2_inplace_release(ip);
412out_quota_unlock:
413 gfs2_quota_unlock(ip);
414out_alloc_put:
415 gfs2_alloc_put(ip);
416out_unlock:
417 gfs2_glock_dq(&gh);
418out:
419 gfs2_holder_uninit(&gh);
420 return ret;
421}
422
423static struct vm_operations_struct gfs2_vm_ops = {
424 .fault = filemap_fault,
425 .page_mkwrite = gfs2_page_mkwrite,
426};
427
341 428
342/** 429/**
343 * gfs2_mmap - 430 * gfs2_mmap -
@@ -360,14 +447,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
360 return error; 447 return error;
361 } 448 }
362 449
363 /* This is VM_MAYWRITE instead of VM_WRITE because a call 450 vma->vm_ops = &gfs2_vm_ops;
364 to mprotect() can turn on VM_WRITE later. */
365
366 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
367 (VM_MAYSHARE | VM_MAYWRITE))
368 vma->vm_ops = &gfs2_vm_ops_sharewrite;
369 else
370 vma->vm_ops = &gfs2_vm_ops_private;
371 451
372 gfs2_glock_dq_uninit(&i_gh); 452 gfs2_glock_dq_uninit(&i_gh);
373 453
@@ -538,15 +618,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
538 if (__mandatory_lock(&ip->i_inode)) 618 if (__mandatory_lock(&ip->i_inode))
539 return -ENOLCK; 619 return -ENOLCK;
540 620
541 if (sdp->sd_args.ar_localflocks) {
542 if (IS_GETLK(cmd)) {
543 posix_test_lock(file, fl);
544 return 0;
545 } else {
546 return posix_lock_file_wait(file, fl);
547 }
548 }
549
550 if (cmd == F_CANCELLK) { 621 if (cmd == F_CANCELLK) {
551 /* Hack: */ 622 /* Hack: */
552 cmd = F_SETLK; 623 cmd = F_SETLK;
@@ -632,16 +703,12 @@ static void do_unflock(struct file *file, struct file_lock *fl)
632static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) 703static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
633{ 704{
634 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 705 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
635 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
636 706
637 if (!(fl->fl_flags & FL_FLOCK)) 707 if (!(fl->fl_flags & FL_FLOCK))
638 return -ENOLCK; 708 return -ENOLCK;
639 if (__mandatory_lock(&ip->i_inode)) 709 if (__mandatory_lock(&ip->i_inode))
640 return -ENOLCK; 710 return -ENOLCK;
641 711
642 if (sdp->sd_args.ar_localflocks)
643 return flock_lock_file_wait(file, fl);
644
645 if (fl->fl_type == F_UNLCK) { 712 if (fl->fl_type == F_UNLCK) {
646 do_unflock(file, fl); 713 do_unflock(file, fl);
647 return 0; 714 return 0;
@@ -678,3 +745,27 @@ const struct file_operations gfs2_dir_fops = {
678 .flock = gfs2_flock, 745 .flock = gfs2_flock,
679}; 746};
680 747
748const struct file_operations gfs2_file_fops_nolock = {
749 .llseek = gfs2_llseek,
750 .read = do_sync_read,
751 .aio_read = generic_file_aio_read,
752 .write = do_sync_write,
753 .aio_write = generic_file_aio_write,
754 .unlocked_ioctl = gfs2_ioctl,
755 .mmap = gfs2_mmap,
756 .open = gfs2_open,
757 .release = gfs2_close,
758 .fsync = gfs2_fsync,
759 .splice_read = generic_file_splice_read,
760 .splice_write = generic_file_splice_write,
761 .setlease = gfs2_setlease,
762};
763
764const struct file_operations gfs2_dir_fops_nolock = {
765 .readdir = gfs2_readdir,
766 .unlocked_ioctl = gfs2_ioctl,
767 .open = gfs2_open,
768 .release = gfs2_close,
769 .fsync = gfs2_fsync,
770};
771
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
deleted file mode 100644
index 7e5d8ec9c84..00000000000
--- a/fs/gfs2/ops_file.h
+++ /dev/null
@@ -1,24 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12
13#include <linux/fs.h>
14struct gfs2_inode;
15
16extern struct file gfs2_internal_file_sentinel;
17extern int gfs2_internal_read(struct gfs2_inode *ip,
18 struct file_ra_state *ra_state,
19 char *buf, loff_t *pos, unsigned size);
20extern void gfs2_set_inode_flags(struct inode *inode);
21extern const struct file_operations gfs2_file_fops;
22extern const struct file_operations gfs2_dir_fops;
23
24#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 17de58e83d9..43d511bba52 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,6 +21,7 @@
21 21
22#include "gfs2.h" 22#include "gfs2.h"
23#include "incore.h" 23#include "incore.h"
24#include "bmap.h"
24#include "daemon.h" 25#include "daemon.h"
25#include "glock.h" 26#include "glock.h"
26#include "glops.h" 27#include "glops.h"
@@ -59,7 +60,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
59 60
60 mutex_init(&sdp->sd_inum_mutex); 61 mutex_init(&sdp->sd_inum_mutex);
61 spin_lock_init(&sdp->sd_statfs_spin); 62 spin_lock_init(&sdp->sd_statfs_spin);
62 mutex_init(&sdp->sd_statfs_mutex);
63 63
64 spin_lock_init(&sdp->sd_rindex_spin); 64 spin_lock_init(&sdp->sd_rindex_spin);
65 mutex_init(&sdp->sd_rindex_mutex); 65 mutex_init(&sdp->sd_rindex_mutex);
@@ -77,7 +77,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
77 77
78 spin_lock_init(&sdp->sd_log_lock); 78 spin_lock_init(&sdp->sd_log_lock);
79 79
80 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
81 INIT_LIST_HEAD(&sdp->sd_log_le_buf); 80 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 81 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
83 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 82 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
@@ -303,6 +302,67 @@ out:
303 return error; 302 return error;
304} 303}
305 304
305/**
306 * map_journal_extents - create a reusable "extent" mapping from all logical
307 * blocks to all physical blocks for the given journal. This will save
308 * us time when writing journal blocks. Most journals will have only one
309 * extent that maps all their logical blocks. That's because gfs2.mkfs
310 * arranges the journal blocks sequentially to maximize performance.
311 * So the extent would map the first block for the entire file length.
312 * However, gfs2_jadd can happen while file activity is happening, so
313 * those journals may not be sequential. Less likely is the case where
314 * the users created their own journals by mounting the metafs and
315 * laying it out. But it's still possible. These journals might have
316 * several extents.
317 *
318 * TODO: This should be done in bigger chunks rather than one block at a time,
319 * but since it's only done at mount time, I'm not worried about the
320 * time it takes.
321 */
322static int map_journal_extents(struct gfs2_sbd *sdp)
323{
324 struct gfs2_jdesc *jd = sdp->sd_jdesc;
325 unsigned int lb;
326 u64 db, prev_db; /* logical block, disk block, prev disk block */
327 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
328 struct gfs2_journal_extent *jext = NULL;
329 struct buffer_head bh;
330 int rc = 0;
331
332 prev_db = 0;
333
334 for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
335 bh.b_state = 0;
336 bh.b_blocknr = 0;
337 bh.b_size = 1 << ip->i_inode.i_blkbits;
338 rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
339 db = bh.b_blocknr;
340 if (rc || !db) {
341 printk(KERN_INFO "GFS2 journal mapping error %d: lb="
342 "%u db=%llu\n", rc, lb, (unsigned long long)db);
343 break;
344 }
345 if (!prev_db || db != prev_db + 1) {
346 jext = kzalloc(sizeof(struct gfs2_journal_extent),
347 GFP_KERNEL);
348 if (!jext) {
349 printk(KERN_INFO "GFS2 error: out of memory "
350 "mapping journal extents.\n");
351 rc = -ENOMEM;
352 break;
353 }
354 jext->dblock = db;
355 jext->lblock = lb;
356 jext->blocks = 1;
357 list_add_tail(&jext->extent_list, &jd->extent_list);
358 } else {
359 jext->blocks++;
360 }
361 prev_db = db;
362 }
363 return rc;
364}
365
306static int init_journal(struct gfs2_sbd *sdp, int undo) 366static int init_journal(struct gfs2_sbd *sdp, int undo)
307{ 367{
308 struct gfs2_holder ji_gh; 368 struct gfs2_holder ji_gh;
@@ -340,7 +400,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
340 400
341 if (sdp->sd_args.ar_spectator) { 401 if (sdp->sd_args.ar_spectator) {
342 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); 402 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
343 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; 403 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
344 } else { 404 } else {
345 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { 405 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
346 fs_err(sdp, "can't mount journal #%u\n", 406 fs_err(sdp, "can't mount journal #%u\n",
@@ -377,7 +437,10 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
377 sdp->sd_jdesc->jd_jid, error); 437 sdp->sd_jdesc->jd_jid, error);
378 goto fail_jinode_gh; 438 goto fail_jinode_gh;
379 } 439 }
380 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; 440 atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
441
442 /* Map the extents for this journal's blocks */
443 map_journal_extents(sdp);
381 } 444 }
382 445
383 if (sdp->sd_lockstruct.ls_first) { 446 if (sdp->sd_lockstruct.ls_first) {
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 291f0c7eaa3..9f71372c175 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -61,7 +61,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
61 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0); 61 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
62 if (!IS_ERR(inode)) { 62 if (!IS_ERR(inode)) {
63 gfs2_trans_end(sdp); 63 gfs2_trans_end(sdp);
64 if (dip->i_alloc.al_rgd) 64 if (dip->i_alloc->al_rgd)
65 gfs2_inplace_release(dip); 65 gfs2_inplace_release(dip);
66 gfs2_quota_unlock(dip); 66 gfs2_quota_unlock(dip);
67 gfs2_alloc_put(dip); 67 gfs2_alloc_put(dip);
@@ -113,8 +113,18 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
113 if (inode && IS_ERR(inode)) 113 if (inode && IS_ERR(inode))
114 return ERR_PTR(PTR_ERR(inode)); 114 return ERR_PTR(PTR_ERR(inode));
115 115
116 if (inode) 116 if (inode) {
117 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
118 struct gfs2_holder gh;
119 int error;
120 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
121 if (error) {
122 iput(inode);
123 return ERR_PTR(error);
124 }
125 gfs2_glock_dq_uninit(&gh);
117 return d_splice_alias(inode, dentry); 126 return d_splice_alias(inode, dentry);
127 }
118 d_add(dentry, inode); 128 d_add(dentry, inode);
119 129
120 return NULL; 130 return NULL;
@@ -366,7 +376,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
366 } 376 }
367 377
368 gfs2_trans_end(sdp); 378 gfs2_trans_end(sdp);
369 if (dip->i_alloc.al_rgd) 379 if (dip->i_alloc->al_rgd)
370 gfs2_inplace_release(dip); 380 gfs2_inplace_release(dip);
371 gfs2_quota_unlock(dip); 381 gfs2_quota_unlock(dip);
372 gfs2_alloc_put(dip); 382 gfs2_alloc_put(dip);
@@ -442,7 +452,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
442 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ 452 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
443 453
444 gfs2_trans_end(sdp); 454 gfs2_trans_end(sdp);
445 if (dip->i_alloc.al_rgd) 455 if (dip->i_alloc->al_rgd)
446 gfs2_inplace_release(dip); 456 gfs2_inplace_release(dip);
447 gfs2_quota_unlock(dip); 457 gfs2_quota_unlock(dip);
448 gfs2_alloc_put(dip); 458 gfs2_alloc_put(dip);
@@ -548,7 +558,7 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
548 } 558 }
549 559
550 gfs2_trans_end(sdp); 560 gfs2_trans_end(sdp);
551 if (dip->i_alloc.al_rgd) 561 if (dip->i_alloc->al_rgd)
552 gfs2_inplace_release(dip); 562 gfs2_inplace_release(dip);
553 gfs2_quota_unlock(dip); 563 gfs2_quota_unlock(dip);
554 gfs2_alloc_put(dip); 564 gfs2_alloc_put(dip);
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index 34f0caac1a0..fd8cee231e1 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -16,5 +16,11 @@ extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops; 16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops; 17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct inode_operations gfs2_dev_iops; 18extern const struct inode_operations gfs2_dev_iops;
19extern const struct file_operations gfs2_file_fops;
20extern const struct file_operations gfs2_dir_fops;
21extern const struct file_operations gfs2_file_fops_nolock;
22extern const struct file_operations gfs2_dir_fops_nolock;
23
24extern void gfs2_set_inode_flags(struct inode *inode);
19 25
20#endif /* __OPS_INODE_DOT_H__ */ 26#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 950f31460e8..5e524217944 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -487,7 +487,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
487 if (ip) { 487 if (ip) {
488 ip->i_flags = 0; 488 ip->i_flags = 0;
489 ip->i_gl = NULL; 489 ip->i_gl = NULL;
490 ip->i_last_pfault = jiffies;
491 } 490 }
492 return &ip->i_inode; 491 return &ip->i_inode;
493} 492}
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
deleted file mode 100644
index 927d739d468..00000000000
--- a/fs/gfs2/ops_vm.c
+++ /dev/null
@@ -1,169 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/mm.h>
15#include <linux/pagemap.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "ops_vm.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "util.h"
29
30static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
31{
32 struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
33
34 set_bit(GIF_PAGED, &ip->i_flags);
35 return filemap_fault(vma, vmf);
36}
37
38static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
39{
40 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
41 unsigned long index = page->index;
42 u64 lblock = index << (PAGE_CACHE_SHIFT -
43 sdp->sd_sb.sb_bsize_shift);
44 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
45 struct gfs2_alloc *al;
46 unsigned int data_blocks, ind_blocks;
47 unsigned int x;
48 int error;
49
50 al = gfs2_alloc_get(ip);
51
52 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
53 if (error)
54 goto out;
55
56 error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
57 if (error)
58 goto out_gunlock_q;
59
60 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
61
62 al->al_requested = data_blocks + ind_blocks;
63
64 error = gfs2_inplace_reserve(ip);
65 if (error)
66 goto out_gunlock_q;
67
68 error = gfs2_trans_begin(sdp, al->al_rgd->rd_length +
69 ind_blocks + RES_DINODE +
70 RES_STATFS + RES_QUOTA, 0);
71 if (error)
72 goto out_ipres;
73
74 if (gfs2_is_stuffed(ip)) {
75 error = gfs2_unstuff_dinode(ip, NULL);
76 if (error)
77 goto out_trans;
78 }
79
80 for (x = 0; x < blocks; ) {
81 u64 dblock;
82 unsigned int extlen;
83 int new = 1;
84
85 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
86 if (error)
87 goto out_trans;
88
89 lblock += extlen;
90 x += extlen;
91 }
92
93 gfs2_assert_warn(sdp, al->al_alloced);
94
95out_trans:
96 gfs2_trans_end(sdp);
97out_ipres:
98 gfs2_inplace_release(ip);
99out_gunlock_q:
100 gfs2_quota_unlock(ip);
101out:
102 gfs2_alloc_put(ip);
103 return error;
104}
105
106static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
107 struct vm_fault *vmf)
108{
109 struct file *file = vma->vm_file;
110 struct gfs2_file *gf = file->private_data;
111 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
112 struct gfs2_holder i_gh;
113 int alloc_required;
114 int error;
115 int ret = 0;
116
117 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
118 if (error)
119 goto out;
120
121 set_bit(GIF_PAGED, &ip->i_flags);
122 set_bit(GIF_SW_PAGED, &ip->i_flags);
123
124 error = gfs2_write_alloc_required(ip,
125 (u64)vmf->pgoff << PAGE_CACHE_SHIFT,
126 PAGE_CACHE_SIZE, &alloc_required);
127 if (error) {
128 ret = VM_FAULT_OOM; /* XXX: are these right? */
129 goto out_unlock;
130 }
131
132 set_bit(GFF_EXLOCK, &gf->f_flags);
133 ret = filemap_fault(vma, vmf);
134 clear_bit(GFF_EXLOCK, &gf->f_flags);
135 if (ret & VM_FAULT_ERROR)
136 goto out_unlock;
137
138 if (alloc_required) {
139 /* XXX: do we need to drop page lock around alloc_page_backing?*/
140 error = alloc_page_backing(ip, vmf->page);
141 if (error) {
142 /*
143 * VM_FAULT_LOCKED should always be the case for
144 * filemap_fault, but it may not be in a future
145 * implementation.
146 */
147 if (ret & VM_FAULT_LOCKED)
148 unlock_page(vmf->page);
149 page_cache_release(vmf->page);
150 ret = VM_FAULT_OOM;
151 goto out_unlock;
152 }
153 set_page_dirty(vmf->page);
154 }
155
156out_unlock:
157 gfs2_glock_dq_uninit(&i_gh);
158out:
159 return ret;
160}
161
162struct vm_operations_struct gfs2_vm_ops_private = {
163 .fault = gfs2_private_fault,
164};
165
166struct vm_operations_struct gfs2_vm_ops_sharewrite = {
167 .fault = gfs2_sharewrite_fault,
168};
169
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
deleted file mode 100644
index 4ae8f43ed5e..00000000000
--- a/fs/gfs2/ops_vm.h
+++ /dev/null
@@ -1,18 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13#include <linux/mm.h>
14
15extern struct vm_operations_struct gfs2_vm_ops_private;
16extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
17
18#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index addb51e0f13..a08dabd6ce9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -59,7 +59,6 @@
59#include "super.h" 59#include "super.h"
60#include "trans.h" 60#include "trans.h"
61#include "inode.h" 61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h" 62#include "ops_address.h"
64#include "util.h" 63#include "util.h"
65 64
@@ -274,10 +273,10 @@ static int bh_get(struct gfs2_quota_data *qd)
274 } 273 }
275 274
276 block = qd->qd_slot / sdp->sd_qc_per_block; 275 block = qd->qd_slot / sdp->sd_qc_per_block;
277 offset = qd->qd_slot % sdp->sd_qc_per_block;; 276 offset = qd->qd_slot % sdp->sd_qc_per_block;
278 277
279 bh_map.b_size = 1 << ip->i_inode.i_blkbits; 278 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
280 error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map); 279 error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
281 if (error) 280 if (error)
282 goto fail; 281 goto fail;
283 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); 282 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
@@ -454,7 +453,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
454int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) 453int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
455{ 454{
456 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 455 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
457 struct gfs2_alloc *al = &ip->i_alloc; 456 struct gfs2_alloc *al = ip->i_alloc;
458 struct gfs2_quota_data **qd = al->al_qd; 457 struct gfs2_quota_data **qd = al->al_qd;
459 int error; 458 int error;
460 459
@@ -502,7 +501,7 @@ out:
502void gfs2_quota_unhold(struct gfs2_inode *ip) 501void gfs2_quota_unhold(struct gfs2_inode *ip)
503{ 502{
504 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 503 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
505 struct gfs2_alloc *al = &ip->i_alloc; 504 struct gfs2_alloc *al = ip->i_alloc;
506 unsigned int x; 505 unsigned int x;
507 506
508 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); 507 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
@@ -646,7 +645,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
646 } 645 }
647 646
648 if (!buffer_mapped(bh)) { 647 if (!buffer_mapped(bh)) {
649 gfs2_get_block(inode, iblock, bh, 1); 648 gfs2_block_map(inode, iblock, bh, 1);
650 if (!buffer_mapped(bh)) 649 if (!buffer_mapped(bh))
651 goto unlock; 650 goto unlock;
652 } 651 }
@@ -793,11 +792,9 @@ static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
793 struct gfs2_holder i_gh; 792 struct gfs2_holder i_gh;
794 struct gfs2_quota_host q; 793 struct gfs2_quota_host q;
795 char buf[sizeof(struct gfs2_quota)]; 794 char buf[sizeof(struct gfs2_quota)];
796 struct file_ra_state ra_state;
797 int error; 795 int error;
798 struct gfs2_quota_lvb *qlvb; 796 struct gfs2_quota_lvb *qlvb;
799 797
800 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
801restart: 798restart:
802 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); 799 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
803 if (error) 800 if (error)
@@ -820,8 +817,8 @@ restart:
820 817
821 memset(buf, 0, sizeof(struct gfs2_quota)); 818 memset(buf, 0, sizeof(struct gfs2_quota));
822 pos = qd2offset(qd); 819 pos = qd2offset(qd);
823 error = gfs2_internal_read(ip, &ra_state, buf, 820 error = gfs2_internal_read(ip, NULL, buf, &pos,
824 &pos, sizeof(struct gfs2_quota)); 821 sizeof(struct gfs2_quota));
825 if (error < 0) 822 if (error < 0)
826 goto fail_gunlock; 823 goto fail_gunlock;
827 824
@@ -856,7 +853,7 @@ fail:
856int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) 853int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
857{ 854{
858 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 855 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
859 struct gfs2_alloc *al = &ip->i_alloc; 856 struct gfs2_alloc *al = ip->i_alloc;
860 unsigned int x; 857 unsigned int x;
861 int error = 0; 858 int error = 0;
862 859
@@ -924,7 +921,7 @@ static int need_sync(struct gfs2_quota_data *qd)
924 921
925void gfs2_quota_unlock(struct gfs2_inode *ip) 922void gfs2_quota_unlock(struct gfs2_inode *ip)
926{ 923{
927 struct gfs2_alloc *al = &ip->i_alloc; 924 struct gfs2_alloc *al = ip->i_alloc;
928 struct gfs2_quota_data *qda[4]; 925 struct gfs2_quota_data *qda[4];
929 unsigned int count = 0; 926 unsigned int count = 0;
930 unsigned int x; 927 unsigned int x;
@@ -972,7 +969,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
972int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) 969int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
973{ 970{
974 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 971 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
975 struct gfs2_alloc *al = &ip->i_alloc; 972 struct gfs2_alloc *al = ip->i_alloc;
976 struct gfs2_quota_data *qd; 973 struct gfs2_quota_data *qd;
977 s64 value; 974 s64 value;
978 unsigned int x; 975 unsigned int x;
@@ -1016,10 +1013,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
1016void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 1013void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1017 u32 uid, u32 gid) 1014 u32 uid, u32 gid)
1018{ 1015{
1019 struct gfs2_alloc *al = &ip->i_alloc; 1016 struct gfs2_alloc *al = ip->i_alloc;
1020 struct gfs2_quota_data *qd; 1017 struct gfs2_quota_data *qd;
1021 unsigned int x; 1018 unsigned int x;
1022 unsigned int found = 0;
1023 1019
1024 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) 1020 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
1025 return; 1021 return;
@@ -1032,7 +1028,6 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1032 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || 1028 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1033 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { 1029 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1034 do_qc(qd, change); 1030 do_qc(qd, change);
1035 found++;
1036 } 1031 }
1037 } 1032 }
1038} 1033}
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index beb6c7ac008..b249e294a95 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -391,7 +391,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
391 lblock = head->lh_blkno; 391 lblock = head->lh_blkno;
392 gfs2_replay_incr_blk(sdp, &lblock); 392 gfs2_replay_incr_blk(sdp, &lblock);
393 bh_map.b_size = 1 << ip->i_inode.i_blkbits; 393 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
394 error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map); 394 error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0);
395 if (error) 395 if (error)
396 return error; 396 return error;
397 if (!bh_map.b_blocknr) { 397 if (!bh_map.b_blocknr) {
@@ -504,13 +504,21 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
504 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) 504 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
505 ro = 1; 505 ro = 1;
506 } else { 506 } else {
507 if (sdp->sd_vfs->s_flags & MS_RDONLY) 507 if (sdp->sd_vfs->s_flags & MS_RDONLY) {
508 ro = 1; 508 /* check if device itself is read-only */
509 ro = bdev_read_only(sdp->sd_vfs->s_bdev);
510 if (!ro) {
511 fs_info(sdp, "recovery required on "
512 "read-only filesystem.\n");
513 fs_info(sdp, "write access will be "
514 "enabled during recovery.\n");
515 }
516 }
509 } 517 }
510 518
511 if (ro) { 519 if (ro) {
512 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n", 520 fs_warn(sdp, "jid=%u: Can't replay: read-only block "
513 jd->jd_jid); 521 "device\n", jd->jd_jid);
514 error = -EROFS; 522 error = -EROFS;
515 goto fail_gunlock_tr; 523 goto fail_gunlock_tr;
516 } 524 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 708c287e1d0..3552110b2e5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -25,10 +25,10 @@
25#include "rgrp.h" 25#include "rgrp.h"
26#include "super.h" 26#include "super.h"
27#include "trans.h" 27#include "trans.h"
28#include "ops_file.h"
29#include "util.h" 28#include "util.h"
30#include "log.h" 29#include "log.h"
31#include "inode.h" 30#include "inode.h"
31#include "ops_address.h"
32 32
33#define BFITNOENT ((u32)~0) 33#define BFITNOENT ((u32)~0)
34#define NO_BLOCK ((u64)~0) 34#define NO_BLOCK ((u64)~0)
@@ -126,41 +126,43 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 * Return: the block number (bitmap buffer scope) that was found 126 * Return: the block number (bitmap buffer scope) that was found
127 */ 127 */
128 128
129static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer, 129static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
130 unsigned int buflen, u32 goal, 130 unsigned char old_state)
131 unsigned char old_state)
132{ 131{
133 unsigned char *byte, *end, alloc; 132 unsigned char *byte;
134 u32 blk = goal; 133 u32 blk = goal;
135 unsigned int bit; 134 unsigned int bit, bitlong;
135 unsigned long *plong, plong55;
136 136
137 byte = buffer + (goal / GFS2_NBBY); 137 byte = buffer + (goal / GFS2_NBBY);
138 plong = (unsigned long *)(buffer + (goal / GFS2_NBBY));
138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 139 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
139 end = buffer + buflen; 140 bitlong = bit;
140 alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0; 141#if BITS_PER_LONG == 32
141 142 plong55 = 0x55555555;
142 while (byte < end) { 143#else
143 /* If we're looking for a free block we can eliminate all 144 plong55 = 0x5555555555555555;
144 bitmap settings with 0x55, which represents four data 145#endif
145 blocks in a row. If we're looking for a data block, we can 146 while (byte < buffer + buflen) {
146 eliminate 0x00 which corresponds to four free blocks. */ 147
147 if ((*byte & 0x55) == alloc) { 148 if (bitlong == 0 && old_state == 0 && *plong == plong55) {
148 blk += (8 - bit) >> 1; 149 plong++;
149 150 byte += sizeof(unsigned long);
150 bit = 0; 151 blk += sizeof(unsigned long) * GFS2_NBBY;
151 byte++;
152
153 continue; 152 continue;
154 } 153 }
155
156 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) 154 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
157 return blk; 155 return blk;
158
159 bit += GFS2_BIT_SIZE; 156 bit += GFS2_BIT_SIZE;
160 if (bit >= 8) { 157 if (bit >= 8) {
161 bit = 0; 158 bit = 0;
162 byte++; 159 byte++;
163 } 160 }
161 bitlong += GFS2_BIT_SIZE;
162 if (bitlong >= sizeof(unsigned long) * 8) {
163 bitlong = 0;
164 plong++;
165 }
164 166
165 blk++; 167 blk++;
166 } 168 }
@@ -817,11 +819,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
817 819
818struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 820struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
819{ 821{
820 struct gfs2_alloc *al = &ip->i_alloc; 822 BUG_ON(ip->i_alloc != NULL);
821 823 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
822 /* FIXME: Should assert that the correct locks are held here... */ 824 return ip->i_alloc;
823 memset(al, 0, sizeof(*al));
824 return al;
825} 825}
826 826
827/** 827/**
@@ -1059,26 +1059,34 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1059 struct inode *inode = NULL; 1059 struct inode *inode = NULL;
1060 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1060 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1061 struct gfs2_rgrpd *rgd, *begin = NULL; 1061 struct gfs2_rgrpd *rgd, *begin = NULL;
1062 struct gfs2_alloc *al = &ip->i_alloc; 1062 struct gfs2_alloc *al = ip->i_alloc;
1063 int flags = LM_FLAG_TRY; 1063 int flags = LM_FLAG_TRY;
1064 int skipped = 0; 1064 int skipped = 0;
1065 int loops = 0; 1065 int loops = 0;
1066 int error; 1066 int error, rg_locked;
1067 1067
1068 /* Try recently successful rgrps */ 1068 /* Try recently successful rgrps */
1069 1069
1070 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); 1070 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
1071 1071
1072 while (rgd) { 1072 while (rgd) {
1073 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 1073 rg_locked = 0;
1074 LM_FLAG_TRY, &al->al_rgd_gh); 1074
1075 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1076 rg_locked = 1;
1077 error = 0;
1078 } else {
1079 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1080 LM_FLAG_TRY, &al->al_rgd_gh);
1081 }
1075 switch (error) { 1082 switch (error) {
1076 case 0: 1083 case 0:
1077 if (try_rgrp_fit(rgd, al)) 1084 if (try_rgrp_fit(rgd, al))
1078 goto out; 1085 goto out;
1079 if (rgd->rd_flags & GFS2_RDF_CHECK) 1086 if (rgd->rd_flags & GFS2_RDF_CHECK)
1080 inode = try_rgrp_unlink(rgd, last_unlinked); 1087 inode = try_rgrp_unlink(rgd, last_unlinked);
1081 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1088 if (!rg_locked)
1089 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1082 if (inode) 1090 if (inode)
1083 return inode; 1091 return inode;
1084 rgd = recent_rgrp_next(rgd, 1); 1092 rgd = recent_rgrp_next(rgd, 1);
@@ -1098,15 +1106,23 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1098 begin = rgd = forward_rgrp_get(sdp); 1106 begin = rgd = forward_rgrp_get(sdp);
1099 1107
1100 for (;;) { 1108 for (;;) {
1101 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, 1109 rg_locked = 0;
1102 &al->al_rgd_gh); 1110
1111 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
1112 rg_locked = 1;
1113 error = 0;
1114 } else {
1115 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
1116 &al->al_rgd_gh);
1117 }
1103 switch (error) { 1118 switch (error) {
1104 case 0: 1119 case 0:
1105 if (try_rgrp_fit(rgd, al)) 1120 if (try_rgrp_fit(rgd, al))
1106 goto out; 1121 goto out;
1107 if (rgd->rd_flags & GFS2_RDF_CHECK) 1122 if (rgd->rd_flags & GFS2_RDF_CHECK)
1108 inode = try_rgrp_unlink(rgd, last_unlinked); 1123 inode = try_rgrp_unlink(rgd, last_unlinked);
1109 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1124 if (!rg_locked)
1125 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1110 if (inode) 1126 if (inode)
1111 return inode; 1127 return inode;
1112 break; 1128 break;
@@ -1158,7 +1174,7 @@ out:
1158int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1174int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1159{ 1175{
1160 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1176 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1161 struct gfs2_alloc *al = &ip->i_alloc; 1177 struct gfs2_alloc *al = ip->i_alloc;
1162 struct inode *inode; 1178 struct inode *inode;
1163 int error = 0; 1179 int error = 0;
1164 u64 last_unlinked = NO_BLOCK; 1180 u64 last_unlinked = NO_BLOCK;
@@ -1204,7 +1220,7 @@ try_again:
1204void gfs2_inplace_release(struct gfs2_inode *ip) 1220void gfs2_inplace_release(struct gfs2_inode *ip)
1205{ 1221{
1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1222 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1207 struct gfs2_alloc *al = &ip->i_alloc; 1223 struct gfs2_alloc *al = ip->i_alloc;
1208 1224
1209 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) 1225 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1210 fs_warn(sdp, "al_alloced = %u, al_requested = %u " 1226 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
@@ -1213,7 +1229,8 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1213 al->al_line); 1229 al->al_line);
1214 1230
1215 al->al_rgd = NULL; 1231 al->al_rgd = NULL;
1216 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1232 if (al->al_rgd_gh.gh_gl)
1233 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1217 if (ip != GFS2_I(sdp->sd_rindex)) 1234 if (ip != GFS2_I(sdp->sd_rindex))
1218 gfs2_glock_dq_uninit(&al->al_ri_gh); 1235 gfs2_glock_dq_uninit(&al->al_ri_gh);
1219} 1236}
@@ -1301,11 +1318,10 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1301 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1318 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1302 bitmaps, so we must search the originals for that. */ 1319 bitmaps, so we must search the originals for that. */
1303 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1320 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1304 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, 1321 blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset,
1305 bi->bi_len, goal, old_state); 1322 bi->bi_len, goal, old_state);
1306 else 1323 else
1307 blk = gfs2_bitfit(rgd, 1324 blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
1308 bi->bi_bh->b_data + bi->bi_offset,
1309 bi->bi_len, goal, old_state); 1325 bi->bi_len, goal, old_state);
1310 if (blk != BFITNOENT) 1326 if (blk != BFITNOENT)
1311 break; 1327 break;
@@ -1394,7 +1410,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1394u64 gfs2_alloc_data(struct gfs2_inode *ip) 1410u64 gfs2_alloc_data(struct gfs2_inode *ip)
1395{ 1411{
1396 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1412 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1397 struct gfs2_alloc *al = &ip->i_alloc; 1413 struct gfs2_alloc *al = ip->i_alloc;
1398 struct gfs2_rgrpd *rgd = al->al_rgd; 1414 struct gfs2_rgrpd *rgd = al->al_rgd;
1399 u32 goal, blk; 1415 u32 goal, blk;
1400 u64 block; 1416 u64 block;
@@ -1439,7 +1455,7 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
1439u64 gfs2_alloc_meta(struct gfs2_inode *ip) 1455u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1440{ 1456{
1441 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1457 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1442 struct gfs2_alloc *al = &ip->i_alloc; 1458 struct gfs2_alloc *al = ip->i_alloc;
1443 struct gfs2_rgrpd *rgd = al->al_rgd; 1459 struct gfs2_rgrpd *rgd = al->al_rgd;
1444 u32 goal, blk; 1460 u32 goal, blk;
1445 u64 block; 1461 u64 block;
@@ -1485,7 +1501,7 @@ u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1485u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) 1501u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1486{ 1502{
1487 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1503 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1488 struct gfs2_alloc *al = &dip->i_alloc; 1504 struct gfs2_alloc *al = dip->i_alloc;
1489 struct gfs2_rgrpd *rgd = al->al_rgd; 1505 struct gfs2_rgrpd *rgd = al->al_rgd;
1490 u32 blk; 1506 u32 blk;
1491 u64 block; 1507 u64 block;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4c6adfc6f2..149bb161f4b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -32,7 +32,9 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip) 33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{ 34{
35 return; /* So we can see where ip->i_alloc is used */ 35 BUG_ON(ip->i_alloc == NULL);
36 kfree(ip->i_alloc);
37 ip->i_alloc = NULL;
36} 38}
37 39
38int gfs2_inplace_reserve_i(struct gfs2_inode *ip, 40int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index dd3e737f528..ef0562c3bc7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -51,13 +51,9 @@ void gfs2_tune_init(struct gfs2_tune *gt)
51{ 51{
52 spin_lock_init(&gt->gt_spin); 52 spin_lock_init(&gt->gt_spin);
53 53
54 gt->gt_ilimit = 100;
55 gt->gt_ilimit_tries = 3;
56 gt->gt_ilimit_min = 1;
57 gt->gt_demote_secs = 300; 54 gt->gt_demote_secs = 300;
58 gt->gt_incore_log_blocks = 1024; 55 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60; 56 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_recoverd_secs = 60; 57 gt->gt_recoverd_secs = 60;
62 gt->gt_logd_secs = 1; 58 gt->gt_logd_secs = 1;
63 gt->gt_quotad_secs = 5; 59 gt->gt_quotad_secs = 5;
@@ -71,10 +67,8 @@ void gfs2_tune_init(struct gfs2_tune *gt)
71 gt->gt_new_files_jdata = 0; 67 gt->gt_new_files_jdata = 0;
72 gt->gt_new_files_directio = 0; 68 gt->gt_new_files_directio = 0;
73 gt->gt_max_readahead = 1 << 18; 69 gt->gt_max_readahead = 1 << 18;
74 gt->gt_lockdump_size = 131072;
75 gt->gt_stall_secs = 600; 70 gt->gt_stall_secs = 600;
76 gt->gt_complain_secs = 10; 71 gt->gt_complain_secs = 10;
77 gt->gt_reclaim_limit = 5000;
78 gt->gt_statfs_quantum = 30; 72 gt->gt_statfs_quantum = 30;
79 gt->gt_statfs_slow = 0; 73 gt->gt_statfs_slow = 0;
80} 74}
@@ -393,6 +387,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
393 if (!jd) 387 if (!jd)
394 break; 388 break;
395 389
390 INIT_LIST_HEAD(&jd->extent_list);
396 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); 391 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
397 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 392 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
398 if (!jd->jd_inode) 393 if (!jd->jd_inode)
@@ -422,8 +417,9 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
422 417
423void gfs2_jindex_free(struct gfs2_sbd *sdp) 418void gfs2_jindex_free(struct gfs2_sbd *sdp)
424{ 419{
425 struct list_head list; 420 struct list_head list, *head;
426 struct gfs2_jdesc *jd; 421 struct gfs2_jdesc *jd;
422 struct gfs2_journal_extent *jext;
427 423
428 spin_lock(&sdp->sd_jindex_spin); 424 spin_lock(&sdp->sd_jindex_spin);
429 list_add(&list, &sdp->sd_jindex_list); 425 list_add(&list, &sdp->sd_jindex_list);
@@ -433,6 +429,14 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
433 429
434 while (!list_empty(&list)) { 430 while (!list_empty(&list)) {
435 jd = list_entry(list.next, struct gfs2_jdesc, jd_list); 431 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
432 head = &jd->extent_list;
433 while (!list_empty(head)) {
434 jext = list_entry(head->next,
435 struct gfs2_journal_extent,
436 extent_list);
437 list_del(&jext->extent_list);
438 kfree(jext);
439 }
436 list_del(&jd->jd_list); 440 list_del(&jd->jd_list);
437 iput(jd->jd_inode); 441 iput(jd->jd_inode);
438 kfree(jd); 442 kfree(jd);
@@ -543,7 +547,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
543 if (error) 547 if (error)
544 return error; 548 return error;
545 549
546 gfs2_meta_cache_flush(ip);
547 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 550 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
548 551
549 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 552 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -686,9 +689,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
686 if (error) 689 if (error)
687 return; 690 return;
688 691
689 mutex_lock(&sdp->sd_statfs_mutex);
690 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 692 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
691 mutex_unlock(&sdp->sd_statfs_mutex);
692 693
693 spin_lock(&sdp->sd_statfs_spin); 694 spin_lock(&sdp->sd_statfs_spin);
694 l_sc->sc_total += total; 695 l_sc->sc_total += total;
@@ -736,9 +737,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
736 if (error) 737 if (error)
737 goto out_bh2; 738 goto out_bh2;
738 739
739 mutex_lock(&sdp->sd_statfs_mutex);
740 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); 740 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
741 mutex_unlock(&sdp->sd_statfs_mutex);
742 741
743 spin_lock(&sdp->sd_statfs_spin); 742 spin_lock(&sdp->sd_statfs_spin);
744 m_sc->sc_total += l_sc->sc_total; 743 m_sc->sc_total += l_sc->sc_total;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 06e0b7768d9..eaa3b7b2f99 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -32,7 +32,8 @@ spinlock_t gfs2_sys_margs_lock;
32 32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{ 34{
35 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id); 35 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
36 MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev));
36} 37}
37 38
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) 39static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
@@ -221,9 +222,7 @@ static struct kobj_type gfs2_ktype = {
221 .sysfs_ops = &gfs2_attr_ops, 222 .sysfs_ops = &gfs2_attr_ops,
222}; 223};
223 224
224static struct kset gfs2_kset = { 225static struct kset *gfs2_kset;
225 .ktype = &gfs2_ktype,
226};
227 226
228/* 227/*
229 * display struct lm_lockstruct fields 228 * display struct lm_lockstruct fields
@@ -427,13 +426,11 @@ TUNE_ATTR_2(name, name##_store)
427TUNE_ATTR(demote_secs, 0); 426TUNE_ATTR(demote_secs, 0);
428TUNE_ATTR(incore_log_blocks, 0); 427TUNE_ATTR(incore_log_blocks, 0);
429TUNE_ATTR(log_flush_secs, 0); 428TUNE_ATTR(log_flush_secs, 0);
430TUNE_ATTR(jindex_refresh_secs, 0);
431TUNE_ATTR(quota_warn_period, 0); 429TUNE_ATTR(quota_warn_period, 0);
432TUNE_ATTR(quota_quantum, 0); 430TUNE_ATTR(quota_quantum, 0);
433TUNE_ATTR(atime_quantum, 0); 431TUNE_ATTR(atime_quantum, 0);
434TUNE_ATTR(max_readahead, 0); 432TUNE_ATTR(max_readahead, 0);
435TUNE_ATTR(complain_secs, 0); 433TUNE_ATTR(complain_secs, 0);
436TUNE_ATTR(reclaim_limit, 0);
437TUNE_ATTR(statfs_slow, 0); 434TUNE_ATTR(statfs_slow, 0);
438TUNE_ATTR(new_files_jdata, 0); 435TUNE_ATTR(new_files_jdata, 0);
439TUNE_ATTR(new_files_directio, 0); 436TUNE_ATTR(new_files_directio, 0);
@@ -450,13 +447,11 @@ static struct attribute *tune_attrs[] = {
450 &tune_attr_demote_secs.attr, 447 &tune_attr_demote_secs.attr,
451 &tune_attr_incore_log_blocks.attr, 448 &tune_attr_incore_log_blocks.attr,
452 &tune_attr_log_flush_secs.attr, 449 &tune_attr_log_flush_secs.attr,
453 &tune_attr_jindex_refresh_secs.attr,
454 &tune_attr_quota_warn_period.attr, 450 &tune_attr_quota_warn_period.attr,
455 &tune_attr_quota_quantum.attr, 451 &tune_attr_quota_quantum.attr,
456 &tune_attr_atime_quantum.attr, 452 &tune_attr_atime_quantum.attr,
457 &tune_attr_max_readahead.attr, 453 &tune_attr_max_readahead.attr,
458 &tune_attr_complain_secs.attr, 454 &tune_attr_complain_secs.attr,
459 &tune_attr_reclaim_limit.attr,
460 &tune_attr_statfs_slow.attr, 455 &tune_attr_statfs_slow.attr,
461 &tune_attr_quota_simul_sync.attr, 456 &tune_attr_quota_simul_sync.attr,
462 &tune_attr_quota_cache_secs.attr, 457 &tune_attr_quota_cache_secs.attr,
@@ -495,14 +490,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
495{ 490{
496 int error; 491 int error;
497 492
498 sdp->sd_kobj.kset = &gfs2_kset; 493 sdp->sd_kobj.kset = gfs2_kset;
499 sdp->sd_kobj.ktype = &gfs2_ktype; 494 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
500 495 "%s", sdp->sd_table_name);
501 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
502 if (error)
503 goto fail;
504
505 error = kobject_register(&sdp->sd_kobj);
506 if (error) 496 if (error)
507 goto fail; 497 goto fail;
508 498
@@ -522,6 +512,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
522 if (error) 512 if (error)
523 goto fail_args; 513 goto fail_args;
524 514
515 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
525 return 0; 516 return 0;
526 517
527fail_args: 518fail_args:
@@ -531,7 +522,7 @@ fail_counters:
531fail_lockstruct: 522fail_lockstruct:
532 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 523 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
533fail_reg: 524fail_reg:
534 kobject_unregister(&sdp->sd_kobj); 525 kobject_put(&sdp->sd_kobj);
535fail: 526fail:
536 fs_err(sdp, "error %d adding sysfs files", error); 527 fs_err(sdp, "error %d adding sysfs files", error);
537 return error; 528 return error;
@@ -543,21 +534,22 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
543 sysfs_remove_group(&sdp->sd_kobj, &args_group); 534 sysfs_remove_group(&sdp->sd_kobj, &args_group);
544 sysfs_remove_group(&sdp->sd_kobj, &counters_group); 535 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
545 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 536 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
546 kobject_unregister(&sdp->sd_kobj); 537 kobject_put(&sdp->sd_kobj);
547} 538}
548 539
549int gfs2_sys_init(void) 540int gfs2_sys_init(void)
550{ 541{
551 gfs2_sys_margs = NULL; 542 gfs2_sys_margs = NULL;
552 spin_lock_init(&gfs2_sys_margs_lock); 543 spin_lock_init(&gfs2_sys_margs_lock);
553 kobject_set_name(&gfs2_kset.kobj, "gfs2"); 544 gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
554 kobj_set_kset_s(&gfs2_kset, fs_subsys); 545 if (!gfs2_kset)
555 return kset_register(&gfs2_kset); 546 return -ENOMEM;
547 return 0;
556} 548}
557 549
558void gfs2_sys_uninit(void) 550void gfs2_sys_uninit(void)
559{ 551{
560 kfree(gfs2_sys_margs); 552 kfree(gfs2_sys_margs);
561 kset_unregister(&gfs2_kset); 553 kset_unregister(gfs2_kset);
562} 554}
563 555
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 717983e2c2a..73e5d92a657 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -114,11 +114,6 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
114 gfs2_log_flush(sdp, NULL); 114 gfs2_log_flush(sdp, NULL);
115} 115}
116 116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/** 117/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction 118 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to 119 * @gl: the glock the buffer belongs to
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 043d5f4b9c4..e826f0dab80 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -30,7 +30,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
30 30
31void gfs2_trans_end(struct gfs2_sbd *sdp); 31void gfs2_trans_end(struct gfs2_sbd *sdp);
32 32
33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 33void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); 34void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); 35void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 31284c77bba..110dd3515dc 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -61,7 +61,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
61 mapping = tree->inode->i_mapping; 61 mapping = tree->inode->i_mapping;
62 page = read_mapping_page(mapping, 0, NULL); 62 page = read_mapping_page(mapping, 0, NULL);
63 if (IS_ERR(page)) 63 if (IS_ERR(page))
64 goto free_tree; 64 goto free_inode;
65 65
66 /* Load the header */ 66 /* Load the header */
67 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 67 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -99,11 +99,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
99 page_cache_release(page); 99 page_cache_release(page);
100 return tree; 100 return tree;
101 101
102 fail_page: 102fail_page:
103 page_cache_release(page); 103 page_cache_release(page);
104 free_tree: 104free_inode:
105 tree->inode->i_mapping->a_ops = &hfs_aops; 105 tree->inode->i_mapping->a_ops = &hfs_aops;
106 iput(tree->inode); 106 iput(tree->inode);
107free_tree:
107 kfree(tree); 108 kfree(tree);
108 return NULL; 109 return NULL;
109} 110}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7028c..038ed743619 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -288,10 +288,12 @@ handle_t *journal_start(journal_t *journal, int nblocks)
288 jbd_free_handle(handle); 288 jbd_free_handle(handle);
289 current->journal_info = NULL; 289 current->journal_info = NULL;
290 handle = ERR_PTR(err); 290 handle = ERR_PTR(err);
291 goto out;
291 } 292 }
292 293
293 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
294 295
296out:
295 return handle; 297 return handle;
296} 298}
297 299
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index df25ecc418a..4dcc0581999 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -284,11 +284,11 @@ static struct dir_table_slot *find_index(struct inode *ip, u32 index,
284 release_metapage(*mp); 284 release_metapage(*mp);
285 *mp = NULL; 285 *mp = NULL;
286 } 286 }
287 if (*mp == 0) { 287 if (!(*mp)) {
288 *lblock = blkno; 288 *lblock = blkno;
289 *mp = read_index_page(ip, blkno); 289 *mp = read_index_page(ip, blkno);
290 } 290 }
291 if (*mp == 0) { 291 if (!(*mp)) {
292 jfs_err("free_index: error reading directory table"); 292 jfs_err("free_index: error reading directory table");
293 return NULL; 293 return NULL;
294 } 294 }
@@ -413,7 +413,8 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
413 } 413 }
414 ip->i_size = PSIZE; 414 ip->i_size = PSIZE;
415 415
416 if ((mp = get_index_page(ip, 0)) == 0) { 416 mp = get_index_page(ip, 0);
417 if (!mp) {
417 jfs_err("add_index: get_metapage failed!"); 418 jfs_err("add_index: get_metapage failed!");
418 xtTruncate(tid, ip, 0, COMMIT_PWMAP); 419 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
419 memcpy(&jfs_ip->i_dirtable, temp_table, 420 memcpy(&jfs_ip->i_dirtable, temp_table,
@@ -461,7 +462,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
461 } else 462 } else
462 mp = read_index_page(ip, blkno); 463 mp = read_index_page(ip, blkno);
463 464
464 if (mp == 0) { 465 if (!mp) {
465 jfs_err("add_index: get/read_metapage failed!"); 466 jfs_err("add_index: get/read_metapage failed!");
466 goto clean_up; 467 goto clean_up;
467 } 468 }
@@ -499,7 +500,7 @@ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
499 500
500 dirtab_slot = find_index(ip, index, &mp, &lblock); 501 dirtab_slot = find_index(ip, index, &mp, &lblock);
501 502
502 if (dirtab_slot == 0) 503 if (!dirtab_slot)
503 return; 504 return;
504 505
505 dirtab_slot->flag = DIR_INDEX_FREE; 506 dirtab_slot->flag = DIR_INDEX_FREE;
@@ -526,7 +527,7 @@ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
526 527
527 dirtab_slot = find_index(ip, index, mp, lblock); 528 dirtab_slot = find_index(ip, index, mp, lblock);
528 529
529 if (dirtab_slot == 0) 530 if (!dirtab_slot)
530 return; 531 return;
531 532
532 DTSaddress(dirtab_slot, bn); 533 DTSaddress(dirtab_slot, bn);
@@ -552,7 +553,7 @@ static int read_index(struct inode *ip, u32 index,
552 struct dir_table_slot *slot; 553 struct dir_table_slot *slot;
553 554
554 slot = find_index(ip, index, &mp, &lblock); 555 slot = find_index(ip, index, &mp, &lblock);
555 if (slot == 0) { 556 if (!slot) {
556 return -EIO; 557 return -EIO;
557 } 558 }
558 559
@@ -592,10 +593,8 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
592 struct component_name ciKey; 593 struct component_name ciKey;
593 struct super_block *sb = ip->i_sb; 594 struct super_block *sb = ip->i_sb;
594 595
595 ciKey.name = 596 ciKey.name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS);
596 (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), 597 if (!ciKey.name) {
597 GFP_NOFS);
598 if (ciKey.name == 0) {
599 rc = -ENOMEM; 598 rc = -ENOMEM;
600 goto dtSearch_Exit2; 599 goto dtSearch_Exit2;
601 } 600 }
@@ -957,10 +956,8 @@ static int dtSplitUp(tid_t tid,
957 smp = split->mp; 956 smp = split->mp;
958 sp = DT_PAGE(ip, smp); 957 sp = DT_PAGE(ip, smp);
959 958
960 key.name = 959 key.name = kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), GFP_NOFS);
961 (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), 960 if (!key.name) {
962 GFP_NOFS);
963 if (key.name == 0) {
964 DT_PUTPAGE(smp); 961 DT_PUTPAGE(smp);
965 rc = -ENOMEM; 962 rc = -ENOMEM;
966 goto dtSplitUp_Exit; 963 goto dtSplitUp_Exit;
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 8561c6ecece..cdac2d5bafe 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -74,7 +74,7 @@ struct idtentry {
74#define DTIHDRDATALEN 11 74#define DTIHDRDATALEN 11
75 75
76/* compute number of slots for entry */ 76/* compute number of slots for entry */
77#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 ) 77#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15))
78 78
79 79
80/* 80/*
@@ -133,7 +133,7 @@ struct dir_table_slot {
133 ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) 133 ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
134 134
135/* compute number of slots for entry */ 135/* compute number of slots for entry */
136#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 ) 136#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15))
137#define NDTLEAF NDTINTERNAL 137#define NDTLEAF NDTINTERNAL
138 138
139 139
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3870ba8b908..9bf29f77173 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -381,7 +381,7 @@ int diRead(struct inode *ip)
381 381
382 /* read the page of disk inode */ 382 /* read the page of disk inode */
383 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); 383 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
384 if (mp == 0) { 384 if (!mp) {
385 jfs_err("diRead: read_metapage failed"); 385 jfs_err("diRead: read_metapage failed");
386 return -EIO; 386 return -EIO;
387 } 387 }
@@ -654,7 +654,7 @@ int diWrite(tid_t tid, struct inode *ip)
654 /* read the page of disk inode */ 654 /* read the page of disk inode */
655 retry: 655 retry:
656 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); 656 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
657 if (mp == 0) 657 if (!mp)
658 return -EIO; 658 return -EIO;
659 659
660 /* get the pointer to the disk inode */ 660 /* get the pointer to the disk inode */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 15a3974cdee..325a9679b95 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -208,6 +208,17 @@ static struct lmStat {
208} lmStat; 208} lmStat;
209#endif 209#endif
210 210
211static void write_special_inodes(struct jfs_log *log,
212 int (*writer)(struct address_space *))
213{
214 struct jfs_sb_info *sbi;
215
216 list_for_each_entry(sbi, &log->sb_list, log_list) {
217 writer(sbi->ipbmap->i_mapping);
218 writer(sbi->ipimap->i_mapping);
219 writer(sbi->direct_inode->i_mapping);
220 }
221}
211 222
212/* 223/*
213 * NAME: lmLog() 224 * NAME: lmLog()
@@ -935,22 +946,13 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
935 struct lrd lrd; 946 struct lrd lrd;
936 int lsn; 947 int lsn;
937 struct logsyncblk *lp; 948 struct logsyncblk *lp;
938 struct jfs_sb_info *sbi;
939 unsigned long flags; 949 unsigned long flags;
940 950
941 /* push dirty metapages out to disk */ 951 /* push dirty metapages out to disk */
942 if (hard_sync) 952 if (hard_sync)
943 list_for_each_entry(sbi, &log->sb_list, log_list) { 953 write_special_inodes(log, filemap_fdatawrite);
944 filemap_fdatawrite(sbi->ipbmap->i_mapping);
945 filemap_fdatawrite(sbi->ipimap->i_mapping);
946 filemap_fdatawrite(sbi->direct_inode->i_mapping);
947 }
948 else 954 else
949 list_for_each_entry(sbi, &log->sb_list, log_list) { 955 write_special_inodes(log, filemap_flush);
950 filemap_flush(sbi->ipbmap->i_mapping);
951 filemap_flush(sbi->ipimap->i_mapping);
952 filemap_flush(sbi->direct_inode->i_mapping);
953 }
954 956
955 /* 957 /*
956 * forward syncpt 958 * forward syncpt
@@ -1536,7 +1538,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1536{ 1538{
1537 int i; 1539 int i;
1538 struct tblock *target = NULL; 1540 struct tblock *target = NULL;
1539 struct jfs_sb_info *sbi;
1540 1541
1541 /* jfs_write_inode may call us during read-only mount */ 1542 /* jfs_write_inode may call us during read-only mount */
1542 if (!log) 1543 if (!log)
@@ -1598,11 +1599,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1598 if (wait < 2) 1599 if (wait < 2)
1599 return; 1600 return;
1600 1601
1601 list_for_each_entry(sbi, &log->sb_list, log_list) { 1602 write_special_inodes(log, filemap_fdatawrite);
1602 filemap_fdatawrite(sbi->ipbmap->i_mapping);
1603 filemap_fdatawrite(sbi->ipimap->i_mapping);
1604 filemap_fdatawrite(sbi->direct_inode->i_mapping);
1605 }
1606 1603
1607 /* 1604 /*
1608 * If there was recent activity, we may need to wait 1605 * If there was recent activity, we may need to wait
@@ -1611,6 +1608,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1611 if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { 1608 if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
1612 for (i = 0; i < 200; i++) { /* Too much? */ 1609 for (i = 0; i < 200; i++) { /* Too much? */
1613 msleep(250); 1610 msleep(250);
1611 write_special_inodes(log, filemap_fdatawrite);
1614 if (list_empty(&log->cqueue) && 1612 if (list_empty(&log->cqueue) &&
1615 list_empty(&log->synclist)) 1613 list_empty(&log->synclist))
1616 break; 1614 break;
@@ -2347,7 +2345,7 @@ int jfsIOWait(void *arg)
2347 2345
2348 do { 2346 do {
2349 spin_lock_irq(&log_redrive_lock); 2347 spin_lock_irq(&log_redrive_lock);
2350 while ((bp = log_redrive_list) != 0) { 2348 while ((bp = log_redrive_list)) {
2351 log_redrive_list = bp->l_redrive_next; 2349 log_redrive_list = bp->l_redrive_next;
2352 bp->l_redrive_next = NULL; 2350 bp->l_redrive_next = NULL;
2353 spin_unlock_irq(&log_redrive_lock); 2351 spin_unlock_irq(&log_redrive_lock);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f5cd8d38af7..d1e64f2f2fc 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -39,11 +39,11 @@ static struct {
39#endif 39#endif
40 40
41#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) 41#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag)
42#define trylock_metapage(mp) test_and_set_bit(META_locked, &(mp)->flag) 42#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag)
43 43
44static inline void unlock_metapage(struct metapage *mp) 44static inline void unlock_metapage(struct metapage *mp)
45{ 45{
46 clear_bit(META_locked, &mp->flag); 46 clear_bit_unlock(META_locked, &mp->flag);
47 wake_up(&mp->wait); 47 wake_up(&mp->wait);
48} 48}
49 49
@@ -88,7 +88,7 @@ struct meta_anchor {
88}; 88};
89#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) 89#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
90 90
91static inline struct metapage *page_to_mp(struct page *page, uint offset) 91static inline struct metapage *page_to_mp(struct page *page, int offset)
92{ 92{
93 if (!PagePrivate(page)) 93 if (!PagePrivate(page))
94 return NULL; 94 return NULL;
@@ -153,7 +153,7 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
153} 153}
154 154
155#else 155#else
156static inline struct metapage *page_to_mp(struct page *page, uint offset) 156static inline struct metapage *page_to_mp(struct page *page, int offset)
157{ 157{
158 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; 158 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
159} 159}
@@ -249,7 +249,7 @@ static inline void drop_metapage(struct page *page, struct metapage *mp)
249 */ 249 */
250 250
251static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, 251static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
252 unsigned int *len) 252 int *len)
253{ 253{
254 int rc = 0; 254 int rc = 0;
255 int xflag; 255 int xflag;
@@ -352,25 +352,27 @@ static void metapage_write_end_io(struct bio *bio, int err)
352static int metapage_writepage(struct page *page, struct writeback_control *wbc) 352static int metapage_writepage(struct page *page, struct writeback_control *wbc)
353{ 353{
354 struct bio *bio = NULL; 354 struct bio *bio = NULL;
355 unsigned int block_offset; /* block offset of mp within page */ 355 int block_offset; /* block offset of mp within page */
356 struct inode *inode = page->mapping->host; 356 struct inode *inode = page->mapping->host;
357 unsigned int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; 357 int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage;
358 unsigned int len; 358 int len;
359 unsigned int xlen; 359 int xlen;
360 struct metapage *mp; 360 struct metapage *mp;
361 int redirty = 0; 361 int redirty = 0;
362 sector_t lblock; 362 sector_t lblock;
363 int nr_underway = 0;
363 sector_t pblock; 364 sector_t pblock;
364 sector_t next_block = 0; 365 sector_t next_block = 0;
365 sector_t page_start; 366 sector_t page_start;
366 unsigned long bio_bytes = 0; 367 unsigned long bio_bytes = 0;
367 unsigned long bio_offset = 0; 368 unsigned long bio_offset = 0;
368 unsigned int offset; 369 int offset;
369 370
370 page_start = (sector_t)page->index << 371 page_start = (sector_t)page->index <<
371 (PAGE_CACHE_SHIFT - inode->i_blkbits); 372 (PAGE_CACHE_SHIFT - inode->i_blkbits);
372 BUG_ON(!PageLocked(page)); 373 BUG_ON(!PageLocked(page));
373 BUG_ON(PageWriteback(page)); 374 BUG_ON(PageWriteback(page));
375 set_page_writeback(page);
374 376
375 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { 377 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
376 mp = page_to_mp(page, offset); 378 mp = page_to_mp(page, offset);
@@ -413,11 +415,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
413 if (!bio->bi_size) 415 if (!bio->bi_size)
414 goto dump_bio; 416 goto dump_bio;
415 submit_bio(WRITE, bio); 417 submit_bio(WRITE, bio);
418 nr_underway++;
416 bio = NULL; 419 bio = NULL;
417 } else { 420 } else
418 set_page_writeback(page);
419 inc_io(page); 421 inc_io(page);
420 }
421 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits; 422 xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
422 pblock = metapage_get_blocks(inode, lblock, &xlen); 423 pblock = metapage_get_blocks(inode, lblock, &xlen);
423 if (!pblock) { 424 if (!pblock) {
@@ -427,7 +428,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
427 continue; 428 continue;
428 } 429 }
429 set_bit(META_io, &mp->flag); 430 set_bit(META_io, &mp->flag);
430 len = min(xlen, (uint) JFS_SBI(inode->i_sb)->nbperpage); 431 len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
431 432
432 bio = bio_alloc(GFP_NOFS, 1); 433 bio = bio_alloc(GFP_NOFS, 1);
433 bio->bi_bdev = inode->i_sb->s_bdev; 434 bio->bi_bdev = inode->i_sb->s_bdev;
@@ -449,12 +450,16 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
449 goto dump_bio; 450 goto dump_bio;
450 451
451 submit_bio(WRITE, bio); 452 submit_bio(WRITE, bio);
453 nr_underway++;
452 } 454 }
453 if (redirty) 455 if (redirty)
454 redirty_page_for_writepage(wbc, page); 456 redirty_page_for_writepage(wbc, page);
455 457
456 unlock_page(page); 458 unlock_page(page);
457 459
460 if (nr_underway == 0)
461 end_page_writeback(page);
462
458 return 0; 463 return 0;
459add_failed: 464add_failed:
460 /* We should never reach here, since we're only adding one vec */ 465 /* We should never reach here, since we're only adding one vec */
@@ -475,13 +480,13 @@ static int metapage_readpage(struct file *fp, struct page *page)
475{ 480{
476 struct inode *inode = page->mapping->host; 481 struct inode *inode = page->mapping->host;
477 struct bio *bio = NULL; 482 struct bio *bio = NULL;
478 unsigned int block_offset; 483 int block_offset;
479 unsigned int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; 484 int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
480 sector_t page_start; /* address of page in fs blocks */ 485 sector_t page_start; /* address of page in fs blocks */
481 sector_t pblock; 486 sector_t pblock;
482 unsigned int xlen; 487 int xlen;
483 unsigned int len; 488 unsigned int len;
484 unsigned int offset; 489 int offset;
485 490
486 BUG_ON(!PageLocked(page)); 491 BUG_ON(!PageLocked(page));
487 page_start = (sector_t)page->index << 492 page_start = (sector_t)page->index <<
@@ -530,7 +535,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
530{ 535{
531 struct metapage *mp; 536 struct metapage *mp;
532 int ret = 1; 537 int ret = 1;
533 unsigned int offset; 538 int offset;
534 539
535 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { 540 for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
536 mp = page_to_mp(page, offset); 541 mp = page_to_mp(page, offset);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 644429acb8c..7b698f2ec45 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -147,7 +147,7 @@ int jfs_mount(struct super_block *sb)
147 */ 147 */
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { 148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); 149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (ipaimap2 == 0) { 150 if (!ipaimap2) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I"); 151 jfs_err("jfs_mount: Faild to read AGGREGATE_I");
152 rc = -EIO; 152 rc = -EIO;
153 goto errout35; 153 goto errout35;
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 7971f37534a..adcf92d3b60 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
68 /* 68 /*
69 * Wait for outstanding transactions to be written to log: 69 * Wait for outstanding transactions to be written to log:
70 */ 70 */
71 jfs_flush_journal(log, 2); 71 jfs_flush_journal(log, 1);
72 72
73 /* 73 /*
74 * close fileset inode allocation map (aka fileset inode) 74 * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
146 * 146 *
147 * remove file system from log active file system list. 147 * remove file system from log active file system list.
148 */ 148 */
149 jfs_flush_journal(log, 2); 149 jfs_flush_journal(log, 1);
150 150
151 /* 151 /*
152 * Make sure all metadata makes it to disk 152 * Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4e0a8493cef..f8718de3505 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1103,8 +1103,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1103 * Make sure dest inode number (if any) is what we think it is 1103 * Make sure dest inode number (if any) is what we think it is
1104 */ 1104 */
1105 rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); 1105 rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
1106 if (rc == 0) { 1106 if (!rc) {
1107 if ((new_ip == 0) || (ino != new_ip->i_ino)) { 1107 if ((!new_ip) || (ino != new_ip->i_ino)) {
1108 rc = -ESTALE; 1108 rc = -ESTALE;
1109 goto out3; 1109 goto out3;
1110 } 1110 }
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 71984ee9534..7f24a0bb08c 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -172,7 +172,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
172 */ 172 */
173 t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) 173 t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
174 << L2BPERDMAP; 174 << L2BPERDMAP;
175 t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50; 175 t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50;
176 newFSCKSize = t32 << sbi->l2nbperpage; 176 newFSCKSize = t32 << sbi->l2nbperpage;
177 newFSCKAddress = newLogAddress - newFSCKSize; 177 newFSCKAddress = newLogAddress - newFSCKSize;
178 178
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 314bb4ff1ba..70a14001c98 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -598,6 +598,12 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
598 seq_printf(seq, ",umask=%03o", sbi->umask); 598 seq_printf(seq, ",umask=%03o", sbi->umask);
599 if (sbi->flag & JFS_NOINTEGRITY) 599 if (sbi->flag & JFS_NOINTEGRITY)
600 seq_puts(seq, ",nointegrity"); 600 seq_puts(seq, ",nointegrity");
601 if (sbi->nls_tab)
602 seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset);
603 if (sbi->flag & JFS_ERR_CONTINUE)
604 seq_printf(seq, ",errors=continue");
605 if (sbi->flag & JFS_ERR_PANIC)
606 seq_printf(seq, ",errors=panic");
601 607
602#ifdef CONFIG_QUOTA 608#ifdef CONFIG_QUOTA
603 if (sbi->flag & JFS_USRQUOTA) 609 if (sbi->flag & JFS_USRQUOTA)
diff --git a/fs/namei.c b/fs/namei.c
index 3b993db26ce..73e2e665817 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1605,7 +1605,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1605 if (S_ISLNK(inode->i_mode)) 1605 if (S_ISLNK(inode->i_mode))
1606 return -ELOOP; 1606 return -ELOOP;
1607 1607
1608 if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) 1608 if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
1609 return -EISDIR; 1609 return -EISDIR;
1610 1610
1611 /* 1611 /*
@@ -1620,7 +1620,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1620 return -EACCES; 1620 return -EACCES;
1621 1621
1622 flag &= ~O_TRUNC; 1622 flag &= ~O_TRUNC;
1623 } else if (IS_RDONLY(inode) && (flag & FMODE_WRITE)) 1623 } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
1624 return -EROFS; 1624 return -EROFS;
1625 1625
1626 error = vfs_permission(nd, acc_mode); 1626 error = vfs_permission(nd, acc_mode);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06083885b21..61bf376e29e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,8 +41,8 @@ static struct kmem_cache *mnt_cache __read_mostly;
41static struct rw_semaphore namespace_sem; 41static struct rw_semaphore namespace_sem;
42 42
43/* /sys/fs */ 43/* /sys/fs */
44decl_subsys(fs, NULL, NULL); 44struct kobject *fs_kobj;
45EXPORT_SYMBOL_GPL(fs_subsys); 45EXPORT_SYMBOL_GPL(fs_kobj);
46 46
47static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 47static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
48{ 48{
@@ -1861,10 +1861,9 @@ void __init mnt_init(void)
1861 if (err) 1861 if (err)
1862 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 1862 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
1863 __FUNCTION__, err); 1863 __FUNCTION__, err);
1864 err = subsystem_register(&fs_subsys); 1864 fs_kobj = kobject_create_and_add("fs", NULL);
1865 if (err) 1865 if (!fs_kobj)
1866 printk(KERN_WARNING "%s: subsystem_register error: %d\n", 1866 printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__);
1867 __FUNCTION__, err);
1868 init_rootfs(); 1867 init_rootfs();
1869 init_mount_tree(); 1868 init_mount_tree();
1870} 1869}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2d116d2298f..f917fd25858 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -388,8 +388,11 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
388 * Round the length of the data which was specified up to 388 * Round the length of the data which was specified up to
389 * the next multiple of XDR units and then compare that 389 * the next multiple of XDR units and then compare that
390 * against the length which was actually received. 390 * against the length which was actually received.
391 * Note that when RPCSEC/GSS (for example) is used, the
392 * data buffer can be padded so dlen might be larger
393 * than required. It must never be smaller.
391 */ 394 */
392 if (dlen != XDR_QUADLEN(len)*4) 395 if (dlen < XDR_QUADLEN(len)*4)
393 return 0; 396 return 0;
394 397
395 if (args->count > max_blocksize) { 398 if (args->count > max_blocksize) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 986f9b32083..b86e3658a0a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -313,8 +313,11 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
313 * Round the length of the data which was specified up to 313 * Round the length of the data which was specified up to
314 * the next multiple of XDR units and then compare that 314 * the next multiple of XDR units and then compare that
315 * against the length which was actually received. 315 * against the length which was actually received.
316 * Note that when RPCSEC/GSS (for example) is used, the
317 * data buffer can be padded so dlen might be larger
318 * than required. It must never be smaller.
316 */ 319 */
317 if (dlen != XDR_QUADLEN(len)*4) 320 if (dlen < XDR_QUADLEN(len)*4)
318 return 0; 321 return 0;
319 322
320 rqstp->rq_vec[0].iov_base = (void*)p; 323 rqstp->rq_vec[0].iov_base = (void*)p;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b..4d4ce48bb42 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -19,16 +19,17 @@ ocfs2-objs := \
19 ioctl.o \ 19 ioctl.o \
20 journal.o \ 20 journal.o \
21 localalloc.o \ 21 localalloc.o \
22 locks.o \
22 mmap.o \ 23 mmap.o \
23 namei.o \ 24 namei.o \
25 resize.o \
24 slot_map.o \ 26 slot_map.o \
25 suballoc.o \ 27 suballoc.o \
26 super.o \ 28 super.o \
27 symlink.o \ 29 symlink.o \
28 sysfile.o \ 30 sysfile.o \
29 uptodate.o \ 31 uptodate.o \
30 ver.o \ 32 ver.o
31 vote.o
32 33
33obj-$(CONFIG_OCFS2_FS) += cluster/ 34obj-$(CONFIG_OCFS2_FS) += cluster/
34obj-$(CONFIG_OCFS2_FS) += dlm/ 35obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 23c8cda43f1..e6df06ac640 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4731,7 +4731,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4731 4731
4732 mutex_lock(&data_alloc_inode->i_mutex); 4732 mutex_lock(&data_alloc_inode->i_mutex);
4733 4733
4734 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 4734 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
4735 if (status < 0) { 4735 if (status < 0) {
4736 mlog_errno(status); 4736 mlog_errno(status);
4737 goto out_mutex; 4737 goto out_mutex;
@@ -4753,7 +4753,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
4753 4753
4754out_unlock: 4754out_unlock:
4755 brelse(data_alloc_bh); 4755 brelse(data_alloc_bh);
4756 ocfs2_meta_unlock(data_alloc_inode, 1); 4756 ocfs2_inode_unlock(data_alloc_inode, 1);
4757 4757
4758out_mutex: 4758out_mutex:
4759 mutex_unlock(&data_alloc_inode->i_mutex); 4759 mutex_unlock(&data_alloc_inode->i_mutex);
@@ -5077,7 +5077,7 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5077 5077
5078 mutex_lock(&inode->i_mutex); 5078 mutex_lock(&inode->i_mutex);
5079 5079
5080 ret = ocfs2_meta_lock(inode, &di_bh, 1); 5080 ret = ocfs2_inode_lock(inode, &di_bh, 1);
5081 if (ret) { 5081 if (ret) {
5082 mlog_errno(ret); 5082 mlog_errno(ret);
5083 goto out_mutex; 5083 goto out_mutex;
@@ -5118,7 +5118,7 @@ out_journal:
5118 ocfs2_commit_trans(osb, handle); 5118 ocfs2_commit_trans(osb, handle);
5119 5119
5120out_unlock: 5120out_unlock:
5121 ocfs2_meta_unlock(inode, 1); 5121 ocfs2_inode_unlock(inode, 1);
5122 brelse(di_bh); 5122 brelse(di_bh);
5123out_mutex: 5123out_mutex:
5124 mutex_unlock(&inode->i_mutex); 5124 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 56f7790cad4..bc7b4cbbe8e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -26,6 +26,7 @@
26#include <asm/byteorder.h> 26#include <asm/byteorder.h>
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h>
29 30
30#define MLOG_MASK_PREFIX ML_FILE_IO 31#define MLOG_MASK_PREFIX ML_FILE_IO
31#include <cluster/masklog.h> 32#include <cluster/masklog.h>
@@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
139{ 140{
140 int err = 0; 141 int err = 0;
141 unsigned int ext_flags; 142 unsigned int ext_flags;
142 u64 p_blkno, past_eof; 143 u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
144 u64 p_blkno, count, past_eof;
143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
144 146
145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 147 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
@@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
155 goto bail; 157 goto bail;
156 } 158 }
157 159
158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 160 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
159 &ext_flags); 161 &ext_flags);
160 if (err) { 162 if (err) {
161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 163 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
@@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
164 goto bail; 166 goto bail;
165 } 167 }
166 168
169 if (max_blocks < count)
170 count = max_blocks;
171
167 /* 172 /*
168 * ocfs2 never allocates in this function - the only time we 173 * ocfs2 never allocates in this function - the only time we
169 * need to use BH_New is when we're extending i_size on a file 174 * need to use BH_New is when we're extending i_size on a file
@@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 183 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
179 map_bh(bh_result, inode->i_sb, p_blkno); 184 map_bh(bh_result, inode->i_sb, p_blkno);
180 185
186 bh_result->b_size = count << inode->i_blkbits;
187
181 if (!ocfs2_sparse_alloc(osb)) { 188 if (!ocfs2_sparse_alloc(osb)) {
182 if (p_blkno == 0) { 189 if (p_blkno == 0) {
183 err = -EIO; 190 err = -EIO;
@@ -210,7 +217,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh) 217 struct buffer_head *di_bh)
211{ 218{
212 void *kaddr; 219 void *kaddr;
213 unsigned int size; 220 loff_t size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 221 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215 222
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) { 223 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
@@ -224,8 +231,9 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
224 if (size > PAGE_CACHE_SIZE || 231 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) { 232 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb, 233 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u", 234 "Inode %llu has with inline data has bad size: %Lu",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size); 235 (unsigned long long)OCFS2_I(inode)->ip_blkno,
236 (unsigned long long)size);
229 return -EROFS; 237 return -EROFS;
230 } 238 }
231 239
@@ -275,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
275 283
276 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 284 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
277 285
278 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 286 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
279 if (ret != 0) { 287 if (ret != 0) {
280 if (ret == AOP_TRUNCATED_PAGE) 288 if (ret == AOP_TRUNCATED_PAGE)
281 unlock = 0; 289 unlock = 0;
@@ -285,7 +293,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
285 293
286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 294 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
287 ret = AOP_TRUNCATED_PAGE; 295 ret = AOP_TRUNCATED_PAGE;
288 goto out_meta_unlock; 296 goto out_inode_unlock;
289 } 297 }
290 298
291 /* 299 /*
@@ -305,25 +313,16 @@ static int ocfs2_readpage(struct file *file, struct page *page)
305 goto out_alloc; 313 goto out_alloc;
306 } 314 }
307 315
308 ret = ocfs2_data_lock_with_page(inode, 0, page);
309 if (ret != 0) {
310 if (ret == AOP_TRUNCATED_PAGE)
311 unlock = 0;
312 mlog_errno(ret);
313 goto out_alloc;
314 }
315
316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page); 317 ret = ocfs2_readpage_inline(inode, page);
318 else 318 else
319 ret = block_read_full_page(page, ocfs2_get_block); 319 ret = block_read_full_page(page, ocfs2_get_block);
320 unlock = 0; 320 unlock = 0;
321 321
322 ocfs2_data_unlock(inode, 0);
323out_alloc: 322out_alloc:
324 up_read(&OCFS2_I(inode)->ip_alloc_sem); 323 up_read(&OCFS2_I(inode)->ip_alloc_sem);
325out_meta_unlock: 324out_inode_unlock:
326 ocfs2_meta_unlock(inode, 0); 325 ocfs2_inode_unlock(inode, 0);
327out: 326out:
328 if (unlock) 327 if (unlock)
329 unlock_page(page); 328 unlock_page(page);
@@ -331,6 +330,62 @@ out:
331 return ret; 330 return ret;
332} 331}
333 332
333/*
334 * This is used only for read-ahead. Failures or difficult to handle
335 * situations are safe to ignore.
336 *
337 * Right now, we don't bother with BH_Boundary - in-inode extent lists
338 * are quite large (243 extents on 4k blocks), so most inodes don't
339 * grow out to a tree. If need be, detecting boundary extents could
340 * trivially be added in a future version of ocfs2_get_block().
341 */
342static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
343 struct list_head *pages, unsigned nr_pages)
344{
345 int ret, err = -EIO;
346 struct inode *inode = mapping->host;
347 struct ocfs2_inode_info *oi = OCFS2_I(inode);
348 loff_t start;
349 struct page *last;
350
351 /*
352 * Use the nonblocking flag for the dlm code to avoid page
353 * lock inversion, but don't bother with retrying.
354 */
355 ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
356 if (ret)
357 return err;
358
359 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
360 ocfs2_inode_unlock(inode, 0);
361 return err;
362 }
363
364 /*
365 * Don't bother with inline-data. There isn't anything
366 * to read-ahead in that case anyway...
367 */
368 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
369 goto out_unlock;
370
371 /*
372 * Check whether a remote node truncated this file - we just
373 * drop out in that case as it's not worth handling here.
374 */
375 last = list_entry(pages->prev, struct page, lru);
376 start = (loff_t)last->index << PAGE_CACHE_SHIFT;
377 if (start >= i_size_read(inode))
378 goto out_unlock;
379
380 err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
381
382out_unlock:
383 up_read(&oi->ip_alloc_sem);
384 ocfs2_inode_unlock(inode, 0);
385
386 return err;
387}
388
334/* Note: Because we don't support holes, our allocation has 389/* Note: Because we don't support holes, our allocation has
335 * already happened (allocation writes zeros to the file data) 390 * already happened (allocation writes zeros to the file data)
336 * so we don't have to worry about ordered writes in 391 * so we don't have to worry about ordered writes in
@@ -452,7 +507,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
452 * accessed concurrently from multiple nodes. 507 * accessed concurrently from multiple nodes.
453 */ 508 */
454 if (!INODE_JOURNAL(inode)) { 509 if (!INODE_JOURNAL(inode)) {
455 err = ocfs2_meta_lock(inode, NULL, 0); 510 err = ocfs2_inode_lock(inode, NULL, 0);
456 if (err) { 511 if (err) {
457 if (err != -ENOENT) 512 if (err != -ENOENT)
458 mlog_errno(err); 513 mlog_errno(err);
@@ -467,7 +522,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
467 522
468 if (!INODE_JOURNAL(inode)) { 523 if (!INODE_JOURNAL(inode)) {
469 up_read(&OCFS2_I(inode)->ip_alloc_sem); 524 up_read(&OCFS2_I(inode)->ip_alloc_sem);
470 ocfs2_meta_unlock(inode, 0); 525 ocfs2_inode_unlock(inode, 0);
471 } 526 }
472 527
473 if (err) { 528 if (err) {
@@ -638,34 +693,12 @@ static ssize_t ocfs2_direct_IO(int rw,
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 693 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0; 694 return 0;
640 695
641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
642 /*
643 * We get PR data locks even for O_DIRECT. This
644 * allows concurrent O_DIRECT I/O but doesn't let
645 * O_DIRECT with extending and buffered zeroing writes
646 * race. If they did race then the buffered zeroing
647 * could be written back after the O_DIRECT I/O. It's
648 * one thing to tell people not to mix buffered and
649 * O_DIRECT writes, but expecting them to understand
650 * that file extension is also an implicit buffered
651 * write is too much. By getting the PR we force
652 * writeback of the buffered zeroing before
653 * proceeding.
654 */
655 ret = ocfs2_data_lock(inode, 0);
656 if (ret < 0) {
657 mlog_errno(ret);
658 goto out;
659 }
660 ocfs2_data_unlock(inode, 0);
661 }
662
663 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 696 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
664 inode->i_sb->s_bdev, iov, offset, 697 inode->i_sb->s_bdev, iov, offset,
665 nr_segs, 698 nr_segs,
666 ocfs2_direct_IO_get_blocks, 699 ocfs2_direct_IO_get_blocks,
667 ocfs2_dio_end_io); 700 ocfs2_dio_end_io);
668out: 701
669 mlog_exit(ret); 702 mlog_exit(ret);
670 return ret; 703 return ret;
671} 704}
@@ -1754,7 +1787,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1754 struct buffer_head *di_bh = NULL; 1787 struct buffer_head *di_bh = NULL;
1755 struct inode *inode = mapping->host; 1788 struct inode *inode = mapping->host;
1756 1789
1757 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1790 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1758 if (ret) { 1791 if (ret) {
1759 mlog_errno(ret); 1792 mlog_errno(ret);
1760 return ret; 1793 return ret;
@@ -1769,30 +1802,22 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1769 */ 1802 */
1770 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1803 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1771 1804
1772 ret = ocfs2_data_lock(inode, 1);
1773 if (ret) {
1774 mlog_errno(ret);
1775 goto out_fail;
1776 }
1777
1778 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, 1805 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1779 fsdata, di_bh, NULL); 1806 fsdata, di_bh, NULL);
1780 if (ret) { 1807 if (ret) {
1781 mlog_errno(ret); 1808 mlog_errno(ret);
1782 goto out_fail_data; 1809 goto out_fail;
1783 } 1810 }
1784 1811
1785 brelse(di_bh); 1812 brelse(di_bh);
1786 1813
1787 return 0; 1814 return 0;
1788 1815
1789out_fail_data:
1790 ocfs2_data_unlock(inode, 1);
1791out_fail: 1816out_fail:
1792 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1817 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1793 1818
1794 brelse(di_bh); 1819 brelse(di_bh);
1795 ocfs2_meta_unlock(inode, 1); 1820 ocfs2_inode_unlock(inode, 1);
1796 1821
1797 return ret; 1822 return ret;
1798} 1823}
@@ -1908,15 +1933,15 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
1908 1933
1909 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 1934 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1910 1935
1911 ocfs2_data_unlock(inode, 1);
1912 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1936 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1913 ocfs2_meta_unlock(inode, 1); 1937 ocfs2_inode_unlock(inode, 1);
1914 1938
1915 return ret; 1939 return ret;
1916} 1940}
1917 1941
1918const struct address_space_operations ocfs2_aops = { 1942const struct address_space_operations ocfs2_aops = {
1919 .readpage = ocfs2_readpage, 1943 .readpage = ocfs2_readpage,
1944 .readpages = ocfs2_readpages,
1920 .writepage = ocfs2_writepage, 1945 .writepage = ocfs2_writepage,
1921 .write_begin = ocfs2_write_begin, 1946 .write_begin = ocfs2_write_begin,
1922 .write_end = ocfs2_write_end, 1947 .write_end = ocfs2_write_end,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f..f136639f5b4 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
79 * information for this bh as it's not marked locally 79 * information for this bh as it's not marked locally
80 * uptodate. */ 80 * uptodate. */
81 ret = -EIO; 81 ret = -EIO;
82 brelse(bh); 82 put_bh(bh);
83 } 83 }
84 84
85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
@@ -256,7 +256,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
256 * for this bh as it's not marked locally 256 * for this bh as it's not marked locally
257 * uptodate. */ 257 * uptodate. */
258 status = -EIO; 258 status = -EIO;
259 brelse(bh); 259 put_bh(bh);
260 bhs[i] = NULL; 260 bhs[i] = NULL;
261 continue; 261 continue;
262 } 262 }
@@ -280,3 +280,64 @@ bail:
280 mlog_exit(status); 280 mlog_exit(status);
281 return status; 281 return status;
282} 282}
283
284/* Check whether the blkno is the super block or one of the backups. */
285static void ocfs2_check_super_or_backup(struct super_block *sb,
286 sector_t blkno)
287{
288 int i;
289 u64 backup_blkno;
290
291 if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
292 return;
293
294 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
295 backup_blkno = ocfs2_backup_super_blkno(sb, i);
296 if (backup_blkno == blkno)
297 return;
298 }
299
300 BUG();
301}
302
303/*
304 * Write super block and backups doesn't need to collaborate with journal,
305 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
306 * into this function.
307 */
308int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
309 struct buffer_head *bh)
310{
311 int ret = 0;
312
313 mlog_entry_void();
314
315 BUG_ON(buffer_jbd(bh));
316 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
317
318 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
319 ret = -EROFS;
320 goto out;
321 }
322
323 lock_buffer(bh);
324 set_buffer_uptodate(bh);
325
326 /* remove from dirty list before I/O. */
327 clear_buffer_dirty(bh);
328
329 get_bh(bh); /* for end_buffer_write_sync() */
330 bh->b_end_io = end_buffer_write_sync;
331 submit_bh(WRITE, bh);
332
333 wait_on_buffer(bh);
334
335 if (!buffer_uptodate(bh)) {
336 ret = -EIO;
337 put_bh(bh);
338 }
339
340out:
341 mlog_exit(ret);
342 return ret;
343}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac..c2e78614c3e 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -47,6 +47,8 @@ int ocfs2_read_blocks(struct ocfs2_super *osb,
47 int flags, 47 int flags,
48 struct inode *inode); 48 struct inode *inode);
49 49
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh);
50 52
51#define OCFS2_BH_CACHED 1 53#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 54#define OCFS2_BH_READAHEAD 8
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 35397dd5ecd..e511339886b 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -35,7 +35,7 @@
35#define O2HB_LIVE_THRESHOLD 2 35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */ 36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold; 37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7 38#define O2HB_DEFAULT_DEAD_THRESHOLD 31
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ 39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2 40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) 41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a4882c8df94..23c732f2752 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -146,7 +146,7 @@ static struct kset mlog_kset = {
146 .kobj = {.ktype = &mlog_ktype}, 146 .kobj = {.ktype = &mlog_ktype},
147}; 147};
148 148
149int mlog_sys_init(struct kset *o2cb_subsys) 149int mlog_sys_init(struct kset *o2cb_kset)
150{ 150{
151 int i = 0; 151 int i = 0;
152 152
@@ -157,7 +157,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
157 mlog_attr_ptrs[i] = NULL; 157 mlog_attr_ptrs[i] = NULL;
158 158
159 kobject_set_name(&mlog_kset.kobj, "logmask"); 159 kobject_set_name(&mlog_kset.kobj, "logmask");
160 kobj_set_kset_s(&mlog_kset, *o2cb_subsys); 160 mlog_kset.kobj.kset = o2cb_kset;
161 return kset_register(&mlog_kset); 161 return kset_register(&mlog_kset);
162} 162}
163 163
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 64f6f378fd0..a4b07730b2e 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,96 +28,55 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/kobject.h> 29#include <linux/kobject.h>
30#include <linux/sysfs.h> 30#include <linux/sysfs.h>
31#include <linux/fs.h>
31 32
32#include "ocfs2_nodemanager.h" 33#include "ocfs2_nodemanager.h"
33#include "masklog.h" 34#include "masklog.h"
34#include "sys.h" 35#include "sys.h"
35 36
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
46 37
47static ssize_t o2cb_interface_revision_show(char *buf) 38static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
39 char *buf)
48{ 40{
49 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); 41 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
50} 42}
51 43static struct kobj_attribute attr_version =
52static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); 44 __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
53 45
54static struct attribute *o2cb_attrs[] = { 46static struct attribute *o2cb_attrs[] = {
55 &o2cb_attr_interface_revision.attr, 47 &attr_version.attr,
56 NULL, 48 NULL,
57}; 49};
58 50
59static ssize_t 51static struct attribute_group o2cb_attr_group = {
60o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); 52 .attrs = o2cb_attrs,
61static ssize_t
62o2cb_store(struct kobject * kobj, struct attribute * attr,
63 const char * buffer, size_t count);
64static struct sysfs_ops o2cb_sysfs_ops = {
65 .show = o2cb_show,
66 .store = o2cb_store,
67}; 53};
68 54
69static struct kobj_type o2cb_subsys_type = { 55static struct kset *o2cb_kset;
70 .default_attrs = o2cb_attrs,
71 .sysfs_ops = &o2cb_sysfs_ops,
72};
73
74/* gives us o2cb_subsys */
75static decl_subsys(o2cb, NULL, NULL);
76
77static ssize_t
78o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
79{
80 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
81 struct kset *sbs = to_kset(kobj);
82
83 BUG_ON(sbs != &o2cb_subsys);
84
85 if (o2cb_attr->show)
86 return o2cb_attr->show(buffer);
87 return -EIO;
88}
89
90static ssize_t
91o2cb_store(struct kobject * kobj, struct attribute * attr,
92 const char * buffer, size_t count)
93{
94 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
95 struct kset *sbs = to_kset(kobj);
96
97 BUG_ON(sbs != &o2cb_subsys);
98
99 if (o2cb_attr->store)
100 return o2cb_attr->store(buffer, count);
101 return -EIO;
102}
103 56
104void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
105{ 58{
106 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
107 subsystem_unregister(&o2cb_subsys); 60 kset_unregister(o2cb_kset);
108} 61}
109 62
110int o2cb_sys_init(void) 63int o2cb_sys_init(void)
111{ 64{
112 int ret; 65 int ret;
113 66
114 o2cb_subsys.kobj.ktype = &o2cb_subsys_type; 67 o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
115 ret = subsystem_register(&o2cb_subsys); 68 if (!o2cb_kset)
69 return -ENOMEM;
70
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
116 if (ret) 72 if (ret)
117 return ret; 73 goto error;
118 74
119 ret = mlog_sys_init(&o2cb_subsys); 75 ret = mlog_sys_init(o2cb_kset);
120 if (ret) 76 if (ret)
121 subsystem_unregister(&o2cb_subsys); 77 goto error;
78 return 0;
79error:
80 kset_unregister(o2cb_kset);
122 return ret; 81 return ret;
123} 82}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index da880fc215f..f36f66aab3d 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
60/* same as hb delay, we're waiting for another node to recognize our hb */ 60/* same as hb delay, we're waiting for another node to recognize our hb */
61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 61#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
62 62
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66 66
67/* TODO: figure this out.... */ 67/* TODO: figure this out.... */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 9606111fe89..b2e832aca56 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,12 @@
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * New in version 10:
42 * - Meta/data locks combined
43 *
44 * New in version 9:
45 * - All votes removed
46 *
41 * New in version 8: 47 * New in version 8:
42 * - Replace delete inode votes with a cluster lock 48 * - Replace delete inode votes with a cluster lock
43 * 49 *
@@ -60,7 +66,7 @@
60 * - full 64 bit i_size in the metadata lock lvbs 66 * - full 64 bit i_size in the metadata lock lvbs
61 * - introduction of "rw" lock and pushing meta/data locking down 67 * - introduction of "rw" lock and pushing meta/data locking down
62 */ 68 */
63#define O2NET_PROTOCOL_VERSION 8ULL 69#define O2NET_PROTOCOL_VERSION 10ULL
64struct o2net_handshake { 70struct o2net_handshake {
65 __be64 protocol_version; 71 __be64 protocol_version;
66 __be64 connector_id; 72 __be64 connector_id;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
index 7286c48bb30..a56eee6abad 100644
--- a/fs/ocfs2/cluster/ver.c
+++ b/fs/ocfs2/cluster/ver.c
@@ -28,7 +28,7 @@
28 28
29#include "ver.h" 29#include "ver.h"
30 30
31#define CLUSTER_BUILD_VERSION "1.3.3" 31#define CLUSTER_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION 33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 9923278ea6d..b1cc7c381e8 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -128,9 +128,9 @@ static int ocfs2_match_dentry(struct dentry *dentry,
128/* 128/*
129 * Walk the inode alias list, and find a dentry which has a given 129 * Walk the inode alias list, and find a dentry which has a given
130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it 130 * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
131 * is looking for a dentry_lock reference. The vote thread is looking 131 * is looking for a dentry_lock reference. The downconvert thread is
132 * to unhash aliases, so we allow it to skip any that already have 132 * looking to unhash aliases, so we allow it to skip any that already
133 * that property. 133 * have that property.
134 */ 134 */
135struct dentry *ocfs2_find_local_alias(struct inode *inode, 135struct dentry *ocfs2_find_local_alias(struct inode *inode,
136 u64 parent_blkno, 136 u64 parent_blkno,
@@ -266,7 +266,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
266 dl->dl_count = 0; 266 dl->dl_count = 0;
267 /* 267 /*
268 * Does this have to happen below, for all attaches, in case 268 * Does this have to happen below, for all attaches, in case
269 * the struct inode gets blown away by votes? 269 * the struct inode gets blown away by the downconvert thread?
270 */ 270 */
271 dl->dl_inode = igrab(inode); 271 dl->dl_inode = igrab(inode);
272 dl->dl_parent_blkno = parent_blkno; 272 dl->dl_parent_blkno = parent_blkno;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 63b28fdceb4..6b0107f2134 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -846,14 +846,14 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
846 mlog_entry("dirino=%llu\n", 846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno); 847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848 848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 849 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) { 850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime 851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention 852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */ 853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1); 854 ocfs2_inode_unlock(inode, 1);
855 lock_level = 0; 855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0); 856 error = ocfs2_inode_lock(inode, NULL, 0);
857 } 857 }
858 if (error < 0) { 858 if (error < 0) {
859 if (error != -ENOENT) 859 if (error != -ENOENT)
@@ -865,7 +865,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL); 866 dirent, filldir, NULL);
867 867
868 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_inode_unlock(inode, lock_level);
869 869
870bail_nolock: 870bail_nolock:
871 mlog_exit(error); 871 mlog_exit(error);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
index d2be3ad841f..a733b3321f8 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmfsver.h" 29#include "dlmfsver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2fde7bf9143..91f747b8a53 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2270,6 +2270,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2270 } 2270 }
2271 } 2271 }
2272 2272
2273 /* Clean up join state on node death. */
2274 if (dlm->joining_node == idx) {
2275 mlog(0, "Clearing join state for node %u\n", idx);
2276 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2277 }
2278
2273 /* check to see if the node is already considered dead */ 2279 /* check to see if the node is already considered dead */
2274 if (!test_bit(idx, dlm->live_nodes_map)) { 2280 if (!test_bit(idx, dlm->live_nodes_map)) {
2275 mlog(0, "for domain %s, node %d is already dead. " 2281 mlog(0, "for domain %s, node %d is already dead. "
@@ -2288,12 +2294,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2288 2294
2289 clear_bit(idx, dlm->live_nodes_map); 2295 clear_bit(idx, dlm->live_nodes_map);
2290 2296
2291 /* Clean up join state on node death. */
2292 if (dlm->joining_node == idx) {
2293 mlog(0, "Clearing join state for node %u\n", idx);
2294 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2295 }
2296
2297 /* make sure local cleanup occurs before the heartbeat events */ 2297 /* make sure local cleanup occurs before the heartbeat events */
2298 if (!test_bit(idx, dlm->recovery_map)) 2298 if (!test_bit(idx, dlm->recovery_map))
2299 dlm_do_local_recovery_cleanup(dlm, idx); 2299 dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2321,6 +2321,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2321 if (!dlm_grab(dlm)) 2321 if (!dlm_grab(dlm))
2322 return; 2322 return;
2323 2323
2324 /*
2325 * This will notify any dlm users that a node in our domain
2326 * went away without notifying us first.
2327 */
2328 if (test_bit(idx, dlm->domain_map))
2329 dlm_fire_domain_eviction_callbacks(dlm, idx);
2330
2324 spin_lock(&dlm->spinlock); 2331 spin_lock(&dlm->spinlock);
2325 __dlm_hb_node_down(dlm, idx); 2332 __dlm_hb_node_down(dlm, idx);
2326 spin_unlock(&dlm->spinlock); 2333 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
index 7ef2653f8f4..dfc0da4d158 100644
--- a/fs/ocfs2/dlm/dlmver.c
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -28,7 +28,7 @@
28 28
29#include "dlmver.h" 29#include "dlmver.h"
30 30
31#define DLM_BUILD_VERSION "1.3.3" 31#define DLM_BUILD_VERSION "1.5.0"
32 32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION 33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34 34
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8..3867244fb14 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1089 mlog_entry_void(); 1141 mlog_entry_void();
1090 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1091 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1092 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1093 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1094 mlog_exit_void(); 1146 mlog_exit_void();
1095} 1147}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1147 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1148 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1149 */ 1201 */
1150 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1151 if (ret) {
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1157 if (ret) { 1203 if (ret) {
1158 mlog_errno(ret); 1204 mlog_errno(ret);
1159 goto bail; 1205 goto bail;
@@ -1311,76 +1357,221 @@ out:
1311 mlog_exit_void(); 1357 mlog_exit_void();
1312} 1358}
1313 1359
1314int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1315 int write, 1361 int level)
1316 int arg_flags)
1317{ 1362{
1318 int status = 0, level; 1363 int ret;
1319 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1321 1367
1322 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1323 1369
1324 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1325 1385
1326 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1327 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1328 write ? "EXMODE" : "PRMODE"); 1388 }
1329 1389
1330 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1331 * rodevices. */ 1391 /*
1332 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1333 if (write) { 1393 * point to restarting the syscall.
1334 status = -EROFS; 1394 */
1335 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1336 } 1462 }
1337 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1338 } 1470 }
1339 1471
1340 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1341 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1342 1476
1343 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1344 1479
1345 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1346 1490
1347 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1348 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1349 if (status < 0 && status != -EAGAIN) 1493 goto out;
1350 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1351 1515
1352out: 1516out:
1353 mlog_exit(status); 1517
1354 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1355} 1521}
1356 1522
1357/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1358int ocfs2_data_lock_with_page(struct inode *inode,
1359 int write,
1360 struct page *page)
1361{ 1524{
1362 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1363 1531
1364 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1365 if (ret == -EAGAIN) { 1533
1366 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1367 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1368 ocfs2_data_unlock(inode, write); 1536
1369 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1370 } 1559 }
1371 1560
1372 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1373} 1564}
1374 1565
1375static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1376 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1377{ 1568{
1378 int kick = 0; 1569 int kick = 0;
1379 1570
1380 mlog_entry_void(); 1571 mlog_entry_void();
1381 1572
1382 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1383 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1384 * condition. */ 1575 * condition. */
1385 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1386 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1398 } 1589 }
1399 1590
1400 if (kick) 1591 if (kick)
1401 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1402
1403 mlog_exit_void();
1404}
1405
1406void ocfs2_data_unlock(struct inode *inode,
1407 int write)
1408{
1409 int level = write ? LKM_EXMODE : LKM_PRMODE;
1410 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1411 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1412
1413 mlog_entry_void();
1414
1415 mlog(0, "inode %llu drop %s DATA lock\n",
1416 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1417 write ? "EXMODE" : "PRMODE");
1418
1419 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1420 !ocfs2_mount_local(osb))
1421 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1422 1593
1423 mlog_exit_void(); 1594 mlog_exit_void();
1424} 1595}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1442 1613
1443/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1444 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1445 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1446static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1447{ 1618{
1448 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1449 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1450 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1451 1622
1452 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1496static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1497{ 1668{
1498 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1499 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1500 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1501 1672
1502 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1604} 1775}
1605 1776
1606/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1607static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1608 struct buffer_head **bh) 1779 struct buffer_head **bh)
1609{ 1780{
1610 int status = 0; 1781 int status = 0;
1611 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1612 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1613 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1615 1786
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1721 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1722 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1723 */ 1894 */
1724int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1725 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1726 int ex, 1897 int ex,
1727 int arg_flags) 1898 int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1756 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1757 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1758 1929
1759 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1760 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1761 dlm_flags = 0; 1932 dlm_flags = 0;
1762 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
1795 } 1966 }
1796 1967
1797 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1798 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1799 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1800 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1801 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1802 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1803 if (status < 0) { 1974 if (status < 0) {
1804 if (status != -ENOENT) 1975 if (status != -ENOENT)
1805 mlog_errno(status); 1976 mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
1821 *ret_bh = NULL; 1992 *ret_bh = NULL;
1822 } 1993 }
1823 if (acquired) 1994 if (acquired)
1824 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1825 } 1996 }
1826 1997
1827 if (local_bh) 1998 if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
1832} 2003}
1833 2004
1834/* 2005/*
1835 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1836 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1837 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1838 * 2009 *
1839 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1840 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1841 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1842 * 2013 *
1843 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1844 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1845 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1846 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1847 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1848 * 2020 *
1849 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1850 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
1852 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1853 * the lock inversion simply. 2025 * the lock inversion simply.
1854 */ 2026 */
1855int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1856 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1857 int ex, 2029 int ex,
1858 struct page *page) 2030 struct page *page)
1859{ 2031{
1860 int ret; 2032 int ret;
1861 2033
1862 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1863 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1864 unlock_page(page); 2036 unlock_page(page);
1865 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1866 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1867 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1868 } 2040 }
1869 2041
1870 return ret; 2042 return ret;
1871} 2043}
1872 2044
1873int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1874 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1875 int *level) 2047 int *level)
1876{ 2048{
1877 int ret; 2049 int ret;
1878 2050
1879 mlog_entry_void(); 2051 mlog_entry_void();
1880 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1881 if (ret < 0) { 2053 if (ret < 0) {
1882 mlog_errno(ret); 2054 mlog_errno(ret);
1883 return ret; 2055 return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1890 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1891 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1892 2064
1893 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1894 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1895 if (ret < 0) { 2067 if (ret < 0) {
1896 mlog_errno(ret); 2068 mlog_errno(ret);
1897 return ret; 2069 return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1908 return ret; 2080 return ret;
1909} 2081}
1910 2082
1911void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1912 int ex) 2084 int ex)
1913{ 2085{
1914 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1915 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1916 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1917 2089
1918 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2320 goto bail; 2492 goto bail;
2321 } 2493 }
2322 2494
2323 /* launch vote thread */ 2495 /* launch downconvert thread */
2324 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2325 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2326 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2327 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2328 mlog_errno(status); 2500 mlog_errno(status);
2329 goto bail; 2501 goto bail;
2330 } 2502 }
@@ -2353,8 +2525,8 @@ local:
2353bail: 2525bail:
2354 if (status < 0) { 2526 if (status < 0) {
2355 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2356 if (osb->vote_task) 2528 if (osb->dc_task)
2357 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2358 } 2530 }
2359 2531
2360 mlog_exit(status); 2532 mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2369 2541
2370 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2371 2543
2372 if (osb->vote_task) { 2544 if (osb->dc_task) {
2373 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2374 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2375 } 2547 }
2376 2548
2377 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
2527 2699
2528/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2529 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2530 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2531 * it safe to drop. 2703 * it safe to drop.
2532 * 2704 *
2533 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2590 status = err; 2762 status = err;
2591 2763
2592 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2593 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2594 if (err < 0)
2595 mlog_errno(err);
2596 if (err < 0 && !status)
2597 status = err;
2598
2599 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2600 &OCFS2_I(inode)->ip_meta_lockres);
2601 if (err < 0) 2766 if (err < 0)
2602 mlog_errno(err); 2767 mlog_errno(err);
2603 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2850 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2851 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2852 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2853 /* 3021 /*
2854 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2855 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2875 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2876 } 3044 }
2877 3045
3046out:
2878 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2879} 3048}
2880 3049
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2903 3072
2904/* 3073/*
2905 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2906 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2907 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2908 */ 3077 */
2909static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3042 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3043 3212
3044 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3045 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3046 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3047 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3048 * performance. */ 3217 * performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3091 3260
3092 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3093 3262
3094 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3095 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3096 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3097 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3098 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3099 } 3268 }
3100 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3270
3271 mlog_exit_void();
3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3101 3304
3102 mlog_exit_void(); 3305 mlog_exit_void();
3103} 3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 87a785e4120..5f17243ba50 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,12 +49,12 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ 52/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 53/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 54#define OCFS2_META_LOCK_RECOVERY (0x01)
55/* Instruct the dlm not to queue ourselves on the other node. */ 55/* Instruct the dlm not to queue ourselves on the other node. */
56#define OCFS2_META_LOCK_NOQUEUE (0x02) 56#define OCFS2_META_LOCK_NOQUEUE (0x02)
57/* don't block waiting for the vote thread, instead return -EAGAIN */ 57/* don't block waiting for the downconvert thread, instead return -EAGAIN */
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -66,38 +66,32 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
66 struct inode *inode); 66 struct inode *inode);
67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, 67void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
68 u64 parent, struct inode *inode); 68 u64 parent, struct inode *inode);
69struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp);
69void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 72void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
70int ocfs2_create_new_inode_locks(struct inode *inode); 73int ocfs2_create_new_inode_locks(struct inode *inode);
71int ocfs2_drop_inode_locks(struct inode *inode); 74int ocfs2_drop_inode_locks(struct inode *inode);
72int ocfs2_data_lock_full(struct inode *inode,
73 int write,
74 int arg_flags);
75#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
76int ocfs2_data_lock_with_page(struct inode *inode,
77 int write,
78 struct page *page);
79void ocfs2_data_unlock(struct inode *inode,
80 int write);
81int ocfs2_rw_lock(struct inode *inode, int write); 75int ocfs2_rw_lock(struct inode *inode, int write);
82void ocfs2_rw_unlock(struct inode *inode, int write); 76void ocfs2_rw_unlock(struct inode *inode, int write);
83int ocfs2_open_lock(struct inode *inode); 77int ocfs2_open_lock(struct inode *inode);
84int ocfs2_try_open_lock(struct inode *inode, int write); 78int ocfs2_try_open_lock(struct inode *inode, int write);
85void ocfs2_open_unlock(struct inode *inode); 79void ocfs2_open_unlock(struct inode *inode);
86int ocfs2_meta_lock_atime(struct inode *inode, 80int ocfs2_inode_lock_atime(struct inode *inode,
87 struct vfsmount *vfsmnt, 81 struct vfsmount *vfsmnt,
88 int *level); 82 int *level);
89int ocfs2_meta_lock_full(struct inode *inode, 83int ocfs2_inode_lock_full(struct inode *inode,
90 struct buffer_head **ret_bh, 84 struct buffer_head **ret_bh,
91 int ex, 85 int ex,
92 int arg_flags); 86 int arg_flags);
93int ocfs2_meta_lock_with_page(struct inode *inode, 87int ocfs2_inode_lock_with_page(struct inode *inode,
94 struct buffer_head **ret_bh, 88 struct buffer_head **ret_bh,
95 int ex, 89 int ex,
96 struct page *page); 90 struct page *page);
97/* 99% of the time we don't want to supply any additional flags -- 91/* 99% of the time we don't want to supply any additional flags --
98 * those are for very specific cases only. */ 92 * those are for very specific cases only. */
99#define ocfs2_meta_lock(i, b, e) ocfs2_meta_lock_full(i, b, e, 0) 93#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
100void ocfs2_meta_unlock(struct inode *inode, 94void ocfs2_inode_unlock(struct inode *inode,
101 int ex); 95 int ex);
102int ocfs2_super_lock(struct ocfs2_super *osb, 96int ocfs2_super_lock(struct ocfs2_super *osb,
103 int ex); 97 int ex);
@@ -107,14 +101,17 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
107void ocfs2_rename_unlock(struct ocfs2_super *osb); 101void ocfs2_rename_unlock(struct ocfs2_super *osb);
108int ocfs2_dentry_lock(struct dentry *dentry, int ex); 102int ocfs2_dentry_lock(struct dentry *dentry, int ex);
109void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 103void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file);
110 106
111void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
112void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
113 struct ocfs2_lock_res *lockres); 109 struct ocfs2_lock_res *lockres);
114 110
115/* for the vote thread */ 111/* for the downconvert thread */
116void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 112void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
117 struct ocfs2_lock_res *lockres); 113 struct ocfs2_lock_res *lockres);
114void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
118 115
119struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 116struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
120void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 117void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index ff257628af1..1942e09f6ee 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -37,11 +37,6 @@ static inline void le64_add_cpu(__le64 *var, u64 val)
37 *var = cpu_to_le64(le64_to_cpu(*var) + val); 37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38} 38}
39 39
40static inline void le32_and_cpu(__le32 *var, u32 val)
41{
42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
43}
44
45static inline void be32_add_cpu(__be32 *var, u32 val) 40static inline void be32_add_cpu(__be32 *var, u32 val)
46{ 41{
47 *var = cpu_to_be32(be32_to_cpu(*var) + val); 42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 535bfa9568a..67527cebf21 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -58,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
58 return ERR_PTR(-ESTALE); 58 return ERR_PTR(-ESTALE);
59 } 59 }
60 60
61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); 61 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
62 62
63 if (IS_ERR(inode)) 63 if (IS_ERR(inode))
64 return (void *)inode; 64 return (void *)inode;
@@ -95,7 +95,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
95 mlog(0, "find parent of directory %llu\n", 95 mlog(0, "find parent of directory %llu\n",
96 (unsigned long long)OCFS2_I(dir)->ip_blkno); 96 (unsigned long long)OCFS2_I(dir)->ip_blkno);
97 97
98 status = ocfs2_meta_lock(dir, NULL, 0); 98 status = ocfs2_inode_lock(dir, NULL, 0);
99 if (status < 0) { 99 if (status < 0) {
100 if (status != -ENOENT) 100 if (status != -ENOENT)
101 mlog_errno(status); 101 mlog_errno(status);
@@ -109,7 +109,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
109 goto bail_unlock; 109 goto bail_unlock;
110 } 110 }
111 111
112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 112 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
113 if (IS_ERR(inode)) { 113 if (IS_ERR(inode)) {
114 mlog(ML_ERROR, "Unable to create inode %llu\n", 114 mlog(ML_ERROR, "Unable to create inode %llu\n",
115 (unsigned long long)blkno); 115 (unsigned long long)blkno);
@@ -126,7 +126,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
126 parent->d_op = &ocfs2_dentry_ops; 126 parent->d_op = &ocfs2_dentry_ops;
127 127
128bail_unlock: 128bail_unlock:
129 ocfs2_meta_unlock(dir, 0); 129 ocfs2_inode_unlock(dir, 0);
130 130
131bail: 131bail:
132 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b75b2e1f0e4..ed5d5232e85 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -51,6 +51,7 @@
51#include "inode.h" 51#include "inode.h"
52#include "ioctl.h" 52#include "ioctl.h"
53#include "journal.h" 53#include "journal.h"
54#include "locks.h"
54#include "mmap.h" 55#include "mmap.h"
55#include "suballoc.h" 56#include "suballoc.h"
56#include "super.h" 57#include "super.h"
@@ -63,6 +64,35 @@ static int ocfs2_sync_inode(struct inode *inode)
63 return sync_mapping_buffers(inode->i_mapping); 64 return sync_mapping_buffers(inode->i_mapping);
64} 65}
65 66
67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
68{
69 struct ocfs2_file_private *fp;
70
71 fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
72 if (!fp)
73 return -ENOMEM;
74
75 fp->fp_file = file;
76 mutex_init(&fp->fp_mutex);
77 ocfs2_file_lock_res_init(&fp->fp_flock, fp);
78 file->private_data = fp;
79
80 return 0;
81}
82
83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
84{
85 struct ocfs2_file_private *fp = file->private_data;
86 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
87
88 if (fp) {
89 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
90 ocfs2_lock_res_free(&fp->fp_flock);
91 kfree(fp);
92 file->private_data = NULL;
93 }
94}
95
66static int ocfs2_file_open(struct inode *inode, struct file *file) 96static int ocfs2_file_open(struct inode *inode, struct file *file)
67{ 97{
68 int status; 98 int status;
@@ -89,7 +119,18 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
89 119
90 oi->ip_open_count++; 120 oi->ip_open_count++;
91 spin_unlock(&oi->ip_lock); 121 spin_unlock(&oi->ip_lock);
92 status = 0; 122
123 status = ocfs2_init_file_private(inode, file);
124 if (status) {
125 /*
126 * We want to set open count back if we're failing the
127 * open.
128 */
129 spin_lock(&oi->ip_lock);
130 oi->ip_open_count--;
131 spin_unlock(&oi->ip_lock);
132 }
133
93leave: 134leave:
94 mlog_exit(status); 135 mlog_exit(status);
95 return status; 136 return status;
@@ -108,11 +149,24 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
108 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
109 spin_unlock(&oi->ip_lock); 150 spin_unlock(&oi->ip_lock);
110 151
152 ocfs2_free_file_private(inode, file);
153
111 mlog_exit(0); 154 mlog_exit(0);
112 155
113 return 0; 156 return 0;
114} 157}
115 158
159static int ocfs2_dir_open(struct inode *inode, struct file *file)
160{
161 return ocfs2_init_file_private(inode, file);
162}
163
164static int ocfs2_dir_release(struct inode *inode, struct file *file)
165{
166 ocfs2_free_file_private(inode, file);
167 return 0;
168}
169
116static int ocfs2_sync_file(struct file *file, 170static int ocfs2_sync_file(struct file *file,
117 struct dentry *dentry, 171 struct dentry *dentry,
118 int datasync) 172 int datasync)
@@ -382,18 +436,13 @@ static int ocfs2_truncate_file(struct inode *inode,
382 436
383 down_write(&OCFS2_I(inode)->ip_alloc_sem); 437 down_write(&OCFS2_I(inode)->ip_alloc_sem);
384 438
385 /* This forces other nodes to sync and drop their pages. Do 439 /*
386 * this even if we have a truncate without allocation change - 440 * The inode lock forced other nodes to sync and drop their
387 * ocfs2 cluster sizes can be much greater than page size, so 441 * pages, which (correctly) happens even if we have a truncate
388 * we have to truncate them anyway. */ 442 * without allocation change - ocfs2 cluster sizes can be much
389 status = ocfs2_data_lock(inode, 1); 443 * greater than page size, so we have to truncate them
390 if (status < 0) { 444 * anyway.
391 up_write(&OCFS2_I(inode)->ip_alloc_sem); 445 */
392
393 mlog_errno(status);
394 goto bail;
395 }
396
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 446 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 447 truncate_inode_pages(inode->i_mapping, new_i_size);
399 448
@@ -403,7 +452,7 @@ static int ocfs2_truncate_file(struct inode *inode,
403 if (status) 452 if (status)
404 mlog_errno(status); 453 mlog_errno(status);
405 454
406 goto bail_unlock_data; 455 goto bail_unlock_sem;
407 } 456 }
408 457
409 /* alright, we're going to need to do a full blown alloc size 458 /* alright, we're going to need to do a full blown alloc size
@@ -413,25 +462,23 @@ static int ocfs2_truncate_file(struct inode *inode,
413 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 462 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
414 if (status < 0) { 463 if (status < 0) {
415 mlog_errno(status); 464 mlog_errno(status);
416 goto bail_unlock_data; 465 goto bail_unlock_sem;
417 } 466 }
418 467
419 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 468 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
420 if (status < 0) { 469 if (status < 0) {
421 mlog_errno(status); 470 mlog_errno(status);
422 goto bail_unlock_data; 471 goto bail_unlock_sem;
423 } 472 }
424 473
425 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 474 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
426 if (status < 0) { 475 if (status < 0) {
427 mlog_errno(status); 476 mlog_errno(status);
428 goto bail_unlock_data; 477 goto bail_unlock_sem;
429 } 478 }
430 479
431 /* TODO: orphan dir cleanup here. */ 480 /* TODO: orphan dir cleanup here. */
432bail_unlock_data: 481bail_unlock_sem:
433 ocfs2_data_unlock(inode, 1);
434
435 up_write(&OCFS2_I(inode)->ip_alloc_sem); 482 up_write(&OCFS2_I(inode)->ip_alloc_sem);
436 483
437bail: 484bail:
@@ -579,7 +626,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
579 626
580 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
581 "clusters_to_add = %u, extents_to_split = %u\n", 628 "clusters_to_add = %u, extents_to_split = %u\n",
582 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
583 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); 630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
584 631
585 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
@@ -760,7 +807,7 @@ restarted_transaction:
760 le32_to_cpu(fe->i_clusters), 807 le32_to_cpu(fe->i_clusters),
761 (unsigned long long)le64_to_cpu(fe->i_size)); 808 (unsigned long long)le64_to_cpu(fe->i_size));
762 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 809 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
763 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 810 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
764 811
765leave: 812leave:
766 if (handle) { 813 if (handle) {
@@ -917,7 +964,7 @@ static int ocfs2_extend_file(struct inode *inode,
917 struct buffer_head *di_bh, 964 struct buffer_head *di_bh,
918 u64 new_i_size) 965 u64 new_i_size)
919{ 966{
920 int ret = 0, data_locked = 0; 967 int ret = 0;
921 struct ocfs2_inode_info *oi = OCFS2_I(inode); 968 struct ocfs2_inode_info *oi = OCFS2_I(inode);
922 969
923 BUG_ON(!di_bh); 970 BUG_ON(!di_bh);
@@ -943,20 +990,6 @@ static int ocfs2_extend_file(struct inode *inode,
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 990 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
944 goto out_update_size; 991 goto out_update_size;
945 992
946 /*
947 * protect the pages that ocfs2_zero_extend is going to be
948 * pulling into the page cache.. we do this before the
949 * metadata extend so that we don't get into the situation
950 * where we've extended the metadata but can't get the data
951 * lock to zero.
952 */
953 ret = ocfs2_data_lock(inode, 1);
954 if (ret < 0) {
955 mlog_errno(ret);
956 goto out;
957 }
958 data_locked = 1;
959
960 /* 993 /*
961 * The alloc sem blocks people in read/write from reading our 994 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on 995 * allocation until we're done changing it. We depend on
@@ -980,7 +1013,7 @@ static int ocfs2_extend_file(struct inode *inode,
980 up_write(&oi->ip_alloc_sem); 1013 up_write(&oi->ip_alloc_sem);
981 1014
982 mlog_errno(ret); 1015 mlog_errno(ret);
983 goto out_unlock; 1016 goto out;
984 } 1017 }
985 } 1018 }
986 1019
@@ -991,7 +1024,7 @@ static int ocfs2_extend_file(struct inode *inode,
991 1024
992 if (ret < 0) { 1025 if (ret < 0) {
993 mlog_errno(ret); 1026 mlog_errno(ret);
994 goto out_unlock; 1027 goto out;
995 } 1028 }
996 1029
997out_update_size: 1030out_update_size:
@@ -999,10 +1032,6 @@ out_update_size:
999 if (ret < 0) 1032 if (ret < 0)
1000 mlog_errno(ret); 1033 mlog_errno(ret);
1001 1034
1002out_unlock:
1003 if (data_locked)
1004 ocfs2_data_unlock(inode, 1);
1005
1006out: 1035out:
1007 return ret; 1036 return ret;
1008} 1037}
@@ -1050,7 +1079,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050 } 1079 }
1051 } 1080 }
1052 1081
1053 status = ocfs2_meta_lock(inode, &bh, 1); 1082 status = ocfs2_inode_lock(inode, &bh, 1);
1054 if (status < 0) { 1083 if (status < 0) {
1055 if (status != -ENOENT) 1084 if (status != -ENOENT)
1056 mlog_errno(status); 1085 mlog_errno(status);
@@ -1102,7 +1131,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1102bail_commit: 1131bail_commit:
1103 ocfs2_commit_trans(osb, handle); 1132 ocfs2_commit_trans(osb, handle);
1104bail_unlock: 1133bail_unlock:
1105 ocfs2_meta_unlock(inode, 1); 1134 ocfs2_inode_unlock(inode, 1);
1106bail_unlock_rw: 1135bail_unlock_rw:
1107 if (size_change) 1136 if (size_change)
1108 ocfs2_rw_unlock(inode, 1); 1137 ocfs2_rw_unlock(inode, 1);
@@ -1149,7 +1178,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1149 1178
1150 mlog_entry_void(); 1179 mlog_entry_void();
1151 1180
1152 ret = ocfs2_meta_lock(inode, NULL, 0); 1181 ret = ocfs2_inode_lock(inode, NULL, 0);
1153 if (ret) { 1182 if (ret) {
1154 if (ret != -ENOENT) 1183 if (ret != -ENOENT)
1155 mlog_errno(ret); 1184 mlog_errno(ret);
@@ -1158,7 +1187,7 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1158 1187
1159 ret = generic_permission(inode, mask, NULL); 1188 ret = generic_permission(inode, mask, NULL);
1160 1189
1161 ocfs2_meta_unlock(inode, 0); 1190 ocfs2_inode_unlock(inode, 0);
1162out: 1191out:
1163 mlog_exit(ret); 1192 mlog_exit(ret);
1164 return ret; 1193 return ret;
@@ -1630,7 +1659,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1630 goto out; 1659 goto out;
1631 } 1660 }
1632 1661
1633 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1662 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1634 if (ret) { 1663 if (ret) {
1635 mlog_errno(ret); 1664 mlog_errno(ret);
1636 goto out_rw_unlock; 1665 goto out_rw_unlock;
@@ -1638,7 +1667,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1638 1667
1639 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1668 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1640 ret = -EPERM; 1669 ret = -EPERM;
1641 goto out_meta_unlock; 1670 goto out_inode_unlock;
1642 } 1671 }
1643 1672
1644 switch (sr->l_whence) { 1673 switch (sr->l_whence) {
@@ -1652,7 +1681,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1652 break; 1681 break;
1653 default: 1682 default:
1654 ret = -EINVAL; 1683 ret = -EINVAL;
1655 goto out_meta_unlock; 1684 goto out_inode_unlock;
1656 } 1685 }
1657 sr->l_whence = 0; 1686 sr->l_whence = 0;
1658 1687
@@ -1663,14 +1692,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1663 || (sr->l_start + llen) < 0 1692 || (sr->l_start + llen) < 0
1664 || (sr->l_start + llen) > max_off) { 1693 || (sr->l_start + llen) > max_off) {
1665 ret = -EINVAL; 1694 ret = -EINVAL;
1666 goto out_meta_unlock; 1695 goto out_inode_unlock;
1667 } 1696 }
1668 size = sr->l_start + sr->l_len; 1697 size = sr->l_start + sr->l_len;
1669 1698
1670 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1699 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1671 if (sr->l_len <= 0) { 1700 if (sr->l_len <= 0) {
1672 ret = -EINVAL; 1701 ret = -EINVAL;
1673 goto out_meta_unlock; 1702 goto out_inode_unlock;
1674 } 1703 }
1675 } 1704 }
1676 1705
@@ -1678,7 +1707,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1678 ret = __ocfs2_write_remove_suid(inode, di_bh); 1707 ret = __ocfs2_write_remove_suid(inode, di_bh);
1679 if (ret) { 1708 if (ret) {
1680 mlog_errno(ret); 1709 mlog_errno(ret);
1681 goto out_meta_unlock; 1710 goto out_inode_unlock;
1682 } 1711 }
1683 } 1712 }
1684 1713
@@ -1704,7 +1733,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1704 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1733 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1705 if (ret) { 1734 if (ret) {
1706 mlog_errno(ret); 1735 mlog_errno(ret);
1707 goto out_meta_unlock; 1736 goto out_inode_unlock;
1708 } 1737 }
1709 1738
1710 /* 1739 /*
@@ -1714,7 +1743,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1714 if (IS_ERR(handle)) { 1743 if (IS_ERR(handle)) {
1715 ret = PTR_ERR(handle); 1744 ret = PTR_ERR(handle);
1716 mlog_errno(ret); 1745 mlog_errno(ret);
1717 goto out_meta_unlock; 1746 goto out_inode_unlock;
1718 } 1747 }
1719 1748
1720 if (change_size && i_size_read(inode) < size) 1749 if (change_size && i_size_read(inode) < size)
@@ -1727,9 +1756,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1727 1756
1728 ocfs2_commit_trans(osb, handle); 1757 ocfs2_commit_trans(osb, handle);
1729 1758
1730out_meta_unlock: 1759out_inode_unlock:
1731 brelse(di_bh); 1760 brelse(di_bh);
1732 ocfs2_meta_unlock(inode, 1); 1761 ocfs2_inode_unlock(inode, 1);
1733out_rw_unlock: 1762out_rw_unlock:
1734 ocfs2_rw_unlock(inode, 1); 1763 ocfs2_rw_unlock(inode, 1);
1735 1764
@@ -1799,7 +1828,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1799 * if we need to make modifications here. 1828 * if we need to make modifications here.
1800 */ 1829 */
1801 for(;;) { 1830 for(;;) {
1802 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1831 ret = ocfs2_inode_lock(inode, NULL, meta_level);
1803 if (ret < 0) { 1832 if (ret < 0) {
1804 meta_level = -1; 1833 meta_level = -1;
1805 mlog_errno(ret); 1834 mlog_errno(ret);
@@ -1817,7 +1846,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1817 * set inode->i_size at the end of a write. */ 1846 * set inode->i_size at the end of a write. */
1818 if (should_remove_suid(dentry)) { 1847 if (should_remove_suid(dentry)) {
1819 if (meta_level == 0) { 1848 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level); 1849 ocfs2_inode_unlock(inode, meta_level);
1821 meta_level = 1; 1850 meta_level = 1;
1822 continue; 1851 continue;
1823 } 1852 }
@@ -1886,7 +1915,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1886 *ppos = saved_pos; 1915 *ppos = saved_pos;
1887 1916
1888out_unlock: 1917out_unlock:
1889 ocfs2_meta_unlock(inode, meta_level); 1918 ocfs2_inode_unlock(inode, meta_level);
1890 1919
1891out: 1920out:
1892 return ret; 1921 return ret;
@@ -2099,12 +2128,12 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2099 /* 2128 /*
2100 * See the comment in ocfs2_file_aio_read() 2129 * See the comment in ocfs2_file_aio_read()
2101 */ 2130 */
2102 ret = ocfs2_meta_lock(inode, NULL, 0); 2131 ret = ocfs2_inode_lock(inode, NULL, 0);
2103 if (ret < 0) { 2132 if (ret < 0) {
2104 mlog_errno(ret); 2133 mlog_errno(ret);
2105 goto bail; 2134 goto bail;
2106 } 2135 }
2107 ocfs2_meta_unlock(inode, 0); 2136 ocfs2_inode_unlock(inode, 0);
2108 2137
2109 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2138 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2110 2139
@@ -2160,12 +2189,12 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2160 * like i_size. This allows the checks down below 2189 * like i_size. This allows the checks down below
2161 * generic_file_aio_read() a chance of actually working. 2190 * generic_file_aio_read() a chance of actually working.
2162 */ 2191 */
2163 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2192 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2164 if (ret < 0) { 2193 if (ret < 0) {
2165 mlog_errno(ret); 2194 mlog_errno(ret);
2166 goto bail; 2195 goto bail;
2167 } 2196 }
2168 ocfs2_meta_unlock(inode, lock_level); 2197 ocfs2_inode_unlock(inode, lock_level);
2169 2198
2170 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2199 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2171 if (ret == -EINVAL) 2200 if (ret == -EINVAL)
@@ -2204,6 +2233,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2204}; 2233};
2205 2234
2206const struct file_operations ocfs2_fops = { 2235const struct file_operations ocfs2_fops = {
2236 .llseek = generic_file_llseek,
2207 .read = do_sync_read, 2237 .read = do_sync_read,
2208 .write = do_sync_write, 2238 .write = do_sync_write,
2209 .mmap = ocfs2_mmap, 2239 .mmap = ocfs2_mmap,
@@ -2216,16 +2246,21 @@ const struct file_operations ocfs2_fops = {
2216#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2217 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2218#endif 2248#endif
2249 .flock = ocfs2_flock,
2219 .splice_read = ocfs2_file_splice_read, 2250 .splice_read = ocfs2_file_splice_read,
2220 .splice_write = ocfs2_file_splice_write, 2251 .splice_write = ocfs2_file_splice_write,
2221}; 2252};
2222 2253
2223const struct file_operations ocfs2_dops = { 2254const struct file_operations ocfs2_dops = {
2255 .llseek = generic_file_llseek,
2224 .read = generic_read_dir, 2256 .read = generic_read_dir,
2225 .readdir = ocfs2_readdir, 2257 .readdir = ocfs2_readdir,
2226 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open,
2227 .ioctl = ocfs2_ioctl, 2261 .ioctl = ocfs2_ioctl,
2228#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2229 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2230#endif 2264#endif
2265 .flock = ocfs2_flock,
2231}; 2266};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 066f14add3a..048ddcaf5c8 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 32extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 33struct ocfs2_alloc_context;
34 34
35struct ocfs2_file_private {
36 struct file *fp_file;
37 struct mutex fp_mutex;
38 struct ocfs2_lock_res fp_flock;
39};
40
35enum ocfs2_alloc_restarted { 41enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0, 42 RESTART_NONE = 0,
37 RESTART_TRANS, 43 RESTART_TRANS,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c4c36171240..c0efd9489fe 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -30,9 +30,6 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h> 31#include <linux/kmod.h>
32 32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h> 33#include <dlm/dlmapi.h>
37 34
38#define MLOG_MASK_PREFIX ML_SUPER 35#define MLOG_MASK_PREFIX ML_SUPER
@@ -44,13 +41,9 @@
44#include "heartbeat.h" 41#include "heartbeat.h"
45#include "inode.h" 42#include "inode.h"
46#include "journal.h" 43#include "journal.h"
47#include "vote.h"
48 44
49#include "buffer_head_io.h" 45#include "buffer_head_io.h"
50 46
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 47static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit); 48 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
@@ -64,9 +57,7 @@ static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
64void ocfs2_init_node_maps(struct ocfs2_super *osb) 57void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{ 58{
66 spin_lock_init(&osb->node_map_lock); 59 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map); 60 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
71} 62}
72 63
@@ -87,24 +78,7 @@ static void ocfs2_do_node_down(int node_num,
87 return; 78 return;
88 } 79 }
89 80
90 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
91 /* If a node is in the umount map, then we've been
92 * expecting him to go down and we know ahead of time
93 * that recovery is not necessary. */
94 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
95 return;
96 }
97
98 ocfs2_recovery_thread(osb, node_num); 81 ocfs2_recovery_thread(osb, node_num);
99
100 ocfs2_remove_node_from_vote_queues(osb, node_num);
101}
102
103static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
104 int node_num,
105 void *data)
106{
107 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
108} 82}
109 83
110/* Called from the dlm when it's about to evict a node. We may also 84/* Called from the dlm when it's about to evict a node. We may also
@@ -121,27 +95,8 @@ static void ocfs2_dlm_eviction_cb(int node_num,
121 ocfs2_do_node_down(node_num, osb); 95 ocfs2_do_node_down(node_num, osb);
122} 96}
123 97
124static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
125 int node_num,
126 void *data)
127{
128 struct ocfs2_super *osb = data;
129
130 BUG_ON(osb->node_num == node_num);
131
132 mlog(0, "node up event for %d\n", node_num);
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) 98void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
137{ 99{
138 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
139 ocfs2_hb_node_down_cb, osb,
140 OCFS2_HB_NODE_DOWN_PRI);
141
142 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
143 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
144
145 /* Not exactly a heartbeat callback, but leads to essentially 100 /* Not exactly a heartbeat callback, but leads to essentially
146 * the same path so we set it up here. */ 101 * the same path so we set it up here. */
147 dlm_setup_eviction_cb(&osb->osb_eviction_cb, 102 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
@@ -149,39 +104,6 @@ void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
149 osb); 104 osb);
150} 105}
151 106
152/* Most functions here are just stubs for now... */
153int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
154{
155 int status;
156
157 if (ocfs2_mount_local(osb))
158 return 0;
159
160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) {
162 mlog_errno(status);
163 goto bail;
164 }
165
166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) {
168 mlog_errno(status);
169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 }
171
172bail:
173 return status;
174}
175
176void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
177{
178 if (ocfs2_mount_local(osb))
179 return;
180
181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183}
184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 107void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
186{ 108{
187 int ret; 109 int ret;
@@ -341,8 +263,6 @@ int ocfs2_recovery_map_set(struct ocfs2_super *osb,
341 263
342 spin_lock(&osb->node_map_lock); 264 spin_lock(&osb->node_map_lock);
343 265
344 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
345
346 if (!test_bit(num, osb->recovery_map.map)) { 266 if (!test_bit(num, osb->recovery_map.map)) {
347 __ocfs2_node_map_set_bit(&osb->recovery_map, num); 267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
348 set = 1; 268 set = 1;
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e..56859211888 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -29,8 +29,6 @@
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb); 32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35 33
36/* node map functions - used to keep track of mounted and in-recovery 34/* node map functions - used to keep track of mounted and in-recovery
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ebb2bbe30f3..7e9e4c79aec 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,7 +49,6 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "vote.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
55 54
@@ -58,8 +57,11 @@ struct ocfs2_find_inode_args
58 u64 fi_blkno; 57 u64 fi_blkno;
59 unsigned long fi_ino; 58 unsigned long fi_ino;
60 unsigned int fi_flags; 59 unsigned int fi_flags;
60 unsigned int fi_sysfile_type;
61}; 61};
62 62
63static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
64
63static int ocfs2_read_locked_inode(struct inode *inode, 65static int ocfs2_read_locked_inode(struct inode *inode,
64 struct ocfs2_find_inode_args *args); 66 struct ocfs2_find_inode_args *args);
65static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -107,7 +109,8 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
107 oi->ip_attr |= OCFS2_DIRSYNC_FL; 109 oi->ip_attr |= OCFS2_DIRSYNC_FL;
108} 110}
109 111
110struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) 112struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
113 int sysfile_type)
111{ 114{
112 struct inode *inode = NULL; 115 struct inode *inode = NULL;
113 struct super_block *sb = osb->sb; 116 struct super_block *sb = osb->sb;
@@ -127,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
127 args.fi_blkno = blkno; 130 args.fi_blkno = blkno;
128 args.fi_flags = flags; 131 args.fi_flags = flags;
129 args.fi_ino = ino_from_blkno(sb, blkno); 132 args.fi_ino = ino_from_blkno(sb, blkno);
133 args.fi_sysfile_type = sysfile_type;
130 134
131 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 135 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
132 ocfs2_init_locked_inode, &args); 136 ocfs2_init_locked_inode, &args);
@@ -201,6 +205,9 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
201 205
202 inode->i_ino = args->fi_ino; 206 inode->i_ino = args->fi_ino;
203 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 207 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
208 if (args->fi_sysfile_type != 0)
209 lockdep_set_class(&inode->i_mutex,
210 &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
204 211
205 mlog_exit(0); 212 mlog_exit(0);
206 return 0; 213 return 0;
@@ -322,7 +329,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
322 */ 329 */
323 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 330 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
324 331
325 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 332 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
326 OCFS2_LOCK_TYPE_META, 0, inode); 333 OCFS2_LOCK_TYPE_META, 0, inode);
327 334
328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 335 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
@@ -333,10 +340,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
333 OCFS2_LOCK_TYPE_RW, inode->i_generation, 340 OCFS2_LOCK_TYPE_RW, inode->i_generation,
334 inode); 341 inode);
335 342
336 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
337 OCFS2_LOCK_TYPE_DATA, inode->i_generation,
338 inode);
339
340 ocfs2_set_inode_flags(inode); 343 ocfs2_set_inode_flags(inode);
341 344
342 status = 0; 345 status = 0;
@@ -414,7 +417,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
414 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 417 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
415 generation = osb->fs_generation; 418 generation = osb->fs_generation;
416 419
417 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 420 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
418 OCFS2_LOCK_TYPE_META, 421 OCFS2_LOCK_TYPE_META,
419 generation, inode); 422 generation, inode);
420 423
@@ -429,7 +432,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
429 mlog_errno(status); 432 mlog_errno(status);
430 return status; 433 return status;
431 } 434 }
432 status = ocfs2_meta_lock(inode, NULL, 0); 435 status = ocfs2_inode_lock(inode, NULL, 0);
433 if (status) { 436 if (status) {
434 make_bad_inode(inode); 437 make_bad_inode(inode);
435 mlog_errno(status); 438 mlog_errno(status);
@@ -484,7 +487,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
484 487
485bail: 488bail:
486 if (can_lock) 489 if (can_lock)
487 ocfs2_meta_unlock(inode, 0); 490 ocfs2_inode_unlock(inode, 0);
488 491
489 if (status < 0) 492 if (status < 0)
490 make_bad_inode(inode); 493 make_bad_inode(inode);
@@ -586,7 +589,7 @@ static int ocfs2_remove_inode(struct inode *inode,
586 } 589 }
587 590
588 mutex_lock(&inode_alloc_inode->i_mutex); 591 mutex_lock(&inode_alloc_inode->i_mutex);
589 status = ocfs2_meta_lock(inode_alloc_inode, &inode_alloc_bh, 1); 592 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
590 if (status < 0) { 593 if (status < 0) {
591 mutex_unlock(&inode_alloc_inode->i_mutex); 594 mutex_unlock(&inode_alloc_inode->i_mutex);
592 595
@@ -617,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
617 } 620 }
618 621
619 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 622 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
620 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 623 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
621 624
622 status = ocfs2_journal_dirty(handle, di_bh); 625 status = ocfs2_journal_dirty(handle, di_bh);
623 if (status < 0) { 626 if (status < 0) {
@@ -635,7 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
635bail_commit: 638bail_commit:
636 ocfs2_commit_trans(osb, handle); 639 ocfs2_commit_trans(osb, handle);
637bail_unlock: 640bail_unlock:
638 ocfs2_meta_unlock(inode_alloc_inode, 1); 641 ocfs2_inode_unlock(inode_alloc_inode, 1);
639 mutex_unlock(&inode_alloc_inode->i_mutex); 642 mutex_unlock(&inode_alloc_inode->i_mutex);
640 brelse(inode_alloc_bh); 643 brelse(inode_alloc_bh);
641bail: 644bail:
@@ -709,7 +712,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
709 * delete_inode operation. We do this now to avoid races with 712 * delete_inode operation. We do this now to avoid races with
710 * recovery completion on other nodes. */ 713 * recovery completion on other nodes. */
711 mutex_lock(&orphan_dir_inode->i_mutex); 714 mutex_lock(&orphan_dir_inode->i_mutex);
712 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 715 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
713 if (status < 0) { 716 if (status < 0) {
714 mutex_unlock(&orphan_dir_inode->i_mutex); 717 mutex_unlock(&orphan_dir_inode->i_mutex);
715 718
@@ -718,8 +721,8 @@ static int ocfs2_wipe_inode(struct inode *inode,
718 } 721 }
719 722
720 /* we do this while holding the orphan dir lock because we 723 /* we do this while holding the orphan dir lock because we
721 * don't want recovery being run from another node to vote for 724 * don't want recovery being run from another node to try an
722 * an inode delete on us -- this will result in two nodes 725 * inode delete underneath us -- this will result in two nodes
723 * truncating the same file! */ 726 * truncating the same file! */
724 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 727 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
725 if (status < 0) { 728 if (status < 0) {
@@ -733,7 +736,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
733 mlog_errno(status); 736 mlog_errno(status);
734 737
735bail_unlock_dir: 738bail_unlock_dir:
736 ocfs2_meta_unlock(orphan_dir_inode, 1); 739 ocfs2_inode_unlock(orphan_dir_inode, 1);
737 mutex_unlock(&orphan_dir_inode->i_mutex); 740 mutex_unlock(&orphan_dir_inode->i_mutex);
738 brelse(orphan_dir_bh); 741 brelse(orphan_dir_bh);
739bail: 742bail:
@@ -744,7 +747,7 @@ bail:
744} 747}
745 748
746/* There is a series of simple checks that should be done before a 749/* There is a series of simple checks that should be done before a
747 * vote is even considered. Encapsulate those in this function. */ 750 * trylock is even considered. Encapsulate those in this function. */
748static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 751static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
749{ 752{
750 int ret = 0; 753 int ret = 0;
@@ -758,14 +761,14 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
758 goto bail; 761 goto bail;
759 } 762 }
760 763
761 /* If we're coming from process_vote we can't go into our own 764 /* If we're coming from downconvert_thread we can't go into our own
762 * voting [hello, deadlock city!], so unforuntately we just 765 * voting [hello, deadlock city!], so unforuntately we just
763 * have to skip deleting this guy. That's OK though because 766 * have to skip deleting this guy. That's OK though because
764 * the node who's doing the actual deleting should handle it 767 * the node who's doing the actual deleting should handle it
765 * anyway. */ 768 * anyway. */
766 if (current == osb->vote_task) { 769 if (current == osb->dc_task) {
767 mlog(0, "Skipping delete of %lu because we're currently " 770 mlog(0, "Skipping delete of %lu because we're currently "
768 "in process_vote\n", inode->i_ino); 771 "in downconvert\n", inode->i_ino);
769 goto bail; 772 goto bail;
770 } 773 }
771 774
@@ -779,10 +782,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
779 goto bail_unlock; 782 goto bail_unlock;
780 } 783 }
781 784
782 /* If we have voted "yes" on the wipe of this inode for 785 /* If we have allowd wipe of this inode for another node, it
783 * another node, it will be marked here so we can safely skip 786 * will be marked here so we can safely skip it. Recovery will
784 * it. Recovery will cleanup any inodes we might inadvertantly 787 * cleanup any inodes we might inadvertantly skip here. */
785 * skip here. */
786 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 788 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
787 mlog(0, "Skipping delete of %lu because another node " 789 mlog(0, "Skipping delete of %lu because another node "
788 "has done this for us.\n", inode->i_ino); 790 "has done this for us.\n", inode->i_ino);
@@ -929,13 +931,13 @@ void ocfs2_delete_inode(struct inode *inode)
929 931
930 /* Lock down the inode. This gives us an up to date view of 932 /* Lock down the inode. This gives us an up to date view of
931 * it's metadata (for verification), and allows us to 933 * it's metadata (for verification), and allows us to
932 * serialize delete_inode votes. 934 * serialize delete_inode on multiple nodes.
933 * 935 *
934 * Even though we might be doing a truncate, we don't take the 936 * Even though we might be doing a truncate, we don't take the
935 * allocation lock here as it won't be needed - nobody will 937 * allocation lock here as it won't be needed - nobody will
936 * have the file open. 938 * have the file open.
937 */ 939 */
938 status = ocfs2_meta_lock(inode, &di_bh, 1); 940 status = ocfs2_inode_lock(inode, &di_bh, 1);
939 if (status < 0) { 941 if (status < 0) {
940 if (status != -ENOENT) 942 if (status != -ENOENT)
941 mlog_errno(status); 943 mlog_errno(status);
@@ -947,15 +949,15 @@ void ocfs2_delete_inode(struct inode *inode)
947 * before we go ahead and wipe the inode. */ 949 * before we go ahead and wipe the inode. */
948 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 950 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
949 if (!wipe || status < 0) { 951 if (!wipe || status < 0) {
950 /* Error and inode busy vote both mean we won't be 952 /* Error and remote inode busy both mean we won't be
951 * removing the inode, so they take almost the same 953 * removing the inode, so they take almost the same
952 * path. */ 954 * path. */
953 if (status < 0) 955 if (status < 0)
954 mlog_errno(status); 956 mlog_errno(status);
955 957
956 /* Someone in the cluster has voted to not wipe this 958 /* Someone in the cluster has disallowed a wipe of
957 * inode, or it was never completely orphaned. Write 959 * this inode, or it was never completely
958 * out the pages and exit now. */ 960 * orphaned. Write out the pages and exit now. */
959 ocfs2_cleanup_delete_inode(inode, 1); 961 ocfs2_cleanup_delete_inode(inode, 1);
960 goto bail_unlock_inode; 962 goto bail_unlock_inode;
961 } 963 }
@@ -981,7 +983,7 @@ void ocfs2_delete_inode(struct inode *inode)
981 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 983 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
982 984
983bail_unlock_inode: 985bail_unlock_inode:
984 ocfs2_meta_unlock(inode, 1); 986 ocfs2_inode_unlock(inode, 1);
985 brelse(di_bh); 987 brelse(di_bh);
986bail_unblock: 988bail_unblock:
987 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 989 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
@@ -1008,15 +1010,14 @@ void ocfs2_clear_inode(struct inode *inode)
1008 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1010 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1009 "Inode=%lu\n", inode->i_ino); 1011 "Inode=%lu\n", inode->i_ino);
1010 1012
1011 /* For remove delete_inode vote, we hold open lock before, 1013 /* To preven remote deletes we hold open lock before, now it
1012 * now it is time to unlock PR and EX open locks. */ 1014 * is time to unlock PR and EX open locks. */
1013 ocfs2_open_unlock(inode); 1015 ocfs2_open_unlock(inode);
1014 1016
1015 /* Do these before all the other work so that we don't bounce 1017 /* Do these before all the other work so that we don't bounce
1016 * the vote thread while waiting to destroy the locks. */ 1018 * the downconvert thread while waiting to destroy the locks. */
1017 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 1019 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
1018 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 1020 ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
1019 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
1020 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); 1021 ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
1021 1022
1022 /* We very well may get a clear_inode before all an inodes 1023 /* We very well may get a clear_inode before all an inodes
@@ -1039,8 +1040,7 @@ void ocfs2_clear_inode(struct inode *inode)
1039 mlog_errno(status); 1040 mlog_errno(status);
1040 1041
1041 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1042 ocfs2_lock_res_free(&oi->ip_rw_lockres);
1042 ocfs2_lock_res_free(&oi->ip_meta_lockres); 1043 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1043 ocfs2_lock_res_free(&oi->ip_data_lockres);
1044 ocfs2_lock_res_free(&oi->ip_open_lockres); 1044 ocfs2_lock_res_free(&oi->ip_open_lockres);
1045 1045
1046 ocfs2_metadata_cache_purge(inode); 1046 ocfs2_metadata_cache_purge(inode);
@@ -1184,15 +1184,15 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1184 } 1184 }
1185 spin_unlock(&OCFS2_I(inode)->ip_lock); 1185 spin_unlock(&OCFS2_I(inode)->ip_lock);
1186 1186
1187 /* Let ocfs2_meta_lock do the work of updating our struct 1187 /* Let ocfs2_inode_lock do the work of updating our struct
1188 * inode for us. */ 1188 * inode for us. */
1189 status = ocfs2_meta_lock(inode, NULL, 0); 1189 status = ocfs2_inode_lock(inode, NULL, 0);
1190 if (status < 0) { 1190 if (status < 0) {
1191 if (status != -ENOENT) 1191 if (status != -ENOENT)
1192 mlog_errno(status); 1192 mlog_errno(status);
1193 goto bail; 1193 goto bail;
1194 } 1194 }
1195 ocfs2_meta_unlock(inode, 0); 1195 ocfs2_inode_unlock(inode, 0);
1196bail: 1196bail:
1197 mlog_exit(status); 1197 mlog_exit(status);
1198 1198
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 70e881c5553..390a85596aa 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -34,8 +34,7 @@ struct ocfs2_inode_info
34 u64 ip_blkno; 34 u64 ip_blkno;
35 35
36 struct ocfs2_lock_res ip_rw_lockres; 36 struct ocfs2_lock_res ip_rw_lockres;
37 struct ocfs2_lock_res ip_meta_lockres; 37 struct ocfs2_lock_res ip_inode_lockres;
38 struct ocfs2_lock_res ip_data_lockres;
39 struct ocfs2_lock_res ip_open_lockres; 38 struct ocfs2_lock_res ip_open_lockres;
40 39
41 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
@@ -121,9 +120,10 @@ void ocfs2_delete_inode(struct inode *inode);
121void ocfs2_drop_inode(struct inode *inode); 120void ocfs2_drop_inode(struct inode *inode);
122 121
123/* Flags for ocfs2_iget() */ 122/* Flags for ocfs2_iget() */
124#define OCFS2_FI_FLAG_SYSFILE 0x4 123#define OCFS2_FI_FLAG_SYSFILE 0x1
125#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 124#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
126struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); 125struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
126 int sysfile_type);
127int ocfs2_inode_init_private(struct inode *inode); 127int ocfs2_inode_init_private(struct inode *inode);
128int ocfs2_inode_revalidate(struct dentry *dentry); 128int ocfs2_inode_revalidate(struct dentry *dentry);
129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 129int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 87dcece7e1b..5177fba5162 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -20,6 +20,7 @@
20 20
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h"
23 24
24#include <linux/ext2_fs.h> 25#include <linux/ext2_fs.h>
25 26
@@ -27,14 +28,14 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
27{ 28{
28 int status; 29 int status;
29 30
30 status = ocfs2_meta_lock(inode, NULL, 0); 31 status = ocfs2_inode_lock(inode, NULL, 0);
31 if (status < 0) { 32 if (status < 0) {
32 mlog_errno(status); 33 mlog_errno(status);
33 return status; 34 return status;
34 } 35 }
35 ocfs2_get_inode_flags(OCFS2_I(inode)); 36 ocfs2_get_inode_flags(OCFS2_I(inode));
36 *flags = OCFS2_I(inode)->ip_attr; 37 *flags = OCFS2_I(inode)->ip_attr;
37 ocfs2_meta_unlock(inode, 0); 38 ocfs2_inode_unlock(inode, 0);
38 39
39 mlog_exit(status); 40 mlog_exit(status);
40 return status; 41 return status;
@@ -52,7 +53,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
52 53
53 mutex_lock(&inode->i_mutex); 54 mutex_lock(&inode->i_mutex);
54 55
55 status = ocfs2_meta_lock(inode, &bh, 1); 56 status = ocfs2_inode_lock(inode, &bh, 1);
56 if (status < 0) { 57 if (status < 0) {
57 mlog_errno(status); 58 mlog_errno(status);
58 goto bail; 59 goto bail;
@@ -100,7 +101,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
100 101
101 ocfs2_commit_trans(osb, handle); 102 ocfs2_commit_trans(osb, handle);
102bail_unlock: 103bail_unlock:
103 ocfs2_meta_unlock(inode, 1); 104 ocfs2_inode_unlock(inode, 1);
104bail: 105bail:
105 mutex_unlock(&inode->i_mutex); 106 mutex_unlock(&inode->i_mutex);
106 107
@@ -115,8 +116,10 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115 unsigned int cmd, unsigned long arg) 116 unsigned int cmd, unsigned long arg)
116{ 117{
117 unsigned int flags; 118 unsigned int flags;
119 int new_clusters;
118 int status; 120 int status;
119 struct ocfs2_space_resv sr; 121 struct ocfs2_space_resv sr;
122 struct ocfs2_new_group_input input;
120 123
121 switch (cmd) { 124 switch (cmd) {
122 case OCFS2_IOC_GETFLAGS: 125 case OCFS2_IOC_GETFLAGS:
@@ -140,6 +143,23 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
140 return -EFAULT; 143 return -EFAULT;
141 144
142 return ocfs2_change_file_space(filp, cmd, &sr); 145 return ocfs2_change_file_space(filp, cmd, &sr);
146 case OCFS2_IOC_GROUP_EXTEND:
147 if (!capable(CAP_SYS_RESOURCE))
148 return -EPERM;
149
150 if (get_user(new_clusters, (int __user *)arg))
151 return -EFAULT;
152
153 return ocfs2_group_extend(inode, new_clusters);
154 case OCFS2_IOC_GROUP_ADD:
155 case OCFS2_IOC_GROUP_ADD64:
156 if (!capable(CAP_SYS_RESOURCE))
157 return -EPERM;
158
159 if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
160 return -EFAULT;
161
162 return ocfs2_group_add(inode, &input);
143 default: 163 default:
144 return -ENOTTY; 164 return -ENOTTY;
145 } 165 }
@@ -162,6 +182,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
162 case OCFS2_IOC_RESVSP64: 182 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP: 183 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64: 184 case OCFS2_IOC_UNRESVSP64:
185 case OCFS2_IOC_GROUP_EXTEND:
186 case OCFS2_IOC_GROUP_ADD:
187 case OCFS2_IOC_GROUP_ADD64:
165 break; 188 break;
166 default: 189 default:
167 return -ENOIOCTLCMD; 190 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8d81f6c1b87..f31c7e8c19c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -44,7 +44,6 @@
44#include "localalloc.h" 44#include "localalloc.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h"
48#include "sysfile.h" 47#include "sysfile.h"
49 48
50#include "buffer_head_io.h" 49#include "buffer_head_io.h"
@@ -103,7 +102,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
103 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 102 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
104 journal->j_trans_id, flushed); 103 journal->j_trans_id, flushed);
105 104
106 ocfs2_kick_vote_thread(osb); 105 ocfs2_wake_downconvert_thread(osb);
107 wake_up(&journal->j_checkpointed); 106 wake_up(&journal->j_checkpointed);
108finally: 107finally:
109 mlog_exit(status); 108 mlog_exit(status);
@@ -314,14 +313,18 @@ int ocfs2_journal_dirty_data(handle_t *handle,
314 return err; 313 return err;
315} 314}
316 315
317#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) 316#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
318 317
319void ocfs2_set_journal_params(struct ocfs2_super *osb) 318void ocfs2_set_journal_params(struct ocfs2_super *osb)
320{ 319{
321 journal_t *journal = osb->journal->j_journal; 320 journal_t *journal = osb->journal->j_journal;
321 unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
322
323 if (osb->osb_commit_interval)
324 commit_interval = osb->osb_commit_interval;
322 325
323 spin_lock(&journal->j_state_lock); 326 spin_lock(&journal->j_state_lock);
324 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; 327 journal->j_commit_interval = commit_interval;
325 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 328 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
326 journal->j_flags |= JFS_BARRIER; 329 journal->j_flags |= JFS_BARRIER;
327 else 330 else
@@ -337,7 +340,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
337 struct ocfs2_dinode *di = NULL; 340 struct ocfs2_dinode *di = NULL;
338 struct buffer_head *bh = NULL; 341 struct buffer_head *bh = NULL;
339 struct ocfs2_super *osb; 342 struct ocfs2_super *osb;
340 int meta_lock = 0; 343 int inode_lock = 0;
341 344
342 mlog_entry_void(); 345 mlog_entry_void();
343 346
@@ -367,14 +370,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
367 /* Skip recovery waits here - journal inode metadata never 370 /* Skip recovery waits here - journal inode metadata never
368 * changes in a live cluster so it can be considered an 371 * changes in a live cluster so it can be considered an
369 * exception to the rule. */ 372 * exception to the rule. */
370 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 373 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
371 if (status < 0) { 374 if (status < 0) {
372 if (status != -ERESTARTSYS) 375 if (status != -ERESTARTSYS)
373 mlog(ML_ERROR, "Could not get lock on journal!\n"); 376 mlog(ML_ERROR, "Could not get lock on journal!\n");
374 goto done; 377 goto done;
375 } 378 }
376 379
377 meta_lock = 1; 380 inode_lock = 1;
378 di = (struct ocfs2_dinode *)bh->b_data; 381 di = (struct ocfs2_dinode *)bh->b_data;
379 382
380 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { 383 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
@@ -414,8 +417,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
414 status = 0; 417 status = 0;
415done: 418done:
416 if (status < 0) { 419 if (status < 0) {
417 if (meta_lock) 420 if (inode_lock)
418 ocfs2_meta_unlock(inode, 1); 421 ocfs2_inode_unlock(inode, 1);
419 if (bh != NULL) 422 if (bh != NULL)
420 brelse(bh); 423 brelse(bh);
421 if (inode) { 424 if (inode) {
@@ -544,7 +547,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
544 OCFS2_I(inode)->ip_open_count--; 547 OCFS2_I(inode)->ip_open_count--;
545 548
546 /* unlock our journal */ 549 /* unlock our journal */
547 ocfs2_meta_unlock(inode, 1); 550 ocfs2_inode_unlock(inode, 1);
548 551
549 brelse(journal->j_bh); 552 brelse(journal->j_bh);
550 journal->j_bh = NULL; 553 journal->j_bh = NULL;
@@ -883,8 +886,8 @@ restart:
883 ocfs2_super_unlock(osb, 1); 886 ocfs2_super_unlock(osb, 1);
884 887
885 /* We always run recovery on our own orphan dir - the dead 888 /* We always run recovery on our own orphan dir - the dead
886 * node(s) may have voted "no" on an inode delete earlier. A 889 * node(s) may have disallowd a previos inode delete. Re-processing
887 * revote is therefore required. */ 890 * is therefore required. */
888 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 891 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
889 NULL); 892 NULL);
890 893
@@ -973,9 +976,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
973 } 976 }
974 SET_INODE_JOURNAL(inode); 977 SET_INODE_JOURNAL(inode);
975 978
976 status = ocfs2_meta_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 979 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
977 if (status < 0) { 980 if (status < 0) {
978 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); 981 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
979 if (status != -ERESTARTSYS) 982 if (status != -ERESTARTSYS)
980 mlog(ML_ERROR, "Could not lock journal!\n"); 983 mlog(ML_ERROR, "Could not lock journal!\n");
981 goto done; 984 goto done;
@@ -1047,7 +1050,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1047done: 1050done:
1048 /* drop the lock on this nodes journal */ 1051 /* drop the lock on this nodes journal */
1049 if (got_lock) 1052 if (got_lock)
1050 ocfs2_meta_unlock(inode, 1); 1053 ocfs2_inode_unlock(inode, 1);
1051 1054
1052 if (inode) 1055 if (inode)
1053 iput(inode); 1056 iput(inode);
@@ -1162,14 +1165,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1162 SET_INODE_JOURNAL(inode); 1165 SET_INODE_JOURNAL(inode);
1163 1166
1164 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; 1167 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1165 status = ocfs2_meta_lock_full(inode, NULL, 1, flags); 1168 status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
1166 if (status < 0) { 1169 if (status < 0) {
1167 if (status != -EAGAIN) 1170 if (status != -EAGAIN)
1168 mlog_errno(status); 1171 mlog_errno(status);
1169 goto bail; 1172 goto bail;
1170 } 1173 }
1171 1174
1172 ocfs2_meta_unlock(inode, 1); 1175 ocfs2_inode_unlock(inode, 1);
1173bail: 1176bail:
1174 if (inode) 1177 if (inode)
1175 iput(inode); 1178 iput(inode);
@@ -1241,7 +1244,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1241 1244
1242 /* Skip bad inodes so that recovery can continue */ 1245 /* Skip bad inodes so that recovery can continue */
1243 iter = ocfs2_iget(p->osb, ino, 1246 iter = ocfs2_iget(p->osb, ino,
1244 OCFS2_FI_FLAG_ORPHAN_RECOVERY); 1247 OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
1245 if (IS_ERR(iter)) 1248 if (IS_ERR(iter))
1246 return 0; 1249 return 0;
1247 1250
@@ -1277,7 +1280,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1277 } 1280 }
1278 1281
1279 mutex_lock(&orphan_dir_inode->i_mutex); 1282 mutex_lock(&orphan_dir_inode->i_mutex);
1280 status = ocfs2_meta_lock(orphan_dir_inode, NULL, 0); 1283 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
1281 if (status < 0) { 1284 if (status < 0) {
1282 mlog_errno(status); 1285 mlog_errno(status);
1283 goto out; 1286 goto out;
@@ -1293,7 +1296,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1293 *head = priv.head; 1296 *head = priv.head;
1294 1297
1295out_cluster: 1298out_cluster:
1296 ocfs2_meta_unlock(orphan_dir_inode, 0); 1299 ocfs2_inode_unlock(orphan_dir_inode, 0);
1297out: 1300out:
1298 mutex_unlock(&orphan_dir_inode->i_mutex); 1301 mutex_unlock(&orphan_dir_inode->i_mutex);
1299 iput(orphan_dir_inode); 1302 iput(orphan_dir_inode);
@@ -1380,10 +1383,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1380 iter = oi->ip_next_orphan; 1383 iter = oi->ip_next_orphan;
1381 1384
1382 spin_lock(&oi->ip_lock); 1385 spin_lock(&oi->ip_lock);
1383 /* Delete voting may have set these on the assumption 1386 /* The remote delete code may have set these on the
1384 * that the other node would wipe them successfully. 1387 * assumption that the other node would wipe them
1385 * If they are still in the node's orphan dir, we need 1388 * successfully. If they are still in the node's
1386 * to reset that state. */ 1389 * orphan dir, we need to reset that state. */
1387 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); 1390 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1388 1391
1389 /* Set the proper information to get us going into 1392 /* Set the proper information to get us going into
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 4b32e096156..220f3e818e7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -278,6 +278,12 @@ int ocfs2_journal_dirty_data(handle_t *handle,
278/* simple file updates like chmod, etc. */ 278/* simple file updates like chmod, etc. */
279#define OCFS2_INODE_UPDATE_CREDITS 1 279#define OCFS2_INODE_UPDATE_CREDITS 1
280 280
281/* group extend. inode update and last group update. */
282#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
283
284/* group add. inode update and the new group update. */
285#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
286
281/* get one bit out of a suballocator: dinode + group descriptor + 287/* get one bit out of a suballocator: dinode + group descriptor +
282 * prev. group desc. if we relink. */ 288 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 289#define OCFS2_SUBALLOC_ALLOC (3)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 58ea88b5af3..add1ffdc5c6 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,18 +75,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 76 struct inode *local_alloc_inode);
77 77
78/*
79 * Determine how large our local alloc window should be, in bits.
80 *
81 * These values (and the behavior in ocfs2_alloc_should_use_local) have
82 * been chosen so that most allocations, including new block groups go
83 * through local alloc.
84 */
85static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
86{ 79{
87 BUG_ON(osb->s_clustersize_bits < 12); 80 BUG_ON(osb->s_clustersize_bits > 20);
88 81
89 return 2048 >> (osb->s_clustersize_bits - 12); 82 /* Size local alloc windows by the megabyte */
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
90} 84}
91 85
92/* 86/*
@@ -96,18 +90,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
96int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
97{ 91{
98 int la_bits = ocfs2_local_alloc_window_bits(osb); 92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0;
99 94
100 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 95 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
101 return 0; 96 goto bail;
102 97
103 /* la_bits should be at least twice the size (in clusters) of 98 /* la_bits should be at least twice the size (in clusters) of
104 * a new block group. We want to be sure block group 99 * a new block group. We want to be sure block group
105 * allocations go through the local alloc, so allow an 100 * allocations go through the local alloc, so allow an
106 * allocation to take up to half the bitmap. */ 101 * allocation to take up to half the bitmap. */
107 if (bits > (la_bits / 2)) 102 if (bits > (la_bits / 2))
108 return 0; 103 goto bail;
109 104
110 return 1; 105 ret = 1;
106bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
109 return ret;
111} 110}
112 111
113int ocfs2_load_local_alloc(struct ocfs2_super *osb) 112int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -121,6 +120,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
121 120
122 mlog_entry_void(); 121 mlog_entry_void();
123 122
123 if (ocfs2_mount_local(osb))
124 goto bail;
125
126 if (osb->local_alloc_size == 0)
127 goto bail;
128
129 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
130 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
131 "than max possible %u. Using defaults.\n",
132 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
133 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
134 }
135
124 /* read the alloc off disk */ 136 /* read the alloc off disk */
125 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, 137 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
126 osb->slot_num); 138 osb->slot_num);
@@ -181,6 +193,9 @@ bail:
181 if (inode) 193 if (inode)
182 iput(inode); 194 iput(inode);
183 195
196 mlog(0, "Local alloc window bits = %d\n",
197 ocfs2_local_alloc_window_bits(osb));
198
184 mlog_exit(status); 199 mlog_exit(status);
185 return status; 200 return status;
186} 201}
@@ -231,7 +246,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
231 246
232 mutex_lock(&main_bm_inode->i_mutex); 247 mutex_lock(&main_bm_inode->i_mutex);
233 248
234 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 249 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
235 if (status < 0) { 250 if (status < 0) {
236 mlog_errno(status); 251 mlog_errno(status);
237 goto out_mutex; 252 goto out_mutex;
@@ -286,7 +301,7 @@ out_unlock:
286 if (main_bm_bh) 301 if (main_bm_bh)
287 brelse(main_bm_bh); 302 brelse(main_bm_bh);
288 303
289 ocfs2_meta_unlock(main_bm_inode, 1); 304 ocfs2_inode_unlock(main_bm_inode, 1);
290 305
291out_mutex: 306out_mutex:
292 mutex_unlock(&main_bm_inode->i_mutex); 307 mutex_unlock(&main_bm_inode->i_mutex);
@@ -399,7 +414,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
399 414
400 mutex_lock(&main_bm_inode->i_mutex); 415 mutex_lock(&main_bm_inode->i_mutex);
401 416
402 status = ocfs2_meta_lock(main_bm_inode, &main_bm_bh, 1); 417 status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
403 if (status < 0) { 418 if (status < 0) {
404 mlog_errno(status); 419 mlog_errno(status);
405 goto out_mutex; 420 goto out_mutex;
@@ -424,7 +439,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
424 ocfs2_commit_trans(osb, handle); 439 ocfs2_commit_trans(osb, handle);
425 440
426out_unlock: 441out_unlock:
427 ocfs2_meta_unlock(main_bm_inode, 1); 442 ocfs2_inode_unlock(main_bm_inode, 1);
428 443
429out_mutex: 444out_mutex:
430 mutex_unlock(&main_bm_inode->i_mutex); 445 mutex_unlock(&main_bm_inode->i_mutex);
@@ -521,6 +536,9 @@ bail:
521 iput(local_alloc_inode); 536 iput(local_alloc_inode);
522 } 537 }
523 538
539 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
540 status);
541
524 mlog_exit(status); 542 mlog_exit(status);
525 return status; 543 return status;
526} 544}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 00000000000..203f8714387
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,125 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * locks.c
5 *
6 * Userspace file locking support
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27
28#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "dlmglue.h"
34#include "file.h"
35#include "locks.h"
36
37static int ocfs2_do_flock(struct file *file, struct inode *inode,
38 int cmd, struct file_lock *fl)
39{
40 int ret = 0, level = 0, trylock = 0;
41 struct ocfs2_file_private *fp = file->private_data;
42 struct ocfs2_lock_res *lockres = &fp->fp_flock;
43
44 if (fl->fl_type == F_WRLCK)
45 level = 1;
46 if (!IS_SETLKW(cmd))
47 trylock = 1;
48
49 mutex_lock(&fp->fp_mutex);
50
51 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
52 lockres->l_level > LKM_NLMODE) {
53 int old_level = 0;
54
55 if (lockres->l_level == LKM_EXMODE)
56 old_level = 1;
57
58 if (level == old_level)
59 goto out;
60
61 /*
62 * Converting an existing lock is not guaranteed to be
63 * atomic, so we can get away with simply unlocking
64 * here and allowing the lock code to try at the new
65 * level.
66 */
67
68 flock_lock_file_wait(file,
69 &(struct file_lock){.fl_type = F_UNLCK});
70
71 ocfs2_file_unlock(file);
72 }
73
74 ret = ocfs2_file_lock(file, level, trylock);
75 if (ret) {
76 if (ret == -EAGAIN && trylock)
77 ret = -EWOULDBLOCK;
78 else
79 mlog_errno(ret);
80 goto out;
81 }
82
83 ret = flock_lock_file_wait(file, fl);
84
85out:
86 mutex_unlock(&fp->fp_mutex);
87
88 return ret;
89}
90
91static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
92{
93 int ret;
94 struct ocfs2_file_private *fp = file->private_data;
95
96 mutex_lock(&fp->fp_mutex);
97 ocfs2_file_unlock(file);
98 ret = flock_lock_file_wait(file, fl);
99 mutex_unlock(&fp->fp_mutex);
100
101 return ret;
102}
103
104/*
105 * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
106 */
107int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
108{
109 struct inode *inode = file->f_mapping->host;
110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
111
112 if (!(fl->fl_flags & FL_FLOCK))
113 return -ENOLCK;
114 if (__mandatory_lock(inode))
115 return -ENOLCK;
116
117 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
118 ocfs2_mount_local(osb))
119 return flock_lock_file_wait(file, fl);
120
121 if (fl->fl_type == F_UNLCK)
122 return ocfs2_do_funlock(file, cmd, fl);
123 else
124 return ocfs2_do_flock(file, inode, cmd, fl);
125}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/locks.h
index 9ea46f62de3..9743ef2324e 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
1/* -*- mode: c; c-basic-offset: 8; -*- 1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * vote.h 4 * locks.h
5 * 5 *
6 * description here 6 * Function prototypes for Userspace file locking support
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
@@ -23,26 +23,9 @@
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26#ifndef OCFS2_LOCKS_H
27#define OCFS2_LOCKS_H
26 28
27#ifndef VOTE_H 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
28#define VOTE_H
29 30
30int ocfs2_vote_thread(void *arg); 31#endif /* OCFS2_LOCKS_H */
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_mount_vote(struct ocfs2_super *osb);
42int ocfs2_request_umount_vote(struct ocfs2_super *osb);
43int ocfs2_register_net_handlers(struct ocfs2_super *osb);
44void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
45
46void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
47 int node_num);
48#endif
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 98756156d29..3dc18d67557 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -168,7 +168,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
168 * node. Taking the data lock will also ensure that we don't 168 * node. Taking the data lock will also ensure that we don't
169 * attempt page truncation as part of a downconvert. 169 * attempt page truncation as part of a downconvert.
170 */ 170 */
171 ret = ocfs2_meta_lock(inode, &di_bh, 1); 171 ret = ocfs2_inode_lock(inode, &di_bh, 1);
172 if (ret < 0) { 172 if (ret < 0) {
173 mlog_errno(ret); 173 mlog_errno(ret);
174 goto out; 174 goto out;
@@ -181,21 +181,12 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
181 */ 181 */
182 down_write(&OCFS2_I(inode)->ip_alloc_sem); 182 down_write(&OCFS2_I(inode)->ip_alloc_sem);
183 183
184 ret = ocfs2_data_lock(inode, 1);
185 if (ret < 0) {
186 mlog_errno(ret);
187 goto out_meta_unlock;
188 }
189
190 ret = __ocfs2_page_mkwrite(inode, di_bh, page); 184 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
191 185
192 ocfs2_data_unlock(inode, 1);
193
194out_meta_unlock:
195 up_write(&OCFS2_I(inode)->ip_alloc_sem); 186 up_write(&OCFS2_I(inode)->ip_alloc_sem);
196 187
197 brelse(di_bh); 188 brelse(di_bh);
198 ocfs2_meta_unlock(inode, 1); 189 ocfs2_inode_unlock(inode, 1);
199 190
200out: 191out:
201 ret2 = ocfs2_vm_op_unblock_sigs(&oldset); 192 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
@@ -214,13 +205,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
214{ 205{
215 int ret = 0, lock_level = 0; 206 int ret = 0, lock_level = 0;
216 207
217 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 208 ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
218 file->f_vfsmnt, &lock_level); 209 file->f_vfsmnt, &lock_level);
219 if (ret < 0) { 210 if (ret < 0) {
220 mlog_errno(ret); 211 mlog_errno(ret);
221 goto out; 212 goto out;
222 } 213 }
223 ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level); 214 ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
224out: 215out:
225 vma->vm_ops = &ocfs2_file_vm_ops; 216 vma->vm_ops = &ocfs2_file_vm_ops;
226 vma->vm_flags |= VM_CAN_NONLINEAR; 217 vma->vm_flags |= VM_CAN_NONLINEAR;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 989ac271858..ae9ad958751 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,7 +60,6 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "vote.h"
64 63
65#include "buffer_head_io.h" 64#include "buffer_head_io.h"
66 65
@@ -116,7 +115,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
116 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len, 115 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
117 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); 116 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
118 117
119 status = ocfs2_meta_lock(dir, NULL, 0); 118 status = ocfs2_inode_lock(dir, NULL, 0);
120 if (status < 0) { 119 if (status < 0) {
121 if (status != -ENOENT) 120 if (status != -ENOENT)
122 mlog_errno(status); 121 mlog_errno(status);
@@ -129,7 +128,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
129 if (status < 0) 128 if (status < 0)
130 goto bail_add; 129 goto bail_add;
131 130
132 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); 131 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
133 if (IS_ERR(inode)) { 132 if (IS_ERR(inode)) {
134 ret = ERR_PTR(-EACCES); 133 ret = ERR_PTR(-EACCES);
135 goto bail_unlock; 134 goto bail_unlock;
@@ -176,8 +175,8 @@ bail_unlock:
176 /* Don't drop the cluster lock until *after* the d_add -- 175 /* Don't drop the cluster lock until *after* the d_add --
177 * unlink on another node will message us to remove that 176 * unlink on another node will message us to remove that
178 * dentry under this lock so otherwise we can race this with 177 * dentry under this lock so otherwise we can race this with
179 * the vote thread and have a stale dentry. */ 178 * the downconvert thread and have a stale dentry. */
180 ocfs2_meta_unlock(dir, 0); 179 ocfs2_inode_unlock(dir, 0);
181 180
182bail: 181bail:
183 182
@@ -209,7 +208,7 @@ static int ocfs2_mknod(struct inode *dir,
209 /* get our super block */ 208 /* get our super block */
210 osb = OCFS2_SB(dir->i_sb); 209 osb = OCFS2_SB(dir->i_sb);
211 210
212 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 211 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
213 if (status < 0) { 212 if (status < 0) {
214 if (status != -ENOENT) 213 if (status != -ENOENT)
215 mlog_errno(status); 214 mlog_errno(status);
@@ -323,7 +322,7 @@ leave:
323 if (handle) 322 if (handle)
324 ocfs2_commit_trans(osb, handle); 323 ocfs2_commit_trans(osb, handle);
325 324
326 ocfs2_meta_unlock(dir, 1); 325 ocfs2_inode_unlock(dir, 1);
327 326
328 if (status == -ENOSPC) 327 if (status == -ENOSPC)
329 mlog(0, "Disk is full\n"); 328 mlog(0, "Disk is full\n");
@@ -553,7 +552,7 @@ static int ocfs2_link(struct dentry *old_dentry,
553 if (S_ISDIR(inode->i_mode)) 552 if (S_ISDIR(inode->i_mode))
554 return -EPERM; 553 return -EPERM;
555 554
556 err = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 555 err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
557 if (err < 0) { 556 if (err < 0) {
558 if (err != -ENOENT) 557 if (err != -ENOENT)
559 mlog_errno(err); 558 mlog_errno(err);
@@ -578,7 +577,7 @@ static int ocfs2_link(struct dentry *old_dentry,
578 goto out; 577 goto out;
579 } 578 }
580 579
581 err = ocfs2_meta_lock(inode, &fe_bh, 1); 580 err = ocfs2_inode_lock(inode, &fe_bh, 1);
582 if (err < 0) { 581 if (err < 0) {
583 if (err != -ENOENT) 582 if (err != -ENOENT)
584 mlog_errno(err); 583 mlog_errno(err);
@@ -643,10 +642,10 @@ static int ocfs2_link(struct dentry *old_dentry,
643out_commit: 642out_commit:
644 ocfs2_commit_trans(osb, handle); 643 ocfs2_commit_trans(osb, handle);
645out_unlock_inode: 644out_unlock_inode:
646 ocfs2_meta_unlock(inode, 1); 645 ocfs2_inode_unlock(inode, 1);
647 646
648out: 647out:
649 ocfs2_meta_unlock(dir, 1); 648 ocfs2_inode_unlock(dir, 1);
650 649
651 if (de_bh) 650 if (de_bh)
652 brelse(de_bh); 651 brelse(de_bh);
@@ -720,7 +719,7 @@ static int ocfs2_unlink(struct inode *dir,
720 return -EPERM; 719 return -EPERM;
721 } 720 }
722 721
723 status = ocfs2_meta_lock(dir, &parent_node_bh, 1); 722 status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
724 if (status < 0) { 723 if (status < 0) {
725 if (status != -ENOENT) 724 if (status != -ENOENT)
726 mlog_errno(status); 725 mlog_errno(status);
@@ -745,7 +744,7 @@ static int ocfs2_unlink(struct inode *dir,
745 goto leave; 744 goto leave;
746 } 745 }
747 746
748 status = ocfs2_meta_lock(inode, &fe_bh, 1); 747 status = ocfs2_inode_lock(inode, &fe_bh, 1);
749 if (status < 0) { 748 if (status < 0) {
750 if (status != -ENOENT) 749 if (status != -ENOENT)
751 mlog_errno(status); 750 mlog_errno(status);
@@ -765,7 +764,7 @@ static int ocfs2_unlink(struct inode *dir,
765 764
766 status = ocfs2_remote_dentry_delete(dentry); 765 status = ocfs2_remote_dentry_delete(dentry);
767 if (status < 0) { 766 if (status < 0) {
768 /* This vote should succeed under all normal 767 /* This remote delete should succeed under all normal
769 * circumstances. */ 768 * circumstances. */
770 mlog_errno(status); 769 mlog_errno(status);
771 goto leave; 770 goto leave;
@@ -841,13 +840,13 @@ leave:
841 ocfs2_commit_trans(osb, handle); 840 ocfs2_commit_trans(osb, handle);
842 841
843 if (child_locked) 842 if (child_locked)
844 ocfs2_meta_unlock(inode, 1); 843 ocfs2_inode_unlock(inode, 1);
845 844
846 ocfs2_meta_unlock(dir, 1); 845 ocfs2_inode_unlock(dir, 1);
847 846
848 if (orphan_dir) { 847 if (orphan_dir) {
849 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 848 /* This was locked for us in ocfs2_prepare_orphan_dir() */
850 ocfs2_meta_unlock(orphan_dir, 1); 849 ocfs2_inode_unlock(orphan_dir, 1);
851 mutex_unlock(&orphan_dir->i_mutex); 850 mutex_unlock(&orphan_dir->i_mutex);
852 iput(orphan_dir); 851 iput(orphan_dir);
853 } 852 }
@@ -908,7 +907,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
908 inode1 = tmpinode; 907 inode1 = tmpinode;
909 } 908 }
910 /* lock id2 */ 909 /* lock id2 */
911 status = ocfs2_meta_lock(inode2, bh2, 1); 910 status = ocfs2_inode_lock(inode2, bh2, 1);
912 if (status < 0) { 911 if (status < 0) {
913 if (status != -ENOENT) 912 if (status != -ENOENT)
914 mlog_errno(status); 913 mlog_errno(status);
@@ -917,14 +916,14 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
917 } 916 }
918 917
919 /* lock id1 */ 918 /* lock id1 */
920 status = ocfs2_meta_lock(inode1, bh1, 1); 919 status = ocfs2_inode_lock(inode1, bh1, 1);
921 if (status < 0) { 920 if (status < 0) {
922 /* 921 /*
923 * An error return must mean that no cluster locks 922 * An error return must mean that no cluster locks
924 * were held on function exit. 923 * were held on function exit.
925 */ 924 */
926 if (oi1->ip_blkno != oi2->ip_blkno) 925 if (oi1->ip_blkno != oi2->ip_blkno)
927 ocfs2_meta_unlock(inode2, 1); 926 ocfs2_inode_unlock(inode2, 1);
928 927
929 if (status != -ENOENT) 928 if (status != -ENOENT)
930 mlog_errno(status); 929 mlog_errno(status);
@@ -937,10 +936,10 @@ bail:
937 936
938static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) 937static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
939{ 938{
940 ocfs2_meta_unlock(inode1, 1); 939 ocfs2_inode_unlock(inode1, 1);
941 940
942 if (inode1 != inode2) 941 if (inode1 != inode2)
943 ocfs2_meta_unlock(inode2, 1); 942 ocfs2_inode_unlock(inode2, 1);
944} 943}
945 944
946static int ocfs2_rename(struct inode *old_dir, 945static int ocfs2_rename(struct inode *old_dir,
@@ -1031,10 +1030,11 @@ static int ocfs2_rename(struct inode *old_dir,
1031 1030
1032 /* 1031 /*
1033 * Aside from allowing a meta data update, the locking here 1032 * Aside from allowing a meta data update, the locking here
1034 * also ensures that the vote thread on other nodes won't have 1033 * also ensures that the downconvert thread on other nodes
1035 * to concurrently downconvert the inode and the dentry locks. 1034 * won't have to concurrently downconvert the inode and the
1035 * dentry locks.
1036 */ 1036 */
1037 status = ocfs2_meta_lock(old_inode, &old_inode_bh, 1); 1037 status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
1038 if (status < 0) { 1038 if (status < 0) {
1039 if (status != -ENOENT) 1039 if (status != -ENOENT)
1040 mlog_errno(status); 1040 mlog_errno(status);
@@ -1143,7 +1143,7 @@ static int ocfs2_rename(struct inode *old_dir,
1143 goto bail; 1143 goto bail;
1144 } 1144 }
1145 1145
1146 status = ocfs2_meta_lock(new_inode, &newfe_bh, 1); 1146 status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
1147 if (status < 0) { 1147 if (status < 0) {
1148 if (status != -ENOENT) 1148 if (status != -ENOENT)
1149 mlog_errno(status); 1149 mlog_errno(status);
@@ -1355,14 +1355,14 @@ bail:
1355 ocfs2_double_unlock(old_dir, new_dir); 1355 ocfs2_double_unlock(old_dir, new_dir);
1356 1356
1357 if (old_child_locked) 1357 if (old_child_locked)
1358 ocfs2_meta_unlock(old_inode, 1); 1358 ocfs2_inode_unlock(old_inode, 1);
1359 1359
1360 if (new_child_locked) 1360 if (new_child_locked)
1361 ocfs2_meta_unlock(new_inode, 1); 1361 ocfs2_inode_unlock(new_inode, 1);
1362 1362
1363 if (orphan_dir) { 1363 if (orphan_dir) {
1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */ 1364 /* This was locked for us in ocfs2_prepare_orphan_dir() */
1365 ocfs2_meta_unlock(orphan_dir, 1); 1365 ocfs2_inode_unlock(orphan_dir, 1);
1366 mutex_unlock(&orphan_dir->i_mutex); 1366 mutex_unlock(&orphan_dir->i_mutex);
1367 iput(orphan_dir); 1367 iput(orphan_dir);
1368 } 1368 }
@@ -1530,7 +1530,7 @@ static int ocfs2_symlink(struct inode *dir,
1530 credits = ocfs2_calc_symlink_credits(sb); 1530 credits = ocfs2_calc_symlink_credits(sb);
1531 1531
1532 /* lock the parent directory */ 1532 /* lock the parent directory */
1533 status = ocfs2_meta_lock(dir, &parent_fe_bh, 1); 1533 status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
1534 if (status < 0) { 1534 if (status < 0) {
1535 if (status != -ENOENT) 1535 if (status != -ENOENT)
1536 mlog_errno(status); 1536 mlog_errno(status);
@@ -1657,7 +1657,7 @@ bail:
1657 if (handle) 1657 if (handle)
1658 ocfs2_commit_trans(osb, handle); 1658 ocfs2_commit_trans(osb, handle);
1659 1659
1660 ocfs2_meta_unlock(dir, 1); 1660 ocfs2_inode_unlock(dir, 1);
1661 1661
1662 if (new_fe_bh) 1662 if (new_fe_bh)
1663 brelse(new_fe_bh); 1663 brelse(new_fe_bh);
@@ -1735,7 +1735,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1735 1735
1736 mutex_lock(&orphan_dir_inode->i_mutex); 1736 mutex_lock(&orphan_dir_inode->i_mutex);
1737 1737
1738 status = ocfs2_meta_lock(orphan_dir_inode, &orphan_dir_bh, 1); 1738 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
1739 if (status < 0) { 1739 if (status < 0) {
1740 mlog_errno(status); 1740 mlog_errno(status);
1741 goto leave; 1741 goto leave;
@@ -1745,7 +1745,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1745 orphan_dir_bh, name, 1745 orphan_dir_bh, name,
1746 OCFS2_ORPHAN_NAMELEN, de_bh); 1746 OCFS2_ORPHAN_NAMELEN, de_bh);
1747 if (status < 0) { 1747 if (status < 0) {
1748 ocfs2_meta_unlock(orphan_dir_inode, 1); 1748 ocfs2_inode_unlock(orphan_dir_inode, 1);
1749 1749
1750 mlog_errno(status); 1750 mlog_errno(status);
1751 goto leave; 1751 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 60a23e1906b..d0848058047 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
101 * about to be 101 * about to be
102 * dropped. */ 102 * dropped. */
103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 103#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
104#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
104 105
105struct ocfs2_lock_res_ops; 106struct ocfs2_lock_res_ops;
106 107
@@ -170,6 +171,7 @@ enum ocfs2_mount_options
170 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 171 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
171 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 172 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
172 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 173 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
174 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
173}; 175};
174 176
175#define OCFS2_OSB_SOFT_RO 0x0001 177#define OCFS2_OSB_SOFT_RO 0x0001
@@ -189,9 +191,7 @@ struct ocfs2_super
189 struct ocfs2_slot_info *slot_info; 191 struct ocfs2_slot_info *slot_info;
190 192
191 spinlock_t node_map_lock; 193 spinlock_t node_map_lock;
192 struct ocfs2_node_map mounted_map;
193 struct ocfs2_node_map recovery_map; 194 struct ocfs2_node_map recovery_map;
194 struct ocfs2_node_map umount_map;
195 195
196 u64 root_blkno; 196 u64 root_blkno;
197 u64 system_dir_blkno; 197 u64 system_dir_blkno;
@@ -231,7 +231,9 @@ struct ocfs2_super
231 wait_queue_head_t checkpoint_event; 231 wait_queue_head_t checkpoint_event;
232 atomic_t needs_checkpoint; 232 atomic_t needs_checkpoint;
233 struct ocfs2_journal *journal; 233 struct ocfs2_journal *journal;
234 unsigned long osb_commit_interval;
234 235
236 int local_alloc_size;
235 enum ocfs2_local_alloc_state local_alloc_state; 237 enum ocfs2_local_alloc_state local_alloc_state;
236 struct buffer_head *local_alloc_bh; 238 struct buffer_head *local_alloc_bh;
237 u64 la_last_gd; 239 u64 la_last_gd;
@@ -254,28 +256,21 @@ struct ocfs2_super
254 256
255 wait_queue_head_t recovery_event; 257 wait_queue_head_t recovery_event;
256 258
257 spinlock_t vote_task_lock; 259 spinlock_t dc_task_lock;
258 struct task_struct *vote_task; 260 struct task_struct *dc_task;
259 wait_queue_head_t vote_event; 261 wait_queue_head_t dc_event;
260 unsigned long vote_wake_sequence; 262 unsigned long dc_wake_sequence;
261 unsigned long vote_work_sequence; 263 unsigned long dc_work_sequence;
262 264
265 /*
266 * Any thread can add locks to the list, but the downconvert
267 * thread is the only one allowed to remove locks. Any change
268 * to this rule requires updating
269 * ocfs2_downconvert_thread_do_work().
270 */
263 struct list_head blocked_lock_list; 271 struct list_head blocked_lock_list;
264 unsigned long blocked_lock_count; 272 unsigned long blocked_lock_count;
265 273
266 struct list_head vote_list;
267 int vote_count;
268
269 u32 net_key;
270 spinlock_t net_response_lock;
271 unsigned int net_response_ids;
272 struct list_head net_response_list;
273
274 struct o2hb_callback_func osb_hb_up;
275 struct o2hb_callback_func osb_hb_down;
276
277 struct list_head osb_net_handlers;
278
279 wait_queue_head_t osb_mount_event; 274 wait_queue_head_t osb_mount_event;
280 275
281 /* Truncate log info */ 276 /* Truncate log info */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 6ef876759a7..3633edd3982 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -231,6 +231,20 @@ struct ocfs2_space_resv {
231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) 231#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) 232#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
233 233
234/* Used to pass group descriptor data when online resize is done */
235struct ocfs2_new_group_input {
236 __u64 group; /* Group descriptor's blkno. */
237 __u32 clusters; /* Total number of clusters in this group */
238 __u32 frees; /* Total free clusters in this group */
239 __u16 chain; /* Chain for this group */
240 __u16 reserved1;
241 __u32 reserved2;
242};
243
244#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
245#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
246#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
247
234/* 248/*
235 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 249 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
236 */ 250 */
@@ -256,6 +270,14 @@ struct ocfs2_space_resv {
256/* Journal limits (in bytes) */ 270/* Journal limits (in bytes) */
257#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
258 272
273/*
274 * Default local alloc size (in megabytes)
275 *
276 * The value chosen should be such that most allocations, including new
277 * block groups, use local alloc.
278 */
279#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
280
259struct ocfs2_system_inode_info { 281struct ocfs2_system_inode_info {
260 char *si_name; 282 char *si_name;
261 int si_iflags; 283 int si_iflags;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4ca02b1c38a..86f3e3799c2 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
45 OCFS2_LOCK_TYPE_RW, 45 OCFS2_LOCK_TYPE_RW,
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK,
48 OCFS2_NUM_LOCK_TYPES 49 OCFS2_NUM_LOCK_TYPES
49}; 50};
50 51
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
73 case OCFS2_LOCK_TYPE_OPEN: 74 case OCFS2_LOCK_TYPE_OPEN:
74 c = 'O'; 75 c = 'O';
75 break; 76 break;
77 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F';
79 break;
76 default: 80 default:
77 c = '\0'; 81 c = '\0';
78 } 82 }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
90 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 94 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
91 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
92 [OCFS2_LOCK_TYPE_OPEN] = "Open", 96 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
93}; 98};
94 99
95static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 00000000000..37835ffcb03
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,634 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.c
5 *
6 * volume resize.
7 * Inspired by ext3/resize.c.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "dlmglue.h"
37#include "inode.h"
38#include "journal.h"
39#include "super.h"
40#include "sysfile.h"
41#include "uptodate.h"
42
43#include "buffer_head_io.h"
44#include "suballoc.h"
45#include "resize.h"
46
47/*
48 * Check whether there are new backup superblocks exist
49 * in the last group. If there are some, mark them or clear
50 * them in the bitmap.
51 *
52 * Return how many backups we find in the last group.
53 */
54static u16 ocfs2_calc_new_backup_super(struct inode *inode,
55 struct ocfs2_group_desc *gd,
56 int new_clusters,
57 u32 first_new_cluster,
58 u16 cl_cpg,
59 int set)
60{
61 int i;
62 u16 backups = 0;
63 u32 cluster;
64 u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
65
66 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
67 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
68 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
69
70 gd_blkno = ocfs2_which_cluster_group(inode, cluster);
71 if (gd_blkno < lgd_blkno)
72 continue;
73 else if (gd_blkno > lgd_blkno)
74 break;
75
76 if (set)
77 ocfs2_set_bit(cluster % cl_cpg,
78 (unsigned long *)gd->bg_bitmap);
79 else
80 ocfs2_clear_bit(cluster % cl_cpg,
81 (unsigned long *)gd->bg_bitmap);
82 backups++;
83 }
84
85 mlog_exit_void();
86 return backups;
87}
88
89static int ocfs2_update_last_group_and_inode(handle_t *handle,
90 struct inode *bm_inode,
91 struct buffer_head *bm_bh,
92 struct buffer_head *group_bh,
93 u32 first_new_cluster,
94 int new_clusters)
95{
96 int ret = 0;
97 struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
98 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
99 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
100 struct ocfs2_chain_rec *cr;
101 struct ocfs2_group_desc *group;
102 u16 chain, num_bits, backups = 0;
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster);
108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) {
112 mlog_errno(ret);
113 goto out;
114 }
115
116 group = (struct ocfs2_group_desc *)group_bh->b_data;
117
118 /* update the group first. */
119 num_bits = new_clusters * cl_bpc;
120 le16_add_cpu(&group->bg_bits, num_bits);
121 le16_add_cpu(&group->bg_free_bits_count, num_bits);
122
123 /*
124 * check whether there are some new backup superblocks exist in
125 * this group and update the group bitmap accordingly.
126 */
127 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
128 OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
129 backups = ocfs2_calc_new_backup_super(bm_inode,
130 group,
131 new_clusters,
132 first_new_cluster,
133 cl_cpg, 1);
134 le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
135 }
136
137 ret = ocfs2_journal_dirty(handle, group_bh);
138 if (ret < 0) {
139 mlog_errno(ret);
140 goto out_rollback;
141 }
142
143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out_rollback;
149 }
150
151 chain = le16_to_cpu(group->bg_chain);
152 cr = (&cl->cl_recs[chain]);
153 le32_add_cpu(&cr->c_total, num_bits);
154 le32_add_cpu(&cr->c_free, num_bits);
155 le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
156 le32_add_cpu(&fe->i_clusters, new_clusters);
157
158 if (backups) {
159 le32_add_cpu(&cr->c_free, -1 * backups);
160 le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
161 }
162
163 spin_lock(&OCFS2_I(bm_inode)->ip_lock);
164 OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
165 le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
166 spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
167 i_size_write(bm_inode, le64_to_cpu(fe->i_size));
168
169 ocfs2_journal_dirty(handle, bm_bh);
170
171out_rollback:
172 if (ret < 0) {
173 ocfs2_calc_new_backup_super(bm_inode,
174 group,
175 new_clusters,
176 first_new_cluster,
177 cl_cpg, 0);
178 le16_add_cpu(&group->bg_free_bits_count, backups);
179 le16_add_cpu(&group->bg_bits, -1 * num_bits);
180 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
181 }
182out:
183 mlog_exit(ret);
184 return ret;
185}
186
187static int update_backups(struct inode * inode, u32 clusters, char *data)
188{
189 int i, ret = 0;
190 u32 cluster;
191 u64 blkno;
192 struct buffer_head *backup = NULL;
193 struct ocfs2_dinode *backup_di = NULL;
194 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
195
196 /* calculate the real backups we need to update. */
197 for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
198 blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
199 cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
200 if (cluster > clusters)
201 break;
202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
204 if (ret < 0) {
205 mlog_errno(ret);
206 break;
207 }
208
209 memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
210
211 backup_di = (struct ocfs2_dinode *)backup->b_data;
212 backup_di->i_blkno = cpu_to_le64(blkno);
213
214 ret = ocfs2_write_super_or_backup(osb, backup);
215 brelse(backup);
216 backup = NULL;
217 if (ret < 0) {
218 mlog_errno(ret);
219 break;
220 }
221 }
222
223 return ret;
224}
225
226static void ocfs2_update_super_and_backups(struct inode *inode,
227 int new_clusters)
228{
229 int ret;
230 u32 clusters = 0;
231 struct buffer_head *super_bh = NULL;
232 struct ocfs2_dinode *super_di = NULL;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234
235 /*
236 * update the superblock last.
237 * It doesn't matter if the write failed.
238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
240 &super_bh, 0, NULL);
241 if (ret < 0) {
242 mlog_errno(ret);
243 goto out;
244 }
245
246 super_di = (struct ocfs2_dinode *)super_bh->b_data;
247 le32_add_cpu(&super_di->i_clusters, new_clusters);
248 clusters = le32_to_cpu(super_di->i_clusters);
249
250 ret = ocfs2_write_super_or_backup(osb, super_bh);
251 if (ret < 0) {
252 mlog_errno(ret);
253 goto out;
254 }
255
256 if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
257 ret = update_backups(inode, clusters, super_bh->b_data);
258
259out:
260 brelse(super_bh);
261 if (ret)
262 printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
263 " during fs resize. This condition is not fatal,"
264 " but fsck.ocfs2 should be run to fix it\n",
265 osb->dev_str);
266 return;
267}
268
269/*
270 * Extend the filesystem to the new number of clusters specified. This entry
271 * point is only used to extend the current filesystem to the end of the last
272 * existing group.
273 */
274int ocfs2_group_extend(struct inode * inode, int new_clusters)
275{
276 int ret;
277 handle_t *handle;
278 struct buffer_head *main_bm_bh = NULL;
279 struct buffer_head *group_bh = NULL;
280 struct inode *main_bm_inode = NULL;
281 struct ocfs2_dinode *fe = NULL;
282 struct ocfs2_group_desc *group = NULL;
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 u16 cl_bpc;
285 u32 first_new_cluster;
286 u64 lgd_blkno;
287
288 mlog_entry_void();
289
290 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
291 return -EROFS;
292
293 if (new_clusters < 0)
294 return -EINVAL;
295 else if (new_clusters == 0)
296 return 0;
297
298 main_bm_inode = ocfs2_get_system_file_inode(osb,
299 GLOBAL_BITMAP_SYSTEM_INODE,
300 OCFS2_INVALID_SLOT);
301 if (!main_bm_inode) {
302 ret = -EINVAL;
303 mlog_errno(ret);
304 goto out;
305 }
306
307 mutex_lock(&main_bm_inode->i_mutex);
308
309 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
310 if (ret < 0) {
311 mlog_errno(ret);
312 goto out_mutex;
313 }
314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. "
320 "Force to do offline resize.");
321 ret = -EINVAL;
322 goto out_unlock;
323 }
324
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1);
334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
336 main_bm_inode);
337 if (ret < 0) {
338 mlog_errno(ret);
339 goto out_unlock;
340 }
341
342 group = (struct ocfs2_group_desc *)group_bh->b_data;
343
344 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
345 if (ret) {
346 mlog_errno(ret);
347 goto out_unlock;
348 }
349
350 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
351 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
352 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
353 ret = -EINVAL;
354 goto out_unlock;
355 }
356
357 mlog(0, "extend the last group at %llu, new clusters = %d\n",
358 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
359
360 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
361 if (IS_ERR(handle)) {
362 mlog_errno(PTR_ERR(handle));
363 ret = -EINVAL;
364 goto out_unlock;
365 }
366
367 /* update the last group descriptor and inode. */
368 ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
369 main_bm_bh, group_bh,
370 first_new_cluster,
371 new_clusters);
372 if (ret) {
373 mlog_errno(ret);
374 goto out_commit;
375 }
376
377 ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
378
379out_commit:
380 ocfs2_commit_trans(osb, handle);
381out_unlock:
382 brelse(group_bh);
383 brelse(main_bm_bh);
384
385 ocfs2_inode_unlock(main_bm_inode, 1);
386
387out_mutex:
388 mutex_unlock(&main_bm_inode->i_mutex);
389 iput(main_bm_inode);
390
391out:
392 mlog_exit_void();
393 return ret;
394}
395
396static int ocfs2_check_new_group(struct inode *inode,
397 struct ocfs2_dinode *di,
398 struct ocfs2_new_group_input *input,
399 struct buffer_head *group_bh)
400{
401 int ret;
402 struct ocfs2_group_desc *gd;
403 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
404 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
405 le16_to_cpu(di->id2.i_chain.cl_bpc);
406
407
408 gd = (struct ocfs2_group_desc *)group_bh->b_data;
409
410 ret = -EIO;
411 if (!OCFS2_IS_VALID_GROUP_DESC(gd))
412 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
413 (unsigned long long)le64_to_cpu(gd->bg_blkno));
414 else if (di->i_blkno != gd->bg_parent_dinode)
415 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
416 "pointer (%llu, expected %llu)\n",
417 (unsigned long long)le64_to_cpu(gd->bg_blkno),
418 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
419 (unsigned long long)le64_to_cpu(di->i_blkno));
420 else if (le16_to_cpu(gd->bg_bits) > max_bits)
421 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
422 (unsigned long long)le64_to_cpu(gd->bg_blkno),
423 le16_to_cpu(gd->bg_bits));
424 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
425 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
426 "claims that %u are free\n",
427 (unsigned long long)le64_to_cpu(gd->bg_blkno),
428 le16_to_cpu(gd->bg_bits),
429 le16_to_cpu(gd->bg_free_bits_count));
430 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
431 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
432 "max bitmap bits of %u\n",
433 (unsigned long long)le64_to_cpu(gd->bg_blkno),
434 le16_to_cpu(gd->bg_bits),
435 8 * le16_to_cpu(gd->bg_size));
436 else if (le16_to_cpu(gd->bg_chain) != input->chain)
437 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
438 "while input has %u set.\n",
439 (unsigned long long)le64_to_cpu(gd->bg_blkno),
440 le16_to_cpu(gd->bg_chain), input->chain);
441 else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
442 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
443 "input has %u clusters set\n",
444 (unsigned long long)le64_to_cpu(gd->bg_blkno),
445 le16_to_cpu(gd->bg_bits), input->clusters);
446 else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
447 mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
448 "but it should have %u set\n",
449 (unsigned long long)le64_to_cpu(gd->bg_blkno),
450 le16_to_cpu(gd->bg_bits),
451 input->frees * cl_bpc);
452 else
453 ret = 0;
454
455 return ret;
456}
457
458static int ocfs2_verify_group_and_input(struct inode *inode,
459 struct ocfs2_dinode *di,
460 struct ocfs2_new_group_input *input,
461 struct buffer_head *group_bh)
462{
463 u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
464 u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
465 u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
466 u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
467 u32 total_clusters = le32_to_cpu(di->i_clusters);
468 int ret = -EINVAL;
469
470 if (cluster < total_clusters)
471 mlog(ML_ERROR, "add a group which is in the current volume.\n");
472 else if (input->chain >= cl_count)
473 mlog(ML_ERROR, "input chain exceeds the limit.\n");
474 else if (next_free != cl_count && next_free != input->chain)
475 mlog(ML_ERROR,
476 "the add group should be in chain %u\n", next_free);
477 else if (total_clusters + input->clusters < total_clusters)
478 mlog(ML_ERROR, "add group's clusters overflow.\n");
479 else if (input->clusters > cl_cpg)
480 mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
481 else if (input->frees > input->clusters)
482 mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
483 else if (total_clusters % cl_cpg != 0)
484 mlog(ML_ERROR,
485 "the last group isn't full. Use group extend first.\n");
486 else if (input->group != ocfs2_which_cluster_group(inode, cluster))
487 mlog(ML_ERROR, "group blkno is invalid\n");
488 else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
489 mlog(ML_ERROR, "group descriptor check failed.\n");
490 else
491 ret = 0;
492
493 return ret;
494}
495
496/* Add a new group descriptor to global_bitmap. */
497int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
498{
499 int ret;
500 handle_t *handle;
501 struct buffer_head *main_bm_bh = NULL;
502 struct inode *main_bm_inode = NULL;
503 struct ocfs2_dinode *fe = NULL;
504 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
505 struct buffer_head *group_bh = NULL;
506 struct ocfs2_group_desc *group = NULL;
507 struct ocfs2_chain_list *cl;
508 struct ocfs2_chain_rec *cr;
509 u16 cl_bpc;
510
511 mlog_entry_void();
512
513 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
514 return -EROFS;
515
516 main_bm_inode = ocfs2_get_system_file_inode(osb,
517 GLOBAL_BITMAP_SYSTEM_INODE,
518 OCFS2_INVALID_SLOT);
519 if (!main_bm_inode) {
520 ret = -EINVAL;
521 mlog_errno(ret);
522 goto out;
523 }
524
525 mutex_lock(&main_bm_inode->i_mutex);
526
527 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 if (ret < 0) {
529 mlog_errno(ret);
530 goto out_mutex;
531 }
532
533 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
534
535 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
536 ocfs2_group_bitmap_size(osb->sb) * 8) {
537 mlog(ML_ERROR, "The disk is too old and small."
538 " Force to do offline resize.");
539 ret = -EINVAL;
540 goto out_unlock;
541 }
542
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
544 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group);
547 goto out_unlock;
548 }
549
550 ocfs2_set_new_buffer_uptodate(inode, group_bh);
551
552 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
553 if (ret) {
554 mlog_errno(ret);
555 goto out_unlock;
556 }
557
558 mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
559 (unsigned long long)input->group, input->chain, input->clusters);
560
561 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
562 if (IS_ERR(handle)) {
563 mlog_errno(PTR_ERR(handle));
564 ret = -EINVAL;
565 goto out_unlock;
566 }
567
568 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
569 cl = &fe->id2.i_chain;
570 cr = &cl->cl_recs[input->chain];
571
572 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
573 OCFS2_JOURNAL_ACCESS_WRITE);
574 if (ret < 0) {
575 mlog_errno(ret);
576 goto out_commit;
577 }
578
579 group = (struct ocfs2_group_desc *)group_bh->b_data;
580 group->bg_next_group = cr->c_blkno;
581
582 ret = ocfs2_journal_dirty(handle, group_bh);
583 if (ret < 0) {
584 mlog_errno(ret);
585 goto out_commit;
586 }
587
588 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
589 OCFS2_JOURNAL_ACCESS_WRITE);
590 if (ret < 0) {
591 mlog_errno(ret);
592 goto out_commit;
593 }
594
595 if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
596 le16_add_cpu(&cl->cl_next_free_rec, 1);
597 memset(cr, 0, sizeof(struct ocfs2_chain_rec));
598 }
599
600 cr->c_blkno = le64_to_cpu(input->group);
601 le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
602 le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
603
604 le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
605 le32_add_cpu(&fe->id1.bitmap1.i_used,
606 (input->clusters - input->frees) * cl_bpc);
607 le32_add_cpu(&fe->i_clusters, input->clusters);
608
609 ocfs2_journal_dirty(handle, main_bm_bh);
610
611 spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
612 OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
613 le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
614 spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
615 i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
616
617 ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
618
619out_commit:
620 ocfs2_commit_trans(osb, handle);
621out_unlock:
622 brelse(group_bh);
623 brelse(main_bm_bh);
624
625 ocfs2_inode_unlock(main_bm_inode, 1);
626
627out_mutex:
628 mutex_unlock(&main_bm_inode->i_mutex);
629 iput(main_bm_inode);
630
631out:
632 mlog_exit_void();
633 return ret;
634}
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
new file mode 100644
index 00000000000..f38841abf10
--- /dev/null
+++ b/fs/ocfs2/resize.h
@@ -0,0 +1,32 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * resize.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_RESIZE_H
27#define OCFS2_RESIZE_H
28
29int ocfs2_group_extend(struct inode * inode, int new_clusters);
30int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
31
32#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index af4882b62cf..3a50ce555e6 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -48,25 +48,6 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
48 s16 slot_num, 48 s16 slot_num,
49 s16 node_num); 49 s16 node_num);
50 50
51/* Use the slot information we've collected to create a map of mounted
52 * nodes. Should be holding an EX on super block. assumes slot info is
53 * up to date. Note that we call this *after* we find a slot, so our
54 * own node should be set in the map too... */
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
56{
57 int i;
58 struct ocfs2_slot_info *si = osb->slot_info;
59
60 spin_lock(&si->si_lock);
61
62 for (i = 0; i < si->si_size; i++)
63 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
64 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
65 si->si_global_node_nums[i]);
66
67 spin_unlock(&si->si_lock);
68}
69
70/* post the slot information on disk into our slot_info struct. */ 51/* post the slot information on disk into our slot_info struct. */
71void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
72{ 53{
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031..1025872aaad 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -52,8 +52,6 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
52void ocfs2_clear_slot(struct ocfs2_slot_info *si, 52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num); 53 s16 slot_num);
54 54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num) 56 int slot_num)
59{ 57{
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8f09f5235e3..7e397e2c25d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -101,8 +101,6 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 u64 bg_blkno, 102 u64 bg_blkno,
103 u16 bg_bit_off); 103 u16 bg_bit_off);
104static inline u64 ocfs2_which_cluster_group(struct inode *inode,
105 u32 cluster);
106static inline void ocfs2_block_to_cluster_group(struct inode *inode, 104static inline void ocfs2_block_to_cluster_group(struct inode *inode,
107 u64 data_blkno, 105 u64 data_blkno,
108 u64 *bg_blkno, 106 u64 *bg_blkno,
@@ -114,7 +112,7 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
114 112
115 if (inode) { 113 if (inode) {
116 if (ac->ac_which != OCFS2_AC_USE_LOCAL) 114 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
117 ocfs2_meta_unlock(inode, 1); 115 ocfs2_inode_unlock(inode, 1);
118 116
119 mutex_unlock(&inode->i_mutex); 117 mutex_unlock(&inode->i_mutex);
120 118
@@ -131,9 +129,9 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
131} 129}
132 130
133/* somewhat more expensive than our other checks, so use sparingly. */ 131/* somewhat more expensive than our other checks, so use sparingly. */
134static int ocfs2_check_group_descriptor(struct super_block *sb, 132int ocfs2_check_group_descriptor(struct super_block *sb,
135 struct ocfs2_dinode *di, 133 struct ocfs2_dinode *di,
136 struct ocfs2_group_desc *gd) 134 struct ocfs2_group_desc *gd)
137{ 135{
138 unsigned int max_bits; 136 unsigned int max_bits;
139 137
@@ -412,7 +410,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
412 410
413 mutex_lock(&alloc_inode->i_mutex); 411 mutex_lock(&alloc_inode->i_mutex);
414 412
415 status = ocfs2_meta_lock(alloc_inode, &bh, 1); 413 status = ocfs2_inode_lock(alloc_inode, &bh, 1);
416 if (status < 0) { 414 if (status < 0) {
417 mutex_unlock(&alloc_inode->i_mutex); 415 mutex_unlock(&alloc_inode->i_mutex);
418 iput(alloc_inode); 416 iput(alloc_inode);
@@ -1443,8 +1441,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1443 1441
1444/* given a cluster offset, calculate which block group it belongs to 1442/* given a cluster offset, calculate which block group it belongs to
1445 * and return that block offset. */ 1443 * and return that block offset. */
1446static inline u64 ocfs2_which_cluster_group(struct inode *inode, 1444u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1447 u32 cluster)
1448{ 1445{
1449 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1446 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1450 u32 group_no; 1447 u32 group_no;
@@ -1519,8 +1516,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1519 if (min_clusters > (osb->bitmap_cpg - 1)) { 1516 if (min_clusters > (osb->bitmap_cpg - 1)) {
1520 /* The only paths asking for contiguousness 1517 /* The only paths asking for contiguousness
1521 * should know about this already. */ 1518 * should know about this already. */
1522 mlog(ML_ERROR, "minimum allocation requested exceeds " 1519 mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1523 "group bitmap size!"); 1520 "group bitmap size %u!\n", min_clusters,
1521 osb->bitmap_cpg);
1524 status = -ENOSPC; 1522 status = -ENOSPC;
1525 goto bail; 1523 goto bail;
1526 } 1524 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index cafe9370309..8799033bb45 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -147,4 +147,12 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 147int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
148 struct ocfs2_alloc_context *ac); 148 struct ocfs2_alloc_context *ac);
149 149
150/* given a cluster offset, calculate which block group it belongs to
151 * and return that block offset. */
152u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
153
154/* somewhat more expensive than our other checks, so use sparingly. */
155int ocfs2_check_group_descriptor(struct super_block *sb,
156 struct ocfs2_dinode *di,
157 struct ocfs2_group_desc *gd);
150#endif /* _CHAINALLOC_H_ */ 158#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5ee77542066..01fe40ee5ea 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,7 +65,6 @@
65#include "sysfile.h" 65#include "sysfile.h"
66#include "uptodate.h" 66#include "uptodate.h"
67#include "ver.h" 67#include "ver.h"
68#include "vote.h"
69 68
70#include "buffer_head_io.h" 69#include "buffer_head_io.h"
71 70
@@ -84,9 +83,11 @@ MODULE_LICENSE("GPL");
84 83
85struct mount_options 84struct mount_options
86{ 85{
86 unsigned long commit_interval;
87 unsigned long mount_opt; 87 unsigned long mount_opt;
88 unsigned int atime_quantum; 88 unsigned int atime_quantum;
89 signed short slot; 89 signed short slot;
90 unsigned int localalloc_opt;
90}; 91};
91 92
92static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -150,6 +151,9 @@ enum {
150 Opt_data_writeback, 151 Opt_data_writeback,
151 Opt_atime_quantum, 152 Opt_atime_quantum,
152 Opt_slot, 153 Opt_slot,
154 Opt_commit,
155 Opt_localalloc,
156 Opt_localflocks,
153 Opt_err, 157 Opt_err,
154}; 158};
155 159
@@ -165,6 +169,9 @@ static match_table_t tokens = {
165 {Opt_data_writeback, "data=writeback"}, 169 {Opt_data_writeback, "data=writeback"},
166 {Opt_atime_quantum, "atime_quantum=%u"}, 170 {Opt_atime_quantum, "atime_quantum=%u"},
167 {Opt_slot, "preferred_slot=%u"}, 171 {Opt_slot, "preferred_slot=%u"},
172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"},
168 {Opt_err, NULL} 175 {Opt_err, NULL}
169}; 176};
170 177
@@ -213,7 +220,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
213 220
214 mlog_entry_void(); 221 mlog_entry_void();
215 222
216 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); 223 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
217 if (IS_ERR(new)) { 224 if (IS_ERR(new)) {
218 status = PTR_ERR(new); 225 status = PTR_ERR(new);
219 mlog_errno(status); 226 mlog_errno(status);
@@ -221,7 +228,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
221 } 228 }
222 osb->root_inode = new; 229 osb->root_inode = new;
223 230
224 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); 231 new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
225 if (IS_ERR(new)) { 232 if (IS_ERR(new)) {
226 status = PTR_ERR(new); 233 status = PTR_ERR(new);
227 mlog_errno(status); 234 mlog_errno(status);
@@ -443,6 +450,8 @@ unlock_osb:
443 osb->s_mount_opt = parsed_options.mount_opt; 450 osb->s_mount_opt = parsed_options.mount_opt;
444 osb->s_atime_quantum = parsed_options.atime_quantum; 451 osb->s_atime_quantum = parsed_options.atime_quantum;
445 osb->preferred_slot = parsed_options.slot; 452 osb->preferred_slot = parsed_options.slot;
453 if (parsed_options.commit_interval)
454 osb->osb_commit_interval = parsed_options.commit_interval;
446 455
447 if (!ocfs2_is_hard_readonly(osb)) 456 if (!ocfs2_is_hard_readonly(osb))
448 ocfs2_set_journal_params(osb); 457 ocfs2_set_journal_params(osb);
@@ -597,6 +606,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
597 osb->s_mount_opt = parsed_options.mount_opt; 606 osb->s_mount_opt = parsed_options.mount_opt;
598 osb->s_atime_quantum = parsed_options.atime_quantum; 607 osb->s_atime_quantum = parsed_options.atime_quantum;
599 osb->preferred_slot = parsed_options.slot; 608 osb->preferred_slot = parsed_options.slot;
609 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt;
600 611
601 sb->s_magic = OCFS2_SUPER_MAGIC; 612 sb->s_magic = OCFS2_SUPER_MAGIC;
602 613
@@ -747,9 +758,11 @@ static int ocfs2_parse_options(struct super_block *sb,
747 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 758 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
748 options ? options : "(none)"); 759 options ? options : "(none)");
749 760
761 mopt->commit_interval = 0;
750 mopt->mount_opt = 0; 762 mopt->mount_opt = 0;
751 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
752 mopt->slot = OCFS2_INVALID_SLOT; 764 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
753 766
754 if (!options) { 767 if (!options) {
755 status = 1; 768 status = 1;
@@ -816,6 +829,41 @@ static int ocfs2_parse_options(struct super_block *sb,
816 if (option) 829 if (option)
817 mopt->slot = (s16)option; 830 mopt->slot = (s16)option;
818 break; 831 break;
832 case Opt_commit:
833 option = 0;
834 if (match_int(&args[0], &option)) {
835 status = 0;
836 goto bail;
837 }
838 if (option < 0)
839 return 0;
840 if (option == 0)
841 option = JBD_DEFAULT_MAX_COMMIT_AGE;
842 mopt->commit_interval = HZ * option;
843 break;
844 case Opt_localalloc:
845 option = 0;
846 if (match_int(&args[0], &option)) {
847 status = 0;
848 goto bail;
849 }
850 if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
851 mopt->localalloc_opt = option;
852 break;
853 case Opt_localflocks:
854 /*
855 * Changing this during remount could race
856 * flock() requests, or "unbalance" existing
857 * ones (e.g., a lock is taken in one mode but
858 * dropped in the other). If users care enough
859 * to flip locking modes during remount, we
860 * could add a "local" flag to individual
861 * flock structures for proper tracking of
862 * state.
863 */
864 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break;
819 default: 867 default:
820 mlog(ML_ERROR, 868 mlog(ML_ERROR,
821 "Unrecognized mount option \"%s\" " 869 "Unrecognized mount option \"%s\" "
@@ -864,6 +912,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 912 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 913 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866 914
915 if (osb->osb_commit_interval)
916 seq_printf(s, ",commit=%u",
917 (unsigned) (osb->osb_commit_interval / HZ));
918
919 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
920 seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
921
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,");
924
867 return 0; 925 return 0;
868} 926}
869 927
@@ -965,7 +1023,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
965 goto bail; 1023 goto bail;
966 } 1024 }
967 1025
968 status = ocfs2_meta_lock(inode, &bh, 0); 1026 status = ocfs2_inode_lock(inode, &bh, 0);
969 if (status < 0) { 1027 if (status < 0) {
970 mlog_errno(status); 1028 mlog_errno(status);
971 goto bail; 1029 goto bail;
@@ -989,7 +1047,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
989 1047
990 brelse(bh); 1048 brelse(bh);
991 1049
992 ocfs2_meta_unlock(inode, 0); 1050 ocfs2_inode_unlock(inode, 0);
993 status = 0; 1051 status = 0;
994bail: 1052bail:
995 if (inode) 1053 if (inode)
@@ -1020,8 +1078,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
1020 oi->ip_clusters = 0; 1078 oi->ip_clusters = 0;
1021 1079
1022 ocfs2_lock_res_init_once(&oi->ip_rw_lockres); 1080 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
1023 ocfs2_lock_res_init_once(&oi->ip_meta_lockres); 1081 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1024 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
1025 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1082 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1026 1083
1027 ocfs2_metadata_cache_init(&oi->vfs_inode); 1084 ocfs2_metadata_cache_init(&oi->vfs_inode);
@@ -1117,25 +1174,12 @@ static int ocfs2_mount_volume(struct super_block *sb)
1117 goto leave; 1174 goto leave;
1118 } 1175 }
1119 1176
1120 status = ocfs2_register_hb_callbacks(osb);
1121 if (status < 0) {
1122 mlog_errno(status);
1123 goto leave;
1124 }
1125
1126 status = ocfs2_dlm_init(osb); 1177 status = ocfs2_dlm_init(osb);
1127 if (status < 0) { 1178 if (status < 0) {
1128 mlog_errno(status); 1179 mlog_errno(status);
1129 goto leave; 1180 goto leave;
1130 } 1181 }
1131 1182
1132 /* requires vote_thread to be running. */
1133 status = ocfs2_register_net_handlers(osb);
1134 if (status < 0) {
1135 mlog_errno(status);
1136 goto leave;
1137 }
1138
1139 status = ocfs2_super_lock(osb, 1); 1183 status = ocfs2_super_lock(osb, 1);
1140 if (status < 0) { 1184 if (status < 0) {
1141 mlog_errno(status); 1185 mlog_errno(status);
@@ -1150,8 +1194,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1150 goto leave; 1194 goto leave;
1151 } 1195 }
1152 1196
1153 ocfs2_populate_mounted_map(osb);
1154
1155 /* load all node-local system inodes */ 1197 /* load all node-local system inodes */
1156 status = ocfs2_init_local_system_inodes(osb); 1198 status = ocfs2_init_local_system_inodes(osb);
1157 if (status < 0) { 1199 if (status < 0) {
@@ -1174,15 +1216,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1174 if (ocfs2_mount_local(osb)) 1216 if (ocfs2_mount_local(osb))
1175 goto leave; 1217 goto leave;
1176 1218
1177 /* This should be sent *after* we recovered our journal as it
1178 * will cause other nodes to unmark us as needing
1179 * recovery. However, we need to send it *before* dropping the
1180 * super block lock as otherwise their recovery threads might
1181 * try to clean us up while we're live! */
1182 status = ocfs2_request_mount_vote(osb);
1183 if (status < 0)
1184 mlog_errno(status);
1185
1186leave: 1219leave:
1187 if (unlock_super) 1220 if (unlock_super)
1188 ocfs2_super_unlock(osb, 1); 1221 ocfs2_super_unlock(osb, 1);
@@ -1240,10 +1273,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1240 mlog_errno(tmp); 1273 mlog_errno(tmp);
1241 return; 1274 return;
1242 } 1275 }
1243
1244 tmp = ocfs2_request_umount_vote(osb);
1245 if (tmp < 0)
1246 mlog_errno(tmp);
1247 } 1276 }
1248 1277
1249 if (osb->slot_num != OCFS2_INVALID_SLOT) 1278 if (osb->slot_num != OCFS2_INVALID_SLOT)
@@ -1254,13 +1283,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1254 1283
1255 ocfs2_release_system_inodes(osb); 1284 ocfs2_release_system_inodes(osb);
1256 1285
1257 if (osb->dlm) { 1286 if (osb->dlm)
1258 ocfs2_unregister_net_handlers(osb);
1259
1260 ocfs2_dlm_shutdown(osb); 1287 ocfs2_dlm_shutdown(osb);
1261 }
1262
1263 ocfs2_clear_hb_callbacks(osb);
1264 1288
1265 debugfs_remove(osb->osb_debug_root); 1289 debugfs_remove(osb->osb_debug_root);
1266 1290
@@ -1315,7 +1339,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1315 int i, cbits, bbits; 1339 int i, cbits, bbits;
1316 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1340 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1317 struct inode *inode = NULL; 1341 struct inode *inode = NULL;
1318 struct buffer_head *bitmap_bh = NULL;
1319 struct ocfs2_journal *journal; 1342 struct ocfs2_journal *journal;
1320 __le32 uuid_net_key; 1343 __le32 uuid_net_key;
1321 struct ocfs2_super *osb; 1344 struct ocfs2_super *osb;
@@ -1344,19 +1367,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
1344 osb->s_sectsize_bits = blksize_bits(sector_size); 1367 osb->s_sectsize_bits = blksize_bits(sector_size);
1345 BUG_ON(!osb->s_sectsize_bits); 1368 BUG_ON(!osb->s_sectsize_bits);
1346 1369
1347 osb->net_response_ids = 0;
1348 spin_lock_init(&osb->net_response_lock);
1349 INIT_LIST_HEAD(&osb->net_response_list);
1350
1351 INIT_LIST_HEAD(&osb->osb_net_handlers);
1352 init_waitqueue_head(&osb->recovery_event); 1370 init_waitqueue_head(&osb->recovery_event);
1353 spin_lock_init(&osb->vote_task_lock); 1371 spin_lock_init(&osb->dc_task_lock);
1354 init_waitqueue_head(&osb->vote_event); 1372 init_waitqueue_head(&osb->dc_event);
1355 osb->vote_work_sequence = 0; 1373 osb->dc_work_sequence = 0;
1356 osb->vote_wake_sequence = 0; 1374 osb->dc_wake_sequence = 0;
1357 INIT_LIST_HEAD(&osb->blocked_lock_list); 1375 INIT_LIST_HEAD(&osb->blocked_lock_list);
1358 osb->blocked_lock_count = 0; 1376 osb->blocked_lock_count = 0;
1359 INIT_LIST_HEAD(&osb->vote_list);
1360 spin_lock_init(&osb->osb_lock); 1377 spin_lock_init(&osb->osb_lock);
1361 1378
1362 atomic_set(&osb->alloc_stats.moves, 0); 1379 atomic_set(&osb->alloc_stats.moves, 0);
@@ -1496,7 +1513,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1496 } 1513 }
1497 1514
1498 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); 1515 memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
1499 osb->net_key = le32_to_cpu(uuid_net_key);
1500 1516
1501 strncpy(osb->vol_label, di->id2.i_super.s_label, 63); 1517 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1502 osb->vol_label[63] = '\0'; 1518 osb->vol_label[63] = '\0';
@@ -1539,25 +1555,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1539 } 1555 }
1540 1556
1541 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; 1557 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1542
1543 /* We don't have a cluster lock on the bitmap here because
1544 * we're only interested in static information and the extra
1545 * complexity at mount time isn't worht it. Don't pass the
1546 * inode in to the read function though as we don't want it to
1547 * be put in the cache. */
1548 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1549 NULL);
1550 iput(inode); 1558 iput(inode);
1551 if (status < 0) {
1552 mlog_errno(status);
1553 goto bail;
1554 }
1555 1559
1556 di = (struct ocfs2_dinode *) bitmap_bh->b_data; 1560 osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
1557 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1558 brelse(bitmap_bh);
1559 mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
1560 (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
1561 1561
1562 status = ocfs2_init_slot_info(osb); 1562 status = ocfs2_init_slot_info(osb);
1563 if (status < 0) { 1563 if (status < 0) {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fd2e846e3e6..ab713ebdd54 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -112,7 +112,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
112 goto bail; 112 goto bail;
113 } 113 }
114 114
115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); 115 inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
116 if (IS_ERR(inode)) { 116 if (IS_ERR(inode)) {
117 mlog_errno(PTR_ERR(inode)); 117 mlog_errno(PTR_ERR(inode));
118 inode = NULL; 118 inode = NULL;
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
index 5405ce121c9..e2488f4128a 100644
--- a/fs/ocfs2/ver.c
+++ b/fs/ocfs2/ver.c
@@ -29,7 +29,7 @@
29 29
30#include "ver.h" 30#include "ver.h"
31 31
32#define OCFS2_BUILD_VERSION "1.3.3" 32#define OCFS2_BUILD_VERSION "1.5.0"
33 33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION 34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35 35
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index c05358538f2..00000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/kthread.h>
30
31#include <cluster/heartbeat.h>
32#include <cluster/nodemanager.h>
33#include <cluster/tcp.h>
34
35#include <dlm/dlmapi.h>
36
37#define MLOG_MASK_PREFIX ML_VOTE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
41
42#include "alloc.h"
43#include "dlmglue.h"
44#include "extent_map.h"
45#include "heartbeat.h"
46#include "inode.h"
47#include "journal.h"
48#include "slot_map.h"
49#include "vote.h"
50
51#include "buffer_head_io.h"
52
53#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
54#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
55struct ocfs2_msg_hdr
56{
57 __be32 h_response_id; /* used to lookup message handle on sending
58 * node. */
59 __be32 h_request;
60 __be64 h_blkno;
61 __be32 h_generation;
62 __be32 h_node_num; /* node sending this particular message. */
63};
64
65struct ocfs2_vote_msg
66{
67 struct ocfs2_msg_hdr v_hdr;
68 __be32 v_reserved1;
69} __attribute__ ((packed));
70
71/* Responses are given these values to maintain backwards
72 * compatibility with older ocfs2 versions */
73#define OCFS2_RESPONSE_OK (0)
74#define OCFS2_RESPONSE_BUSY (-16)
75#define OCFS2_RESPONSE_BAD_MSG (-22)
76
77struct ocfs2_response_msg
78{
79 struct ocfs2_msg_hdr r_hdr;
80 __be32 r_response;
81} __attribute__ ((packed));
82
83struct ocfs2_vote_work {
84 struct list_head w_list;
85 struct ocfs2_vote_msg w_msg;
86};
87
88enum ocfs2_vote_request {
89 OCFS2_VOTE_REQ_INVALID = 0,
90 OCFS2_VOTE_REQ_MOUNT,
91 OCFS2_VOTE_REQ_UMOUNT,
92 OCFS2_VOTE_REQ_LAST
93};
94
95static inline int ocfs2_is_valid_vote_request(int request)
96{
97 return OCFS2_VOTE_REQ_INVALID < request &&
98 request < OCFS2_VOTE_REQ_LAST;
99}
100
101typedef void (*ocfs2_net_response_callback)(void *priv,
102 struct ocfs2_response_msg *resp);
103struct ocfs2_net_response_cb {
104 ocfs2_net_response_callback rc_cb;
105 void *rc_priv;
106};
107
108struct ocfs2_net_wait_ctxt {
109 struct list_head n_list;
110 u32 n_response_id;
111 wait_queue_head_t n_event;
112 struct ocfs2_node_map n_node_map;
113 int n_response; /* an agreggate response. 0 if
114 * all nodes are go, < 0 on any
115 * negative response from any
116 * node or network error. */
117 struct ocfs2_net_response_cb *n_callback;
118};
119
120static void ocfs2_process_mount_request(struct ocfs2_super *osb,
121 unsigned int node_num)
122{
123 mlog(0, "MOUNT vote from node %u\n", node_num);
124 /* The other node only sends us this message when he has an EX
125 * on the superblock, so our recovery threads (if having been
126 * launched) are waiting on it.*/
127 ocfs2_recovery_map_clear(osb, node_num);
128 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
129
130 /* We clear the umount map here because a node may have been
131 * previously mounted, safely unmounted but never stopped
132 * heartbeating - in which case we'd have a stale entry. */
133 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
134}
135
136static void ocfs2_process_umount_request(struct ocfs2_super *osb,
137 unsigned int node_num)
138{
139 mlog(0, "UMOUNT vote from node %u\n", node_num);
140 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
141 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
142}
143
144static void ocfs2_process_vote(struct ocfs2_super *osb,
145 struct ocfs2_vote_msg *msg)
146{
147 int net_status, vote_response;
148 unsigned int node_num;
149 u64 blkno;
150 enum ocfs2_vote_request request;
151 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
152 struct ocfs2_response_msg response;
153
154 /* decode the network mumbo jumbo into local variables. */
155 request = be32_to_cpu(hdr->h_request);
156 blkno = be64_to_cpu(hdr->h_blkno);
157 node_num = be32_to_cpu(hdr->h_node_num);
158
159 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
160 request, (unsigned long long)blkno, node_num);
161
162 if (!ocfs2_is_valid_vote_request(request)) {
163 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
164 request, node_num);
165 vote_response = OCFS2_RESPONSE_BAD_MSG;
166 goto respond;
167 }
168
169 vote_response = OCFS2_RESPONSE_OK;
170
171 switch (request) {
172 case OCFS2_VOTE_REQ_UMOUNT:
173 ocfs2_process_umount_request(osb, node_num);
174 goto respond;
175 case OCFS2_VOTE_REQ_MOUNT:
176 ocfs2_process_mount_request(osb, node_num);
177 goto respond;
178 default:
179 /* avoids a gcc warning */
180 break;
181 }
182
183respond:
184 /* Response struture is small so we just put it on the stack
185 * and stuff it inline. */
186 memset(&response, 0, sizeof(struct ocfs2_response_msg));
187 response.r_hdr.h_response_id = hdr->h_response_id;
188 response.r_hdr.h_blkno = hdr->h_blkno;
189 response.r_hdr.h_generation = hdr->h_generation;
190 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
191 response.r_response = cpu_to_be32(vote_response);
192
193 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
194 osb->net_key,
195 &response,
196 sizeof(struct ocfs2_response_msg),
197 node_num,
198 NULL);
199 /* We still want to error print for ENOPROTOOPT here. The
200 * sending node shouldn't have unregistered his net handler
201 * without sending an unmount vote 1st */
202 if (net_status < 0
203 && net_status != -ETIMEDOUT
204 && net_status != -ENOTCONN)
205 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
206 node_num, net_status);
207}
208
209static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
210{
211 unsigned long processed;
212 struct ocfs2_lock_res *lockres;
213 struct ocfs2_vote_work *work;
214
215 mlog_entry_void();
216
217 spin_lock(&osb->vote_task_lock);
218 /* grab this early so we know to try again if a state change and
219 * wake happens part-way through our work */
220 osb->vote_work_sequence = osb->vote_wake_sequence;
221
222 processed = osb->blocked_lock_count;
223 while (processed) {
224 BUG_ON(list_empty(&osb->blocked_lock_list));
225
226 lockres = list_entry(osb->blocked_lock_list.next,
227 struct ocfs2_lock_res, l_blocked_list);
228 list_del_init(&lockres->l_blocked_list);
229 osb->blocked_lock_count--;
230 spin_unlock(&osb->vote_task_lock);
231
232 BUG_ON(!processed);
233 processed--;
234
235 ocfs2_process_blocked_lock(osb, lockres);
236
237 spin_lock(&osb->vote_task_lock);
238 }
239
240 while (osb->vote_count) {
241 BUG_ON(list_empty(&osb->vote_list));
242 work = list_entry(osb->vote_list.next,
243 struct ocfs2_vote_work, w_list);
244 list_del(&work->w_list);
245 osb->vote_count--;
246 spin_unlock(&osb->vote_task_lock);
247
248 ocfs2_process_vote(osb, &work->w_msg);
249 kfree(work);
250
251 spin_lock(&osb->vote_task_lock);
252 }
253 spin_unlock(&osb->vote_task_lock);
254
255 mlog_exit_void();
256}
257
258static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
259{
260 int empty = 0;
261
262 spin_lock(&osb->vote_task_lock);
263 if (list_empty(&osb->blocked_lock_list) &&
264 list_empty(&osb->vote_list))
265 empty = 1;
266
267 spin_unlock(&osb->vote_task_lock);
268 return empty;
269}
270
271static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
272{
273 int should_wake = 0;
274
275 spin_lock(&osb->vote_task_lock);
276 if (osb->vote_work_sequence != osb->vote_wake_sequence)
277 should_wake = 1;
278 spin_unlock(&osb->vote_task_lock);
279
280 return should_wake;
281}
282
283int ocfs2_vote_thread(void *arg)
284{
285 int status = 0;
286 struct ocfs2_super *osb = arg;
287
288 /* only quit once we've been asked to stop and there is no more
289 * work available */
290 while (!(kthread_should_stop() &&
291 ocfs2_vote_thread_lists_empty(osb))) {
292
293 wait_event_interruptible(osb->vote_event,
294 ocfs2_vote_thread_should_wake(osb) ||
295 kthread_should_stop());
296
297 mlog(0, "vote_thread: awoken\n");
298
299 ocfs2_vote_thread_do_work(osb);
300 }
301
302 osb->vote_task = NULL;
303 return status;
304}
305
306static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
307{
308 struct ocfs2_net_wait_ctxt *w;
309
310 w = kzalloc(sizeof(*w), GFP_NOFS);
311 if (!w) {
312 mlog_errno(-ENOMEM);
313 goto bail;
314 }
315
316 INIT_LIST_HEAD(&w->n_list);
317 init_waitqueue_head(&w->n_event);
318 ocfs2_node_map_init(&w->n_node_map);
319 w->n_response_id = response_id;
320 w->n_callback = NULL;
321bail:
322 return w;
323}
324
325static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
326{
327 unsigned int ret;
328
329 spin_lock(&osb->net_response_lock);
330 ret = ++osb->net_response_ids;
331 spin_unlock(&osb->net_response_lock);
332
333 return ret;
334}
335
336static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
337 struct ocfs2_net_wait_ctxt *w)
338{
339 spin_lock(&osb->net_response_lock);
340 list_del(&w->n_list);
341 spin_unlock(&osb->net_response_lock);
342}
343
344static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
345 struct ocfs2_net_wait_ctxt *w)
346{
347 spin_lock(&osb->net_response_lock);
348 list_add_tail(&w->n_list,
349 &osb->net_response_list);
350 spin_unlock(&osb->net_response_lock);
351}
352
353static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
354 struct ocfs2_net_wait_ctxt *w,
355 int node_num)
356{
357 assert_spin_locked(&osb->net_response_lock);
358
359 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
360 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
361 wake_up(&w->n_event);
362}
363
364/* Intended to be called from the node down callback, we fake remove
365 * the node from all our response contexts */
366void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
367 int node_num)
368{
369 struct list_head *p;
370 struct ocfs2_net_wait_ctxt *w = NULL;
371
372 spin_lock(&osb->net_response_lock);
373
374 list_for_each(p, &osb->net_response_list) {
375 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
376
377 __ocfs2_mark_node_responded(osb, w, node_num);
378 }
379
380 spin_unlock(&osb->net_response_lock);
381}
382
383static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
384 struct ocfs2_vote_msg *request,
385 unsigned int response_id,
386 int *response,
387 struct ocfs2_net_response_cb *callback)
388{
389 int status, i, remote_err;
390 struct ocfs2_net_wait_ctxt *w = NULL;
391 int dequeued = 0;
392
393 mlog_entry_void();
394
395 w = ocfs2_new_net_wait_ctxt(response_id);
396 if (!w) {
397 status = -ENOMEM;
398 mlog_errno(status);
399 goto bail;
400 }
401 w->n_callback = callback;
402
403 /* we're pretty much ready to go at this point, and this fills
404 * in n_response which we need anyway... */
405 ocfs2_queue_net_wait_ctxt(osb, w);
406
407 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
408
409 while (i != O2NM_INVALID_NODE_NUM) {
410 if (i != osb->node_num) {
411 mlog(0, "trying to send request to node %i\n", i);
412 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
413
414 remote_err = 0;
415 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
416 osb->net_key,
417 request,
418 sizeof(*request),
419 i,
420 &remote_err);
421 if (status == -ETIMEDOUT) {
422 mlog(0, "remote node %d timed out!\n", i);
423 status = -EAGAIN;
424 goto bail;
425 }
426 if (remote_err < 0) {
427 status = remote_err;
428 mlog(0, "remote error %d on node %d!\n",
429 remote_err, i);
430 mlog_errno(status);
431 goto bail;
432 }
433 if (status < 0) {
434 mlog_errno(status);
435 goto bail;
436 }
437 }
438 i++;
439 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
440 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
441 }
442 mlog(0, "done sending, now waiting on responses...\n");
443
444 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
445
446 ocfs2_dequeue_net_wait_ctxt(osb, w);
447 dequeued = 1;
448
449 *response = w->n_response;
450 status = 0;
451bail:
452 if (w) {
453 if (!dequeued)
454 ocfs2_dequeue_net_wait_ctxt(osb, w);
455 kfree(w);
456 }
457
458 mlog_exit(status);
459 return status;
460}
461
462static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
463 u64 blkno,
464 unsigned int generation,
465 enum ocfs2_vote_request type)
466{
467 struct ocfs2_vote_msg *request;
468 struct ocfs2_msg_hdr *hdr;
469
470 BUG_ON(!ocfs2_is_valid_vote_request(type));
471
472 request = kzalloc(sizeof(*request), GFP_NOFS);
473 if (!request) {
474 mlog_errno(-ENOMEM);
475 } else {
476 hdr = &request->v_hdr;
477 hdr->h_node_num = cpu_to_be32(osb->node_num);
478 hdr->h_request = cpu_to_be32(type);
479 hdr->h_blkno = cpu_to_be64(blkno);
480 hdr->h_generation = cpu_to_be32(generation);
481 }
482
483 return request;
484}
485
486/* Complete the buildup of a new vote request and process the
487 * broadcast return value. */
488static int ocfs2_do_request_vote(struct ocfs2_super *osb,
489 struct ocfs2_vote_msg *request,
490 struct ocfs2_net_response_cb *callback)
491{
492 int status, response = -EBUSY;
493 unsigned int response_id;
494 struct ocfs2_msg_hdr *hdr;
495
496 response_id = ocfs2_new_response_id(osb);
497
498 hdr = &request->v_hdr;
499 hdr->h_response_id = cpu_to_be32(response_id);
500
501 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
502 callback);
503 if (status < 0) {
504 mlog_errno(status);
505 goto bail;
506 }
507
508 status = response;
509bail:
510
511 return status;
512}
513
514int ocfs2_request_mount_vote(struct ocfs2_super *osb)
515{
516 int status;
517 struct ocfs2_vote_msg *request = NULL;
518
519 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
520 if (!request) {
521 status = -ENOMEM;
522 goto bail;
523 }
524
525 status = -EAGAIN;
526 while (status == -EAGAIN) {
527 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
528 signal_pending(current)) {
529 status = -ERESTARTSYS;
530 goto bail;
531 }
532
533 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
534 osb->node_num)) {
535 status = 0;
536 goto bail;
537 }
538
539 status = ocfs2_do_request_vote(osb, request, NULL);
540 }
541
542bail:
543 kfree(request);
544 return status;
545}
546
547int ocfs2_request_umount_vote(struct ocfs2_super *osb)
548{
549 int status;
550 struct ocfs2_vote_msg *request = NULL;
551
552 request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
553 if (!request) {
554 status = -ENOMEM;
555 goto bail;
556 }
557
558 status = -EAGAIN;
559 while (status == -EAGAIN) {
560 /* Do not check signals on this vote... We really want
561 * this one to go all the way through. */
562
563 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
564 osb->node_num)) {
565 status = 0;
566 goto bail;
567 }
568
569 status = ocfs2_do_request_vote(osb, request, NULL);
570 }
571
572bail:
573 kfree(request);
574 return status;
575}
576
577/* TODO: This should eventually be a hash table! */
578static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
579 u32 response_id)
580{
581 struct list_head *p;
582 struct ocfs2_net_wait_ctxt *w = NULL;
583
584 list_for_each(p, &osb->net_response_list) {
585 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
586 if (response_id == w->n_response_id)
587 break;
588 w = NULL;
589 }
590
591 return w;
592}
593
594/* Translate response codes into local node errno values */
595static inline int ocfs2_translate_response(int response)
596{
597 int ret;
598
599 switch (response) {
600 case OCFS2_RESPONSE_OK:
601 ret = 0;
602 break;
603
604 case OCFS2_RESPONSE_BUSY:
605 ret = -EBUSY;
606 break;
607
608 default:
609 ret = -EINVAL;
610 }
611
612 return ret;
613}
614
615static int ocfs2_handle_response_message(struct o2net_msg *msg,
616 u32 len,
617 void *data, void **ret_data)
618{
619 unsigned int response_id, node_num;
620 int response_status;
621 struct ocfs2_super *osb = data;
622 struct ocfs2_response_msg *resp;
623 struct ocfs2_net_wait_ctxt * w;
624 struct ocfs2_net_response_cb *resp_cb;
625
626 resp = (struct ocfs2_response_msg *) msg->buf;
627
628 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
629 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
630 response_status =
631 ocfs2_translate_response(be32_to_cpu(resp->r_response));
632
633 mlog(0, "received response message:\n");
634 mlog(0, "h_response_id = %u\n", response_id);
635 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
636 mlog(0, "h_blkno = %llu\n",
637 (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
638 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
639 mlog(0, "h_node_num = %u\n", node_num);
640 mlog(0, "r_response = %d\n", response_status);
641
642 spin_lock(&osb->net_response_lock);
643 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
644 if (!w) {
645 mlog(0, "request not found!\n");
646 goto bail;
647 }
648 resp_cb = w->n_callback;
649
650 if (response_status && (!w->n_response)) {
651 /* we only really need one negative response so don't
652 * set it twice. */
653 w->n_response = response_status;
654 }
655
656 if (resp_cb) {
657 spin_unlock(&osb->net_response_lock);
658
659 resp_cb->rc_cb(resp_cb->rc_priv, resp);
660
661 spin_lock(&osb->net_response_lock);
662 }
663
664 __ocfs2_mark_node_responded(osb, w, node_num);
665bail:
666 spin_unlock(&osb->net_response_lock);
667
668 return 0;
669}
670
671static int ocfs2_handle_vote_message(struct o2net_msg *msg,
672 u32 len,
673 void *data, void **ret_data)
674{
675 int status;
676 struct ocfs2_super *osb = data;
677 struct ocfs2_vote_work *work;
678
679 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
680 if (!work) {
681 status = -ENOMEM;
682 mlog_errno(status);
683 goto bail;
684 }
685
686 INIT_LIST_HEAD(&work->w_list);
687 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
688
689 mlog(0, "scheduling vote request:\n");
690 mlog(0, "h_response_id = %u\n",
691 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
692 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
693 mlog(0, "h_blkno = %llu\n",
694 (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
695 mlog(0, "h_generation = %u\n",
696 be32_to_cpu(work->w_msg.v_hdr.h_generation));
697 mlog(0, "h_node_num = %u\n",
698 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
699
700 spin_lock(&osb->vote_task_lock);
701 list_add_tail(&work->w_list, &osb->vote_list);
702 osb->vote_count++;
703 spin_unlock(&osb->vote_task_lock);
704
705 ocfs2_kick_vote_thread(osb);
706
707 status = 0;
708bail:
709 return status;
710}
711
712void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
713{
714 if (!osb->net_key)
715 return;
716
717 o2net_unregister_handler_list(&osb->osb_net_handlers);
718
719 if (!list_empty(&osb->net_response_list))
720 mlog(ML_ERROR, "net response list not empty!\n");
721
722 osb->net_key = 0;
723}
724
725int ocfs2_register_net_handlers(struct ocfs2_super *osb)
726{
727 int status = 0;
728
729 if (ocfs2_mount_local(osb))
730 return 0;
731
732 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
733 osb->net_key,
734 sizeof(struct ocfs2_response_msg),
735 ocfs2_handle_response_message,
736 osb, NULL, &osb->osb_net_handlers);
737 if (status) {
738 mlog_errno(status);
739 goto bail;
740 }
741
742 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
743 osb->net_key,
744 sizeof(struct ocfs2_vote_msg),
745 ocfs2_handle_vote_message,
746 osb, NULL, &osb->osb_net_handlers);
747 if (status) {
748 mlog_errno(status);
749 goto bail;
750 }
751bail:
752 if (status < 0)
753 ocfs2_unregister_net_handlers(osb);
754
755 return status;
756}
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d8817384008..6b7ff161894 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -131,7 +131,7 @@ static void property_stop(struct seq_file *f, void *v)
131 /* Nothing to do */ 131 /* Nothing to do */
132} 132}
133 133
134static struct seq_operations property_op = { 134static const struct seq_operations property_op = {
135 .start = property_start, 135 .start = property_start,
136 .next = property_next, 136 .next = property_next,
137 .stop = property_stop, 137 .stop = property_stop,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 722e12e5acc..739da701ae7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,96 +195,45 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
195 return ERR_PTR(res); 195 return ERR_PTR(res);
196} 196}
197 197
198/* 198static ssize_t part_start_show(struct device *dev,
199 * sysfs bindings for partitions 199 struct device_attribute *attr, char *buf)
200 */
201
202struct part_attribute {
203 struct attribute attr;
204 ssize_t (*show)(struct hd_struct *,char *);
205 ssize_t (*store)(struct hd_struct *,const char *, size_t);
206};
207
208static ssize_t
209part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
210{ 200{
211 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); 201 struct hd_struct *p = dev_to_part(dev);
212 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
213 ssize_t ret = 0;
214 if (part_attr->show)
215 ret = part_attr->show(p, page);
216 return ret;
217}
218static ssize_t
219part_attr_store(struct kobject * kobj, struct attribute * attr,
220 const char *page, size_t count)
221{
222 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
223 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
224 ssize_t ret = 0;
225 202
226 if (part_attr->store) 203 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
227 ret = part_attr->store(p, page, count);
228 return ret;
229} 204}
230 205
231static struct sysfs_ops part_sysfs_ops = { 206static ssize_t part_size_show(struct device *dev,
232 .show = part_attr_show, 207 struct device_attribute *attr, char *buf)
233 .store = part_attr_store,
234};
235
236static ssize_t part_uevent_store(struct hd_struct * p,
237 const char *page, size_t count)
238{ 208{
239 kobject_uevent(&p->kobj, KOBJ_ADD); 209 struct hd_struct *p = dev_to_part(dev);
240 return count; 210 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
241} 211}
242static ssize_t part_dev_read(struct hd_struct * p, char *page) 212
243{ 213static ssize_t part_stat_show(struct device *dev,
244 struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj); 214 struct device_attribute *attr, char *buf)
245 dev_t dev = MKDEV(disk->major, disk->first_minor + p->partno);
246 return print_dev_t(page, dev);
247}
248static ssize_t part_start_read(struct hd_struct * p, char *page)
249{
250 return sprintf(page, "%llu\n",(unsigned long long)p->start_sect);
251}
252static ssize_t part_size_read(struct hd_struct * p, char *page)
253{
254 return sprintf(page, "%llu\n",(unsigned long long)p->nr_sects);
255}
256static ssize_t part_stat_read(struct hd_struct * p, char *page)
257{ 215{
258 return sprintf(page, "%8u %8llu %8u %8llu\n", 216 struct hd_struct *p = dev_to_part(dev);
217
218 return sprintf(buf, "%8u %8llu %8u %8llu\n",
259 p->ios[0], (unsigned long long)p->sectors[0], 219 p->ios[0], (unsigned long long)p->sectors[0],
260 p->ios[1], (unsigned long long)p->sectors[1]); 220 p->ios[1], (unsigned long long)p->sectors[1]);
261} 221}
262static struct part_attribute part_attr_uevent = {
263 .attr = {.name = "uevent", .mode = S_IWUSR },
264 .store = part_uevent_store
265};
266static struct part_attribute part_attr_dev = {
267 .attr = {.name = "dev", .mode = S_IRUGO },
268 .show = part_dev_read
269};
270static struct part_attribute part_attr_start = {
271 .attr = {.name = "start", .mode = S_IRUGO },
272 .show = part_start_read
273};
274static struct part_attribute part_attr_size = {
275 .attr = {.name = "size", .mode = S_IRUGO },
276 .show = part_size_read
277};
278static struct part_attribute part_attr_stat = {
279 .attr = {.name = "stat", .mode = S_IRUGO },
280 .show = part_stat_read
281};
282 222
283#ifdef CONFIG_FAIL_MAKE_REQUEST 223#ifdef CONFIG_FAIL_MAKE_REQUEST
224static ssize_t part_fail_show(struct device *dev,
225 struct device_attribute *attr, char *buf)
226{
227 struct hd_struct *p = dev_to_part(dev);
284 228
285static ssize_t part_fail_store(struct hd_struct * p, 229 return sprintf(buf, "%d\n", p->make_it_fail);
230}
231
232static ssize_t part_fail_store(struct device *dev,
233 struct device_attribute *attr,
286 const char *buf, size_t count) 234 const char *buf, size_t count)
287{ 235{
236 struct hd_struct *p = dev_to_part(dev);
288 int i; 237 int i;
289 238
290 if (count > 0 && sscanf(buf, "%d", &i) > 0) 239 if (count > 0 && sscanf(buf, "%d", &i) > 0)
@@ -292,50 +241,53 @@ static ssize_t part_fail_store(struct hd_struct * p,
292 241
293 return count; 242 return count;
294} 243}
295static ssize_t part_fail_read(struct hd_struct * p, char *page) 244#endif
296{
297 return sprintf(page, "%d\n", p->make_it_fail);
298}
299static struct part_attribute part_attr_fail = {
300 .attr = {.name = "make-it-fail", .mode = S_IRUGO | S_IWUSR },
301 .store = part_fail_store,
302 .show = part_fail_read
303};
304 245
246static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
247static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
248static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
249#ifdef CONFIG_FAIL_MAKE_REQUEST
250static struct device_attribute dev_attr_fail =
251 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
305#endif 252#endif
306 253
307static struct attribute * default_attrs[] = { 254static struct attribute *part_attrs[] = {
308 &part_attr_uevent.attr, 255 &dev_attr_start.attr,
309 &part_attr_dev.attr, 256 &dev_attr_size.attr,
310 &part_attr_start.attr, 257 &dev_attr_stat.attr,
311 &part_attr_size.attr,
312 &part_attr_stat.attr,
313#ifdef CONFIG_FAIL_MAKE_REQUEST 258#ifdef CONFIG_FAIL_MAKE_REQUEST
314 &part_attr_fail.attr, 259 &dev_attr_fail.attr,
315#endif 260#endif
316 NULL, 261 NULL
317}; 262};
318 263
319extern struct kset block_subsys; 264static struct attribute_group part_attr_group = {
265 .attrs = part_attrs,
266};
320 267
321static void part_release(struct kobject *kobj) 268static struct attribute_group *part_attr_groups[] = {
269 &part_attr_group,
270 NULL
271};
272
273static void part_release(struct device *dev)
322{ 274{
323 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj); 275 struct hd_struct *p = dev_to_part(dev);
324 kfree(p); 276 kfree(p);
325} 277}
326 278
327struct kobj_type ktype_part = { 279struct device_type part_type = {
280 .name = "partition",
281 .groups = part_attr_groups,
328 .release = part_release, 282 .release = part_release,
329 .default_attrs = default_attrs,
330 .sysfs_ops = &part_sysfs_ops,
331}; 283};
332 284
333static inline void partition_sysfs_add_subdir(struct hd_struct *p) 285static inline void partition_sysfs_add_subdir(struct hd_struct *p)
334{ 286{
335 struct kobject *k; 287 struct kobject *k;
336 288
337 k = kobject_get(&p->kobj); 289 k = kobject_get(&p->dev.kobj);
338 p->holder_dir = kobject_add_dir(k, "holders"); 290 p->holder_dir = kobject_create_and_add("holders", k);
339 kobject_put(k); 291 kobject_put(k);
340} 292}
341 293
@@ -343,15 +295,16 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
343{ 295{
344 struct kobject *k; 296 struct kobject *k;
345 297
346 k = kobject_get(&disk->kobj); 298 k = kobject_get(&disk->dev.kobj);
347 disk->holder_dir = kobject_add_dir(k, "holders"); 299 disk->holder_dir = kobject_create_and_add("holders", k);
348 disk->slave_dir = kobject_add_dir(k, "slaves"); 300 disk->slave_dir = kobject_create_and_add("slaves", k);
349 kobject_put(k); 301 kobject_put(k);
350} 302}
351 303
352void delete_partition(struct gendisk *disk, int part) 304void delete_partition(struct gendisk *disk, int part)
353{ 305{
354 struct hd_struct *p = disk->part[part-1]; 306 struct hd_struct *p = disk->part[part-1];
307
355 if (!p) 308 if (!p)
356 return; 309 return;
357 if (!p->nr_sects) 310 if (!p->nr_sects)
@@ -361,113 +314,55 @@ void delete_partition(struct gendisk *disk, int part)
361 p->nr_sects = 0; 314 p->nr_sects = 0;
362 p->ios[0] = p->ios[1] = 0; 315 p->ios[0] = p->ios[1] = 0;
363 p->sectors[0] = p->sectors[1] = 0; 316 p->sectors[0] = p->sectors[1] = 0;
364 sysfs_remove_link(&p->kobj, "subsystem"); 317 kobject_put(p->holder_dir);
365 kobject_unregister(p->holder_dir); 318 device_del(&p->dev);
366 kobject_uevent(&p->kobj, KOBJ_REMOVE); 319 put_device(&p->dev);
367 kobject_del(&p->kobj);
368 kobject_put(&p->kobj);
369} 320}
370 321
371void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) 322void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
372{ 323{
373 struct hd_struct *p; 324 struct hd_struct *p;
325 int err;
374 326
375 p = kzalloc(sizeof(*p), GFP_KERNEL); 327 p = kzalloc(sizeof(*p), GFP_KERNEL);
376 if (!p) 328 if (!p)
377 return; 329 return;
378 330
379 p->start_sect = start; 331 p->start_sect = start;
380 p->nr_sects = len; 332 p->nr_sects = len;
381 p->partno = part; 333 p->partno = part;
382 p->policy = disk->policy; 334 p->policy = disk->policy;
383 335
384 if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1])) 336 if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
385 kobject_set_name(&p->kobj, "%sp%d", 337 snprintf(p->dev.bus_id, BUS_ID_SIZE,
386 kobject_name(&disk->kobj), part); 338 "%sp%d", disk->dev.bus_id, part);
387 else 339 else
388 kobject_set_name(&p->kobj, "%s%d", 340 snprintf(p->dev.bus_id, BUS_ID_SIZE,
389 kobject_name(&disk->kobj),part); 341 "%s%d", disk->dev.bus_id, part);
390 p->kobj.parent = &disk->kobj; 342
391 p->kobj.ktype = &ktype_part; 343 device_initialize(&p->dev);
392 kobject_init(&p->kobj); 344 p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
393 kobject_add(&p->kobj); 345 p->dev.class = &block_class;
394 if (!disk->part_uevent_suppress) 346 p->dev.type = &part_type;
395 kobject_uevent(&p->kobj, KOBJ_ADD); 347 p->dev.parent = &disk->dev;
396 sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem"); 348 disk->part[part-1] = p;
349
350 /* delay uevent until 'holders' subdir is created */
351 p->dev.uevent_suppress = 1;
352 device_add(&p->dev);
353 partition_sysfs_add_subdir(p);
354 p->dev.uevent_suppress = 0;
397 if (flags & ADDPART_FLAG_WHOLEDISK) { 355 if (flags & ADDPART_FLAG_WHOLEDISK) {
398 static struct attribute addpartattr = { 356 static struct attribute addpartattr = {
399 .name = "whole_disk", 357 .name = "whole_disk",
400 .mode = S_IRUSR | S_IRGRP | S_IROTH, 358 .mode = S_IRUSR | S_IRGRP | S_IROTH,
401 }; 359 };
402 360 err = sysfs_create_file(&p->dev.kobj, &addpartattr);
403 sysfs_create_file(&p->kobj, &addpartattr);
404 } 361 }
405 partition_sysfs_add_subdir(p);
406 disk->part[part-1] = p;
407}
408 362
409static char *make_block_name(struct gendisk *disk) 363 /* suppress uevent if the disk supresses it */
410{ 364 if (!disk->dev.uevent_suppress)
411 char *name; 365 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
412 static char *block_str = "block:";
413 int size;
414 char *s;
415
416 size = strlen(block_str) + strlen(disk->disk_name) + 1;
417 name = kmalloc(size, GFP_KERNEL);
418 if (!name)
419 return NULL;
420 strcpy(name, block_str);
421 strcat(name, disk->disk_name);
422 /* ewww... some of these buggers have / in name... */
423 s = strchr(name, '/');
424 if (s)
425 *s = '!';
426 return name;
427}
428
429static int disk_sysfs_symlinks(struct gendisk *disk)
430{
431 struct device *target = get_device(disk->driverfs_dev);
432 int err;
433 char *disk_name = NULL;
434
435 if (target) {
436 disk_name = make_block_name(disk);
437 if (!disk_name) {
438 err = -ENOMEM;
439 goto err_out;
440 }
441
442 err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
443 if (err)
444 goto err_out_disk_name;
445
446 err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
447 if (err)
448 goto err_out_dev_link;
449 }
450
451 err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,
452 "subsystem");
453 if (err)
454 goto err_out_disk_name_lnk;
455
456 kfree(disk_name);
457
458 return 0;
459
460err_out_disk_name_lnk:
461 if (target) {
462 sysfs_remove_link(&target->kobj, disk_name);
463err_out_dev_link:
464 sysfs_remove_link(&disk->kobj, "device");
465err_out_disk_name:
466 kfree(disk_name);
467err_out:
468 put_device(target);
469 }
470 return err;
471} 366}
472 367
473/* Not exported, helper to add_disk(). */ 368/* Not exported, helper to add_disk(). */
@@ -479,19 +374,29 @@ void register_disk(struct gendisk *disk)
479 struct hd_struct *p; 374 struct hd_struct *p;
480 int err; 375 int err;
481 376
482 kobject_set_name(&disk->kobj, "%s", disk->disk_name); 377 disk->dev.parent = disk->driverfs_dev;
483 /* ewww... some of these buggers have / in name... */ 378 disk->dev.devt = MKDEV(disk->major, disk->first_minor);
484 s = strchr(disk->kobj.k_name, '/'); 379
380 strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
381 /* ewww... some of these buggers have / in the name... */
382 s = strchr(disk->dev.bus_id, '/');
485 if (s) 383 if (s)
486 *s = '!'; 384 *s = '!';
487 if ((err = kobject_add(&disk->kobj))) 385
386 /* delay uevents, until we scanned partition table */
387 disk->dev.uevent_suppress = 1;
388
389 if (device_add(&disk->dev))
488 return; 390 return;
489 err = disk_sysfs_symlinks(disk); 391#ifndef CONFIG_SYSFS_DEPRECATED
392 err = sysfs_create_link(block_depr, &disk->dev.kobj,
393 kobject_name(&disk->dev.kobj));
490 if (err) { 394 if (err) {
491 kobject_del(&disk->kobj); 395 device_del(&disk->dev);
492 return; 396 return;
493 } 397 }
494 disk_sysfs_add_subdirs(disk); 398#endif
399 disk_sysfs_add_subdirs(disk);
495 400
496 /* No minors to use for partitions */ 401 /* No minors to use for partitions */
497 if (disk->minors == 1) 402 if (disk->minors == 1)
@@ -505,25 +410,23 @@ void register_disk(struct gendisk *disk)
505 if (!bdev) 410 if (!bdev)
506 goto exit; 411 goto exit;
507 412
508 /* scan partition table, but suppress uevents */
509 bdev->bd_invalidated = 1; 413 bdev->bd_invalidated = 1;
510 disk->part_uevent_suppress = 1;
511 err = blkdev_get(bdev, FMODE_READ, 0); 414 err = blkdev_get(bdev, FMODE_READ, 0);
512 disk->part_uevent_suppress = 0;
513 if (err < 0) 415 if (err < 0)
514 goto exit; 416 goto exit;
515 blkdev_put(bdev); 417 blkdev_put(bdev);
516 418
517exit: 419exit:
518 /* announce disk after possible partitions are already created */ 420 /* announce disk after possible partitions are created */
519 kobject_uevent(&disk->kobj, KOBJ_ADD); 421 disk->dev.uevent_suppress = 0;
422 kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
520 423
521 /* announce possible partitions */ 424 /* announce possible partitions */
522 for (i = 1; i < disk->minors; i++) { 425 for (i = 1; i < disk->minors; i++) {
523 p = disk->part[i-1]; 426 p = disk->part[i-1];
524 if (!p || !p->nr_sects) 427 if (!p || !p->nr_sects)
525 continue; 428 continue;
526 kobject_uevent(&p->kobj, KOBJ_ADD); 429 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
527 } 430 }
528} 431}
529 432
@@ -602,19 +505,11 @@ void del_gendisk(struct gendisk *disk)
602 disk_stat_set_all(disk, 0); 505 disk_stat_set_all(disk, 0);
603 disk->stamp = 0; 506 disk->stamp = 0;
604 507
605 kobject_uevent(&disk->kobj, KOBJ_REMOVE); 508 kobject_put(disk->holder_dir);
606 kobject_unregister(disk->holder_dir); 509 kobject_put(disk->slave_dir);
607 kobject_unregister(disk->slave_dir); 510 disk->driverfs_dev = NULL;
608 if (disk->driverfs_dev) { 511#ifndef CONFIG_SYSFS_DEPRECATED
609 char *disk_name = make_block_name(disk); 512 sysfs_remove_link(block_depr, disk->dev.bus_id);
610 sysfs_remove_link(&disk->kobj, "device"); 513#endif
611 if (disk_name) { 514 device_del(&disk->dev);
612 sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name);
613 kfree(disk_name);
614 }
615 put_device(disk->driverfs_dev);
616 disk->driverfs_dev = NULL;
617 }
618 sysfs_remove_link(&disk->kobj, "subsystem");
619 kobject_del(&disk->kobj);
620} 515}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65c62e1bfd6..eb97f2897e2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -169,7 +169,7 @@ static inline char *task_state(struct task_struct *p, char *buffer)
169 ppid = pid_alive(p) ? 169 ppid = pid_alive(p) ?
170 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; 170 task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
171 tpid = pid_alive(p) && p->ptrace ? 171 tpid = pid_alive(p) && p->ptrace ?
172 task_ppid_nr_ns(rcu_dereference(p->parent), ns) : 0; 172 task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
173 buffer += sprintf(buffer, 173 buffer += sprintf(buffer,
174 "State:\t%s\n" 174 "State:\t%s\n"
175 "Tgid:\t%d\n" 175 "Tgid:\t%d\n"
@@ -464,8 +464,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
464 } 464 }
465 465
466 sid = task_session_nr_ns(task, ns); 466 sid = task_session_nr_ns(task, ns);
467 ppid = task_tgid_nr_ns(task->real_parent, ns);
467 pgid = task_pgrp_nr_ns(task, ns); 468 pgid = task_pgrp_nr_ns(task, ns);
468 ppid = task_ppid_nr_ns(task, ns);
469 469
470 unlock_task_sighand(task, &flags); 470 unlock_task_sighand(task, &flags);
471 } 471 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7c..91fa8e6ce8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
310} 310}
311#endif 311#endif
312 312
313#ifdef CONFIG_LATENCYTOP
314static int lstats_show_proc(struct seq_file *m, void *v)
315{
316 int i;
317 struct task_struct *task = m->private;
318 seq_puts(m, "Latency Top version : v0.1\n");
319
320 for (i = 0; i < 32; i++) {
321 if (task->latency_record[i].backtrace[0]) {
322 int q;
323 seq_printf(m, "%i %li %li ",
324 task->latency_record[i].count,
325 task->latency_record[i].time,
326 task->latency_record[i].max);
327 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
328 char sym[KSYM_NAME_LEN];
329 char *c;
330 if (!task->latency_record[i].backtrace[q])
331 break;
332 if (task->latency_record[i].backtrace[q] == ULONG_MAX)
333 break;
334 sprint_symbol(sym, task->latency_record[i].backtrace[q]);
335 c = strchr(sym, '+');
336 if (c)
337 *c = 0;
338 seq_printf(m, "%s ", sym);
339 }
340 seq_printf(m, "\n");
341 }
342
343 }
344 return 0;
345}
346
347static int lstats_open(struct inode *inode, struct file *file)
348{
349 int ret;
350 struct seq_file *m;
351 struct task_struct *task = get_proc_task(inode);
352
353 ret = single_open(file, lstats_show_proc, NULL);
354 if (!ret) {
355 m = file->private_data;
356 m->private = task;
357 }
358 return ret;
359}
360
361static ssize_t lstats_write(struct file *file, const char __user *buf,
362 size_t count, loff_t *offs)
363{
364 struct seq_file *m;
365 struct task_struct *task;
366
367 m = file->private_data;
368 task = m->private;
369 clear_all_latency_tracing(task);
370
371 return count;
372}
373
374static const struct file_operations proc_lstats_operations = {
375 .open = lstats_open,
376 .read = seq_read,
377 .write = lstats_write,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
381
382#endif
383
313/* The badness from the OOM killer */ 384/* The badness from the OOM killer */
314unsigned long badness(struct task_struct *p, unsigned long uptime); 385unsigned long badness(struct task_struct *p, unsigned long uptime);
315static int proc_oom_score(struct task_struct *task, char *buffer) 386static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
1020}; 1091};
1021#endif 1092#endif
1022 1093
1094
1023#ifdef CONFIG_SCHED_DEBUG 1095#ifdef CONFIG_SCHED_DEBUG
1024/* 1096/*
1025 * Print out various scheduling related per-task fields: 1097 * Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2230#ifdef CONFIG_SCHEDSTATS 2302#ifdef CONFIG_SCHEDSTATS
2231 INF("schedstat", S_IRUGO, pid_schedstat), 2303 INF("schedstat", S_IRUGO, pid_schedstat),
2232#endif 2304#endif
2305#ifdef CONFIG_LATENCYTOP
2306 REG("latency", S_IRUGO, lstats),
2307#endif
2233#ifdef CONFIG_PROC_PID_CPUSET 2308#ifdef CONFIG_PROC_PID_CPUSET
2234 REG("cpuset", S_IRUGO, cpuset), 2309 REG("cpuset", S_IRUGO, cpuset),
2235#endif 2310#endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
2555#ifdef CONFIG_SCHEDSTATS 2630#ifdef CONFIG_SCHEDSTATS
2556 INF("schedstat", S_IRUGO, pid_schedstat), 2631 INF("schedstat", S_IRUGO, pid_schedstat),
2557#endif 2632#endif
2633#ifdef CONFIG_LATENCYTOP
2634 REG("latency", S_IRUGO, lstats),
2635#endif
2558#ifdef CONFIG_PROC_PID_CPUSET 2636#ifdef CONFIG_PROC_PID_CPUSET
2559 REG("cpuset", S_IRUGO, cpuset), 2637 REG("cpuset", S_IRUGO, cpuset),
2560#endif 2638#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94cc722..c4d3d17923f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -197,25 +197,27 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
197{ 197{
198 struct inode *inode; 198 struct inode *inode;
199 loff_t pos; 199 loff_t pos;
200 int retval = -EINVAL;
200 201
201 inode = file->f_path.dentry->d_inode; 202 inode = file->f_path.dentry->d_inode;
202 if (unlikely((ssize_t) count < 0)) 203 if (unlikely((ssize_t) count < 0))
203 goto Einval; 204 return retval;
204 pos = *ppos; 205 pos = *ppos;
205 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) 206 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
206 goto Einval; 207 return retval;
207 208
208 if (unlikely(inode->i_flock && mandatory_lock(inode))) { 209 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
209 int retval = locks_mandatory_area( 210 retval = locks_mandatory_area(
210 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 211 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
211 inode, file, pos, count); 212 inode, file, pos, count);
212 if (retval < 0) 213 if (retval < 0)
213 return retval; 214 return retval;
214 } 215 }
216 retval = security_file_permission(file,
217 read_write == READ ? MAY_READ : MAY_WRITE);
218 if (retval)
219 return retval;
215 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; 220 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
216
217Einval:
218 return -EINVAL;
219} 221}
220 222
221static void wait_on_retry_sync_kiocb(struct kiocb *iocb) 223static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
@@ -267,18 +269,15 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
267 ret = rw_verify_area(READ, file, pos, count); 269 ret = rw_verify_area(READ, file, pos, count);
268 if (ret >= 0) { 270 if (ret >= 0) {
269 count = ret; 271 count = ret;
270 ret = security_file_permission (file, MAY_READ); 272 if (file->f_op->read)
271 if (!ret) { 273 ret = file->f_op->read(file, buf, count, pos);
272 if (file->f_op->read) 274 else
273 ret = file->f_op->read(file, buf, count, pos); 275 ret = do_sync_read(file, buf, count, pos);
274 else 276 if (ret > 0) {
275 ret = do_sync_read(file, buf, count, pos); 277 fsnotify_access(file->f_path.dentry);
276 if (ret > 0) { 278 add_rchar(current, ret);
277 fsnotify_access(file->f_path.dentry);
278 add_rchar(current, ret);
279 }
280 inc_syscr(current);
281 } 279 }
280 inc_syscr(current);
282 } 281 }
283 282
284 return ret; 283 return ret;
@@ -325,18 +324,15 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
325 ret = rw_verify_area(WRITE, file, pos, count); 324 ret = rw_verify_area(WRITE, file, pos, count);
326 if (ret >= 0) { 325 if (ret >= 0) {
327 count = ret; 326 count = ret;
328 ret = security_file_permission (file, MAY_WRITE); 327 if (file->f_op->write)
329 if (!ret) { 328 ret = file->f_op->write(file, buf, count, pos);
330 if (file->f_op->write) 329 else
331 ret = file->f_op->write(file, buf, count, pos); 330 ret = do_sync_write(file, buf, count, pos);
332 else 331 if (ret > 0) {
333 ret = do_sync_write(file, buf, count, pos); 332 fsnotify_modify(file->f_path.dentry);
334 if (ret > 0) { 333 add_wchar(current, ret);
335 fsnotify_modify(file->f_path.dentry);
336 add_wchar(current, ret);
337 }
338 inc_syscw(current);
339 } 334 }
335 inc_syscw(current);
340 } 336 }
341 337
342 return ret; 338 return ret;
@@ -603,9 +599,6 @@ static ssize_t do_readv_writev(int type, struct file *file,
603 ret = rw_verify_area(type, file, pos, tot_len); 599 ret = rw_verify_area(type, file, pos, tot_len);
604 if (ret < 0) 600 if (ret < 0)
605 goto out; 601 goto out;
606 ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
607 if (ret)
608 goto out;
609 602
610 fnv = NULL; 603 fnv = NULL;
611 if (type == READ) { 604 if (type == READ) {
@@ -737,10 +730,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
737 goto fput_in; 730 goto fput_in;
738 count = retval; 731 count = retval;
739 732
740 retval = security_file_permission (in_file, MAY_READ);
741 if (retval)
742 goto fput_in;
743
744 /* 733 /*
745 * Get output file, and verify that it is ok.. 734 * Get output file, and verify that it is ok..
746 */ 735 */
@@ -759,10 +748,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
759 goto fput_out; 748 goto fput_out;
760 count = retval; 749 count = retval;
761 750
762 retval = security_file_permission (out_file, MAY_WRITE);
763 if (retval)
764 goto fput_out;
765
766 if (!max) 751 if (!max)
767 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 752 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
768 753
diff --git a/fs/splice.c b/fs/splice.c
index 6bdcb6107bc..56b802bfbfa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -908,10 +908,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
908 if (unlikely(ret < 0)) 908 if (unlikely(ret < 0))
909 return ret; 909 return ret;
910 910
911 ret = security_file_permission(out, MAY_WRITE);
912 if (unlikely(ret < 0))
913 return ret;
914
915 return out->f_op->splice_write(pipe, out, ppos, len, flags); 911 return out->f_op->splice_write(pipe, out, ppos, len, flags);
916} 912}
917 913
@@ -934,10 +930,6 @@ static long do_splice_to(struct file *in, loff_t *ppos,
934 if (unlikely(ret < 0)) 930 if (unlikely(ret < 0))
935 return ret; 931 return ret;
936 932
937 ret = security_file_permission(in, MAY_READ);
938 if (unlikely(ret < 0))
939 return ret;
940
941 return in->f_op->splice_read(in, ppos, pipe, len, flags); 933 return in->f_op->splice_read(in, ppos, pipe, len, flags);
942} 934}
943 935
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 337162935d2..4948d9bc405 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -440,7 +440,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
440/** 440/**
441 * sysfs_remove_one - remove sysfs_dirent from parent 441 * sysfs_remove_one - remove sysfs_dirent from parent
442 * @acxt: addrm context to use 442 * @acxt: addrm context to use
443 * @sd: sysfs_dirent to be added 443 * @sd: sysfs_dirent to be removed
444 * 444 *
445 * Mark @sd removed and drop nlink of parent inode if @sd is a 445 * Mark @sd removed and drop nlink of parent inode if @sd is a
446 * directory. @sd is unlinked from the children list. 446 * directory. @sd is unlinked from the children list.
@@ -678,8 +678,10 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
678 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); 678 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
679 679
680 /* no such entry */ 680 /* no such entry */
681 if (!sd) 681 if (!sd) {
682 ret = ERR_PTR(-ENOENT);
682 goto out_unlock; 683 goto out_unlock;
684 }
683 685
684 /* attach dentry and inode */ 686 /* attach dentry and inode */
685 inode = sysfs_get_inode(sd); 687 inode = sysfs_get_inode(sd);
@@ -781,6 +783,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
781 old_dentry = sysfs_get_dentry(sd); 783 old_dentry = sysfs_get_dentry(sd);
782 if (IS_ERR(old_dentry)) { 784 if (IS_ERR(old_dentry)) {
783 error = PTR_ERR(old_dentry); 785 error = PTR_ERR(old_dentry);
786 old_dentry = NULL;
784 goto out; 787 goto out;
785 } 788 }
786 789
@@ -848,6 +851,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
848 old_dentry = sysfs_get_dentry(sd); 851 old_dentry = sysfs_get_dentry(sd);
849 if (IS_ERR(old_dentry)) { 852 if (IS_ERR(old_dentry)) {
850 error = PTR_ERR(old_dentry); 853 error = PTR_ERR(old_dentry);
854 old_dentry = NULL;
851 goto out; 855 goto out;
852 } 856 }
853 old_parent = old_dentry->d_parent; 857 old_parent = old_dentry->d_parent;
@@ -855,6 +859,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
855 new_parent = sysfs_get_dentry(new_parent_sd); 859 new_parent = sysfs_get_dentry(new_parent_sd);
856 if (IS_ERR(new_parent)) { 860 if (IS_ERR(new_parent)) {
857 error = PTR_ERR(new_parent); 861 error = PTR_ERR(new_parent);
862 new_parent = NULL;
858 goto out; 863 goto out;
859 } 864 }
860 865
@@ -878,7 +883,6 @@ again:
878 error = 0; 883 error = 0;
879 d_add(new_dentry, NULL); 884 d_add(new_dentry, NULL);
880 d_move(old_dentry, new_dentry); 885 d_move(old_dentry, new_dentry);
881 dput(new_dentry);
882 886
883 /* Remove from old parent's list and insert into new parent's list. */ 887 /* Remove from old parent's list and insert into new parent's list. */
884 sysfs_unlink_sibling(sd); 888 sysfs_unlink_sibling(sd);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b834f1709f9..a271c87c447 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -20,43 +20,6 @@
20 20
21#include "sysfs.h" 21#include "sysfs.h"
22 22
23#define to_sattr(a) container_of(a,struct subsys_attribute, attr)
24
25/*
26 * Subsystem file operations.
27 * These operations allow subsystems to have files that can be
28 * read/written.
29 */
30static ssize_t
31subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
32{
33 struct kset *kset = to_kset(kobj);
34 struct subsys_attribute * sattr = to_sattr(attr);
35 ssize_t ret = -EIO;
36
37 if (sattr->show)
38 ret = sattr->show(kset, page);
39 return ret;
40}
41
42static ssize_t
43subsys_attr_store(struct kobject * kobj, struct attribute * attr,
44 const char * page, size_t count)
45{
46 struct kset *kset = to_kset(kobj);
47 struct subsys_attribute * sattr = to_sattr(attr);
48 ssize_t ret = -EIO;
49
50 if (sattr->store)
51 ret = sattr->store(kset, page, count);
52 return ret;
53}
54
55static struct sysfs_ops subsys_sysfs_ops = {
56 .show = subsys_attr_show,
57 .store = subsys_attr_store,
58};
59
60/* 23/*
61 * There's one sysfs_buffer for each open file and one 24 * There's one sysfs_buffer for each open file and one
62 * sysfs_open_dirent for each sysfs_dirent with one or more open 25 * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -66,7 +29,7 @@ static struct sysfs_ops subsys_sysfs_ops = {
66 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open 29 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open
67 * is protected by sysfs_open_dirent_lock. 30 * is protected by sysfs_open_dirent_lock.
68 */ 31 */
69static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED; 32static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
70 33
71struct sysfs_open_dirent { 34struct sysfs_open_dirent {
72 atomic_t refcnt; 35 atomic_t refcnt;
@@ -354,31 +317,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
354{ 317{
355 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 318 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
356 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 319 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
357 struct sysfs_buffer * buffer; 320 struct sysfs_buffer *buffer;
358 struct sysfs_ops * ops = NULL; 321 struct sysfs_ops *ops;
359 int error; 322 int error = -EACCES;
360 323
361 /* need attr_sd for attr and ops, its parent for kobj */ 324 /* need attr_sd for attr and ops, its parent for kobj */
362 if (!sysfs_get_active_two(attr_sd)) 325 if (!sysfs_get_active_two(attr_sd))
363 return -ENODEV; 326 return -ENODEV;
364 327
365 /* if the kobject has no ktype, then we assume that it is a subsystem 328 /* every kobject with an attribute needs a ktype assigned */
366 * itself, and use ops for it. 329 if (kobj->ktype && kobj->ktype->sysfs_ops)
367 */
368 if (kobj->kset && kobj->kset->ktype)
369 ops = kobj->kset->ktype->sysfs_ops;
370 else if (kobj->ktype)
371 ops = kobj->ktype->sysfs_ops; 330 ops = kobj->ktype->sysfs_ops;
372 else 331 else {
373 ops = &subsys_sysfs_ops; 332 printk(KERN_ERR "missing sysfs attribute operations for "
374 333 "kobject: %s\n", kobject_name(kobj));
375 error = -EACCES; 334 WARN_ON(1);
376
377 /* No sysfs operations, either from having no subsystem,
378 * or the subsystem have no operations.
379 */
380 if (!ops)
381 goto err_out; 335 goto err_out;
336 }
382 337
383 /* File needs write support. 338 /* File needs write support.
384 * The inode's perms must say it's ok, 339 * The inode's perms must say it's ok,
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3eac20c63c4..5f66c446615 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,39 +19,6 @@
19 19
20#include "sysfs.h" 20#include "sysfs.h"
21 21
22static int object_depth(struct sysfs_dirent *sd)
23{
24 int depth = 0;
25
26 for (; sd->s_parent; sd = sd->s_parent)
27 depth++;
28
29 return depth;
30}
31
32static int object_path_length(struct sysfs_dirent * sd)
33{
34 int length = 1;
35
36 for (; sd->s_parent; sd = sd->s_parent)
37 length += strlen(sd->s_name) + 1;
38
39 return length;
40}
41
42static void fill_object_path(struct sysfs_dirent *sd, char *buffer, int length)
43{
44 --length;
45 for (; sd->s_parent; sd = sd->s_parent) {
46 int cur = strlen(sd->s_name);
47
48 /* back up enough to print this bus id with '/' */
49 length -= cur;
50 strncpy(buffer + length, sd->s_name, cur);
51 *(buffer + --length) = '/';
52 }
53}
54
55/** 22/**
56 * sysfs_create_link - create symlink between two objects. 23 * sysfs_create_link - create symlink between two objects.
57 * @kobj: object whose directory we're creating the link in. 24 * @kobj: object whose directory we're creating the link in.
@@ -112,7 +79,6 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
112 return error; 79 return error;
113} 80}
114 81
115
116/** 82/**
117 * sysfs_remove_link - remove symlink in object's directory. 83 * sysfs_remove_link - remove symlink in object's directory.
118 * @kobj: object we're acting for. 84 * @kobj: object we're acting for.
@@ -124,24 +90,54 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
124 sysfs_hash_and_remove(kobj->sd, name); 90 sysfs_hash_and_remove(kobj->sd, name);
125} 91}
126 92
127static int sysfs_get_target_path(struct sysfs_dirent * parent_sd, 93static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
128 struct sysfs_dirent * target_sd, char *path) 94 struct sysfs_dirent *target_sd, char *path)
129{ 95{
130 char * s; 96 struct sysfs_dirent *base, *sd;
131 int depth, size; 97 char *s = path;
98 int len = 0;
99
100 /* go up to the root, stop at the base */
101 base = parent_sd;
102 while (base->s_parent) {
103 sd = target_sd->s_parent;
104 while (sd->s_parent && base != sd)
105 sd = sd->s_parent;
106
107 if (base == sd)
108 break;
109
110 strcpy(s, "../");
111 s += 3;
112 base = base->s_parent;
113 }
114
115 /* determine end of target string for reverse fillup */
116 sd = target_sd;
117 while (sd->s_parent && sd != base) {
118 len += strlen(sd->s_name) + 1;
119 sd = sd->s_parent;
120 }
132 121
133 depth = object_depth(parent_sd); 122 /* check limits */
134 size = object_path_length(target_sd) + depth * 3 - 1; 123 if (len < 2)
135 if (size > PATH_MAX) 124 return -EINVAL;
125 len--;
126 if ((s - path) + len > PATH_MAX)
136 return -ENAMETOOLONG; 127 return -ENAMETOOLONG;
137 128
138 pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); 129 /* reverse fillup of target string from target to base */
130 sd = target_sd;
131 while (sd->s_parent && sd != base) {
132 int slen = strlen(sd->s_name);
139 133
140 for (s = path; depth--; s += 3) 134 len -= slen;
141 strcpy(s,"../"); 135 strncpy(s + len, sd->s_name, slen);
136 if (len)
137 s[--len] = '/';
142 138
143 fill_object_path(target_sd, path, size); 139 sd = sd->s_parent;
144 pr_debug("%s: path = '%s'\n", __FUNCTION__, path); 140 }
145 141
146 return 0; 142 return 0;
147} 143}