diff options
Diffstat (limited to 'fs')
175 files changed, 4574 insertions, 15044 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 3d185308ec88..65781de44fc0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -50,6 +50,7 @@ endif # BLOCK | |||
50 | config FILE_LOCKING | 50 | config FILE_LOCKING |
51 | bool "Enable POSIX file locking API" if EMBEDDED | 51 | bool "Enable POSIX file locking API" if EMBEDDED |
52 | default y | 52 | default y |
53 | select BKL # while lockd still uses it. | ||
53 | help | 54 | help |
54 | This option enables standard file locking support, required | 55 | This option enables standard file locking support, required |
55 | for filesystems like NFS and for the flock() system | 56 | for filesystems like NFS and for the flock() system |
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index e55182a74605..1dd5f34b3cf2 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config ADFS_FS | 1 | config ADFS_FS |
2 | tristate "ADFS file system support (EXPERIMENTAL)" | 2 | tristate "ADFS file system support (EXPERIMENTAL)" |
3 | depends on BLOCK && EXPERIMENTAL | 3 | depends on BLOCK && EXPERIMENTAL |
4 | depends on BKL # need to fix | ||
4 | help | 5 | help |
5 | The Acorn Disc Filing System is the standard file system of the | 6 | The Acorn Disc Filing System is the standard file system of the |
6 | RiscOS operating system which runs on Acorn's ARM-based Risc PC | 7 | RiscOS operating system which runs on Acorn's ARM-based Risc PC |
diff --git a/fs/affs/super.c b/fs/affs/super.c index a167f96d79f7..fa4fbe1e238a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c | |||
@@ -104,8 +104,8 @@ static void init_once(void *foo) | |||
104 | { | 104 | { |
105 | struct affs_inode_info *ei = (struct affs_inode_info *) foo; | 105 | struct affs_inode_info *ei = (struct affs_inode_info *) foo; |
106 | 106 | ||
107 | init_MUTEX(&ei->i_link_lock); | 107 | sema_init(&ei->i_link_lock, 1); |
108 | init_MUTEX(&ei->i_ext_lock); | 108 | sema_init(&ei->i_ext_lock, 1); |
109 | inode_init_once(&ei->vfs_inode); | 109 | inode_init_once(&ei->vfs_inode); |
110 | } | 110 | } |
111 | 111 | ||
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig index 5f3bea90911e..480e210c83ab 100644 --- a/fs/autofs/Kconfig +++ b/fs/autofs/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config AUTOFS_FS | 1 | config AUTOFS_FS |
2 | tristate "Kernel automounter support" | 2 | tristate "Kernel automounter support" |
3 | depends on BKL # unfixable, just use autofs4 | ||
3 | help | 4 | help |
4 | The automounter is a tool to automatically mount remote file systems | 5 | The automounter is a tool to automatically mount remote file systems |
5 | on demand. This implementation is partially kernel-based to reduce | 6 | on demand. This implementation is partially kernel-based to reduce |
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f96eff04e11a..a6395bdb26ae 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c | |||
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm) | |||
134 | if (!dump_write(file, dump_start, dump_size)) | 134 | if (!dump_write(file, dump_start, dump_size)) |
135 | goto end_coredump; | 135 | goto end_coredump; |
136 | } | 136 | } |
137 | /* Finally dump the task struct. Not be used by gdb, but could be useful */ | ||
138 | set_fs(KERNEL_DS); | ||
139 | if (!dump_write(file, current, sizeof(*current))) | ||
140 | goto end_coredump; | ||
141 | end_coredump: | 137 | end_coredump: |
142 | set_fs(fs); | 138 | set_fs(fs); |
143 | return has_dumped; | 139 | return has_dumped; |
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 535e763ab1a6..6884e198e0c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) | |||
800 | * default mmap base, as well as whatever program they | 800 | * default mmap base, as well as whatever program they |
801 | * might try to exec. This is because the brk will | 801 | * might try to exec. This is because the brk will |
802 | * follow the loader, and is not movable. */ | 802 | * follow the loader, and is not movable. */ |
803 | #ifdef CONFIG_X86 | 803 | #if defined(CONFIG_X86) || defined(CONFIG_ARM) |
804 | load_bias = 0; | 804 | load_bias = 0; |
805 | #else | 805 | #else |
806 | load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); | 806 | load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); |
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 0fcd2640c23f..9eb134ea6eb2 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig | |||
@@ -1,9 +1,11 @@ | |||
1 | config CEPH_FS | 1 | config CEPH_FS |
2 | tristate "Ceph distributed file system (EXPERIMENTAL)" | 2 | tristate "Ceph distributed file system (EXPERIMENTAL)" |
3 | depends on INET && EXPERIMENTAL | 3 | depends on INET && EXPERIMENTAL |
4 | select CEPH_LIB | ||
4 | select LIBCRC32C | 5 | select LIBCRC32C |
5 | select CRYPTO_AES | 6 | select CRYPTO_AES |
6 | select CRYPTO | 7 | select CRYPTO |
8 | default n | ||
7 | help | 9 | help |
8 | Choose Y or M here to include support for mounting the | 10 | Choose Y or M here to include support for mounting the |
9 | experimental Ceph distributed file system. Ceph is an extremely | 11 | experimental Ceph distributed file system. Ceph is an extremely |
@@ -14,15 +16,3 @@ config CEPH_FS | |||
14 | 16 | ||
15 | If unsure, say N. | 17 | If unsure, say N. |
16 | 18 | ||
17 | config CEPH_FS_PRETTYDEBUG | ||
18 | bool "Include file:line in ceph debug output" | ||
19 | depends on CEPH_FS | ||
20 | default n | ||
21 | help | ||
22 | If you say Y here, debug output will include a filename and | ||
23 | line to aid debugging. This icnreases kernel size and slows | ||
24 | execution slightly when debug call sites are enabled (e.g., | ||
25 | via CONFIG_DYNAMIC_DEBUG). | ||
26 | |||
27 | If unsure, say N. | ||
28 | |||
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 278e1172600d..9e6c4f2e8ff1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile | |||
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o | |||
8 | 8 | ||
9 | ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ | 9 | ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ |
10 | export.o caps.o snap.o xattr.o \ | 10 | export.o caps.o snap.o xattr.o \ |
11 | messenger.o msgpool.o buffer.o pagelist.o \ | 11 | mds_client.o mdsmap.o strings.o ceph_frag.o \ |
12 | mds_client.o mdsmap.o \ | 12 | debugfs.o |
13 | mon_client.o \ | ||
14 | osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ | ||
15 | debugfs.o \ | ||
16 | auth.o auth_none.o \ | ||
17 | crypto.o armor.o \ | ||
18 | auth_x.o \ | ||
19 | ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o | ||
20 | 13 | ||
21 | else | 14 | else |
22 | #Otherwise we were called directly from the command | 15 | #Otherwise we were called directly from the command |
diff --git a/fs/ceph/README b/fs/ceph/README deleted file mode 100644 index 18352fab37c0..000000000000 --- a/fs/ceph/README +++ /dev/null | |||
@@ -1,20 +0,0 @@ | |||
1 | # | ||
2 | # The following files are shared by (and manually synchronized | ||
3 | # between) the Ceph userland and kernel client. | ||
4 | # | ||
5 | # userland kernel | ||
6 | src/include/ceph_fs.h fs/ceph/ceph_fs.h | ||
7 | src/include/ceph_fs.cc fs/ceph/ceph_fs.c | ||
8 | src/include/msgr.h fs/ceph/msgr.h | ||
9 | src/include/rados.h fs/ceph/rados.h | ||
10 | src/include/ceph_strings.cc fs/ceph/ceph_strings.c | ||
11 | src/include/ceph_frag.h fs/ceph/ceph_frag.h | ||
12 | src/include/ceph_frag.cc fs/ceph/ceph_frag.c | ||
13 | src/include/ceph_hash.h fs/ceph/ceph_hash.h | ||
14 | src/include/ceph_hash.cc fs/ceph/ceph_hash.c | ||
15 | src/crush/crush.c fs/ceph/crush/crush.c | ||
16 | src/crush/crush.h fs/ceph/crush/crush.h | ||
17 | src/crush/mapper.c fs/ceph/crush/mapper.c | ||
18 | src/crush/mapper.h fs/ceph/crush/mapper.h | ||
19 | src/crush/hash.h fs/ceph/crush/hash.h | ||
20 | src/crush/hash.c fs/ceph/crush/hash.c | ||
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index efbc604001c8..51bcc5ce3230 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
@@ -10,7 +10,8 @@ | |||
10 | #include <linux/task_io_accounting_ops.h> | 10 | #include <linux/task_io_accounting_ops.h> |
11 | 11 | ||
12 | #include "super.h" | 12 | #include "super.h" |
13 | #include "osd_client.h" | 13 | #include "mds_client.h" |
14 | #include <linux/ceph/osd_client.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * Ceph address space ops. | 17 | * Ceph address space ops. |
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page) | |||
193 | { | 194 | { |
194 | struct inode *inode = filp->f_dentry->d_inode; | 195 | struct inode *inode = filp->f_dentry->d_inode; |
195 | struct ceph_inode_info *ci = ceph_inode(inode); | 196 | struct ceph_inode_info *ci = ceph_inode(inode); |
196 | struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; | 197 | struct ceph_osd_client *osdc = |
198 | &ceph_inode_to_client(inode)->client->osdc; | ||
197 | int err = 0; | 199 | int err = 0; |
198 | u64 len = PAGE_CACHE_SIZE; | 200 | u64 len = PAGE_CACHE_SIZE; |
199 | 201 | ||
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, | |||
265 | { | 267 | { |
266 | struct inode *inode = file->f_dentry->d_inode; | 268 | struct inode *inode = file->f_dentry->d_inode; |
267 | struct ceph_inode_info *ci = ceph_inode(inode); | 269 | struct ceph_inode_info *ci = ceph_inode(inode); |
268 | struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; | 270 | struct ceph_osd_client *osdc = |
271 | &ceph_inode_to_client(inode)->client->osdc; | ||
269 | int rc = 0; | 272 | int rc = 0; |
270 | struct page **pages; | 273 | struct page **pages; |
271 | loff_t offset; | 274 | loff_t offset; |
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
365 | { | 368 | { |
366 | struct inode *inode; | 369 | struct inode *inode; |
367 | struct ceph_inode_info *ci; | 370 | struct ceph_inode_info *ci; |
368 | struct ceph_client *client; | 371 | struct ceph_fs_client *fsc; |
369 | struct ceph_osd_client *osdc; | 372 | struct ceph_osd_client *osdc; |
370 | loff_t page_off = page->index << PAGE_CACHE_SHIFT; | 373 | loff_t page_off = page->index << PAGE_CACHE_SHIFT; |
371 | int len = PAGE_CACHE_SIZE; | 374 | int len = PAGE_CACHE_SIZE; |
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
383 | } | 386 | } |
384 | inode = page->mapping->host; | 387 | inode = page->mapping->host; |
385 | ci = ceph_inode(inode); | 388 | ci = ceph_inode(inode); |
386 | client = ceph_inode_to_client(inode); | 389 | fsc = ceph_inode_to_client(inode); |
387 | osdc = &client->osdc; | 390 | osdc = &fsc->client->osdc; |
388 | 391 | ||
389 | /* verify this is a writeable snap context */ | 392 | /* verify this is a writeable snap context */ |
390 | snapc = (void *)page->private; | 393 | snapc = (void *)page->private; |
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
414 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", | 417 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", |
415 | inode, page, page->index, page_off, len, snapc); | 418 | inode, page, page->index, page_off, len, snapc); |
416 | 419 | ||
417 | writeback_stat = atomic_long_inc_return(&client->writeback_count); | 420 | writeback_stat = atomic_long_inc_return(&fsc->writeback_count); |
418 | if (writeback_stat > | 421 | if (writeback_stat > |
419 | CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) | 422 | CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) |
420 | set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); | 423 | set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); |
421 | 424 | ||
422 | set_page_writeback(page); | 425 | set_page_writeback(page); |
423 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), | 426 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), |
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
496 | struct address_space *mapping = inode->i_mapping; | 499 | struct address_space *mapping = inode->i_mapping; |
497 | __s32 rc = -EIO; | 500 | __s32 rc = -EIO; |
498 | u64 bytes = 0; | 501 | u64 bytes = 0; |
499 | struct ceph_client *client = ceph_inode_to_client(inode); | 502 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
500 | long writeback_stat; | 503 | long writeback_stat; |
501 | unsigned issued = ceph_caps_issued(ci); | 504 | unsigned issued = ceph_caps_issued(ci); |
502 | 505 | ||
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
529 | WARN_ON(!PageUptodate(page)); | 532 | WARN_ON(!PageUptodate(page)); |
530 | 533 | ||
531 | writeback_stat = | 534 | writeback_stat = |
532 | atomic_long_dec_return(&client->writeback_count); | 535 | atomic_long_dec_return(&fsc->writeback_count); |
533 | if (writeback_stat < | 536 | if (writeback_stat < |
534 | CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) | 537 | CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) |
535 | clear_bdi_congested(&client->backing_dev_info, | 538 | clear_bdi_congested(&fsc->backing_dev_info, |
536 | BLK_RW_ASYNC); | 539 | BLK_RW_ASYNC); |
537 | 540 | ||
538 | ceph_put_snap_context((void *)page->private); | 541 | ceph_put_snap_context((void *)page->private); |
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
569 | * mempool. we avoid the mempool if we can because req->r_num_pages | 572 | * mempool. we avoid the mempool if we can because req->r_num_pages |
570 | * may be less than the maximum write size. | 573 | * may be less than the maximum write size. |
571 | */ | 574 | */ |
572 | static void alloc_page_vec(struct ceph_client *client, | 575 | static void alloc_page_vec(struct ceph_fs_client *fsc, |
573 | struct ceph_osd_request *req) | 576 | struct ceph_osd_request *req) |
574 | { | 577 | { |
575 | req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, | 578 | req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, |
576 | GFP_NOFS); | 579 | GFP_NOFS); |
577 | if (!req->r_pages) { | 580 | if (!req->r_pages) { |
578 | req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); | 581 | req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); |
579 | req->r_pages_from_pool = 1; | 582 | req->r_pages_from_pool = 1; |
580 | WARN_ON(!req->r_pages); | 583 | WARN_ON(!req->r_pages); |
581 | } | 584 | } |
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
590 | struct inode *inode = mapping->host; | 593 | struct inode *inode = mapping->host; |
591 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 594 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
592 | struct ceph_inode_info *ci = ceph_inode(inode); | 595 | struct ceph_inode_info *ci = ceph_inode(inode); |
593 | struct ceph_client *client; | 596 | struct ceph_fs_client *fsc; |
594 | pgoff_t index, start, end; | 597 | pgoff_t index, start, end; |
595 | int range_whole = 0; | 598 | int range_whole = 0; |
596 | int should_loop = 1; | 599 | int should_loop = 1; |
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
617 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : | 620 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : |
618 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); | 621 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); |
619 | 622 | ||
620 | client = ceph_inode_to_client(inode); | 623 | fsc = ceph_inode_to_client(inode); |
621 | if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { | 624 | if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { |
622 | pr_warning("writepage_start %p on forced umount\n", inode); | 625 | pr_warning("writepage_start %p on forced umount\n", inode); |
623 | return -EIO; /* we're in a forced umount, don't write! */ | 626 | return -EIO; /* we're in a forced umount, don't write! */ |
624 | } | 627 | } |
625 | if (client->mount_args->wsize && client->mount_args->wsize < wsize) | 628 | if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) |
626 | wsize = client->mount_args->wsize; | 629 | wsize = fsc->mount_options->wsize; |
627 | if (wsize < PAGE_CACHE_SIZE) | 630 | if (wsize < PAGE_CACHE_SIZE) |
628 | wsize = PAGE_CACHE_SIZE; | 631 | wsize = PAGE_CACHE_SIZE; |
629 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; | 632 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; |
@@ -769,7 +772,7 @@ get_more_pages: | |||
769 | offset = (unsigned long long)page->index | 772 | offset = (unsigned long long)page->index |
770 | << PAGE_CACHE_SHIFT; | 773 | << PAGE_CACHE_SHIFT; |
771 | len = wsize; | 774 | len = wsize; |
772 | req = ceph_osdc_new_request(&client->osdc, | 775 | req = ceph_osdc_new_request(&fsc->client->osdc, |
773 | &ci->i_layout, | 776 | &ci->i_layout, |
774 | ceph_vino(inode), | 777 | ceph_vino(inode), |
775 | offset, &len, | 778 | offset, &len, |
@@ -782,7 +785,7 @@ get_more_pages: | |||
782 | &inode->i_mtime, true, 1); | 785 | &inode->i_mtime, true, 1); |
783 | max_pages = req->r_num_pages; | 786 | max_pages = req->r_num_pages; |
784 | 787 | ||
785 | alloc_page_vec(client, req); | 788 | alloc_page_vec(fsc, req); |
786 | req->r_callback = writepages_finish; | 789 | req->r_callback = writepages_finish; |
787 | req->r_inode = inode; | 790 | req->r_inode = inode; |
788 | } | 791 | } |
@@ -794,10 +797,10 @@ get_more_pages: | |||
794 | inode, page, page->index); | 797 | inode, page, page->index); |
795 | 798 | ||
796 | writeback_stat = | 799 | writeback_stat = |
797 | atomic_long_inc_return(&client->writeback_count); | 800 | atomic_long_inc_return(&fsc->writeback_count); |
798 | if (writeback_stat > CONGESTION_ON_THRESH( | 801 | if (writeback_stat > CONGESTION_ON_THRESH( |
799 | client->mount_args->congestion_kb)) { | 802 | fsc->mount_options->congestion_kb)) { |
800 | set_bdi_congested(&client->backing_dev_info, | 803 | set_bdi_congested(&fsc->backing_dev_info, |
801 | BLK_RW_ASYNC); | 804 | BLK_RW_ASYNC); |
802 | } | 805 | } |
803 | 806 | ||
@@ -846,7 +849,7 @@ get_more_pages: | |||
846 | op->payload_len = cpu_to_le32(len); | 849 | op->payload_len = cpu_to_le32(len); |
847 | req->r_request->hdr.data_len = cpu_to_le32(len); | 850 | req->r_request->hdr.data_len = cpu_to_le32(len); |
848 | 851 | ||
849 | ceph_osdc_start_request(&client->osdc, req, true); | 852 | ceph_osdc_start_request(&fsc->client->osdc, req, true); |
850 | req = NULL; | 853 | req = NULL; |
851 | 854 | ||
852 | /* continue? */ | 855 | /* continue? */ |
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file, | |||
915 | { | 918 | { |
916 | struct inode *inode = file->f_dentry->d_inode; | 919 | struct inode *inode = file->f_dentry->d_inode; |
917 | struct ceph_inode_info *ci = ceph_inode(inode); | 920 | struct ceph_inode_info *ci = ceph_inode(inode); |
918 | struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; | 921 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
919 | loff_t page_off = pos & PAGE_CACHE_MASK; | 922 | loff_t page_off = pos & PAGE_CACHE_MASK; |
920 | int pos_in_page = pos & ~PAGE_CACHE_MASK; | 923 | int pos_in_page = pos & ~PAGE_CACHE_MASK; |
921 | int end_in_page = pos_in_page + len; | 924 | int end_in_page = pos_in_page + len; |
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1053 | struct page *page, void *fsdata) | 1056 | struct page *page, void *fsdata) |
1054 | { | 1057 | { |
1055 | struct inode *inode = file->f_dentry->d_inode; | 1058 | struct inode *inode = file->f_dentry->d_inode; |
1056 | struct ceph_client *client = ceph_inode_to_client(inode); | 1059 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1057 | struct ceph_mds_client *mdsc = &client->mdsc; | 1060 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1058 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1061 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1059 | int check_cap = 0; | 1062 | int check_cap = 0; |
1060 | 1063 | ||
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1123 | { | 1126 | { |
1124 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1127 | struct inode *inode = vma->vm_file->f_dentry->d_inode; |
1125 | struct page *page = vmf->page; | 1128 | struct page *page = vmf->page; |
1126 | struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; | 1129 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1127 | loff_t off = page->index << PAGE_CACHE_SHIFT; | 1130 | loff_t off = page->index << PAGE_CACHE_SHIFT; |
1128 | loff_t size, len; | 1131 | loff_t size, len; |
1129 | int ret; | 1132 | int ret; |
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c deleted file mode 100644 index eb2a666b0be7..000000000000 --- a/fs/ceph/armor.c +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | |||
2 | #include <linux/errno.h> | ||
3 | |||
4 | int ceph_armor(char *dst, const char *src, const char *end); | ||
5 | int ceph_unarmor(char *dst, const char *src, const char *end); | ||
6 | |||
7 | /* | ||
8 | * base64 encode/decode. | ||
9 | */ | ||
10 | |||
11 | static const char *pem_key = | ||
12 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | ||
13 | |||
14 | static int encode_bits(int c) | ||
15 | { | ||
16 | return pem_key[c]; | ||
17 | } | ||
18 | |||
19 | static int decode_bits(char c) | ||
20 | { | ||
21 | if (c >= 'A' && c <= 'Z') | ||
22 | return c - 'A'; | ||
23 | if (c >= 'a' && c <= 'z') | ||
24 | return c - 'a' + 26; | ||
25 | if (c >= '0' && c <= '9') | ||
26 | return c - '0' + 52; | ||
27 | if (c == '+') | ||
28 | return 62; | ||
29 | if (c == '/') | ||
30 | return 63; | ||
31 | if (c == '=') | ||
32 | return 0; /* just non-negative, please */ | ||
33 | return -EINVAL; | ||
34 | } | ||
35 | |||
36 | int ceph_armor(char *dst, const char *src, const char *end) | ||
37 | { | ||
38 | int olen = 0; | ||
39 | int line = 0; | ||
40 | |||
41 | while (src < end) { | ||
42 | unsigned char a, b, c; | ||
43 | |||
44 | a = *src++; | ||
45 | *dst++ = encode_bits(a >> 2); | ||
46 | if (src < end) { | ||
47 | b = *src++; | ||
48 | *dst++ = encode_bits(((a & 3) << 4) | (b >> 4)); | ||
49 | if (src < end) { | ||
50 | c = *src++; | ||
51 | *dst++ = encode_bits(((b & 15) << 2) | | ||
52 | (c >> 6)); | ||
53 | *dst++ = encode_bits(c & 63); | ||
54 | } else { | ||
55 | *dst++ = encode_bits((b & 15) << 2); | ||
56 | *dst++ = '='; | ||
57 | } | ||
58 | } else { | ||
59 | *dst++ = encode_bits(((a & 3) << 4)); | ||
60 | *dst++ = '='; | ||
61 | *dst++ = '='; | ||
62 | } | ||
63 | olen += 4; | ||
64 | line += 4; | ||
65 | if (line == 64) { | ||
66 | line = 0; | ||
67 | *(dst++) = '\n'; | ||
68 | olen++; | ||
69 | } | ||
70 | } | ||
71 | return olen; | ||
72 | } | ||
73 | |||
74 | int ceph_unarmor(char *dst, const char *src, const char *end) | ||
75 | { | ||
76 | int olen = 0; | ||
77 | |||
78 | while (src < end) { | ||
79 | int a, b, c, d; | ||
80 | |||
81 | if (src < end && src[0] == '\n') | ||
82 | src++; | ||
83 | if (src + 4 > end) | ||
84 | return -EINVAL; | ||
85 | a = decode_bits(src[0]); | ||
86 | b = decode_bits(src[1]); | ||
87 | c = decode_bits(src[2]); | ||
88 | d = decode_bits(src[3]); | ||
89 | if (a < 0 || b < 0 || c < 0 || d < 0) | ||
90 | return -EINVAL; | ||
91 | |||
92 | *dst++ = (a << 2) | (b >> 4); | ||
93 | if (src[2] == '=') | ||
94 | return olen + 1; | ||
95 | *dst++ = ((b & 15) << 4) | (c >> 2); | ||
96 | if (src[3] == '=') | ||
97 | return olen + 2; | ||
98 | *dst++ = ((c & 3) << 6) | d; | ||
99 | olen += 3; | ||
100 | src += 4; | ||
101 | } | ||
102 | return olen; | ||
103 | } | ||
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c deleted file mode 100644 index 6d2e30600627..000000000000 --- a/fs/ceph/auth.c +++ /dev/null | |||
@@ -1,259 +0,0 @@ | |||
1 | #include "ceph_debug.h" | ||
2 | |||
3 | #include <linux/module.h> | ||
4 | #include <linux/err.h> | ||
5 | #include <linux/slab.h> | ||
6 | |||
7 | #include "types.h" | ||
8 | #include "auth_none.h" | ||
9 | #include "auth_x.h" | ||
10 | #include "decode.h" | ||
11 | #include "super.h" | ||
12 | |||
13 | #include "messenger.h" | ||
14 | |||
15 | /* | ||
16 | * get protocol handler | ||
17 | */ | ||
18 | static u32 supported_protocols[] = { | ||
19 | CEPH_AUTH_NONE, | ||
20 | CEPH_AUTH_CEPHX | ||
21 | }; | ||
22 | |||
23 | static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) | ||
24 | { | ||
25 | switch (protocol) { | ||
26 | case CEPH_AUTH_NONE: | ||
27 | return ceph_auth_none_init(ac); | ||
28 | case CEPH_AUTH_CEPHX: | ||
29 | return ceph_x_init(ac); | ||
30 | default: | ||
31 | return -ENOENT; | ||
32 | } | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * setup, teardown. | ||
37 | */ | ||
38 | struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret) | ||
39 | { | ||
40 | struct ceph_auth_client *ac; | ||
41 | int ret; | ||
42 | |||
43 | dout("auth_init name '%s' secret '%s'\n", name, secret); | ||
44 | |||
45 | ret = -ENOMEM; | ||
46 | ac = kzalloc(sizeof(*ac), GFP_NOFS); | ||
47 | if (!ac) | ||
48 | goto out; | ||
49 | |||
50 | ac->negotiating = true; | ||
51 | if (name) | ||
52 | ac->name = name; | ||
53 | else | ||
54 | ac->name = CEPH_AUTH_NAME_DEFAULT; | ||
55 | dout("auth_init name %s secret %s\n", ac->name, secret); | ||
56 | ac->secret = secret; | ||
57 | return ac; | ||
58 | |||
59 | out: | ||
60 | return ERR_PTR(ret); | ||
61 | } | ||
62 | |||
63 | void ceph_auth_destroy(struct ceph_auth_client *ac) | ||
64 | { | ||
65 | dout("auth_destroy %p\n", ac); | ||
66 | if (ac->ops) | ||
67 | ac->ops->destroy(ac); | ||
68 | kfree(ac); | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * Reset occurs when reconnecting to the monitor. | ||
73 | */ | ||
74 | void ceph_auth_reset(struct ceph_auth_client *ac) | ||
75 | { | ||
76 | dout("auth_reset %p\n", ac); | ||
77 | if (ac->ops && !ac->negotiating) | ||
78 | ac->ops->reset(ac); | ||
79 | ac->negotiating = true; | ||
80 | } | ||
81 | |||
82 | int ceph_entity_name_encode(const char *name, void **p, void *end) | ||
83 | { | ||
84 | int len = strlen(name); | ||
85 | |||
86 | if (*p + 2*sizeof(u32) + len > end) | ||
87 | return -ERANGE; | ||
88 | ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); | ||
89 | ceph_encode_32(p, len); | ||
90 | ceph_encode_copy(p, name, len); | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Initiate protocol negotiation with monitor. Include entity name | ||
96 | * and list supported protocols. | ||
97 | */ | ||
98 | int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) | ||
99 | { | ||
100 | struct ceph_mon_request_header *monhdr = buf; | ||
101 | void *p = monhdr + 1, *end = buf + len, *lenp; | ||
102 | int i, num; | ||
103 | int ret; | ||
104 | |||
105 | dout("auth_build_hello\n"); | ||
106 | monhdr->have_version = 0; | ||
107 | monhdr->session_mon = cpu_to_le16(-1); | ||
108 | monhdr->session_mon_tid = 0; | ||
109 | |||
110 | ceph_encode_32(&p, 0); /* no protocol, yet */ | ||
111 | |||
112 | lenp = p; | ||
113 | p += sizeof(u32); | ||
114 | |||
115 | ceph_decode_need(&p, end, 1 + sizeof(u32), bad); | ||
116 | ceph_encode_8(&p, 1); | ||
117 | num = ARRAY_SIZE(supported_protocols); | ||
118 | ceph_encode_32(&p, num); | ||
119 | ceph_decode_need(&p, end, num * sizeof(u32), bad); | ||
120 | for (i = 0; i < num; i++) | ||
121 | ceph_encode_32(&p, supported_protocols[i]); | ||
122 | |||
123 | ret = ceph_entity_name_encode(ac->name, &p, end); | ||
124 | if (ret < 0) | ||
125 | return ret; | ||
126 | ceph_decode_need(&p, end, sizeof(u64), bad); | ||
127 | ceph_encode_64(&p, ac->global_id); | ||
128 | |||
129 | ceph_encode_32(&lenp, p - lenp - sizeof(u32)); | ||
130 | return p - buf; | ||
131 | |||
132 | bad: | ||
133 | return -ERANGE; | ||
134 | } | ||
135 | |||
136 | static int ceph_build_auth_request(struct ceph_auth_client *ac, | ||
137 | void *msg_buf, size_t msg_len) | ||
138 | { | ||
139 | struct ceph_mon_request_header *monhdr = msg_buf; | ||
140 | void *p = monhdr + 1; | ||
141 | void *end = msg_buf + msg_len; | ||
142 | int ret; | ||
143 | |||
144 | monhdr->have_version = 0; | ||
145 | monhdr->session_mon = cpu_to_le16(-1); | ||
146 | monhdr->session_mon_tid = 0; | ||
147 | |||
148 | ceph_encode_32(&p, ac->protocol); | ||
149 | |||
150 | ret = ac->ops->build_request(ac, p + sizeof(u32), end); | ||
151 | if (ret < 0) { | ||
152 | pr_err("error %d building auth method %s request\n", ret, | ||
153 | ac->ops->name); | ||
154 | return ret; | ||
155 | } | ||
156 | dout(" built request %d bytes\n", ret); | ||
157 | ceph_encode_32(&p, ret); | ||
158 | return p + ret - msg_buf; | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * Handle auth message from monitor. | ||
163 | */ | ||
164 | int ceph_handle_auth_reply(struct ceph_auth_client *ac, | ||
165 | void *buf, size_t len, | ||
166 | void *reply_buf, size_t reply_len) | ||
167 | { | ||
168 | void *p = buf; | ||
169 | void *end = buf + len; | ||
170 | int protocol; | ||
171 | s32 result; | ||
172 | u64 global_id; | ||
173 | void *payload, *payload_end; | ||
174 | int payload_len; | ||
175 | char *result_msg; | ||
176 | int result_msg_len; | ||
177 | int ret = -EINVAL; | ||
178 | |||
179 | dout("handle_auth_reply %p %p\n", p, end); | ||
180 | ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); | ||
181 | protocol = ceph_decode_32(&p); | ||
182 | result = ceph_decode_32(&p); | ||
183 | global_id = ceph_decode_64(&p); | ||
184 | payload_len = ceph_decode_32(&p); | ||
185 | payload = p; | ||
186 | p += payload_len; | ||
187 | ceph_decode_need(&p, end, sizeof(u32), bad); | ||
188 | result_msg_len = ceph_decode_32(&p); | ||
189 | result_msg = p; | ||
190 | p += result_msg_len; | ||
191 | if (p != end) | ||
192 | goto bad; | ||
193 | |||
194 | dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, | ||
195 | result_msg, global_id, payload_len); | ||
196 | |||
197 | payload_end = payload + payload_len; | ||
198 | |||
199 | if (global_id && ac->global_id != global_id) { | ||
200 | dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); | ||
201 | ac->global_id = global_id; | ||
202 | } | ||
203 | |||
204 | if (ac->negotiating) { | ||
205 | /* server does not support our protocols? */ | ||
206 | if (!protocol && result < 0) { | ||
207 | ret = result; | ||
208 | goto out; | ||
209 | } | ||
210 | /* set up (new) protocol handler? */ | ||
211 | if (ac->protocol && ac->protocol != protocol) { | ||
212 | ac->ops->destroy(ac); | ||
213 | ac->protocol = 0; | ||
214 | ac->ops = NULL; | ||
215 | } | ||
216 | if (ac->protocol != protocol) { | ||
217 | ret = ceph_auth_init_protocol(ac, protocol); | ||
218 | if (ret) { | ||
219 | pr_err("error %d on auth protocol %d init\n", | ||
220 | ret, protocol); | ||
221 | goto out; | ||
222 | } | ||
223 | } | ||
224 | |||
225 | ac->negotiating = false; | ||
226 | } | ||
227 | |||
228 | ret = ac->ops->handle_reply(ac, result, payload, payload_end); | ||
229 | if (ret == -EAGAIN) { | ||
230 | return ceph_build_auth_request(ac, reply_buf, reply_len); | ||
231 | } else if (ret) { | ||
232 | pr_err("auth method '%s' error %d\n", ac->ops->name, ret); | ||
233 | return ret; | ||
234 | } | ||
235 | return 0; | ||
236 | |||
237 | bad: | ||
238 | pr_err("failed to decode auth msg\n"); | ||
239 | out: | ||
240 | return ret; | ||
241 | } | ||
242 | |||
243 | int ceph_build_auth(struct ceph_auth_client *ac, | ||
244 | void *msg_buf, size_t msg_len) | ||
245 | { | ||
246 | if (!ac->protocol) | ||
247 | return ceph_auth_build_hello(ac, msg_buf, msg_len); | ||
248 | BUG_ON(!ac->ops); | ||
249 | if (ac->ops->should_authenticate(ac)) | ||
250 | return ceph_build_auth_request(ac, msg_buf, msg_len); | ||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | int ceph_auth_is_authenticated(struct ceph_auth_client *ac) | ||
255 | { | ||
256 | if (!ac->ops) | ||
257 | return 0; | ||
258 | return ac->ops->is_authenticated(ac); | ||
259 | } | ||
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h deleted file mode 100644 index d38a2fb4a137..000000000000 --- a/fs/ceph/auth.h +++ /dev/null | |||
@@ -1,92 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_AUTH_H | ||
2 | #define _FS_CEPH_AUTH_H | ||
3 | |||
4 | #include "types.h" | ||
5 | #include "buffer.h" | ||
6 | |||
7 | /* | ||
8 | * Abstract interface for communicating with the authenticate module. | ||
9 | * There is some handshake that takes place between us and the monitor | ||
10 | * to acquire the necessary keys. These are used to generate an | ||
11 | * 'authorizer' that we use when connecting to a service (mds, osd). | ||
12 | */ | ||
13 | |||
14 | struct ceph_auth_client; | ||
15 | struct ceph_authorizer; | ||
16 | |||
17 | struct ceph_auth_client_ops { | ||
18 | const char *name; | ||
19 | |||
20 | /* | ||
21 | * true if we are authenticated and can connect to | ||
22 | * services. | ||
23 | */ | ||
24 | int (*is_authenticated)(struct ceph_auth_client *ac); | ||
25 | |||
26 | /* | ||
27 | * true if we should (re)authenticate, e.g., when our tickets | ||
28 | * are getting old and crusty. | ||
29 | */ | ||
30 | int (*should_authenticate)(struct ceph_auth_client *ac); | ||
31 | |||
32 | /* | ||
33 | * build requests and process replies during monitor | ||
34 | * handshake. if handle_reply returns -EAGAIN, we build | ||
35 | * another request. | ||
36 | */ | ||
37 | int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); | ||
38 | int (*handle_reply)(struct ceph_auth_client *ac, int result, | ||
39 | void *buf, void *end); | ||
40 | |||
41 | /* | ||
42 | * Create authorizer for connecting to a service, and verify | ||
43 | * the response to authenticate the service. | ||
44 | */ | ||
45 | int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, | ||
46 | struct ceph_authorizer **a, | ||
47 | void **buf, size_t *len, | ||
48 | void **reply_buf, size_t *reply_len); | ||
49 | int (*verify_authorizer_reply)(struct ceph_auth_client *ac, | ||
50 | struct ceph_authorizer *a, size_t len); | ||
51 | void (*destroy_authorizer)(struct ceph_auth_client *ac, | ||
52 | struct ceph_authorizer *a); | ||
53 | void (*invalidate_authorizer)(struct ceph_auth_client *ac, | ||
54 | int peer_type); | ||
55 | |||
56 | /* reset when we (re)connect to a monitor */ | ||
57 | void (*reset)(struct ceph_auth_client *ac); | ||
58 | |||
59 | void (*destroy)(struct ceph_auth_client *ac); | ||
60 | }; | ||
61 | |||
62 | struct ceph_auth_client { | ||
63 | u32 protocol; /* CEPH_AUTH_* */ | ||
64 | void *private; /* for use by protocol implementation */ | ||
65 | const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ | ||
66 | |||
67 | bool negotiating; /* true if negotiating protocol */ | ||
68 | const char *name; /* entity name */ | ||
69 | u64 global_id; /* our unique id in system */ | ||
70 | const char *secret; /* our secret key */ | ||
71 | unsigned want_keys; /* which services we want */ | ||
72 | }; | ||
73 | |||
74 | extern struct ceph_auth_client *ceph_auth_init(const char *name, | ||
75 | const char *secret); | ||
76 | extern void ceph_auth_destroy(struct ceph_auth_client *ac); | ||
77 | |||
78 | extern void ceph_auth_reset(struct ceph_auth_client *ac); | ||
79 | |||
80 | extern int ceph_auth_build_hello(struct ceph_auth_client *ac, | ||
81 | void *buf, size_t len); | ||
82 | extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, | ||
83 | void *buf, size_t len, | ||
84 | void *reply_buf, size_t reply_len); | ||
85 | extern int ceph_entity_name_encode(const char *name, void **p, void *end); | ||
86 | |||
87 | extern int ceph_build_auth(struct ceph_auth_client *ac, | ||
88 | void *msg_buf, size_t msg_len); | ||
89 | |||
90 | extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); | ||
91 | |||
92 | #endif | ||
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c deleted file mode 100644 index ad1dc21286c7..000000000000 --- a/fs/ceph/auth_none.c +++ /dev/null | |||
@@ -1,131 +0,0 @@ | |||
1 | |||
2 | #include "ceph_debug.h" | ||
3 | |||
4 | #include <linux/err.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/random.h> | ||
7 | #include <linux/slab.h> | ||
8 | |||
9 | #include "auth_none.h" | ||
10 | #include "auth.h" | ||
11 | #include "decode.h" | ||
12 | |||
13 | static void reset(struct ceph_auth_client *ac) | ||
14 | { | ||
15 | struct ceph_auth_none_info *xi = ac->private; | ||
16 | |||
17 | xi->starting = true; | ||
18 | xi->built_authorizer = false; | ||
19 | } | ||
20 | |||
21 | static void destroy(struct ceph_auth_client *ac) | ||
22 | { | ||
23 | kfree(ac->private); | ||
24 | ac->private = NULL; | ||
25 | } | ||
26 | |||
27 | static int is_authenticated(struct ceph_auth_client *ac) | ||
28 | { | ||
29 | struct ceph_auth_none_info *xi = ac->private; | ||
30 | |||
31 | return !xi->starting; | ||
32 | } | ||
33 | |||
34 | static int should_authenticate(struct ceph_auth_client *ac) | ||
35 | { | ||
36 | struct ceph_auth_none_info *xi = ac->private; | ||
37 | |||
38 | return xi->starting; | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * the generic auth code decode the global_id, and we carry no actual | ||
43 | * authenticate state, so nothing happens here. | ||
44 | */ | ||
45 | static int handle_reply(struct ceph_auth_client *ac, int result, | ||
46 | void *buf, void *end) | ||
47 | { | ||
48 | struct ceph_auth_none_info *xi = ac->private; | ||
49 | |||
50 | xi->starting = false; | ||
51 | return result; | ||
52 | } | ||
53 | |||
54 | /* | ||
55 | * build an 'authorizer' with our entity_name and global_id. we can | ||
56 | * reuse a single static copy since it is identical for all services | ||
57 | * we connect to. | ||
58 | */ | ||
59 | static int ceph_auth_none_create_authorizer( | ||
60 | struct ceph_auth_client *ac, int peer_type, | ||
61 | struct ceph_authorizer **a, | ||
62 | void **buf, size_t *len, | ||
63 | void **reply_buf, size_t *reply_len) | ||
64 | { | ||
65 | struct ceph_auth_none_info *ai = ac->private; | ||
66 | struct ceph_none_authorizer *au = &ai->au; | ||
67 | void *p, *end; | ||
68 | int ret; | ||
69 | |||
70 | if (!ai->built_authorizer) { | ||
71 | p = au->buf; | ||
72 | end = p + sizeof(au->buf); | ||
73 | ceph_encode_8(&p, 1); | ||
74 | ret = ceph_entity_name_encode(ac->name, &p, end - 8); | ||
75 | if (ret < 0) | ||
76 | goto bad; | ||
77 | ceph_decode_need(&p, end, sizeof(u64), bad2); | ||
78 | ceph_encode_64(&p, ac->global_id); | ||
79 | au->buf_len = p - (void *)au->buf; | ||
80 | ai->built_authorizer = true; | ||
81 | dout("built authorizer len %d\n", au->buf_len); | ||
82 | } | ||
83 | |||
84 | *a = (struct ceph_authorizer *)au; | ||
85 | *buf = au->buf; | ||
86 | *len = au->buf_len; | ||
87 | *reply_buf = au->reply_buf; | ||
88 | *reply_len = sizeof(au->reply_buf); | ||
89 | return 0; | ||
90 | |||
91 | bad2: | ||
92 | ret = -ERANGE; | ||
93 | bad: | ||
94 | return ret; | ||
95 | } | ||
96 | |||
97 | static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, | ||
98 | struct ceph_authorizer *a) | ||
99 | { | ||
100 | /* nothing to do */ | ||
101 | } | ||
102 | |||
103 | static const struct ceph_auth_client_ops ceph_auth_none_ops = { | ||
104 | .name = "none", | ||
105 | .reset = reset, | ||
106 | .destroy = destroy, | ||
107 | .is_authenticated = is_authenticated, | ||
108 | .should_authenticate = should_authenticate, | ||
109 | .handle_reply = handle_reply, | ||
110 | .create_authorizer = ceph_auth_none_create_authorizer, | ||
111 | .destroy_authorizer = ceph_auth_none_destroy_authorizer, | ||
112 | }; | ||
113 | |||
114 | int ceph_auth_none_init(struct ceph_auth_client *ac) | ||
115 | { | ||
116 | struct ceph_auth_none_info *xi; | ||
117 | |||
118 | dout("ceph_auth_none_init %p\n", ac); | ||
119 | xi = kzalloc(sizeof(*xi), GFP_NOFS); | ||
120 | if (!xi) | ||
121 | return -ENOMEM; | ||
122 | |||
123 | xi->starting = true; | ||
124 | xi->built_authorizer = false; | ||
125 | |||
126 | ac->protocol = CEPH_AUTH_NONE; | ||
127 | ac->private = xi; | ||
128 | ac->ops = &ceph_auth_none_ops; | ||
129 | return 0; | ||
130 | } | ||
131 | |||
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h deleted file mode 100644 index 8164df1a08be..000000000000 --- a/fs/ceph/auth_none.h +++ /dev/null | |||
@@ -1,30 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_AUTH_NONE_H | ||
2 | #define _FS_CEPH_AUTH_NONE_H | ||
3 | |||
4 | #include <linux/slab.h> | ||
5 | |||
6 | #include "auth.h" | ||
7 | |||
8 | /* | ||
9 | * null security mode. | ||
10 | * | ||
11 | * we use a single static authorizer that simply encodes our entity name | ||
12 | * and global id. | ||
13 | */ | ||
14 | |||
15 | struct ceph_none_authorizer { | ||
16 | char buf[128]; | ||
17 | int buf_len; | ||
18 | char reply_buf[0]; | ||
19 | }; | ||
20 | |||
21 | struct ceph_auth_none_info { | ||
22 | bool starting; | ||
23 | bool built_authorizer; | ||
24 | struct ceph_none_authorizer au; /* we only need one; it's static */ | ||
25 | }; | ||
26 | |||
27 | extern int ceph_auth_none_init(struct ceph_auth_client *ac); | ||
28 | |||
29 | #endif | ||
30 | |||
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c deleted file mode 100644 index a2d002cbdec2..000000000000 --- a/fs/ceph/auth_x.c +++ /dev/null | |||
@@ -1,687 +0,0 @@ | |||
1 | |||
2 | #include "ceph_debug.h" | ||
3 | |||
4 | #include <linux/err.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/random.h> | ||
7 | #include <linux/slab.h> | ||
8 | |||
9 | #include "auth_x.h" | ||
10 | #include "auth_x_protocol.h" | ||
11 | #include "crypto.h" | ||
12 | #include "auth.h" | ||
13 | #include "decode.h" | ||
14 | |||
15 | #define TEMP_TICKET_BUF_LEN 256 | ||
16 | |||
17 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); | ||
18 | |||
19 | static int ceph_x_is_authenticated(struct ceph_auth_client *ac) | ||
20 | { | ||
21 | struct ceph_x_info *xi = ac->private; | ||
22 | int need; | ||
23 | |||
24 | ceph_x_validate_tickets(ac, &need); | ||
25 | dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", | ||
26 | ac->want_keys, need, xi->have_keys); | ||
27 | return (ac->want_keys & xi->have_keys) == ac->want_keys; | ||
28 | } | ||
29 | |||
30 | static int ceph_x_should_authenticate(struct ceph_auth_client *ac) | ||
31 | { | ||
32 | struct ceph_x_info *xi = ac->private; | ||
33 | int need; | ||
34 | |||
35 | ceph_x_validate_tickets(ac, &need); | ||
36 | dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", | ||
37 | ac->want_keys, need, xi->have_keys); | ||
38 | return need != 0; | ||
39 | } | ||
40 | |||
41 | static int ceph_x_encrypt_buflen(int ilen) | ||
42 | { | ||
43 | return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + | ||
44 | sizeof(u32); | ||
45 | } | ||
46 | |||
47 | static int ceph_x_encrypt(struct ceph_crypto_key *secret, | ||
48 | void *ibuf, int ilen, void *obuf, size_t olen) | ||
49 | { | ||
50 | struct ceph_x_encrypt_header head = { | ||
51 | .struct_v = 1, | ||
52 | .magic = cpu_to_le64(CEPHX_ENC_MAGIC) | ||
53 | }; | ||
54 | size_t len = olen - sizeof(u32); | ||
55 | int ret; | ||
56 | |||
57 | ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, | ||
58 | &head, sizeof(head), ibuf, ilen); | ||
59 | if (ret) | ||
60 | return ret; | ||
61 | ceph_encode_32(&obuf, len); | ||
62 | return len + sizeof(u32); | ||
63 | } | ||
64 | |||
65 | static int ceph_x_decrypt(struct ceph_crypto_key *secret, | ||
66 | void **p, void *end, void *obuf, size_t olen) | ||
67 | { | ||
68 | struct ceph_x_encrypt_header head; | ||
69 | size_t head_len = sizeof(head); | ||
70 | int len, ret; | ||
71 | |||
72 | len = ceph_decode_32(p); | ||
73 | if (*p + len > end) | ||
74 | return -EINVAL; | ||
75 | |||
76 | dout("ceph_x_decrypt len %d\n", len); | ||
77 | ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, | ||
78 | *p, len); | ||
79 | if (ret) | ||
80 | return ret; | ||
81 | if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) | ||
82 | return -EPERM; | ||
83 | *p += len; | ||
84 | return olen; | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * get existing (or insert new) ticket handler | ||
89 | */ | ||
90 | static struct ceph_x_ticket_handler * | ||
91 | get_ticket_handler(struct ceph_auth_client *ac, int service) | ||
92 | { | ||
93 | struct ceph_x_ticket_handler *th; | ||
94 | struct ceph_x_info *xi = ac->private; | ||
95 | struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; | ||
96 | |||
97 | while (*p) { | ||
98 | parent = *p; | ||
99 | th = rb_entry(parent, struct ceph_x_ticket_handler, node); | ||
100 | if (service < th->service) | ||
101 | p = &(*p)->rb_left; | ||
102 | else if (service > th->service) | ||
103 | p = &(*p)->rb_right; | ||
104 | else | ||
105 | return th; | ||
106 | } | ||
107 | |||
108 | /* add it */ | ||
109 | th = kzalloc(sizeof(*th), GFP_NOFS); | ||
110 | if (!th) | ||
111 | return ERR_PTR(-ENOMEM); | ||
112 | th->service = service; | ||
113 | rb_link_node(&th->node, parent, p); | ||
114 | rb_insert_color(&th->node, &xi->ticket_handlers); | ||
115 | return th; | ||
116 | } | ||
117 | |||
118 | static void remove_ticket_handler(struct ceph_auth_client *ac, | ||
119 | struct ceph_x_ticket_handler *th) | ||
120 | { | ||
121 | struct ceph_x_info *xi = ac->private; | ||
122 | |||
123 | dout("remove_ticket_handler %p %d\n", th, th->service); | ||
124 | rb_erase(&th->node, &xi->ticket_handlers); | ||
125 | ceph_crypto_key_destroy(&th->session_key); | ||
126 | if (th->ticket_blob) | ||
127 | ceph_buffer_put(th->ticket_blob); | ||
128 | kfree(th); | ||
129 | } | ||
130 | |||
131 | static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, | ||
132 | struct ceph_crypto_key *secret, | ||
133 | void *buf, void *end) | ||
134 | { | ||
135 | struct ceph_x_info *xi = ac->private; | ||
136 | int num; | ||
137 | void *p = buf; | ||
138 | int ret; | ||
139 | char *dbuf; | ||
140 | char *ticket_buf; | ||
141 | u8 reply_struct_v; | ||
142 | |||
143 | dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); | ||
144 | if (!dbuf) | ||
145 | return -ENOMEM; | ||
146 | |||
147 | ret = -ENOMEM; | ||
148 | ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); | ||
149 | if (!ticket_buf) | ||
150 | goto out_dbuf; | ||
151 | |||
152 | ceph_decode_need(&p, end, 1 + sizeof(u32), bad); | ||
153 | reply_struct_v = ceph_decode_8(&p); | ||
154 | if (reply_struct_v != 1) | ||
155 | goto bad; | ||
156 | num = ceph_decode_32(&p); | ||
157 | dout("%d tickets\n", num); | ||
158 | while (num--) { | ||
159 | int type; | ||
160 | u8 tkt_struct_v, blob_struct_v; | ||
161 | struct ceph_x_ticket_handler *th; | ||
162 | void *dp, *dend; | ||
163 | int dlen; | ||
164 | char is_enc; | ||
165 | struct timespec validity; | ||
166 | struct ceph_crypto_key old_key; | ||
167 | void *tp, *tpend; | ||
168 | struct ceph_timespec new_validity; | ||
169 | struct ceph_crypto_key new_session_key; | ||
170 | struct ceph_buffer *new_ticket_blob; | ||
171 | unsigned long new_expires, new_renew_after; | ||
172 | u64 new_secret_id; | ||
173 | |||
174 | ceph_decode_need(&p, end, sizeof(u32) + 1, bad); | ||
175 | |||
176 | type = ceph_decode_32(&p); | ||
177 | dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); | ||
178 | |||
179 | tkt_struct_v = ceph_decode_8(&p); | ||
180 | if (tkt_struct_v != 1) | ||
181 | goto bad; | ||
182 | |||
183 | th = get_ticket_handler(ac, type); | ||
184 | if (IS_ERR(th)) { | ||
185 | ret = PTR_ERR(th); | ||
186 | goto out; | ||
187 | } | ||
188 | |||
189 | /* blob for me */ | ||
190 | dlen = ceph_x_decrypt(secret, &p, end, dbuf, | ||
191 | TEMP_TICKET_BUF_LEN); | ||
192 | if (dlen <= 0) { | ||
193 | ret = dlen; | ||
194 | goto out; | ||
195 | } | ||
196 | dout(" decrypted %d bytes\n", dlen); | ||
197 | dend = dbuf + dlen; | ||
198 | dp = dbuf; | ||
199 | |||
200 | tkt_struct_v = ceph_decode_8(&dp); | ||
201 | if (tkt_struct_v != 1) | ||
202 | goto bad; | ||
203 | |||
204 | memcpy(&old_key, &th->session_key, sizeof(old_key)); | ||
205 | ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); | ||
206 | if (ret) | ||
207 | goto out; | ||
208 | |||
209 | ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); | ||
210 | ceph_decode_timespec(&validity, &new_validity); | ||
211 | new_expires = get_seconds() + validity.tv_sec; | ||
212 | new_renew_after = new_expires - (validity.tv_sec / 4); | ||
213 | dout(" expires=%lu renew_after=%lu\n", new_expires, | ||
214 | new_renew_after); | ||
215 | |||
216 | /* ticket blob for service */ | ||
217 | ceph_decode_8_safe(&p, end, is_enc, bad); | ||
218 | tp = ticket_buf; | ||
219 | if (is_enc) { | ||
220 | /* encrypted */ | ||
221 | dout(" encrypted ticket\n"); | ||
222 | dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, | ||
223 | TEMP_TICKET_BUF_LEN); | ||
224 | if (dlen < 0) { | ||
225 | ret = dlen; | ||
226 | goto out; | ||
227 | } | ||
228 | dlen = ceph_decode_32(&tp); | ||
229 | } else { | ||
230 | /* unencrypted */ | ||
231 | ceph_decode_32_safe(&p, end, dlen, bad); | ||
232 | ceph_decode_need(&p, end, dlen, bad); | ||
233 | ceph_decode_copy(&p, ticket_buf, dlen); | ||
234 | } | ||
235 | tpend = tp + dlen; | ||
236 | dout(" ticket blob is %d bytes\n", dlen); | ||
237 | ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); | ||
238 | blob_struct_v = ceph_decode_8(&tp); | ||
239 | new_secret_id = ceph_decode_64(&tp); | ||
240 | ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); | ||
241 | if (ret) | ||
242 | goto out; | ||
243 | |||
244 | /* all is well, update our ticket */ | ||
245 | ceph_crypto_key_destroy(&th->session_key); | ||
246 | if (th->ticket_blob) | ||
247 | ceph_buffer_put(th->ticket_blob); | ||
248 | th->session_key = new_session_key; | ||
249 | th->ticket_blob = new_ticket_blob; | ||
250 | th->validity = new_validity; | ||
251 | th->secret_id = new_secret_id; | ||
252 | th->expires = new_expires; | ||
253 | th->renew_after = new_renew_after; | ||
254 | dout(" got ticket service %d (%s) secret_id %lld len %d\n", | ||
255 | type, ceph_entity_type_name(type), th->secret_id, | ||
256 | (int)th->ticket_blob->vec.iov_len); | ||
257 | xi->have_keys |= th->service; | ||
258 | } | ||
259 | |||
260 | ret = 0; | ||
261 | out: | ||
262 | kfree(ticket_buf); | ||
263 | out_dbuf: | ||
264 | kfree(dbuf); | ||
265 | return ret; | ||
266 | |||
267 | bad: | ||
268 | ret = -EINVAL; | ||
269 | goto out; | ||
270 | } | ||
271 | |||
272 | static int ceph_x_build_authorizer(struct ceph_auth_client *ac, | ||
273 | struct ceph_x_ticket_handler *th, | ||
274 | struct ceph_x_authorizer *au) | ||
275 | { | ||
276 | int maxlen; | ||
277 | struct ceph_x_authorize_a *msg_a; | ||
278 | struct ceph_x_authorize_b msg_b; | ||
279 | void *p, *end; | ||
280 | int ret; | ||
281 | int ticket_blob_len = | ||
282 | (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); | ||
283 | |||
284 | dout("build_authorizer for %s %p\n", | ||
285 | ceph_entity_type_name(th->service), au); | ||
286 | |||
287 | maxlen = sizeof(*msg_a) + sizeof(msg_b) + | ||
288 | ceph_x_encrypt_buflen(ticket_blob_len); | ||
289 | dout(" need len %d\n", maxlen); | ||
290 | if (au->buf && au->buf->alloc_len < maxlen) { | ||
291 | ceph_buffer_put(au->buf); | ||
292 | au->buf = NULL; | ||
293 | } | ||
294 | if (!au->buf) { | ||
295 | au->buf = ceph_buffer_new(maxlen, GFP_NOFS); | ||
296 | if (!au->buf) | ||
297 | return -ENOMEM; | ||
298 | } | ||
299 | au->service = th->service; | ||
300 | |||
301 | msg_a = au->buf->vec.iov_base; | ||
302 | msg_a->struct_v = 1; | ||
303 | msg_a->global_id = cpu_to_le64(ac->global_id); | ||
304 | msg_a->service_id = cpu_to_le32(th->service); | ||
305 | msg_a->ticket_blob.struct_v = 1; | ||
306 | msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); | ||
307 | msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); | ||
308 | if (ticket_blob_len) { | ||
309 | memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, | ||
310 | th->ticket_blob->vec.iov_len); | ||
311 | } | ||
312 | dout(" th %p secret_id %lld %lld\n", th, th->secret_id, | ||
313 | le64_to_cpu(msg_a->ticket_blob.secret_id)); | ||
314 | |||
315 | p = msg_a + 1; | ||
316 | p += ticket_blob_len; | ||
317 | end = au->buf->vec.iov_base + au->buf->vec.iov_len; | ||
318 | |||
319 | get_random_bytes(&au->nonce, sizeof(au->nonce)); | ||
320 | msg_b.struct_v = 1; | ||
321 | msg_b.nonce = cpu_to_le64(au->nonce); | ||
322 | ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), | ||
323 | p, end - p); | ||
324 | if (ret < 0) | ||
325 | goto out_buf; | ||
326 | p += ret; | ||
327 | au->buf->vec.iov_len = p - au->buf->vec.iov_base; | ||
328 | dout(" built authorizer nonce %llx len %d\n", au->nonce, | ||
329 | (int)au->buf->vec.iov_len); | ||
330 | BUG_ON(au->buf->vec.iov_len > maxlen); | ||
331 | return 0; | ||
332 | |||
333 | out_buf: | ||
334 | ceph_buffer_put(au->buf); | ||
335 | au->buf = NULL; | ||
336 | return ret; | ||
337 | } | ||
338 | |||
339 | static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, | ||
340 | void **p, void *end) | ||
341 | { | ||
342 | ceph_decode_need(p, end, 1 + sizeof(u64), bad); | ||
343 | ceph_encode_8(p, 1); | ||
344 | ceph_encode_64(p, th->secret_id); | ||
345 | if (th->ticket_blob) { | ||
346 | const char *buf = th->ticket_blob->vec.iov_base; | ||
347 | u32 len = th->ticket_blob->vec.iov_len; | ||
348 | |||
349 | ceph_encode_32_safe(p, end, len, bad); | ||
350 | ceph_encode_copy_safe(p, end, buf, len, bad); | ||
351 | } else { | ||
352 | ceph_encode_32_safe(p, end, 0, bad); | ||
353 | } | ||
354 | |||
355 | return 0; | ||
356 | bad: | ||
357 | return -ERANGE; | ||
358 | } | ||
359 | |||
360 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) | ||
361 | { | ||
362 | int want = ac->want_keys; | ||
363 | struct ceph_x_info *xi = ac->private; | ||
364 | int service; | ||
365 | |||
366 | *pneed = ac->want_keys & ~(xi->have_keys); | ||
367 | |||
368 | for (service = 1; service <= want; service <<= 1) { | ||
369 | struct ceph_x_ticket_handler *th; | ||
370 | |||
371 | if (!(ac->want_keys & service)) | ||
372 | continue; | ||
373 | |||
374 | if (*pneed & service) | ||
375 | continue; | ||
376 | |||
377 | th = get_ticket_handler(ac, service); | ||
378 | |||
379 | if (IS_ERR(th)) { | ||
380 | *pneed |= service; | ||
381 | continue; | ||
382 | } | ||
383 | |||
384 | if (get_seconds() >= th->renew_after) | ||
385 | *pneed |= service; | ||
386 | if (get_seconds() >= th->expires) | ||
387 | xi->have_keys &= ~service; | ||
388 | } | ||
389 | } | ||
390 | |||
391 | |||
392 | static int ceph_x_build_request(struct ceph_auth_client *ac, | ||
393 | void *buf, void *end) | ||
394 | { | ||
395 | struct ceph_x_info *xi = ac->private; | ||
396 | int need; | ||
397 | struct ceph_x_request_header *head = buf; | ||
398 | int ret; | ||
399 | struct ceph_x_ticket_handler *th = | ||
400 | get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); | ||
401 | |||
402 | if (IS_ERR(th)) | ||
403 | return PTR_ERR(th); | ||
404 | |||
405 | ceph_x_validate_tickets(ac, &need); | ||
406 | |||
407 | dout("build_request want %x have %x need %x\n", | ||
408 | ac->want_keys, xi->have_keys, need); | ||
409 | |||
410 | if (need & CEPH_ENTITY_TYPE_AUTH) { | ||
411 | struct ceph_x_authenticate *auth = (void *)(head + 1); | ||
412 | void *p = auth + 1; | ||
413 | struct ceph_x_challenge_blob tmp; | ||
414 | char tmp_enc[40]; | ||
415 | u64 *u; | ||
416 | |||
417 | if (p > end) | ||
418 | return -ERANGE; | ||
419 | |||
420 | dout(" get_auth_session_key\n"); | ||
421 | head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); | ||
422 | |||
423 | /* encrypt and hash */ | ||
424 | get_random_bytes(&auth->client_challenge, sizeof(u64)); | ||
425 | tmp.client_challenge = auth->client_challenge; | ||
426 | tmp.server_challenge = cpu_to_le64(xi->server_challenge); | ||
427 | ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), | ||
428 | tmp_enc, sizeof(tmp_enc)); | ||
429 | if (ret < 0) | ||
430 | return ret; | ||
431 | |||
432 | auth->struct_v = 1; | ||
433 | auth->key = 0; | ||
434 | for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) | ||
435 | auth->key ^= *(__le64 *)u; | ||
436 | dout(" server_challenge %llx client_challenge %llx key %llx\n", | ||
437 | xi->server_challenge, le64_to_cpu(auth->client_challenge), | ||
438 | le64_to_cpu(auth->key)); | ||
439 | |||
440 | /* now encode the old ticket if exists */ | ||
441 | ret = ceph_x_encode_ticket(th, &p, end); | ||
442 | if (ret < 0) | ||
443 | return ret; | ||
444 | |||
445 | return p - buf; | ||
446 | } | ||
447 | |||
448 | if (need) { | ||
449 | void *p = head + 1; | ||
450 | struct ceph_x_service_ticket_request *req; | ||
451 | |||
452 | if (p > end) | ||
453 | return -ERANGE; | ||
454 | head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); | ||
455 | |||
456 | ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); | ||
457 | if (ret) | ||
458 | return ret; | ||
459 | ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, | ||
460 | xi->auth_authorizer.buf->vec.iov_len); | ||
461 | |||
462 | req = p; | ||
463 | req->keys = cpu_to_le32(need); | ||
464 | p += sizeof(*req); | ||
465 | return p - buf; | ||
466 | } | ||
467 | |||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, | ||
472 | void *buf, void *end) | ||
473 | { | ||
474 | struct ceph_x_info *xi = ac->private; | ||
475 | struct ceph_x_reply_header *head = buf; | ||
476 | struct ceph_x_ticket_handler *th; | ||
477 | int len = end - buf; | ||
478 | int op; | ||
479 | int ret; | ||
480 | |||
481 | if (result) | ||
482 | return result; /* XXX hmm? */ | ||
483 | |||
484 | if (xi->starting) { | ||
485 | /* it's a hello */ | ||
486 | struct ceph_x_server_challenge *sc = buf; | ||
487 | |||
488 | if (len != sizeof(*sc)) | ||
489 | return -EINVAL; | ||
490 | xi->server_challenge = le64_to_cpu(sc->server_challenge); | ||
491 | dout("handle_reply got server challenge %llx\n", | ||
492 | xi->server_challenge); | ||
493 | xi->starting = false; | ||
494 | xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; | ||
495 | return -EAGAIN; | ||
496 | } | ||
497 | |||
498 | op = le16_to_cpu(head->op); | ||
499 | result = le32_to_cpu(head->result); | ||
500 | dout("handle_reply op %d result %d\n", op, result); | ||
501 | switch (op) { | ||
502 | case CEPHX_GET_AUTH_SESSION_KEY: | ||
503 | /* verify auth key */ | ||
504 | ret = ceph_x_proc_ticket_reply(ac, &xi->secret, | ||
505 | buf + sizeof(*head), end); | ||
506 | break; | ||
507 | |||
508 | case CEPHX_GET_PRINCIPAL_SESSION_KEY: | ||
509 | th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); | ||
510 | if (IS_ERR(th)) | ||
511 | return PTR_ERR(th); | ||
512 | ret = ceph_x_proc_ticket_reply(ac, &th->session_key, | ||
513 | buf + sizeof(*head), end); | ||
514 | break; | ||
515 | |||
516 | default: | ||
517 | return -EINVAL; | ||
518 | } | ||
519 | if (ret) | ||
520 | return ret; | ||
521 | if (ac->want_keys == xi->have_keys) | ||
522 | return 0; | ||
523 | return -EAGAIN; | ||
524 | } | ||
525 | |||
526 | static int ceph_x_create_authorizer( | ||
527 | struct ceph_auth_client *ac, int peer_type, | ||
528 | struct ceph_authorizer **a, | ||
529 | void **buf, size_t *len, | ||
530 | void **reply_buf, size_t *reply_len) | ||
531 | { | ||
532 | struct ceph_x_authorizer *au; | ||
533 | struct ceph_x_ticket_handler *th; | ||
534 | int ret; | ||
535 | |||
536 | th = get_ticket_handler(ac, peer_type); | ||
537 | if (IS_ERR(th)) | ||
538 | return PTR_ERR(th); | ||
539 | |||
540 | au = kzalloc(sizeof(*au), GFP_NOFS); | ||
541 | if (!au) | ||
542 | return -ENOMEM; | ||
543 | |||
544 | ret = ceph_x_build_authorizer(ac, th, au); | ||
545 | if (ret) { | ||
546 | kfree(au); | ||
547 | return ret; | ||
548 | } | ||
549 | |||
550 | *a = (struct ceph_authorizer *)au; | ||
551 | *buf = au->buf->vec.iov_base; | ||
552 | *len = au->buf->vec.iov_len; | ||
553 | *reply_buf = au->reply_buf; | ||
554 | *reply_len = sizeof(au->reply_buf); | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, | ||
559 | struct ceph_authorizer *a, size_t len) | ||
560 | { | ||
561 | struct ceph_x_authorizer *au = (void *)a; | ||
562 | struct ceph_x_ticket_handler *th; | ||
563 | int ret = 0; | ||
564 | struct ceph_x_authorize_reply reply; | ||
565 | void *p = au->reply_buf; | ||
566 | void *end = p + sizeof(au->reply_buf); | ||
567 | |||
568 | th = get_ticket_handler(ac, au->service); | ||
569 | if (IS_ERR(th)) | ||
570 | return PTR_ERR(th); | ||
571 | ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); | ||
572 | if (ret < 0) | ||
573 | return ret; | ||
574 | if (ret != sizeof(reply)) | ||
575 | return -EPERM; | ||
576 | |||
577 | if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) | ||
578 | ret = -EPERM; | ||
579 | else | ||
580 | ret = 0; | ||
581 | dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", | ||
582 | au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); | ||
583 | return ret; | ||
584 | } | ||
585 | |||
586 | static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, | ||
587 | struct ceph_authorizer *a) | ||
588 | { | ||
589 | struct ceph_x_authorizer *au = (void *)a; | ||
590 | |||
591 | ceph_buffer_put(au->buf); | ||
592 | kfree(au); | ||
593 | } | ||
594 | |||
595 | |||
596 | static void ceph_x_reset(struct ceph_auth_client *ac) | ||
597 | { | ||
598 | struct ceph_x_info *xi = ac->private; | ||
599 | |||
600 | dout("reset\n"); | ||
601 | xi->starting = true; | ||
602 | xi->server_challenge = 0; | ||
603 | } | ||
604 | |||
605 | static void ceph_x_destroy(struct ceph_auth_client *ac) | ||
606 | { | ||
607 | struct ceph_x_info *xi = ac->private; | ||
608 | struct rb_node *p; | ||
609 | |||
610 | dout("ceph_x_destroy %p\n", ac); | ||
611 | ceph_crypto_key_destroy(&xi->secret); | ||
612 | |||
613 | while ((p = rb_first(&xi->ticket_handlers)) != NULL) { | ||
614 | struct ceph_x_ticket_handler *th = | ||
615 | rb_entry(p, struct ceph_x_ticket_handler, node); | ||
616 | remove_ticket_handler(ac, th); | ||
617 | } | ||
618 | |||
619 | if (xi->auth_authorizer.buf) | ||
620 | ceph_buffer_put(xi->auth_authorizer.buf); | ||
621 | |||
622 | kfree(ac->private); | ||
623 | ac->private = NULL; | ||
624 | } | ||
625 | |||
626 | static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, | ||
627 | int peer_type) | ||
628 | { | ||
629 | struct ceph_x_ticket_handler *th; | ||
630 | |||
631 | th = get_ticket_handler(ac, peer_type); | ||
632 | if (!IS_ERR(th)) | ||
633 | remove_ticket_handler(ac, th); | ||
634 | } | ||
635 | |||
636 | |||
637 | static const struct ceph_auth_client_ops ceph_x_ops = { | ||
638 | .name = "x", | ||
639 | .is_authenticated = ceph_x_is_authenticated, | ||
640 | .should_authenticate = ceph_x_should_authenticate, | ||
641 | .build_request = ceph_x_build_request, | ||
642 | .handle_reply = ceph_x_handle_reply, | ||
643 | .create_authorizer = ceph_x_create_authorizer, | ||
644 | .verify_authorizer_reply = ceph_x_verify_authorizer_reply, | ||
645 | .destroy_authorizer = ceph_x_destroy_authorizer, | ||
646 | .invalidate_authorizer = ceph_x_invalidate_authorizer, | ||
647 | .reset = ceph_x_reset, | ||
648 | .destroy = ceph_x_destroy, | ||
649 | }; | ||
650 | |||
651 | |||
652 | int ceph_x_init(struct ceph_auth_client *ac) | ||
653 | { | ||
654 | struct ceph_x_info *xi; | ||
655 | int ret; | ||
656 | |||
657 | dout("ceph_x_init %p\n", ac); | ||
658 | ret = -ENOMEM; | ||
659 | xi = kzalloc(sizeof(*xi), GFP_NOFS); | ||
660 | if (!xi) | ||
661 | goto out; | ||
662 | |||
663 | ret = -EINVAL; | ||
664 | if (!ac->secret) { | ||
665 | pr_err("no secret set (for auth_x protocol)\n"); | ||
666 | goto out_nomem; | ||
667 | } | ||
668 | |||
669 | ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); | ||
670 | if (ret) | ||
671 | goto out_nomem; | ||
672 | |||
673 | xi->starting = true; | ||
674 | xi->ticket_handlers = RB_ROOT; | ||
675 | |||
676 | ac->protocol = CEPH_AUTH_CEPHX; | ||
677 | ac->private = xi; | ||
678 | ac->ops = &ceph_x_ops; | ||
679 | return 0; | ||
680 | |||
681 | out_nomem: | ||
682 | kfree(xi); | ||
683 | out: | ||
684 | return ret; | ||
685 | } | ||
686 | |||
687 | |||
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h deleted file mode 100644 index ff6f8180e681..000000000000 --- a/fs/ceph/auth_x.h +++ /dev/null | |||
@@ -1,49 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_AUTH_X_H | ||
2 | #define _FS_CEPH_AUTH_X_H | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | |||
6 | #include "crypto.h" | ||
7 | #include "auth.h" | ||
8 | #include "auth_x_protocol.h" | ||
9 | |||
10 | /* | ||
11 | * Handle ticket for a single service. | ||
12 | */ | ||
13 | struct ceph_x_ticket_handler { | ||
14 | struct rb_node node; | ||
15 | unsigned service; | ||
16 | |||
17 | struct ceph_crypto_key session_key; | ||
18 | struct ceph_timespec validity; | ||
19 | |||
20 | u64 secret_id; | ||
21 | struct ceph_buffer *ticket_blob; | ||
22 | |||
23 | unsigned long renew_after, expires; | ||
24 | }; | ||
25 | |||
26 | |||
27 | struct ceph_x_authorizer { | ||
28 | struct ceph_buffer *buf; | ||
29 | unsigned service; | ||
30 | u64 nonce; | ||
31 | char reply_buf[128]; /* big enough for encrypted blob */ | ||
32 | }; | ||
33 | |||
34 | struct ceph_x_info { | ||
35 | struct ceph_crypto_key secret; | ||
36 | |||
37 | bool starting; | ||
38 | u64 server_challenge; | ||
39 | |||
40 | unsigned have_keys; | ||
41 | struct rb_root ticket_handlers; | ||
42 | |||
43 | struct ceph_x_authorizer auth_authorizer; | ||
44 | }; | ||
45 | |||
46 | extern int ceph_x_init(struct ceph_auth_client *ac); | ||
47 | |||
48 | #endif | ||
49 | |||
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h deleted file mode 100644 index 671d30576c4f..000000000000 --- a/fs/ceph/auth_x_protocol.h +++ /dev/null | |||
@@ -1,90 +0,0 @@ | |||
1 | #ifndef __FS_CEPH_AUTH_X_PROTOCOL | ||
2 | #define __FS_CEPH_AUTH_X_PROTOCOL | ||
3 | |||
4 | #define CEPHX_GET_AUTH_SESSION_KEY 0x0100 | ||
5 | #define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 | ||
6 | #define CEPHX_GET_ROTATING_KEY 0x0400 | ||
7 | |||
8 | /* common bits */ | ||
9 | struct ceph_x_ticket_blob { | ||
10 | __u8 struct_v; | ||
11 | __le64 secret_id; | ||
12 | __le32 blob_len; | ||
13 | char blob[]; | ||
14 | } __attribute__ ((packed)); | ||
15 | |||
16 | |||
17 | /* common request/reply headers */ | ||
18 | struct ceph_x_request_header { | ||
19 | __le16 op; | ||
20 | } __attribute__ ((packed)); | ||
21 | |||
22 | struct ceph_x_reply_header { | ||
23 | __le16 op; | ||
24 | __le32 result; | ||
25 | } __attribute__ ((packed)); | ||
26 | |||
27 | |||
28 | /* authenticate handshake */ | ||
29 | |||
30 | /* initial hello (no reply header) */ | ||
31 | struct ceph_x_server_challenge { | ||
32 | __u8 struct_v; | ||
33 | __le64 server_challenge; | ||
34 | } __attribute__ ((packed)); | ||
35 | |||
36 | struct ceph_x_authenticate { | ||
37 | __u8 struct_v; | ||
38 | __le64 client_challenge; | ||
39 | __le64 key; | ||
40 | /* ticket blob */ | ||
41 | } __attribute__ ((packed)); | ||
42 | |||
43 | struct ceph_x_service_ticket_request { | ||
44 | __u8 struct_v; | ||
45 | __le32 keys; | ||
46 | } __attribute__ ((packed)); | ||
47 | |||
48 | struct ceph_x_challenge_blob { | ||
49 | __le64 server_challenge; | ||
50 | __le64 client_challenge; | ||
51 | } __attribute__ ((packed)); | ||
52 | |||
53 | |||
54 | |||
55 | /* authorize handshake */ | ||
56 | |||
57 | /* | ||
58 | * The authorizer consists of two pieces: | ||
59 | * a - service id, ticket blob | ||
60 | * b - encrypted with session key | ||
61 | */ | ||
62 | struct ceph_x_authorize_a { | ||
63 | __u8 struct_v; | ||
64 | __le64 global_id; | ||
65 | __le32 service_id; | ||
66 | struct ceph_x_ticket_blob ticket_blob; | ||
67 | } __attribute__ ((packed)); | ||
68 | |||
69 | struct ceph_x_authorize_b { | ||
70 | __u8 struct_v; | ||
71 | __le64 nonce; | ||
72 | } __attribute__ ((packed)); | ||
73 | |||
74 | struct ceph_x_authorize_reply { | ||
75 | __u8 struct_v; | ||
76 | __le64 nonce_plus_one; | ||
77 | } __attribute__ ((packed)); | ||
78 | |||
79 | |||
80 | /* | ||
81 | * encyption bundle | ||
82 | */ | ||
83 | #define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull | ||
84 | |||
85 | struct ceph_x_encrypt_header { | ||
86 | __u8 struct_v; | ||
87 | __le64 magic; | ||
88 | } __attribute__ ((packed)); | ||
89 | |||
90 | #endif | ||
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c deleted file mode 100644 index cd39f17021de..000000000000 --- a/fs/ceph/buffer.c +++ /dev/null | |||
@@ -1,65 +0,0 @@ | |||
1 | |||
2 | #include "ceph_debug.h" | ||
3 | |||
4 | #include <linux/slab.h> | ||
5 | |||
6 | #include "buffer.h" | ||
7 | #include "decode.h" | ||
8 | |||
9 | struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) | ||
10 | { | ||
11 | struct ceph_buffer *b; | ||
12 | |||
13 | b = kmalloc(sizeof(*b), gfp); | ||
14 | if (!b) | ||
15 | return NULL; | ||
16 | |||
17 | b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); | ||
18 | if (b->vec.iov_base) { | ||
19 | b->is_vmalloc = false; | ||
20 | } else { | ||
21 | b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); | ||
22 | if (!b->vec.iov_base) { | ||
23 | kfree(b); | ||
24 | return NULL; | ||
25 | } | ||
26 | b->is_vmalloc = true; | ||
27 | } | ||
28 | |||
29 | kref_init(&b->kref); | ||
30 | b->alloc_len = len; | ||
31 | b->vec.iov_len = len; | ||
32 | dout("buffer_new %p\n", b); | ||
33 | return b; | ||
34 | } | ||
35 | |||
36 | void ceph_buffer_release(struct kref *kref) | ||
37 | { | ||
38 | struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); | ||
39 | |||
40 | dout("buffer_release %p\n", b); | ||
41 | if (b->vec.iov_base) { | ||
42 | if (b->is_vmalloc) | ||
43 | vfree(b->vec.iov_base); | ||
44 | else | ||
45 | kfree(b->vec.iov_base); | ||
46 | } | ||
47 | kfree(b); | ||
48 | } | ||
49 | |||
50 | int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) | ||
51 | { | ||
52 | size_t len; | ||
53 | |||
54 | ceph_decode_need(p, end, sizeof(u32), bad); | ||
55 | len = ceph_decode_32(p); | ||
56 | dout("decode_buffer len %d\n", (int)len); | ||
57 | ceph_decode_need(p, end, len, bad); | ||
58 | *b = ceph_buffer_new(len, GFP_NOFS); | ||
59 | if (!*b) | ||
60 | return -ENOMEM; | ||
61 | ceph_decode_copy(p, (*b)->vec.iov_base, len); | ||
62 | return 0; | ||
63 | bad: | ||
64 | return -EINVAL; | ||
65 | } | ||
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h deleted file mode 100644 index 58d19014068f..000000000000 --- a/fs/ceph/buffer.h +++ /dev/null | |||
@@ -1,39 +0,0 @@ | |||
1 | #ifndef __FS_CEPH_BUFFER_H | ||
2 | #define __FS_CEPH_BUFFER_H | ||
3 | |||
4 | #include <linux/kref.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/vmalloc.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/uio.h> | ||
9 | |||
10 | /* | ||
11 | * a simple reference counted buffer. | ||
12 | * | ||
13 | * use kmalloc for small sizes (<= one page), vmalloc for larger | ||
14 | * sizes. | ||
15 | */ | ||
16 | struct ceph_buffer { | ||
17 | struct kref kref; | ||
18 | struct kvec vec; | ||
19 | size_t alloc_len; | ||
20 | bool is_vmalloc; | ||
21 | }; | ||
22 | |||
23 | extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); | ||
24 | extern void ceph_buffer_release(struct kref *kref); | ||
25 | |||
26 | static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) | ||
27 | { | ||
28 | kref_get(&b->kref); | ||
29 | return b; | ||
30 | } | ||
31 | |||
32 | static inline void ceph_buffer_put(struct ceph_buffer *b) | ||
33 | { | ||
34 | kref_put(&b->kref, ceph_buffer_release); | ||
35 | } | ||
36 | |||
37 | extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); | ||
38 | |||
39 | #endif | ||
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 73c153092f72..98ab13e2b71d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/fs.h> | 3 | #include <linux/fs.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
@@ -9,8 +9,9 @@ | |||
9 | #include <linux/writeback.h> | 9 | #include <linux/writeback.h> |
10 | 10 | ||
11 | #include "super.h" | 11 | #include "super.h" |
12 | #include "decode.h" | 12 | #include "mds_client.h" |
13 | #include "messenger.h" | 13 | #include <linux/ceph/decode.h> |
14 | #include <linux/ceph/messenger.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * Capability management | 17 | * Capability management |
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) | |||
287 | spin_unlock(&mdsc->caps_list_lock); | 288 | spin_unlock(&mdsc->caps_list_lock); |
288 | } | 289 | } |
289 | 290 | ||
290 | void ceph_reservation_status(struct ceph_client *client, | 291 | void ceph_reservation_status(struct ceph_fs_client *fsc, |
291 | int *total, int *avail, int *used, int *reserved, | 292 | int *total, int *avail, int *used, int *reserved, |
292 | int *min) | 293 | int *min) |
293 | { | 294 | { |
294 | struct ceph_mds_client *mdsc = &client->mdsc; | 295 | struct ceph_mds_client *mdsc = fsc->mdsc; |
295 | 296 | ||
296 | if (total) | 297 | if (total) |
297 | *total = mdsc->caps_total_count; | 298 | *total = mdsc->caps_total_count; |
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci, | |||
399 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, | 400 | static void __cap_set_timeouts(struct ceph_mds_client *mdsc, |
400 | struct ceph_inode_info *ci) | 401 | struct ceph_inode_info *ci) |
401 | { | 402 | { |
402 | struct ceph_mount_args *ma = mdsc->client->mount_args; | 403 | struct ceph_mount_options *ma = mdsc->fsc->mount_options; |
403 | 404 | ||
404 | ci->i_hold_caps_min = round_jiffies(jiffies + | 405 | ci->i_hold_caps_min = round_jiffies(jiffies + |
405 | ma->caps_wanted_delay_min * HZ); | 406 | ma->caps_wanted_delay_min * HZ); |
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode, | |||
515 | unsigned seq, unsigned mseq, u64 realmino, int flags, | 516 | unsigned seq, unsigned mseq, u64 realmino, int flags, |
516 | struct ceph_cap_reservation *caps_reservation) | 517 | struct ceph_cap_reservation *caps_reservation) |
517 | { | 518 | { |
518 | struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; | 519 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
519 | struct ceph_inode_info *ci = ceph_inode(inode); | 520 | struct ceph_inode_info *ci = ceph_inode(inode); |
520 | struct ceph_cap *new_cap = NULL; | 521 | struct ceph_cap *new_cap = NULL; |
521 | struct ceph_cap *cap; | 522 | struct ceph_cap *cap; |
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) | |||
873 | struct ceph_mds_session *session = cap->session; | 874 | struct ceph_mds_session *session = cap->session; |
874 | struct ceph_inode_info *ci = cap->ci; | 875 | struct ceph_inode_info *ci = cap->ci; |
875 | struct ceph_mds_client *mdsc = | 876 | struct ceph_mds_client *mdsc = |
876 | &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 877 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
877 | int removed = 0; | 878 | int removed = 0; |
878 | 879 | ||
879 | dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); | 880 | dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); |
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, | |||
1210 | int mds; | 1211 | int mds; |
1211 | struct ceph_cap_snap *capsnap; | 1212 | struct ceph_cap_snap *capsnap; |
1212 | u32 mseq; | 1213 | u32 mseq; |
1213 | struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; | 1214 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1214 | struct ceph_mds_session *session = NULL; /* if session != NULL, we hold | 1215 | struct ceph_mds_session *session = NULL; /* if session != NULL, we hold |
1215 | session->s_mutex */ | 1216 | session->s_mutex */ |
1216 | u64 next_follows = 0; /* keep track of how far we've gotten through the | 1217 | u64 next_follows = 0; /* keep track of how far we've gotten through the |
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) | |||
1336 | void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | 1337 | void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) |
1337 | { | 1338 | { |
1338 | struct ceph_mds_client *mdsc = | 1339 | struct ceph_mds_client *mdsc = |
1339 | &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 1340 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
1340 | struct inode *inode = &ci->vfs_inode; | 1341 | struct inode *inode = &ci->vfs_inode; |
1341 | int was = ci->i_dirty_caps; | 1342 | int was = ci->i_dirty_caps; |
1342 | int dirty = 0; | 1343 | int dirty = 0; |
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1378 | static int __mark_caps_flushing(struct inode *inode, | 1379 | static int __mark_caps_flushing(struct inode *inode, |
1379 | struct ceph_mds_session *session) | 1380 | struct ceph_mds_session *session) |
1380 | { | 1381 | { |
1381 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; | 1382 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1382 | struct ceph_inode_info *ci = ceph_inode(inode); | 1383 | struct ceph_inode_info *ci = ceph_inode(inode); |
1383 | int flushing; | 1384 | int flushing; |
1384 | 1385 | ||
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode, | |||
1416 | /* | 1417 | /* |
1417 | * try to invalidate mapping pages without blocking. | 1418 | * try to invalidate mapping pages without blocking. |
1418 | */ | 1419 | */ |
1419 | static int mapping_is_empty(struct address_space *mapping) | ||
1420 | { | ||
1421 | struct page *page = find_get_page(mapping, 0); | ||
1422 | |||
1423 | if (!page) | ||
1424 | return 1; | ||
1425 | |||
1426 | put_page(page); | ||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1430 | static int try_nonblocking_invalidate(struct inode *inode) | 1420 | static int try_nonblocking_invalidate(struct inode *inode) |
1431 | { | 1421 | { |
1432 | struct ceph_inode_info *ci = ceph_inode(inode); | 1422 | struct ceph_inode_info *ci = ceph_inode(inode); |
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode) | |||
1436 | invalidate_mapping_pages(&inode->i_data, 0, -1); | 1426 | invalidate_mapping_pages(&inode->i_data, 0, -1); |
1437 | spin_lock(&inode->i_lock); | 1427 | spin_lock(&inode->i_lock); |
1438 | 1428 | ||
1439 | if (mapping_is_empty(&inode->i_data) && | 1429 | if (inode->i_data.nrpages == 0 && |
1440 | invalidating_gen == ci->i_rdcache_gen) { | 1430 | invalidating_gen == ci->i_rdcache_gen) { |
1441 | /* success. */ | 1431 | /* success. */ |
1442 | dout("try_nonblocking_invalidate %p success\n", inode); | 1432 | dout("try_nonblocking_invalidate %p success\n", inode); |
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode) | |||
1462 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, | 1452 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, |
1463 | struct ceph_mds_session *session) | 1453 | struct ceph_mds_session *session) |
1464 | { | 1454 | { |
1465 | struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); | 1455 | struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); |
1466 | struct ceph_mds_client *mdsc = &client->mdsc; | 1456 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1467 | struct inode *inode = &ci->vfs_inode; | 1457 | struct inode *inode = &ci->vfs_inode; |
1468 | struct ceph_cap *cap; | 1458 | struct ceph_cap *cap; |
1469 | int file_wanted, used; | 1459 | int file_wanted, used; |
@@ -1533,7 +1523,7 @@ retry_locked: | |||
1533 | */ | 1523 | */ |
1534 | if ((!is_delayed || mdsc->stopping) && | 1524 | if ((!is_delayed || mdsc->stopping) && |
1535 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ | 1525 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ |
1536 | ci->i_rdcache_gen && /* may have cached pages */ | 1526 | inode->i_data.nrpages && /* have cached pages */ |
1537 | (file_wanted == 0 || /* no open files */ | 1527 | (file_wanted == 0 || /* no open files */ |
1538 | (revoking & (CEPH_CAP_FILE_CACHE| | 1528 | (revoking & (CEPH_CAP_FILE_CACHE| |
1539 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ | 1529 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ |
@@ -1706,7 +1696,7 @@ ack: | |||
1706 | static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, | 1696 | static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, |
1707 | unsigned *flush_tid) | 1697 | unsigned *flush_tid) |
1708 | { | 1698 | { |
1709 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; | 1699 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1710 | struct ceph_inode_info *ci = ceph_inode(inode); | 1700 | struct ceph_inode_info *ci = ceph_inode(inode); |
1711 | int unlock_session = session ? 0 : 1; | 1701 | int unlock_session = session ? 0 : 1; |
1712 | int flushing = 0; | 1702 | int flushing = 0; |
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
1872 | caps_are_flushed(inode, flush_tid)); | 1862 | caps_are_flushed(inode, flush_tid)); |
1873 | } else { | 1863 | } else { |
1874 | struct ceph_mds_client *mdsc = | 1864 | struct ceph_mds_client *mdsc = |
1875 | &ceph_sb_to_client(inode->i_sb)->mdsc; | 1865 | ceph_sb_to_client(inode->i_sb)->mdsc; |
1876 | 1866 | ||
1877 | spin_lock(&inode->i_lock); | 1867 | spin_lock(&inode->i_lock); |
1878 | if (__ceph_caps_dirty(ci)) | 1868 | if (__ceph_caps_dirty(ci)) |
@@ -2283,7 +2273,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2283 | { | 2273 | { |
2284 | struct ceph_inode_info *ci = ceph_inode(inode); | 2274 | struct ceph_inode_info *ci = ceph_inode(inode); |
2285 | int mds = session->s_mds; | 2275 | int mds = session->s_mds; |
2286 | int seq = le32_to_cpu(grant->seq); | 2276 | unsigned seq = le32_to_cpu(grant->seq); |
2277 | unsigned issue_seq = le32_to_cpu(grant->issue_seq); | ||
2287 | int newcaps = le32_to_cpu(grant->caps); | 2278 | int newcaps = le32_to_cpu(grant->caps); |
2288 | int issued, implemented, used, wanted, dirty; | 2279 | int issued, implemented, used, wanted, dirty; |
2289 | u64 size = le64_to_cpu(grant->size); | 2280 | u64 size = le64_to_cpu(grant->size); |
@@ -2295,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2295 | int revoked_rdcache = 0; | 2286 | int revoked_rdcache = 0; |
2296 | int queue_invalidate = 0; | 2287 | int queue_invalidate = 0; |
2297 | 2288 | ||
2298 | dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", | 2289 | dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", |
2299 | inode, cap, mds, seq, ceph_cap_string(newcaps)); | 2290 | inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); |
2300 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, | 2291 | dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, |
2301 | inode->i_size); | 2292 | inode->i_size); |
2302 | 2293 | ||
@@ -2392,6 +2383,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
2392 | } | 2383 | } |
2393 | 2384 | ||
2394 | cap->seq = seq; | 2385 | cap->seq = seq; |
2386 | cap->issue_seq = issue_seq; | ||
2395 | 2387 | ||
2396 | /* file layout may have changed */ | 2388 | /* file layout may have changed */ |
2397 | ci->i_layout = grant->layout; | 2389 | ci->i_layout = grant->layout; |
@@ -2463,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2463 | __releases(inode->i_lock) | 2455 | __releases(inode->i_lock) |
2464 | { | 2456 | { |
2465 | struct ceph_inode_info *ci = ceph_inode(inode); | 2457 | struct ceph_inode_info *ci = ceph_inode(inode); |
2466 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; | 2458 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
2467 | unsigned seq = le32_to_cpu(m->seq); | 2459 | unsigned seq = le32_to_cpu(m->seq); |
2468 | int dirty = le32_to_cpu(m->dirty); | 2460 | int dirty = le32_to_cpu(m->dirty); |
2469 | int cleaned = 0; | 2461 | int cleaned = 0; |
@@ -2711,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2711 | struct ceph_msg *msg) | 2703 | struct ceph_msg *msg) |
2712 | { | 2704 | { |
2713 | struct ceph_mds_client *mdsc = session->s_mdsc; | 2705 | struct ceph_mds_client *mdsc = session->s_mdsc; |
2714 | struct super_block *sb = mdsc->client->sb; | 2706 | struct super_block *sb = mdsc->fsc->sb; |
2715 | struct inode *inode; | 2707 | struct inode *inode; |
2716 | struct ceph_cap *cap; | 2708 | struct ceph_cap *cap; |
2717 | struct ceph_mds_caps *h; | 2709 | struct ceph_mds_caps *h; |
@@ -2774,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2774 | if (op == CEPH_CAP_OP_IMPORT) | 2766 | if (op == CEPH_CAP_OP_IMPORT) |
2775 | __queue_cap_release(session, vino.ino, cap_id, | 2767 | __queue_cap_release(session, vino.ino, cap_id, |
2776 | mseq, seq); | 2768 | mseq, seq); |
2777 | 2769 | goto flush_cap_releases; | |
2778 | /* | ||
2779 | * send any full release message to try to move things | ||
2780 | * along for the mds (who clearly thinks we still have this | ||
2781 | * cap). | ||
2782 | */ | ||
2783 | ceph_add_cap_releases(mdsc, session); | ||
2784 | ceph_send_cap_releases(mdsc, session); | ||
2785 | goto done; | ||
2786 | } | 2770 | } |
2787 | 2771 | ||
2788 | /* these will work even if we don't have a cap yet */ | 2772 | /* these will work even if we don't have a cap yet */ |
@@ -2810,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2810 | dout(" no cap on %p ino %llx.%llx from mds%d\n", | 2794 | dout(" no cap on %p ino %llx.%llx from mds%d\n", |
2811 | inode, ceph_ino(inode), ceph_snap(inode), mds); | 2795 | inode, ceph_ino(inode), ceph_snap(inode), mds); |
2812 | spin_unlock(&inode->i_lock); | 2796 | spin_unlock(&inode->i_lock); |
2813 | goto done; | 2797 | goto flush_cap_releases; |
2814 | } | 2798 | } |
2815 | 2799 | ||
2816 | /* note that each of these drops i_lock for us */ | 2800 | /* note that each of these drops i_lock for us */ |
@@ -2834,6 +2818,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
2834 | ceph_cap_op_name(op)); | 2818 | ceph_cap_op_name(op)); |
2835 | } | 2819 | } |
2836 | 2820 | ||
2821 | goto done; | ||
2822 | |||
2823 | flush_cap_releases: | ||
2824 | /* | ||
2825 | * send any full release message to try to move things | ||
2826 | * along for the mds (who clearly thinks we still have this | ||
2827 | * cap). | ||
2828 | */ | ||
2829 | ceph_add_cap_releases(mdsc, session); | ||
2830 | ceph_send_cap_releases(mdsc, session); | ||
2831 | |||
2837 | done: | 2832 | done: |
2838 | mutex_unlock(&session->s_mutex); | 2833 | mutex_unlock(&session->s_mutex); |
2839 | done_unlocked: | 2834 | done_unlocked: |
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h deleted file mode 100644 index 1818c2305610..000000000000 --- a/fs/ceph/ceph_debug.h +++ /dev/null | |||
@@ -1,37 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_DEBUG_H | ||
2 | #define _FS_CEPH_DEBUG_H | ||
3 | |||
4 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
5 | |||
6 | #ifdef CONFIG_CEPH_FS_PRETTYDEBUG | ||
7 | |||
8 | /* | ||
9 | * wrap pr_debug to include a filename:lineno prefix on each line. | ||
10 | * this incurs some overhead (kernel size and execution time) due to | ||
11 | * the extra function call at each call site. | ||
12 | */ | ||
13 | |||
14 | # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) | ||
15 | extern const char *ceph_file_part(const char *s, int len); | ||
16 | # define dout(fmt, ...) \ | ||
17 | pr_debug(" %12.12s:%-4d : " fmt, \ | ||
18 | ceph_file_part(__FILE__, sizeof(__FILE__)), \ | ||
19 | __LINE__, ##__VA_ARGS__) | ||
20 | # else | ||
21 | /* faux printk call just to see any compiler warnings. */ | ||
22 | # define dout(fmt, ...) do { \ | ||
23 | if (0) \ | ||
24 | printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ | ||
25 | } while (0) | ||
26 | # endif | ||
27 | |||
28 | #else | ||
29 | |||
30 | /* | ||
31 | * or, just wrap pr_debug | ||
32 | */ | ||
33 | # define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) | ||
34 | |||
35 | #endif | ||
36 | |||
37 | #endif | ||
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index ab6cf35c4091..bdce8b1fbd06 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c | |||
@@ -1,7 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Ceph 'frag' type | 2 | * Ceph 'frag' type |
3 | */ | 3 | */ |
4 | #include "types.h" | 4 | #include <linux/module.h> |
5 | #include <linux/ceph/types.h> | ||
5 | 6 | ||
6 | int ceph_frag_compare(__u32 a, __u32 b) | 7 | int ceph_frag_compare(__u32 a, __u32 b) |
7 | { | 8 | { |
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h deleted file mode 100644 index 5babb8e95352..000000000000 --- a/fs/ceph/ceph_frag.h +++ /dev/null | |||
@@ -1,109 +0,0 @@ | |||
1 | #ifndef FS_CEPH_FRAG_H | ||
2 | #define FS_CEPH_FRAG_H | ||
3 | |||
4 | /* | ||
5 | * "Frags" are a way to describe a subset of a 32-bit number space, | ||
6 | * using a mask and a value to match against that mask. Any given frag | ||
7 | * (subset of the number space) can be partitioned into 2^n sub-frags. | ||
8 | * | ||
9 | * Frags are encoded into a 32-bit word: | ||
10 | * 8 upper bits = "bits" | ||
11 | * 24 lower bits = "value" | ||
12 | * (We could go to 5+27 bits, but who cares.) | ||
13 | * | ||
14 | * We use the _most_ significant bits of the 24 bit value. This makes | ||
15 | * values logically sort. | ||
16 | * | ||
17 | * Unfortunately, because the "bits" field is still in the high bits, we | ||
18 | * can't sort encoded frags numerically. However, it does allow you | ||
19 | * to feed encoded frags as values into frag_contains_value. | ||
20 | */ | ||
21 | static inline __u32 ceph_frag_make(__u32 b, __u32 v) | ||
22 | { | ||
23 | return (b << 24) | | ||
24 | (v & (0xffffffu << (24-b)) & 0xffffffu); | ||
25 | } | ||
26 | static inline __u32 ceph_frag_bits(__u32 f) | ||
27 | { | ||
28 | return f >> 24; | ||
29 | } | ||
30 | static inline __u32 ceph_frag_value(__u32 f) | ||
31 | { | ||
32 | return f & 0xffffffu; | ||
33 | } | ||
34 | static inline __u32 ceph_frag_mask(__u32 f) | ||
35 | { | ||
36 | return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; | ||
37 | } | ||
38 | static inline __u32 ceph_frag_mask_shift(__u32 f) | ||
39 | { | ||
40 | return 24 - ceph_frag_bits(f); | ||
41 | } | ||
42 | |||
43 | static inline int ceph_frag_contains_value(__u32 f, __u32 v) | ||
44 | { | ||
45 | return (v & ceph_frag_mask(f)) == ceph_frag_value(f); | ||
46 | } | ||
47 | static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) | ||
48 | { | ||
49 | /* is sub as specific as us, and contained by us? */ | ||
50 | return ceph_frag_bits(sub) >= ceph_frag_bits(f) && | ||
51 | (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); | ||
52 | } | ||
53 | |||
54 | static inline __u32 ceph_frag_parent(__u32 f) | ||
55 | { | ||
56 | return ceph_frag_make(ceph_frag_bits(f) - 1, | ||
57 | ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); | ||
58 | } | ||
59 | static inline int ceph_frag_is_left_child(__u32 f) | ||
60 | { | ||
61 | return ceph_frag_bits(f) > 0 && | ||
62 | (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; | ||
63 | } | ||
64 | static inline int ceph_frag_is_right_child(__u32 f) | ||
65 | { | ||
66 | return ceph_frag_bits(f) > 0 && | ||
67 | (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; | ||
68 | } | ||
69 | static inline __u32 ceph_frag_sibling(__u32 f) | ||
70 | { | ||
71 | return ceph_frag_make(ceph_frag_bits(f), | ||
72 | ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); | ||
73 | } | ||
74 | static inline __u32 ceph_frag_left_child(__u32 f) | ||
75 | { | ||
76 | return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); | ||
77 | } | ||
78 | static inline __u32 ceph_frag_right_child(__u32 f) | ||
79 | { | ||
80 | return ceph_frag_make(ceph_frag_bits(f)+1, | ||
81 | ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); | ||
82 | } | ||
83 | static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) | ||
84 | { | ||
85 | int newbits = ceph_frag_bits(f) + by; | ||
86 | return ceph_frag_make(newbits, | ||
87 | ceph_frag_value(f) | (i << (24 - newbits))); | ||
88 | } | ||
89 | static inline int ceph_frag_is_leftmost(__u32 f) | ||
90 | { | ||
91 | return ceph_frag_value(f) == 0; | ||
92 | } | ||
93 | static inline int ceph_frag_is_rightmost(__u32 f) | ||
94 | { | ||
95 | return ceph_frag_value(f) == ceph_frag_mask(f); | ||
96 | } | ||
97 | static inline __u32 ceph_frag_next(__u32 f) | ||
98 | { | ||
99 | return ceph_frag_make(ceph_frag_bits(f), | ||
100 | ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * comparator to sort frags logically, as when traversing the | ||
105 | * number space in ascending order... | ||
106 | */ | ||
107 | int ceph_frag_compare(__u32 a, __u32 b); | ||
108 | |||
109 | #endif | ||
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c deleted file mode 100644 index 3ac6cc7c1156..000000000000 --- a/fs/ceph/ceph_fs.c +++ /dev/null | |||
@@ -1,72 +0,0 @@ | |||
1 | /* | ||
2 | * Some non-inline ceph helpers | ||
3 | */ | ||
4 | #include "types.h" | ||
5 | |||
6 | /* | ||
7 | * return true if @layout appears to be valid | ||
8 | */ | ||
9 | int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) | ||
10 | { | ||
11 | __u32 su = le32_to_cpu(layout->fl_stripe_unit); | ||
12 | __u32 sc = le32_to_cpu(layout->fl_stripe_count); | ||
13 | __u32 os = le32_to_cpu(layout->fl_object_size); | ||
14 | |||
15 | /* stripe unit, object size must be non-zero, 64k increment */ | ||
16 | if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) | ||
17 | return 0; | ||
18 | if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) | ||
19 | return 0; | ||
20 | /* object size must be a multiple of stripe unit */ | ||
21 | if (os < su || os % su) | ||
22 | return 0; | ||
23 | /* stripe count must be non-zero */ | ||
24 | if (!sc) | ||
25 | return 0; | ||
26 | return 1; | ||
27 | } | ||
28 | |||
29 | |||
30 | int ceph_flags_to_mode(int flags) | ||
31 | { | ||
32 | int mode; | ||
33 | |||
34 | #ifdef O_DIRECTORY /* fixme */ | ||
35 | if ((flags & O_DIRECTORY) == O_DIRECTORY) | ||
36 | return CEPH_FILE_MODE_PIN; | ||
37 | #endif | ||
38 | if ((flags & O_APPEND) == O_APPEND) | ||
39 | flags |= O_WRONLY; | ||
40 | |||
41 | if ((flags & O_ACCMODE) == O_RDWR) | ||
42 | mode = CEPH_FILE_MODE_RDWR; | ||
43 | else if ((flags & O_ACCMODE) == O_WRONLY) | ||
44 | mode = CEPH_FILE_MODE_WR; | ||
45 | else | ||
46 | mode = CEPH_FILE_MODE_RD; | ||
47 | |||
48 | #ifdef O_LAZY | ||
49 | if (flags & O_LAZY) | ||
50 | mode |= CEPH_FILE_MODE_LAZY; | ||
51 | #endif | ||
52 | |||
53 | return mode; | ||
54 | } | ||
55 | |||
56 | int ceph_caps_for_mode(int mode) | ||
57 | { | ||
58 | int caps = CEPH_CAP_PIN; | ||
59 | |||
60 | if (mode & CEPH_FILE_MODE_RD) | ||
61 | caps |= CEPH_CAP_FILE_SHARED | | ||
62 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; | ||
63 | if (mode & CEPH_FILE_MODE_WR) | ||
64 | caps |= CEPH_CAP_FILE_EXCL | | ||
65 | CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | | ||
66 | CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | | ||
67 | CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; | ||
68 | if (mode & CEPH_FILE_MODE_LAZY) | ||
69 | caps |= CEPH_CAP_FILE_LAZYIO; | ||
70 | |||
71 | return caps; | ||
72 | } | ||
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h deleted file mode 100644 index d5619ac86711..000000000000 --- a/fs/ceph/ceph_fs.h +++ /dev/null | |||
@@ -1,728 +0,0 @@ | |||
1 | /* | ||
2 | * ceph_fs.h - Ceph constants and data types to share between kernel and | ||
3 | * user space. | ||
4 | * | ||
5 | * Most types in this file are defined as little-endian, and are | ||
6 | * primarily intended to describe data structures that pass over the | ||
7 | * wire or that are stored on disk. | ||
8 | * | ||
9 | * LGPL2 | ||
10 | */ | ||
11 | |||
12 | #ifndef CEPH_FS_H | ||
13 | #define CEPH_FS_H | ||
14 | |||
15 | #include "msgr.h" | ||
16 | #include "rados.h" | ||
17 | |||
18 | /* | ||
19 | * subprotocol versions. when specific messages types or high-level | ||
20 | * protocols change, bump the affected components. we keep rev | ||
21 | * internal cluster protocols separately from the public, | ||
22 | * client-facing protocol. | ||
23 | */ | ||
24 | #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ | ||
25 | #define CEPH_MDS_PROTOCOL 12 /* cluster internal */ | ||
26 | #define CEPH_MON_PROTOCOL 5 /* cluster internal */ | ||
27 | #define CEPH_OSDC_PROTOCOL 24 /* server/client */ | ||
28 | #define CEPH_MDSC_PROTOCOL 32 /* server/client */ | ||
29 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ | ||
30 | |||
31 | |||
32 | #define CEPH_INO_ROOT 1 | ||
33 | #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ | ||
34 | |||
35 | /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ | ||
36 | #define CEPH_MAX_MON 31 | ||
37 | |||
38 | |||
39 | /* | ||
40 | * feature bits | ||
41 | */ | ||
42 | #define CEPH_FEATURE_UID (1<<0) | ||
43 | #define CEPH_FEATURE_NOSRCADDR (1<<1) | ||
44 | #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) | ||
45 | #define CEPH_FEATURE_FLOCK (1<<3) | ||
46 | |||
47 | |||
48 | /* | ||
49 | * ceph_file_layout - describe data layout for a file/inode | ||
50 | */ | ||
51 | struct ceph_file_layout { | ||
52 | /* file -> object mapping */ | ||
53 | __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple | ||
54 | of page size. */ | ||
55 | __le32 fl_stripe_count; /* over this many objects */ | ||
56 | __le32 fl_object_size; /* until objects are this big, then move to | ||
57 | new objects */ | ||
58 | __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ | ||
59 | |||
60 | /* pg -> disk layout */ | ||
61 | __le32 fl_object_stripe_unit; /* for per-object parity, if any */ | ||
62 | |||
63 | /* object -> pg layout */ | ||
64 | __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ | ||
65 | __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ | ||
66 | } __attribute__ ((packed)); | ||
67 | |||
68 | #define CEPH_MIN_STRIPE_UNIT 65536 | ||
69 | |||
70 | int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); | ||
71 | |||
72 | |||
73 | /* crypto algorithms */ | ||
74 | #define CEPH_CRYPTO_NONE 0x0 | ||
75 | #define CEPH_CRYPTO_AES 0x1 | ||
76 | |||
77 | #define CEPH_AES_IV "cephsageyudagreg" | ||
78 | |||
79 | /* security/authentication protocols */ | ||
80 | #define CEPH_AUTH_UNKNOWN 0x0 | ||
81 | #define CEPH_AUTH_NONE 0x1 | ||
82 | #define CEPH_AUTH_CEPHX 0x2 | ||
83 | |||
84 | #define CEPH_AUTH_UID_DEFAULT ((__u64) -1) | ||
85 | |||
86 | |||
87 | /********************************************* | ||
88 | * message layer | ||
89 | */ | ||
90 | |||
91 | /* | ||
92 | * message types | ||
93 | */ | ||
94 | |||
95 | /* misc */ | ||
96 | #define CEPH_MSG_SHUTDOWN 1 | ||
97 | #define CEPH_MSG_PING 2 | ||
98 | |||
99 | /* client <-> monitor */ | ||
100 | #define CEPH_MSG_MON_MAP 4 | ||
101 | #define CEPH_MSG_MON_GET_MAP 5 | ||
102 | #define CEPH_MSG_STATFS 13 | ||
103 | #define CEPH_MSG_STATFS_REPLY 14 | ||
104 | #define CEPH_MSG_MON_SUBSCRIBE 15 | ||
105 | #define CEPH_MSG_MON_SUBSCRIBE_ACK 16 | ||
106 | #define CEPH_MSG_AUTH 17 | ||
107 | #define CEPH_MSG_AUTH_REPLY 18 | ||
108 | |||
109 | /* client <-> mds */ | ||
110 | #define CEPH_MSG_MDS_MAP 21 | ||
111 | |||
112 | #define CEPH_MSG_CLIENT_SESSION 22 | ||
113 | #define CEPH_MSG_CLIENT_RECONNECT 23 | ||
114 | |||
115 | #define CEPH_MSG_CLIENT_REQUEST 24 | ||
116 | #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 | ||
117 | #define CEPH_MSG_CLIENT_REPLY 26 | ||
118 | #define CEPH_MSG_CLIENT_CAPS 0x310 | ||
119 | #define CEPH_MSG_CLIENT_LEASE 0x311 | ||
120 | #define CEPH_MSG_CLIENT_SNAP 0x312 | ||
121 | #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 | ||
122 | |||
123 | /* pool ops */ | ||
124 | #define CEPH_MSG_POOLOP_REPLY 48 | ||
125 | #define CEPH_MSG_POOLOP 49 | ||
126 | |||
127 | |||
128 | /* osd */ | ||
129 | #define CEPH_MSG_OSD_MAP 41 | ||
130 | #define CEPH_MSG_OSD_OP 42 | ||
131 | #define CEPH_MSG_OSD_OPREPLY 43 | ||
132 | |||
133 | /* pool operations */ | ||
134 | enum { | ||
135 | POOL_OP_CREATE = 0x01, | ||
136 | POOL_OP_DELETE = 0x02, | ||
137 | POOL_OP_AUID_CHANGE = 0x03, | ||
138 | POOL_OP_CREATE_SNAP = 0x11, | ||
139 | POOL_OP_DELETE_SNAP = 0x12, | ||
140 | POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, | ||
141 | POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, | ||
142 | }; | ||
143 | |||
144 | struct ceph_mon_request_header { | ||
145 | __le64 have_version; | ||
146 | __le16 session_mon; | ||
147 | __le64 session_mon_tid; | ||
148 | } __attribute__ ((packed)); | ||
149 | |||
150 | struct ceph_mon_statfs { | ||
151 | struct ceph_mon_request_header monhdr; | ||
152 | struct ceph_fsid fsid; | ||
153 | } __attribute__ ((packed)); | ||
154 | |||
155 | struct ceph_statfs { | ||
156 | __le64 kb, kb_used, kb_avail; | ||
157 | __le64 num_objects; | ||
158 | } __attribute__ ((packed)); | ||
159 | |||
160 | struct ceph_mon_statfs_reply { | ||
161 | struct ceph_fsid fsid; | ||
162 | __le64 version; | ||
163 | struct ceph_statfs st; | ||
164 | } __attribute__ ((packed)); | ||
165 | |||
166 | const char *ceph_pool_op_name(int op); | ||
167 | |||
168 | struct ceph_mon_poolop { | ||
169 | struct ceph_mon_request_header monhdr; | ||
170 | struct ceph_fsid fsid; | ||
171 | __le32 pool; | ||
172 | __le32 op; | ||
173 | __le64 auid; | ||
174 | __le64 snapid; | ||
175 | __le32 name_len; | ||
176 | } __attribute__ ((packed)); | ||
177 | |||
178 | struct ceph_mon_poolop_reply { | ||
179 | struct ceph_mon_request_header monhdr; | ||
180 | struct ceph_fsid fsid; | ||
181 | __le32 reply_code; | ||
182 | __le32 epoch; | ||
183 | char has_data; | ||
184 | char data[0]; | ||
185 | } __attribute__ ((packed)); | ||
186 | |||
187 | struct ceph_mon_unmanaged_snap { | ||
188 | __le64 snapid; | ||
189 | } __attribute__ ((packed)); | ||
190 | |||
191 | struct ceph_osd_getmap { | ||
192 | struct ceph_mon_request_header monhdr; | ||
193 | struct ceph_fsid fsid; | ||
194 | __le32 start; | ||
195 | } __attribute__ ((packed)); | ||
196 | |||
197 | struct ceph_mds_getmap { | ||
198 | struct ceph_mon_request_header monhdr; | ||
199 | struct ceph_fsid fsid; | ||
200 | } __attribute__ ((packed)); | ||
201 | |||
202 | struct ceph_client_mount { | ||
203 | struct ceph_mon_request_header monhdr; | ||
204 | } __attribute__ ((packed)); | ||
205 | |||
206 | struct ceph_mon_subscribe_item { | ||
207 | __le64 have_version; __le64 have; | ||
208 | __u8 onetime; | ||
209 | } __attribute__ ((packed)); | ||
210 | |||
211 | struct ceph_mon_subscribe_ack { | ||
212 | __le32 duration; /* seconds */ | ||
213 | struct ceph_fsid fsid; | ||
214 | } __attribute__ ((packed)); | ||
215 | |||
216 | /* | ||
217 | * mds states | ||
218 | * > 0 -> in | ||
219 | * <= 0 -> out | ||
220 | */ | ||
221 | #define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ | ||
222 | #define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. | ||
223 | empty log. */ | ||
224 | #define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ | ||
225 | #define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ | ||
226 | #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ | ||
227 | #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ | ||
228 | #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ | ||
229 | |||
230 | #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ | ||
231 | #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed | ||
232 | operations (import, rename, etc.) */ | ||
233 | #define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ | ||
234 | #define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ | ||
235 | #define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ | ||
236 | #define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ | ||
237 | #define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ | ||
238 | |||
239 | extern const char *ceph_mds_state_name(int s); | ||
240 | |||
241 | |||
242 | /* | ||
243 | * metadata lock types. | ||
244 | * - these are bitmasks.. we can compose them | ||
245 | * - they also define the lock ordering by the MDS | ||
246 | * - a few of these are internal to the mds | ||
247 | */ | ||
248 | #define CEPH_LOCK_DVERSION 1 | ||
249 | #define CEPH_LOCK_DN 2 | ||
250 | #define CEPH_LOCK_ISNAP 16 | ||
251 | #define CEPH_LOCK_IVERSION 32 /* mds internal */ | ||
252 | #define CEPH_LOCK_IFILE 64 | ||
253 | #define CEPH_LOCK_IAUTH 128 | ||
254 | #define CEPH_LOCK_ILINK 256 | ||
255 | #define CEPH_LOCK_IDFT 512 /* dir frag tree */ | ||
256 | #define CEPH_LOCK_INEST 1024 /* mds internal */ | ||
257 | #define CEPH_LOCK_IXATTR 2048 | ||
258 | #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ | ||
259 | #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ | ||
260 | |||
261 | /* client_session ops */ | ||
262 | enum { | ||
263 | CEPH_SESSION_REQUEST_OPEN, | ||
264 | CEPH_SESSION_OPEN, | ||
265 | CEPH_SESSION_REQUEST_CLOSE, | ||
266 | CEPH_SESSION_CLOSE, | ||
267 | CEPH_SESSION_REQUEST_RENEWCAPS, | ||
268 | CEPH_SESSION_RENEWCAPS, | ||
269 | CEPH_SESSION_STALE, | ||
270 | CEPH_SESSION_RECALL_STATE, | ||
271 | }; | ||
272 | |||
273 | extern const char *ceph_session_op_name(int op); | ||
274 | |||
275 | struct ceph_mds_session_head { | ||
276 | __le32 op; | ||
277 | __le64 seq; | ||
278 | struct ceph_timespec stamp; | ||
279 | __le32 max_caps, max_leases; | ||
280 | } __attribute__ ((packed)); | ||
281 | |||
282 | /* client_request */ | ||
283 | /* | ||
284 | * metadata ops. | ||
285 | * & 0x001000 -> write op | ||
286 | * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). | ||
287 | & & 0x100000 -> use weird ino/path trace | ||
288 | */ | ||
289 | #define CEPH_MDS_OP_WRITE 0x001000 | ||
290 | enum { | ||
291 | CEPH_MDS_OP_LOOKUP = 0x00100, | ||
292 | CEPH_MDS_OP_GETATTR = 0x00101, | ||
293 | CEPH_MDS_OP_LOOKUPHASH = 0x00102, | ||
294 | CEPH_MDS_OP_LOOKUPPARENT = 0x00103, | ||
295 | |||
296 | CEPH_MDS_OP_SETXATTR = 0x01105, | ||
297 | CEPH_MDS_OP_RMXATTR = 0x01106, | ||
298 | CEPH_MDS_OP_SETLAYOUT = 0x01107, | ||
299 | CEPH_MDS_OP_SETATTR = 0x01108, | ||
300 | CEPH_MDS_OP_SETFILELOCK= 0x01109, | ||
301 | CEPH_MDS_OP_GETFILELOCK= 0x00110, | ||
302 | |||
303 | CEPH_MDS_OP_MKNOD = 0x01201, | ||
304 | CEPH_MDS_OP_LINK = 0x01202, | ||
305 | CEPH_MDS_OP_UNLINK = 0x01203, | ||
306 | CEPH_MDS_OP_RENAME = 0x01204, | ||
307 | CEPH_MDS_OP_MKDIR = 0x01220, | ||
308 | CEPH_MDS_OP_RMDIR = 0x01221, | ||
309 | CEPH_MDS_OP_SYMLINK = 0x01222, | ||
310 | |||
311 | CEPH_MDS_OP_CREATE = 0x01301, | ||
312 | CEPH_MDS_OP_OPEN = 0x00302, | ||
313 | CEPH_MDS_OP_READDIR = 0x00305, | ||
314 | |||
315 | CEPH_MDS_OP_LOOKUPSNAP = 0x00400, | ||
316 | CEPH_MDS_OP_MKSNAP = 0x01400, | ||
317 | CEPH_MDS_OP_RMSNAP = 0x01401, | ||
318 | CEPH_MDS_OP_LSSNAP = 0x00402, | ||
319 | }; | ||
320 | |||
321 | extern const char *ceph_mds_op_name(int op); | ||
322 | |||
323 | |||
324 | #define CEPH_SETATTR_MODE 1 | ||
325 | #define CEPH_SETATTR_UID 2 | ||
326 | #define CEPH_SETATTR_GID 4 | ||
327 | #define CEPH_SETATTR_MTIME 8 | ||
328 | #define CEPH_SETATTR_ATIME 16 | ||
329 | #define CEPH_SETATTR_SIZE 32 | ||
330 | #define CEPH_SETATTR_CTIME 64 | ||
331 | |||
332 | union ceph_mds_request_args { | ||
333 | struct { | ||
334 | __le32 mask; /* CEPH_CAP_* */ | ||
335 | } __attribute__ ((packed)) getattr; | ||
336 | struct { | ||
337 | __le32 mode; | ||
338 | __le32 uid; | ||
339 | __le32 gid; | ||
340 | struct ceph_timespec mtime; | ||
341 | struct ceph_timespec atime; | ||
342 | __le64 size, old_size; /* old_size needed by truncate */ | ||
343 | __le32 mask; /* CEPH_SETATTR_* */ | ||
344 | } __attribute__ ((packed)) setattr; | ||
345 | struct { | ||
346 | __le32 frag; /* which dir fragment */ | ||
347 | __le32 max_entries; /* how many dentries to grab */ | ||
348 | __le32 max_bytes; | ||
349 | } __attribute__ ((packed)) readdir; | ||
350 | struct { | ||
351 | __le32 mode; | ||
352 | __le32 rdev; | ||
353 | } __attribute__ ((packed)) mknod; | ||
354 | struct { | ||
355 | __le32 mode; | ||
356 | } __attribute__ ((packed)) mkdir; | ||
357 | struct { | ||
358 | __le32 flags; | ||
359 | __le32 mode; | ||
360 | __le32 stripe_unit; /* layout for newly created file */ | ||
361 | __le32 stripe_count; /* ... */ | ||
362 | __le32 object_size; | ||
363 | __le32 file_replication; | ||
364 | __le32 preferred; | ||
365 | } __attribute__ ((packed)) open; | ||
366 | struct { | ||
367 | __le32 flags; | ||
368 | } __attribute__ ((packed)) setxattr; | ||
369 | struct { | ||
370 | struct ceph_file_layout layout; | ||
371 | } __attribute__ ((packed)) setlayout; | ||
372 | struct { | ||
373 | __u8 rule; /* currently fcntl or flock */ | ||
374 | __u8 type; /* shared, exclusive, remove*/ | ||
375 | __le64 pid; /* process id requesting the lock */ | ||
376 | __le64 pid_namespace; | ||
377 | __le64 start; /* initial location to lock */ | ||
378 | __le64 length; /* num bytes to lock from start */ | ||
379 | __u8 wait; /* will caller wait for lock to become available? */ | ||
380 | } __attribute__ ((packed)) filelock_change; | ||
381 | } __attribute__ ((packed)); | ||
382 | |||
383 | #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ | ||
384 | #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ | ||
385 | |||
386 | struct ceph_mds_request_head { | ||
387 | __le64 oldest_client_tid; | ||
388 | __le32 mdsmap_epoch; /* on client */ | ||
389 | __le32 flags; /* CEPH_MDS_FLAG_* */ | ||
390 | __u8 num_retry, num_fwd; /* count retry, fwd attempts */ | ||
391 | __le16 num_releases; /* # include cap/lease release records */ | ||
392 | __le32 op; /* mds op code */ | ||
393 | __le32 caller_uid, caller_gid; | ||
394 | __le64 ino; /* use this ino for openc, mkdir, mknod, | ||
395 | etc. (if replaying) */ | ||
396 | union ceph_mds_request_args args; | ||
397 | } __attribute__ ((packed)); | ||
398 | |||
399 | /* cap/lease release record */ | ||
400 | struct ceph_mds_request_release { | ||
401 | __le64 ino, cap_id; /* ino and unique cap id */ | ||
402 | __le32 caps, wanted; /* new issued, wanted */ | ||
403 | __le32 seq, issue_seq, mseq; | ||
404 | __le32 dname_seq; /* if releasing a dentry lease, a */ | ||
405 | __le32 dname_len; /* string follows. */ | ||
406 | } __attribute__ ((packed)); | ||
407 | |||
408 | /* client reply */ | ||
409 | struct ceph_mds_reply_head { | ||
410 | __le32 op; | ||
411 | __le32 result; | ||
412 | __le32 mdsmap_epoch; | ||
413 | __u8 safe; /* true if committed to disk */ | ||
414 | __u8 is_dentry, is_target; /* true if dentry, target inode records | ||
415 | are included with reply */ | ||
416 | } __attribute__ ((packed)); | ||
417 | |||
418 | /* one for each node split */ | ||
419 | struct ceph_frag_tree_split { | ||
420 | __le32 frag; /* this frag splits... */ | ||
421 | __le32 by; /* ...by this many bits */ | ||
422 | } __attribute__ ((packed)); | ||
423 | |||
424 | struct ceph_frag_tree_head { | ||
425 | __le32 nsplits; /* num ceph_frag_tree_split records */ | ||
426 | struct ceph_frag_tree_split splits[]; | ||
427 | } __attribute__ ((packed)); | ||
428 | |||
429 | /* capability issue, for bundling with mds reply */ | ||
430 | struct ceph_mds_reply_cap { | ||
431 | __le32 caps, wanted; /* caps issued, wanted */ | ||
432 | __le64 cap_id; | ||
433 | __le32 seq, mseq; | ||
434 | __le64 realm; /* snap realm */ | ||
435 | __u8 flags; /* CEPH_CAP_FLAG_* */ | ||
436 | } __attribute__ ((packed)); | ||
437 | |||
438 | #define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ | ||
439 | |||
440 | /* inode record, for bundling with mds reply */ | ||
441 | struct ceph_mds_reply_inode { | ||
442 | __le64 ino; | ||
443 | __le64 snapid; | ||
444 | __le32 rdev; | ||
445 | __le64 version; /* inode version */ | ||
446 | __le64 xattr_version; /* version for xattr blob */ | ||
447 | struct ceph_mds_reply_cap cap; /* caps issued for this inode */ | ||
448 | struct ceph_file_layout layout; | ||
449 | struct ceph_timespec ctime, mtime, atime; | ||
450 | __le32 time_warp_seq; | ||
451 | __le64 size, max_size, truncate_size; | ||
452 | __le32 truncate_seq; | ||
453 | __le32 mode, uid, gid; | ||
454 | __le32 nlink; | ||
455 | __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ | ||
456 | struct ceph_timespec rctime; | ||
457 | struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ | ||
458 | } __attribute__ ((packed)); | ||
459 | /* followed by frag array, then symlink string, then xattr blob */ | ||
460 | |||
461 | /* reply_lease follows dname, and reply_inode */ | ||
462 | struct ceph_mds_reply_lease { | ||
463 | __le16 mask; /* lease type(s) */ | ||
464 | __le32 duration_ms; /* lease duration */ | ||
465 | __le32 seq; | ||
466 | } __attribute__ ((packed)); | ||
467 | |||
468 | struct ceph_mds_reply_dirfrag { | ||
469 | __le32 frag; /* fragment */ | ||
470 | __le32 auth; /* auth mds, if this is a delegation point */ | ||
471 | __le32 ndist; /* number of mds' this is replicated on */ | ||
472 | __le32 dist[]; | ||
473 | } __attribute__ ((packed)); | ||
474 | |||
475 | #define CEPH_LOCK_FCNTL 1 | ||
476 | #define CEPH_LOCK_FLOCK 2 | ||
477 | |||
478 | #define CEPH_LOCK_SHARED 1 | ||
479 | #define CEPH_LOCK_EXCL 2 | ||
480 | #define CEPH_LOCK_UNLOCK 4 | ||
481 | |||
482 | struct ceph_filelock { | ||
483 | __le64 start;/* file offset to start lock at */ | ||
484 | __le64 length; /* num bytes to lock; 0 for all following start */ | ||
485 | __le64 client; /* which client holds the lock */ | ||
486 | __le64 pid; /* process id holding the lock on the client */ | ||
487 | __le64 pid_namespace; | ||
488 | __u8 type; /* shared lock, exclusive lock, or unlock */ | ||
489 | } __attribute__ ((packed)); | ||
490 | |||
491 | |||
492 | /* file access modes */ | ||
493 | #define CEPH_FILE_MODE_PIN 0 | ||
494 | #define CEPH_FILE_MODE_RD 1 | ||
495 | #define CEPH_FILE_MODE_WR 2 | ||
496 | #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ | ||
497 | #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ | ||
498 | #define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ | ||
499 | |||
500 | int ceph_flags_to_mode(int flags); | ||
501 | |||
502 | |||
503 | /* capability bits */ | ||
504 | #define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ | ||
505 | |||
506 | /* generic cap bits */ | ||
507 | #define CEPH_CAP_GSHARED 1 /* client can reads */ | ||
508 | #define CEPH_CAP_GEXCL 2 /* client can read and update */ | ||
509 | #define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ | ||
510 | #define CEPH_CAP_GRD 8 /* (file) client can read */ | ||
511 | #define CEPH_CAP_GWR 16 /* (file) client can write */ | ||
512 | #define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ | ||
513 | #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ | ||
514 | #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ | ||
515 | |||
516 | /* per-lock shift */ | ||
517 | #define CEPH_CAP_SAUTH 2 | ||
518 | #define CEPH_CAP_SLINK 4 | ||
519 | #define CEPH_CAP_SXATTR 6 | ||
520 | #define CEPH_CAP_SFILE 8 | ||
521 | #define CEPH_CAP_SFLOCK 20 | ||
522 | |||
523 | #define CEPH_CAP_BITS 22 | ||
524 | |||
525 | /* composed values */ | ||
526 | #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) | ||
527 | #define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) | ||
528 | #define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) | ||
529 | #define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) | ||
530 | #define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) | ||
531 | #define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) | ||
532 | #define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) | ||
533 | #define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) | ||
534 | #define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) | ||
535 | #define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) | ||
536 | #define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) | ||
537 | #define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) | ||
538 | #define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) | ||
539 | #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) | ||
540 | #define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) | ||
541 | #define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK) | ||
542 | #define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK) | ||
543 | |||
544 | |||
545 | /* cap masks (for getattr) */ | ||
546 | #define CEPH_STAT_CAP_INODE CEPH_CAP_PIN | ||
547 | #define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ | ||
548 | #define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN | ||
549 | #define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED | ||
550 | #define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED | ||
551 | #define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED | ||
552 | #define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED | ||
553 | #define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED | ||
554 | #define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED | ||
555 | #define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED | ||
556 | #define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ | ||
557 | #define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED | ||
558 | #define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ | ||
559 | CEPH_CAP_AUTH_SHARED | \ | ||
560 | CEPH_CAP_LINK_SHARED | \ | ||
561 | CEPH_CAP_FILE_SHARED | \ | ||
562 | CEPH_CAP_XATTR_SHARED) | ||
563 | |||
564 | #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ | ||
565 | CEPH_CAP_LINK_SHARED | \ | ||
566 | CEPH_CAP_XATTR_SHARED | \ | ||
567 | CEPH_CAP_FILE_SHARED) | ||
568 | #define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ | ||
569 | CEPH_CAP_FILE_CACHE) | ||
570 | |||
571 | #define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ | ||
572 | CEPH_CAP_LINK_EXCL | \ | ||
573 | CEPH_CAP_XATTR_EXCL | \ | ||
574 | CEPH_CAP_FILE_EXCL) | ||
575 | #define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ | ||
576 | CEPH_CAP_FILE_EXCL) | ||
577 | #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) | ||
578 | #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ | ||
579 | CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ | ||
580 | CEPH_CAP_PIN) | ||
581 | |||
582 | #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ | ||
583 | CEPH_LOCK_IXATTR) | ||
584 | |||
585 | int ceph_caps_for_mode(int mode); | ||
586 | |||
587 | enum { | ||
588 | CEPH_CAP_OP_GRANT, /* mds->client grant */ | ||
589 | CEPH_CAP_OP_REVOKE, /* mds->client revoke */ | ||
590 | CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ | ||
591 | CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ | ||
592 | CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ | ||
593 | CEPH_CAP_OP_UPDATE, /* client->mds update */ | ||
594 | CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ | ||
595 | CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ | ||
596 | CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ | ||
597 | CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ | ||
598 | CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ | ||
599 | CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ | ||
600 | CEPH_CAP_OP_RENEW, /* client->mds renewal request */ | ||
601 | }; | ||
602 | |||
603 | extern const char *ceph_cap_op_name(int op); | ||
604 | |||
605 | /* | ||
606 | * caps message, used for capability callbacks, acks, requests, etc. | ||
607 | */ | ||
608 | struct ceph_mds_caps { | ||
609 | __le32 op; /* CEPH_CAP_OP_* */ | ||
610 | __le64 ino, realm; | ||
611 | __le64 cap_id; | ||
612 | __le32 seq, issue_seq; | ||
613 | __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ | ||
614 | __le32 migrate_seq; | ||
615 | __le64 snap_follows; | ||
616 | __le32 snap_trace_len; | ||
617 | |||
618 | /* authlock */ | ||
619 | __le32 uid, gid, mode; | ||
620 | |||
621 | /* linklock */ | ||
622 | __le32 nlink; | ||
623 | |||
624 | /* xattrlock */ | ||
625 | __le32 xattr_len; | ||
626 | __le64 xattr_version; | ||
627 | |||
628 | /* filelock */ | ||
629 | __le64 size, max_size, truncate_size; | ||
630 | __le32 truncate_seq; | ||
631 | struct ceph_timespec mtime, atime, ctime; | ||
632 | struct ceph_file_layout layout; | ||
633 | __le32 time_warp_seq; | ||
634 | } __attribute__ ((packed)); | ||
635 | |||
636 | /* cap release msg head */ | ||
637 | struct ceph_mds_cap_release { | ||
638 | __le32 num; /* number of cap_items that follow */ | ||
639 | } __attribute__ ((packed)); | ||
640 | |||
641 | struct ceph_mds_cap_item { | ||
642 | __le64 ino; | ||
643 | __le64 cap_id; | ||
644 | __le32 migrate_seq, seq; | ||
645 | } __attribute__ ((packed)); | ||
646 | |||
647 | #define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ | ||
648 | #define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ | ||
649 | #define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ | ||
650 | #define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ | ||
651 | |||
652 | extern const char *ceph_lease_op_name(int o); | ||
653 | |||
654 | /* lease msg header */ | ||
655 | struct ceph_mds_lease { | ||
656 | __u8 action; /* CEPH_MDS_LEASE_* */ | ||
657 | __le16 mask; /* which lease */ | ||
658 | __le64 ino; | ||
659 | __le64 first, last; /* snap range */ | ||
660 | __le32 seq; | ||
661 | __le32 duration_ms; /* duration of renewal */ | ||
662 | } __attribute__ ((packed)); | ||
663 | /* followed by a __le32+string for dname */ | ||
664 | |||
665 | /* client reconnect */ | ||
666 | struct ceph_mds_cap_reconnect { | ||
667 | __le64 cap_id; | ||
668 | __le32 wanted; | ||
669 | __le32 issued; | ||
670 | __le64 snaprealm; | ||
671 | __le64 pathbase; /* base ino for our path to this ino */ | ||
672 | __le32 flock_len; /* size of flock state blob, if any */ | ||
673 | } __attribute__ ((packed)); | ||
674 | /* followed by flock blob */ | ||
675 | |||
676 | struct ceph_mds_cap_reconnect_v1 { | ||
677 | __le64 cap_id; | ||
678 | __le32 wanted; | ||
679 | __le32 issued; | ||
680 | __le64 size; | ||
681 | struct ceph_timespec mtime, atime; | ||
682 | __le64 snaprealm; | ||
683 | __le64 pathbase; /* base ino for our path to this ino */ | ||
684 | } __attribute__ ((packed)); | ||
685 | |||
686 | struct ceph_mds_snaprealm_reconnect { | ||
687 | __le64 ino; /* snap realm base */ | ||
688 | __le64 seq; /* snap seq for this snap realm */ | ||
689 | __le64 parent; /* parent realm */ | ||
690 | } __attribute__ ((packed)); | ||
691 | |||
692 | /* | ||
693 | * snaps | ||
694 | */ | ||
695 | enum { | ||
696 | CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ | ||
697 | CEPH_SNAP_OP_CREATE, | ||
698 | CEPH_SNAP_OP_DESTROY, | ||
699 | CEPH_SNAP_OP_SPLIT, | ||
700 | }; | ||
701 | |||
702 | extern const char *ceph_snap_op_name(int o); | ||
703 | |||
704 | /* snap msg header */ | ||
705 | struct ceph_mds_snap_head { | ||
706 | __le32 op; /* CEPH_SNAP_OP_* */ | ||
707 | __le64 split; /* ino to split off, if any */ | ||
708 | __le32 num_split_inos; /* # inos belonging to new child realm */ | ||
709 | __le32 num_split_realms; /* # child realms udner new child realm */ | ||
710 | __le32 trace_len; /* size of snap trace blob */ | ||
711 | } __attribute__ ((packed)); | ||
712 | /* followed by split ino list, then split realms, then the trace blob */ | ||
713 | |||
714 | /* | ||
715 | * encode info about a snaprealm, as viewed by a client | ||
716 | */ | ||
717 | struct ceph_mds_snap_realm { | ||
718 | __le64 ino; /* ino */ | ||
719 | __le64 created; /* snap: when created */ | ||
720 | __le64 parent; /* ino: parent realm */ | ||
721 | __le64 parent_since; /* snap: same parent since */ | ||
722 | __le64 seq; /* snap: version */ | ||
723 | __le32 num_snaps; | ||
724 | __le32 num_prior_parent_snaps; | ||
725 | } __attribute__ ((packed)); | ||
726 | /* followed by my snap list, then prior parent snap list */ | ||
727 | |||
728 | #endif | ||
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c deleted file mode 100644 index bd570015d147..000000000000 --- a/fs/ceph/ceph_hash.c +++ /dev/null | |||
@@ -1,118 +0,0 @@ | |||
1 | |||
2 | #include "types.h" | ||
3 | |||
4 | /* | ||
5 | * Robert Jenkin's hash function. | ||
6 | * http://burtleburtle.net/bob/hash/evahash.html | ||
7 | * This is in the public domain. | ||
8 | */ | ||
9 | #define mix(a, b, c) \ | ||
10 | do { \ | ||
11 | a = a - b; a = a - c; a = a ^ (c >> 13); \ | ||
12 | b = b - c; b = b - a; b = b ^ (a << 8); \ | ||
13 | c = c - a; c = c - b; c = c ^ (b >> 13); \ | ||
14 | a = a - b; a = a - c; a = a ^ (c >> 12); \ | ||
15 | b = b - c; b = b - a; b = b ^ (a << 16); \ | ||
16 | c = c - a; c = c - b; c = c ^ (b >> 5); \ | ||
17 | a = a - b; a = a - c; a = a ^ (c >> 3); \ | ||
18 | b = b - c; b = b - a; b = b ^ (a << 10); \ | ||
19 | c = c - a; c = c - b; c = c ^ (b >> 15); \ | ||
20 | } while (0) | ||
21 | |||
22 | unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) | ||
23 | { | ||
24 | const unsigned char *k = (const unsigned char *)str; | ||
25 | __u32 a, b, c; /* the internal state */ | ||
26 | __u32 len; /* how many key bytes still need mixing */ | ||
27 | |||
28 | /* Set up the internal state */ | ||
29 | len = length; | ||
30 | a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ | ||
31 | b = a; | ||
32 | c = 0; /* variable initialization of internal state */ | ||
33 | |||
34 | /* handle most of the key */ | ||
35 | while (len >= 12) { | ||
36 | a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) + | ||
37 | ((__u32)k[3] << 24)); | ||
38 | b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) + | ||
39 | ((__u32)k[7] << 24)); | ||
40 | c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) + | ||
41 | ((__u32)k[11] << 24)); | ||
42 | mix(a, b, c); | ||
43 | k = k + 12; | ||
44 | len = len - 12; | ||
45 | } | ||
46 | |||
47 | /* handle the last 11 bytes */ | ||
48 | c = c + length; | ||
49 | switch (len) { /* all the case statements fall through */ | ||
50 | case 11: | ||
51 | c = c + ((__u32)k[10] << 24); | ||
52 | case 10: | ||
53 | c = c + ((__u32)k[9] << 16); | ||
54 | case 9: | ||
55 | c = c + ((__u32)k[8] << 8); | ||
56 | /* the first byte of c is reserved for the length */ | ||
57 | case 8: | ||
58 | b = b + ((__u32)k[7] << 24); | ||
59 | case 7: | ||
60 | b = b + ((__u32)k[6] << 16); | ||
61 | case 6: | ||
62 | b = b + ((__u32)k[5] << 8); | ||
63 | case 5: | ||
64 | b = b + k[4]; | ||
65 | case 4: | ||
66 | a = a + ((__u32)k[3] << 24); | ||
67 | case 3: | ||
68 | a = a + ((__u32)k[2] << 16); | ||
69 | case 2: | ||
70 | a = a + ((__u32)k[1] << 8); | ||
71 | case 1: | ||
72 | a = a + k[0]; | ||
73 | /* case 0: nothing left to add */ | ||
74 | } | ||
75 | mix(a, b, c); | ||
76 | |||
77 | return c; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * linux dcache hash | ||
82 | */ | ||
83 | unsigned ceph_str_hash_linux(const char *str, unsigned length) | ||
84 | { | ||
85 | unsigned long hash = 0; | ||
86 | unsigned char c; | ||
87 | |||
88 | while (length--) { | ||
89 | c = *str++; | ||
90 | hash = (hash + (c << 4) + (c >> 4)) * 11; | ||
91 | } | ||
92 | return hash; | ||
93 | } | ||
94 | |||
95 | |||
96 | unsigned ceph_str_hash(int type, const char *s, unsigned len) | ||
97 | { | ||
98 | switch (type) { | ||
99 | case CEPH_STR_HASH_LINUX: | ||
100 | return ceph_str_hash_linux(s, len); | ||
101 | case CEPH_STR_HASH_RJENKINS: | ||
102 | return ceph_str_hash_rjenkins(s, len); | ||
103 | default: | ||
104 | return -1; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | const char *ceph_str_hash_name(int type) | ||
109 | { | ||
110 | switch (type) { | ||
111 | case CEPH_STR_HASH_LINUX: | ||
112 | return "linux"; | ||
113 | case CEPH_STR_HASH_RJENKINS: | ||
114 | return "rjenkins"; | ||
115 | default: | ||
116 | return "unknown"; | ||
117 | } | ||
118 | } | ||
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h deleted file mode 100644 index d099c3f90236..000000000000 --- a/fs/ceph/ceph_hash.h +++ /dev/null | |||
@@ -1,13 +0,0 @@ | |||
1 | #ifndef FS_CEPH_HASH_H | ||
2 | #define FS_CEPH_HASH_H | ||
3 | |||
4 | #define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ | ||
5 | #define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ | ||
6 | |||
7 | extern unsigned ceph_str_hash_linux(const char *s, unsigned len); | ||
8 | extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); | ||
9 | |||
10 | extern unsigned ceph_str_hash(int type, const char *s, unsigned len); | ||
11 | extern const char *ceph_str_hash_name(int type); | ||
12 | |||
13 | #endif | ||
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c deleted file mode 100644 index fabd302e5779..000000000000 --- a/fs/ceph/crush/crush.c +++ /dev/null | |||
@@ -1,151 +0,0 @@ | |||
1 | |||
2 | #ifdef __KERNEL__ | ||
3 | # include <linux/slab.h> | ||
4 | #else | ||
5 | # include <stdlib.h> | ||
6 | # include <assert.h> | ||
7 | # define kfree(x) do { if (x) free(x); } while (0) | ||
8 | # define BUG_ON(x) assert(!(x)) | ||
9 | #endif | ||
10 | |||
11 | #include "crush.h" | ||
12 | |||
13 | const char *crush_bucket_alg_name(int alg) | ||
14 | { | ||
15 | switch (alg) { | ||
16 | case CRUSH_BUCKET_UNIFORM: return "uniform"; | ||
17 | case CRUSH_BUCKET_LIST: return "list"; | ||
18 | case CRUSH_BUCKET_TREE: return "tree"; | ||
19 | case CRUSH_BUCKET_STRAW: return "straw"; | ||
20 | default: return "unknown"; | ||
21 | } | ||
22 | } | ||
23 | |||
24 | /** | ||
25 | * crush_get_bucket_item_weight - Get weight of an item in given bucket | ||
26 | * @b: bucket pointer | ||
27 | * @p: item index in bucket | ||
28 | */ | ||
29 | int crush_get_bucket_item_weight(struct crush_bucket *b, int p) | ||
30 | { | ||
31 | if (p >= b->size) | ||
32 | return 0; | ||
33 | |||
34 | switch (b->alg) { | ||
35 | case CRUSH_BUCKET_UNIFORM: | ||
36 | return ((struct crush_bucket_uniform *)b)->item_weight; | ||
37 | case CRUSH_BUCKET_LIST: | ||
38 | return ((struct crush_bucket_list *)b)->item_weights[p]; | ||
39 | case CRUSH_BUCKET_TREE: | ||
40 | if (p & 1) | ||
41 | return ((struct crush_bucket_tree *)b)->node_weights[p]; | ||
42 | return 0; | ||
43 | case CRUSH_BUCKET_STRAW: | ||
44 | return ((struct crush_bucket_straw *)b)->item_weights[p]; | ||
45 | } | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * crush_calc_parents - Calculate parent vectors for the given crush map. | ||
51 | * @map: crush_map pointer | ||
52 | */ | ||
53 | void crush_calc_parents(struct crush_map *map) | ||
54 | { | ||
55 | int i, b, c; | ||
56 | |||
57 | for (b = 0; b < map->max_buckets; b++) { | ||
58 | if (map->buckets[b] == NULL) | ||
59 | continue; | ||
60 | for (i = 0; i < map->buckets[b]->size; i++) { | ||
61 | c = map->buckets[b]->items[i]; | ||
62 | BUG_ON(c >= map->max_devices || | ||
63 | c < -map->max_buckets); | ||
64 | if (c >= 0) | ||
65 | map->device_parents[c] = map->buckets[b]->id; | ||
66 | else | ||
67 | map->bucket_parents[-1-c] = map->buckets[b]->id; | ||
68 | } | ||
69 | } | ||
70 | } | ||
71 | |||
72 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) | ||
73 | { | ||
74 | kfree(b->h.perm); | ||
75 | kfree(b->h.items); | ||
76 | kfree(b); | ||
77 | } | ||
78 | |||
79 | void crush_destroy_bucket_list(struct crush_bucket_list *b) | ||
80 | { | ||
81 | kfree(b->item_weights); | ||
82 | kfree(b->sum_weights); | ||
83 | kfree(b->h.perm); | ||
84 | kfree(b->h.items); | ||
85 | kfree(b); | ||
86 | } | ||
87 | |||
88 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) | ||
89 | { | ||
90 | kfree(b->node_weights); | ||
91 | kfree(b); | ||
92 | } | ||
93 | |||
94 | void crush_destroy_bucket_straw(struct crush_bucket_straw *b) | ||
95 | { | ||
96 | kfree(b->straws); | ||
97 | kfree(b->item_weights); | ||
98 | kfree(b->h.perm); | ||
99 | kfree(b->h.items); | ||
100 | kfree(b); | ||
101 | } | ||
102 | |||
103 | void crush_destroy_bucket(struct crush_bucket *b) | ||
104 | { | ||
105 | switch (b->alg) { | ||
106 | case CRUSH_BUCKET_UNIFORM: | ||
107 | crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); | ||
108 | break; | ||
109 | case CRUSH_BUCKET_LIST: | ||
110 | crush_destroy_bucket_list((struct crush_bucket_list *)b); | ||
111 | break; | ||
112 | case CRUSH_BUCKET_TREE: | ||
113 | crush_destroy_bucket_tree((struct crush_bucket_tree *)b); | ||
114 | break; | ||
115 | case CRUSH_BUCKET_STRAW: | ||
116 | crush_destroy_bucket_straw((struct crush_bucket_straw *)b); | ||
117 | break; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | /** | ||
122 | * crush_destroy - Destroy a crush_map | ||
123 | * @map: crush_map pointer | ||
124 | */ | ||
125 | void crush_destroy(struct crush_map *map) | ||
126 | { | ||
127 | int b; | ||
128 | |||
129 | /* buckets */ | ||
130 | if (map->buckets) { | ||
131 | for (b = 0; b < map->max_buckets; b++) { | ||
132 | if (map->buckets[b] == NULL) | ||
133 | continue; | ||
134 | crush_destroy_bucket(map->buckets[b]); | ||
135 | } | ||
136 | kfree(map->buckets); | ||
137 | } | ||
138 | |||
139 | /* rules */ | ||
140 | if (map->rules) { | ||
141 | for (b = 0; b < map->max_rules; b++) | ||
142 | kfree(map->rules[b]); | ||
143 | kfree(map->rules); | ||
144 | } | ||
145 | |||
146 | kfree(map->bucket_parents); | ||
147 | kfree(map->device_parents); | ||
148 | kfree(map); | ||
149 | } | ||
150 | |||
151 | |||
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h deleted file mode 100644 index 97e435b191f4..000000000000 --- a/fs/ceph/crush/crush.h +++ /dev/null | |||
@@ -1,180 +0,0 @@ | |||
1 | #ifndef CEPH_CRUSH_CRUSH_H | ||
2 | #define CEPH_CRUSH_CRUSH_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | |||
6 | /* | ||
7 | * CRUSH is a pseudo-random data distribution algorithm that | ||
8 | * efficiently distributes input values (typically, data objects) | ||
9 | * across a heterogeneous, structured storage cluster. | ||
10 | * | ||
11 | * The algorithm was originally described in detail in this paper | ||
12 | * (although the algorithm has evolved somewhat since then): | ||
13 | * | ||
14 | * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf | ||
15 | * | ||
16 | * LGPL2 | ||
17 | */ | ||
18 | |||
19 | |||
20 | #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ | ||
21 | |||
22 | |||
23 | #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ | ||
24 | #define CRUSH_MAX_SET 10 /* max size of a mapping result */ | ||
25 | |||
26 | |||
27 | /* | ||
28 | * CRUSH uses user-defined "rules" to describe how inputs should be | ||
29 | * mapped to devices. A rule consists of sequence of steps to perform | ||
30 | * to generate the set of output devices. | ||
31 | */ | ||
32 | struct crush_rule_step { | ||
33 | __u32 op; | ||
34 | __s32 arg1; | ||
35 | __s32 arg2; | ||
36 | }; | ||
37 | |||
38 | /* step op codes */ | ||
39 | enum { | ||
40 | CRUSH_RULE_NOOP = 0, | ||
41 | CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ | ||
42 | CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ | ||
43 | /* arg2 = type */ | ||
44 | CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ | ||
45 | CRUSH_RULE_EMIT = 4, /* no args */ | ||
46 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, | ||
47 | CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * for specifying choose num (arg1) relative to the max parameter | ||
52 | * passed to do_rule | ||
53 | */ | ||
54 | #define CRUSH_CHOOSE_N 0 | ||
55 | #define CRUSH_CHOOSE_N_MINUS(x) (-(x)) | ||
56 | |||
57 | /* | ||
58 | * The rule mask is used to describe what the rule is intended for. | ||
59 | * Given a ruleset and size of output set, we search through the | ||
60 | * rule list for a matching rule_mask. | ||
61 | */ | ||
62 | struct crush_rule_mask { | ||
63 | __u8 ruleset; | ||
64 | __u8 type; | ||
65 | __u8 min_size; | ||
66 | __u8 max_size; | ||
67 | }; | ||
68 | |||
69 | struct crush_rule { | ||
70 | __u32 len; | ||
71 | struct crush_rule_mask mask; | ||
72 | struct crush_rule_step steps[0]; | ||
73 | }; | ||
74 | |||
75 | #define crush_rule_size(len) (sizeof(struct crush_rule) + \ | ||
76 | (len)*sizeof(struct crush_rule_step)) | ||
77 | |||
78 | |||
79 | |||
80 | /* | ||
81 | * A bucket is a named container of other items (either devices or | ||
82 | * other buckets). Items within a bucket are chosen using one of a | ||
83 | * few different algorithms. The table summarizes how the speed of | ||
84 | * each option measures up against mapping stability when items are | ||
85 | * added or removed. | ||
86 | * | ||
87 | * Bucket Alg Speed Additions Removals | ||
88 | * ------------------------------------------------ | ||
89 | * uniform O(1) poor poor | ||
90 | * list O(n) optimal poor | ||
91 | * tree O(log n) good good | ||
92 | * straw O(n) optimal optimal | ||
93 | */ | ||
94 | enum { | ||
95 | CRUSH_BUCKET_UNIFORM = 1, | ||
96 | CRUSH_BUCKET_LIST = 2, | ||
97 | CRUSH_BUCKET_TREE = 3, | ||
98 | CRUSH_BUCKET_STRAW = 4 | ||
99 | }; | ||
100 | extern const char *crush_bucket_alg_name(int alg); | ||
101 | |||
102 | struct crush_bucket { | ||
103 | __s32 id; /* this'll be negative */ | ||
104 | __u16 type; /* non-zero; type=0 is reserved for devices */ | ||
105 | __u8 alg; /* one of CRUSH_BUCKET_* */ | ||
106 | __u8 hash; /* which hash function to use, CRUSH_HASH_* */ | ||
107 | __u32 weight; /* 16-bit fixed point */ | ||
108 | __u32 size; /* num items */ | ||
109 | __s32 *items; | ||
110 | |||
111 | /* | ||
112 | * cached random permutation: used for uniform bucket and for | ||
113 | * the linear search fallback for the other bucket types. | ||
114 | */ | ||
115 | __u32 perm_x; /* @x for which *perm is defined */ | ||
116 | __u32 perm_n; /* num elements of *perm that are permuted/defined */ | ||
117 | __u32 *perm; | ||
118 | }; | ||
119 | |||
120 | struct crush_bucket_uniform { | ||
121 | struct crush_bucket h; | ||
122 | __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ | ||
123 | }; | ||
124 | |||
125 | struct crush_bucket_list { | ||
126 | struct crush_bucket h; | ||
127 | __u32 *item_weights; /* 16-bit fixed point */ | ||
128 | __u32 *sum_weights; /* 16-bit fixed point. element i is sum | ||
129 | of weights 0..i, inclusive */ | ||
130 | }; | ||
131 | |||
132 | struct crush_bucket_tree { | ||
133 | struct crush_bucket h; /* note: h.size is _tree_ size, not number of | ||
134 | actual items */ | ||
135 | __u8 num_nodes; | ||
136 | __u32 *node_weights; | ||
137 | }; | ||
138 | |||
139 | struct crush_bucket_straw { | ||
140 | struct crush_bucket h; | ||
141 | __u32 *item_weights; /* 16-bit fixed point */ | ||
142 | __u32 *straws; /* 16-bit fixed point */ | ||
143 | }; | ||
144 | |||
145 | |||
146 | |||
147 | /* | ||
148 | * CRUSH map includes all buckets, rules, etc. | ||
149 | */ | ||
150 | struct crush_map { | ||
151 | struct crush_bucket **buckets; | ||
152 | struct crush_rule **rules; | ||
153 | |||
154 | /* | ||
155 | * Parent pointers to identify the parent bucket a device or | ||
156 | * bucket in the hierarchy. If an item appears more than | ||
157 | * once, this is the _last_ time it appeared (where buckets | ||
158 | * are processed in bucket id order, from -1 on down to | ||
159 | * -max_buckets. | ||
160 | */ | ||
161 | __u32 *bucket_parents; | ||
162 | __u32 *device_parents; | ||
163 | |||
164 | __s32 max_buckets; | ||
165 | __u32 max_rules; | ||
166 | __s32 max_devices; | ||
167 | }; | ||
168 | |||
169 | |||
170 | /* crush.c */ | ||
171 | extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); | ||
172 | extern void crush_calc_parents(struct crush_map *map); | ||
173 | extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); | ||
174 | extern void crush_destroy_bucket_list(struct crush_bucket_list *b); | ||
175 | extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); | ||
176 | extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); | ||
177 | extern void crush_destroy_bucket(struct crush_bucket *b); | ||
178 | extern void crush_destroy(struct crush_map *map); | ||
179 | |||
180 | #endif | ||
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c deleted file mode 100644 index 5873aed694bf..000000000000 --- a/fs/ceph/crush/hash.c +++ /dev/null | |||
@@ -1,149 +0,0 @@ | |||
1 | |||
2 | #include <linux/types.h> | ||
3 | #include "hash.h" | ||
4 | |||
5 | /* | ||
6 | * Robert Jenkins' function for mixing 32-bit values | ||
7 | * http://burtleburtle.net/bob/hash/evahash.html | ||
8 | * a, b = random bits, c = input and output | ||
9 | */ | ||
10 | #define crush_hashmix(a, b, c) do { \ | ||
11 | a = a-b; a = a-c; a = a^(c>>13); \ | ||
12 | b = b-c; b = b-a; b = b^(a<<8); \ | ||
13 | c = c-a; c = c-b; c = c^(b>>13); \ | ||
14 | a = a-b; a = a-c; a = a^(c>>12); \ | ||
15 | b = b-c; b = b-a; b = b^(a<<16); \ | ||
16 | c = c-a; c = c-b; c = c^(b>>5); \ | ||
17 | a = a-b; a = a-c; a = a^(c>>3); \ | ||
18 | b = b-c; b = b-a; b = b^(a<<10); \ | ||
19 | c = c-a; c = c-b; c = c^(b>>15); \ | ||
20 | } while (0) | ||
21 | |||
22 | #define crush_hash_seed 1315423911 | ||
23 | |||
24 | static __u32 crush_hash32_rjenkins1(__u32 a) | ||
25 | { | ||
26 | __u32 hash = crush_hash_seed ^ a; | ||
27 | __u32 b = a; | ||
28 | __u32 x = 231232; | ||
29 | __u32 y = 1232; | ||
30 | crush_hashmix(b, x, hash); | ||
31 | crush_hashmix(y, a, hash); | ||
32 | return hash; | ||
33 | } | ||
34 | |||
35 | static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) | ||
36 | { | ||
37 | __u32 hash = crush_hash_seed ^ a ^ b; | ||
38 | __u32 x = 231232; | ||
39 | __u32 y = 1232; | ||
40 | crush_hashmix(a, b, hash); | ||
41 | crush_hashmix(x, a, hash); | ||
42 | crush_hashmix(b, y, hash); | ||
43 | return hash; | ||
44 | } | ||
45 | |||
46 | static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) | ||
47 | { | ||
48 | __u32 hash = crush_hash_seed ^ a ^ b ^ c; | ||
49 | __u32 x = 231232; | ||
50 | __u32 y = 1232; | ||
51 | crush_hashmix(a, b, hash); | ||
52 | crush_hashmix(c, x, hash); | ||
53 | crush_hashmix(y, a, hash); | ||
54 | crush_hashmix(b, x, hash); | ||
55 | crush_hashmix(y, c, hash); | ||
56 | return hash; | ||
57 | } | ||
58 | |||
59 | static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) | ||
60 | { | ||
61 | __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; | ||
62 | __u32 x = 231232; | ||
63 | __u32 y = 1232; | ||
64 | crush_hashmix(a, b, hash); | ||
65 | crush_hashmix(c, d, hash); | ||
66 | crush_hashmix(a, x, hash); | ||
67 | crush_hashmix(y, b, hash); | ||
68 | crush_hashmix(c, x, hash); | ||
69 | crush_hashmix(y, d, hash); | ||
70 | return hash; | ||
71 | } | ||
72 | |||
73 | static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, | ||
74 | __u32 e) | ||
75 | { | ||
76 | __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; | ||
77 | __u32 x = 231232; | ||
78 | __u32 y = 1232; | ||
79 | crush_hashmix(a, b, hash); | ||
80 | crush_hashmix(c, d, hash); | ||
81 | crush_hashmix(e, x, hash); | ||
82 | crush_hashmix(y, a, hash); | ||
83 | crush_hashmix(b, x, hash); | ||
84 | crush_hashmix(y, c, hash); | ||
85 | crush_hashmix(d, x, hash); | ||
86 | crush_hashmix(y, e, hash); | ||
87 | return hash; | ||
88 | } | ||
89 | |||
90 | |||
91 | __u32 crush_hash32(int type, __u32 a) | ||
92 | { | ||
93 | switch (type) { | ||
94 | case CRUSH_HASH_RJENKINS1: | ||
95 | return crush_hash32_rjenkins1(a); | ||
96 | default: | ||
97 | return 0; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | __u32 crush_hash32_2(int type, __u32 a, __u32 b) | ||
102 | { | ||
103 | switch (type) { | ||
104 | case CRUSH_HASH_RJENKINS1: | ||
105 | return crush_hash32_rjenkins1_2(a, b); | ||
106 | default: | ||
107 | return 0; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) | ||
112 | { | ||
113 | switch (type) { | ||
114 | case CRUSH_HASH_RJENKINS1: | ||
115 | return crush_hash32_rjenkins1_3(a, b, c); | ||
116 | default: | ||
117 | return 0; | ||
118 | } | ||
119 | } | ||
120 | |||
121 | __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) | ||
122 | { | ||
123 | switch (type) { | ||
124 | case CRUSH_HASH_RJENKINS1: | ||
125 | return crush_hash32_rjenkins1_4(a, b, c, d); | ||
126 | default: | ||
127 | return 0; | ||
128 | } | ||
129 | } | ||
130 | |||
131 | __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) | ||
132 | { | ||
133 | switch (type) { | ||
134 | case CRUSH_HASH_RJENKINS1: | ||
135 | return crush_hash32_rjenkins1_5(a, b, c, d, e); | ||
136 | default: | ||
137 | return 0; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | const char *crush_hash_name(int type) | ||
142 | { | ||
143 | switch (type) { | ||
144 | case CRUSH_HASH_RJENKINS1: | ||
145 | return "rjenkins1"; | ||
146 | default: | ||
147 | return "unknown"; | ||
148 | } | ||
149 | } | ||
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h deleted file mode 100644 index 91e884230d5d..000000000000 --- a/fs/ceph/crush/hash.h +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | #ifndef CEPH_CRUSH_HASH_H | ||
2 | #define CEPH_CRUSH_HASH_H | ||
3 | |||
4 | #define CRUSH_HASH_RJENKINS1 0 | ||
5 | |||
6 | #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 | ||
7 | |||
8 | extern const char *crush_hash_name(int type); | ||
9 | |||
10 | extern __u32 crush_hash32(int type, __u32 a); | ||
11 | extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); | ||
12 | extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); | ||
13 | extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); | ||
14 | extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, | ||
15 | __u32 e); | ||
16 | |||
17 | #endif | ||
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c deleted file mode 100644 index a4eec133258e..000000000000 --- a/fs/ceph/crush/mapper.c +++ /dev/null | |||
@@ -1,609 +0,0 @@ | |||
1 | |||
2 | #ifdef __KERNEL__ | ||
3 | # include <linux/string.h> | ||
4 | # include <linux/slab.h> | ||
5 | # include <linux/bug.h> | ||
6 | # include <linux/kernel.h> | ||
7 | # ifndef dprintk | ||
8 | # define dprintk(args...) | ||
9 | # endif | ||
10 | #else | ||
11 | # include <string.h> | ||
12 | # include <stdio.h> | ||
13 | # include <stdlib.h> | ||
14 | # include <assert.h> | ||
15 | # define BUG_ON(x) assert(!(x)) | ||
16 | # define dprintk(args...) /* printf(args) */ | ||
17 | # define kmalloc(x, f) malloc(x) | ||
18 | # define kfree(x) free(x) | ||
19 | #endif | ||
20 | |||
21 | #include "crush.h" | ||
22 | #include "hash.h" | ||
23 | |||
24 | /* | ||
25 | * Implement the core CRUSH mapping algorithm. | ||
26 | */ | ||
27 | |||
28 | /** | ||
29 | * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. | ||
30 | * @map: the crush_map | ||
31 | * @ruleset: the storage ruleset id (user defined) | ||
32 | * @type: storage ruleset type (user defined) | ||
33 | * @size: output set size | ||
34 | */ | ||
35 | int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) | ||
36 | { | ||
37 | int i; | ||
38 | |||
39 | for (i = 0; i < map->max_rules; i++) { | ||
40 | if (map->rules[i] && | ||
41 | map->rules[i]->mask.ruleset == ruleset && | ||
42 | map->rules[i]->mask.type == type && | ||
43 | map->rules[i]->mask.min_size <= size && | ||
44 | map->rules[i]->mask.max_size >= size) | ||
45 | return i; | ||
46 | } | ||
47 | return -1; | ||
48 | } | ||
49 | |||
50 | |||
51 | /* | ||
52 | * bucket choose methods | ||
53 | * | ||
54 | * For each bucket algorithm, we have a "choose" method that, given a | ||
55 | * crush input @x and replica position (usually, position in output set) @r, | ||
56 | * will produce an item in the bucket. | ||
57 | */ | ||
58 | |||
59 | /* | ||
60 | * Choose based on a random permutation of the bucket. | ||
61 | * | ||
62 | * We used to use some prime number arithmetic to do this, but it | ||
63 | * wasn't very random, and had some other bad behaviors. Instead, we | ||
64 | * calculate an actual random permutation of the bucket members. | ||
65 | * Since this is expensive, we optimize for the r=0 case, which | ||
66 | * captures the vast majority of calls. | ||
67 | */ | ||
68 | static int bucket_perm_choose(struct crush_bucket *bucket, | ||
69 | int x, int r) | ||
70 | { | ||
71 | unsigned pr = r % bucket->size; | ||
72 | unsigned i, s; | ||
73 | |||
74 | /* start a new permutation if @x has changed */ | ||
75 | if (bucket->perm_x != x || bucket->perm_n == 0) { | ||
76 | dprintk("bucket %d new x=%d\n", bucket->id, x); | ||
77 | bucket->perm_x = x; | ||
78 | |||
79 | /* optimize common r=0 case */ | ||
80 | if (pr == 0) { | ||
81 | s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % | ||
82 | bucket->size; | ||
83 | bucket->perm[0] = s; | ||
84 | bucket->perm_n = 0xffff; /* magic value, see below */ | ||
85 | goto out; | ||
86 | } | ||
87 | |||
88 | for (i = 0; i < bucket->size; i++) | ||
89 | bucket->perm[i] = i; | ||
90 | bucket->perm_n = 0; | ||
91 | } else if (bucket->perm_n == 0xffff) { | ||
92 | /* clean up after the r=0 case above */ | ||
93 | for (i = 1; i < bucket->size; i++) | ||
94 | bucket->perm[i] = i; | ||
95 | bucket->perm[bucket->perm[0]] = 0; | ||
96 | bucket->perm_n = 1; | ||
97 | } | ||
98 | |||
99 | /* calculate permutation up to pr */ | ||
100 | for (i = 0; i < bucket->perm_n; i++) | ||
101 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); | ||
102 | while (bucket->perm_n <= pr) { | ||
103 | unsigned p = bucket->perm_n; | ||
104 | /* no point in swapping the final entry */ | ||
105 | if (p < bucket->size - 1) { | ||
106 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % | ||
107 | (bucket->size - p); | ||
108 | if (i) { | ||
109 | unsigned t = bucket->perm[p + i]; | ||
110 | bucket->perm[p + i] = bucket->perm[p]; | ||
111 | bucket->perm[p] = t; | ||
112 | } | ||
113 | dprintk(" perm_choose swap %d with %d\n", p, p+i); | ||
114 | } | ||
115 | bucket->perm_n++; | ||
116 | } | ||
117 | for (i = 0; i < bucket->size; i++) | ||
118 | dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); | ||
119 | |||
120 | s = bucket->perm[pr]; | ||
121 | out: | ||
122 | dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, | ||
123 | bucket->size, x, r, pr, s); | ||
124 | return bucket->items[s]; | ||
125 | } | ||
126 | |||
127 | /* uniform */ | ||
128 | static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, | ||
129 | int x, int r) | ||
130 | { | ||
131 | return bucket_perm_choose(&bucket->h, x, r); | ||
132 | } | ||
133 | |||
134 | /* list */ | ||
135 | static int bucket_list_choose(struct crush_bucket_list *bucket, | ||
136 | int x, int r) | ||
137 | { | ||
138 | int i; | ||
139 | |||
140 | for (i = bucket->h.size-1; i >= 0; i--) { | ||
141 | __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], | ||
142 | r, bucket->h.id); | ||
143 | w &= 0xffff; | ||
144 | dprintk("list_choose i=%d x=%d r=%d item %d weight %x " | ||
145 | "sw %x rand %llx", | ||
146 | i, x, r, bucket->h.items[i], bucket->item_weights[i], | ||
147 | bucket->sum_weights[i], w); | ||
148 | w *= bucket->sum_weights[i]; | ||
149 | w = w >> 16; | ||
150 | /*dprintk(" scaled %llx\n", w);*/ | ||
151 | if (w < bucket->item_weights[i]) | ||
152 | return bucket->h.items[i]; | ||
153 | } | ||
154 | |||
155 | BUG_ON(1); | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | |||
160 | /* (binary) tree */ | ||
161 | static int height(int n) | ||
162 | { | ||
163 | int h = 0; | ||
164 | while ((n & 1) == 0) { | ||
165 | h++; | ||
166 | n = n >> 1; | ||
167 | } | ||
168 | return h; | ||
169 | } | ||
170 | |||
171 | static int left(int x) | ||
172 | { | ||
173 | int h = height(x); | ||
174 | return x - (1 << (h-1)); | ||
175 | } | ||
176 | |||
177 | static int right(int x) | ||
178 | { | ||
179 | int h = height(x); | ||
180 | return x + (1 << (h-1)); | ||
181 | } | ||
182 | |||
183 | static int terminal(int x) | ||
184 | { | ||
185 | return x & 1; | ||
186 | } | ||
187 | |||
188 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, | ||
189 | int x, int r) | ||
190 | { | ||
191 | int n, l; | ||
192 | __u32 w; | ||
193 | __u64 t; | ||
194 | |||
195 | /* start at root */ | ||
196 | n = bucket->num_nodes >> 1; | ||
197 | |||
198 | while (!terminal(n)) { | ||
199 | /* pick point in [0, w) */ | ||
200 | w = bucket->node_weights[n]; | ||
201 | t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, | ||
202 | bucket->h.id) * (__u64)w; | ||
203 | t = t >> 32; | ||
204 | |||
205 | /* descend to the left or right? */ | ||
206 | l = left(n); | ||
207 | if (t < bucket->node_weights[l]) | ||
208 | n = l; | ||
209 | else | ||
210 | n = right(n); | ||
211 | } | ||
212 | |||
213 | return bucket->h.items[n >> 1]; | ||
214 | } | ||
215 | |||
216 | |||
217 | /* straw */ | ||
218 | |||
219 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, | ||
220 | int x, int r) | ||
221 | { | ||
222 | int i; | ||
223 | int high = 0; | ||
224 | __u64 high_draw = 0; | ||
225 | __u64 draw; | ||
226 | |||
227 | for (i = 0; i < bucket->h.size; i++) { | ||
228 | draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); | ||
229 | draw &= 0xffff; | ||
230 | draw *= bucket->straws[i]; | ||
231 | if (i == 0 || draw > high_draw) { | ||
232 | high = i; | ||
233 | high_draw = draw; | ||
234 | } | ||
235 | } | ||
236 | return bucket->h.items[high]; | ||
237 | } | ||
238 | |||
239 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | ||
240 | { | ||
241 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); | ||
242 | switch (in->alg) { | ||
243 | case CRUSH_BUCKET_UNIFORM: | ||
244 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, | ||
245 | x, r); | ||
246 | case CRUSH_BUCKET_LIST: | ||
247 | return bucket_list_choose((struct crush_bucket_list *)in, | ||
248 | x, r); | ||
249 | case CRUSH_BUCKET_TREE: | ||
250 | return bucket_tree_choose((struct crush_bucket_tree *)in, | ||
251 | x, r); | ||
252 | case CRUSH_BUCKET_STRAW: | ||
253 | return bucket_straw_choose((struct crush_bucket_straw *)in, | ||
254 | x, r); | ||
255 | default: | ||
256 | BUG_ON(1); | ||
257 | return in->items[0]; | ||
258 | } | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * true if device is marked "out" (failed, fully offloaded) | ||
263 | * of the cluster | ||
264 | */ | ||
265 | static int is_out(struct crush_map *map, __u32 *weight, int item, int x) | ||
266 | { | ||
267 | if (weight[item] >= 0x10000) | ||
268 | return 0; | ||
269 | if (weight[item] == 0) | ||
270 | return 1; | ||
271 | if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) | ||
272 | < weight[item]) | ||
273 | return 0; | ||
274 | return 1; | ||
275 | } | ||
276 | |||
277 | /** | ||
278 | * crush_choose - choose numrep distinct items of given type | ||
279 | * @map: the crush_map | ||
280 | * @bucket: the bucket we are choose an item from | ||
281 | * @x: crush input value | ||
282 | * @numrep: the number of items to choose | ||
283 | * @type: the type of item to choose | ||
284 | * @out: pointer to output vector | ||
285 | * @outpos: our position in that vector | ||
286 | * @firstn: true if choosing "first n" items, false if choosing "indep" | ||
287 | * @recurse_to_leaf: true if we want one device under each item of given type | ||
288 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | ||
289 | */ | ||
290 | static int crush_choose(struct crush_map *map, | ||
291 | struct crush_bucket *bucket, | ||
292 | __u32 *weight, | ||
293 | int x, int numrep, int type, | ||
294 | int *out, int outpos, | ||
295 | int firstn, int recurse_to_leaf, | ||
296 | int *out2) | ||
297 | { | ||
298 | int rep; | ||
299 | int ftotal, flocal; | ||
300 | int retry_descent, retry_bucket, skip_rep; | ||
301 | struct crush_bucket *in = bucket; | ||
302 | int r; | ||
303 | int i; | ||
304 | int item = 0; | ||
305 | int itemtype; | ||
306 | int collide, reject; | ||
307 | const int orig_tries = 5; /* attempts before we fall back to search */ | ||
308 | |||
309 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | ||
310 | bucket->id, x, outpos, numrep); | ||
311 | |||
312 | for (rep = outpos; rep < numrep; rep++) { | ||
313 | /* keep trying until we get a non-out, non-colliding item */ | ||
314 | ftotal = 0; | ||
315 | skip_rep = 0; | ||
316 | do { | ||
317 | retry_descent = 0; | ||
318 | in = bucket; /* initial bucket */ | ||
319 | |||
320 | /* choose through intervening buckets */ | ||
321 | flocal = 0; | ||
322 | do { | ||
323 | collide = 0; | ||
324 | retry_bucket = 0; | ||
325 | r = rep; | ||
326 | if (in->alg == CRUSH_BUCKET_UNIFORM) { | ||
327 | /* be careful */ | ||
328 | if (firstn || numrep >= in->size) | ||
329 | /* r' = r + f_total */ | ||
330 | r += ftotal; | ||
331 | else if (in->size % numrep == 0) | ||
332 | /* r'=r+(n+1)*f_local */ | ||
333 | r += (numrep+1) * | ||
334 | (flocal+ftotal); | ||
335 | else | ||
336 | /* r' = r + n*f_local */ | ||
337 | r += numrep * (flocal+ftotal); | ||
338 | } else { | ||
339 | if (firstn) | ||
340 | /* r' = r + f_total */ | ||
341 | r += ftotal; | ||
342 | else | ||
343 | /* r' = r + n*f_local */ | ||
344 | r += numrep * (flocal+ftotal); | ||
345 | } | ||
346 | |||
347 | /* bucket choose */ | ||
348 | if (in->size == 0) { | ||
349 | reject = 1; | ||
350 | goto reject; | ||
351 | } | ||
352 | if (flocal >= (in->size>>1) && | ||
353 | flocal > orig_tries) | ||
354 | item = bucket_perm_choose(in, x, r); | ||
355 | else | ||
356 | item = crush_bucket_choose(in, x, r); | ||
357 | BUG_ON(item >= map->max_devices); | ||
358 | |||
359 | /* desired type? */ | ||
360 | if (item < 0) | ||
361 | itemtype = map->buckets[-1-item]->type; | ||
362 | else | ||
363 | itemtype = 0; | ||
364 | dprintk(" item %d type %d\n", item, itemtype); | ||
365 | |||
366 | /* keep going? */ | ||
367 | if (itemtype != type) { | ||
368 | BUG_ON(item >= 0 || | ||
369 | (-1-item) >= map->max_buckets); | ||
370 | in = map->buckets[-1-item]; | ||
371 | retry_bucket = 1; | ||
372 | continue; | ||
373 | } | ||
374 | |||
375 | /* collision? */ | ||
376 | for (i = 0; i < outpos; i++) { | ||
377 | if (out[i] == item) { | ||
378 | collide = 1; | ||
379 | break; | ||
380 | } | ||
381 | } | ||
382 | |||
383 | reject = 0; | ||
384 | if (recurse_to_leaf) { | ||
385 | if (item < 0) { | ||
386 | if (crush_choose(map, | ||
387 | map->buckets[-1-item], | ||
388 | weight, | ||
389 | x, outpos+1, 0, | ||
390 | out2, outpos, | ||
391 | firstn, 0, | ||
392 | NULL) <= outpos) | ||
393 | /* didn't get leaf */ | ||
394 | reject = 1; | ||
395 | } else { | ||
396 | /* we already have a leaf! */ | ||
397 | out2[outpos] = item; | ||
398 | } | ||
399 | } | ||
400 | |||
401 | if (!reject) { | ||
402 | /* out? */ | ||
403 | if (itemtype == 0) | ||
404 | reject = is_out(map, weight, | ||
405 | item, x); | ||
406 | else | ||
407 | reject = 0; | ||
408 | } | ||
409 | |||
410 | reject: | ||
411 | if (reject || collide) { | ||
412 | ftotal++; | ||
413 | flocal++; | ||
414 | |||
415 | if (collide && flocal < 3) | ||
416 | /* retry locally a few times */ | ||
417 | retry_bucket = 1; | ||
418 | else if (flocal < in->size + orig_tries) | ||
419 | /* exhaustive bucket search */ | ||
420 | retry_bucket = 1; | ||
421 | else if (ftotal < 20) | ||
422 | /* then retry descent */ | ||
423 | retry_descent = 1; | ||
424 | else | ||
425 | /* else give up */ | ||
426 | skip_rep = 1; | ||
427 | dprintk(" reject %d collide %d " | ||
428 | "ftotal %d flocal %d\n", | ||
429 | reject, collide, ftotal, | ||
430 | flocal); | ||
431 | } | ||
432 | } while (retry_bucket); | ||
433 | } while (retry_descent); | ||
434 | |||
435 | if (skip_rep) { | ||
436 | dprintk("skip rep\n"); | ||
437 | continue; | ||
438 | } | ||
439 | |||
440 | dprintk("CHOOSE got %d\n", item); | ||
441 | out[outpos] = item; | ||
442 | outpos++; | ||
443 | } | ||
444 | |||
445 | dprintk("CHOOSE returns %d\n", outpos); | ||
446 | return outpos; | ||
447 | } | ||
448 | |||
449 | |||
450 | /** | ||
451 | * crush_do_rule - calculate a mapping with the given input and rule | ||
452 | * @map: the crush_map | ||
453 | * @ruleno: the rule id | ||
454 | * @x: hash input | ||
455 | * @result: pointer to result vector | ||
456 | * @result_max: maximum result size | ||
457 | * @force: force initial replica choice; -1 for none | ||
458 | */ | ||
459 | int crush_do_rule(struct crush_map *map, | ||
460 | int ruleno, int x, int *result, int result_max, | ||
461 | int force, __u32 *weight) | ||
462 | { | ||
463 | int result_len; | ||
464 | int force_context[CRUSH_MAX_DEPTH]; | ||
465 | int force_pos = -1; | ||
466 | int a[CRUSH_MAX_SET]; | ||
467 | int b[CRUSH_MAX_SET]; | ||
468 | int c[CRUSH_MAX_SET]; | ||
469 | int recurse_to_leaf; | ||
470 | int *w; | ||
471 | int wsize = 0; | ||
472 | int *o; | ||
473 | int osize; | ||
474 | int *tmp; | ||
475 | struct crush_rule *rule; | ||
476 | int step; | ||
477 | int i, j; | ||
478 | int numrep; | ||
479 | int firstn; | ||
480 | int rc = -1; | ||
481 | |||
482 | BUG_ON(ruleno >= map->max_rules); | ||
483 | |||
484 | rule = map->rules[ruleno]; | ||
485 | result_len = 0; | ||
486 | w = a; | ||
487 | o = b; | ||
488 | |||
489 | /* | ||
490 | * determine hierarchical context of force, if any. note | ||
491 | * that this may or may not correspond to the specific types | ||
492 | * referenced by the crush rule. | ||
493 | */ | ||
494 | if (force >= 0) { | ||
495 | if (force >= map->max_devices || | ||
496 | map->device_parents[force] == 0) { | ||
497 | /*dprintk("CRUSH: forcefed device dne\n");*/ | ||
498 | rc = -1; /* force fed device dne */ | ||
499 | goto out; | ||
500 | } | ||
501 | if (!is_out(map, weight, force, x)) { | ||
502 | while (1) { | ||
503 | force_context[++force_pos] = force; | ||
504 | if (force >= 0) | ||
505 | force = map->device_parents[force]; | ||
506 | else | ||
507 | force = map->bucket_parents[-1-force]; | ||
508 | if (force == 0) | ||
509 | break; | ||
510 | } | ||
511 | } | ||
512 | } | ||
513 | |||
514 | for (step = 0; step < rule->len; step++) { | ||
515 | firstn = 0; | ||
516 | switch (rule->steps[step].op) { | ||
517 | case CRUSH_RULE_TAKE: | ||
518 | w[0] = rule->steps[step].arg1; | ||
519 | if (force_pos >= 0) { | ||
520 | BUG_ON(force_context[force_pos] != w[0]); | ||
521 | force_pos--; | ||
522 | } | ||
523 | wsize = 1; | ||
524 | break; | ||
525 | |||
526 | case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: | ||
527 | case CRUSH_RULE_CHOOSE_FIRSTN: | ||
528 | firstn = 1; | ||
529 | case CRUSH_RULE_CHOOSE_LEAF_INDEP: | ||
530 | case CRUSH_RULE_CHOOSE_INDEP: | ||
531 | BUG_ON(wsize == 0); | ||
532 | |||
533 | recurse_to_leaf = | ||
534 | rule->steps[step].op == | ||
535 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN || | ||
536 | rule->steps[step].op == | ||
537 | CRUSH_RULE_CHOOSE_LEAF_INDEP; | ||
538 | |||
539 | /* reset output */ | ||
540 | osize = 0; | ||
541 | |||
542 | for (i = 0; i < wsize; i++) { | ||
543 | /* | ||
544 | * see CRUSH_N, CRUSH_N_MINUS macros. | ||
545 | * basically, numrep <= 0 means relative to | ||
546 | * the provided result_max | ||
547 | */ | ||
548 | numrep = rule->steps[step].arg1; | ||
549 | if (numrep <= 0) { | ||
550 | numrep += result_max; | ||
551 | if (numrep <= 0) | ||
552 | continue; | ||
553 | } | ||
554 | j = 0; | ||
555 | if (osize == 0 && force_pos >= 0) { | ||
556 | /* skip any intermediate types */ | ||
557 | while (force_pos && | ||
558 | force_context[force_pos] < 0 && | ||
559 | rule->steps[step].arg2 != | ||
560 | map->buckets[-1 - | ||
561 | force_context[force_pos]]->type) | ||
562 | force_pos--; | ||
563 | o[osize] = force_context[force_pos]; | ||
564 | if (recurse_to_leaf) | ||
565 | c[osize] = force_context[0]; | ||
566 | j++; | ||
567 | force_pos--; | ||
568 | } | ||
569 | osize += crush_choose(map, | ||
570 | map->buckets[-1-w[i]], | ||
571 | weight, | ||
572 | x, numrep, | ||
573 | rule->steps[step].arg2, | ||
574 | o+osize, j, | ||
575 | firstn, | ||
576 | recurse_to_leaf, c+osize); | ||
577 | } | ||
578 | |||
579 | if (recurse_to_leaf) | ||
580 | /* copy final _leaf_ values to output set */ | ||
581 | memcpy(o, c, osize*sizeof(*o)); | ||
582 | |||
583 | /* swap t and w arrays */ | ||
584 | tmp = o; | ||
585 | o = w; | ||
586 | w = tmp; | ||
587 | wsize = osize; | ||
588 | break; | ||
589 | |||
590 | |||
591 | case CRUSH_RULE_EMIT: | ||
592 | for (i = 0; i < wsize && result_len < result_max; i++) { | ||
593 | result[result_len] = w[i]; | ||
594 | result_len++; | ||
595 | } | ||
596 | wsize = 0; | ||
597 | break; | ||
598 | |||
599 | default: | ||
600 | BUG_ON(1); | ||
601 | } | ||
602 | } | ||
603 | rc = result_len; | ||
604 | |||
605 | out: | ||
606 | return rc; | ||
607 | } | ||
608 | |||
609 | |||
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h deleted file mode 100644 index c46b99c18bb0..000000000000 --- a/fs/ceph/crush/mapper.h +++ /dev/null | |||
@@ -1,20 +0,0 @@ | |||
1 | #ifndef CEPH_CRUSH_MAPPER_H | ||
2 | #define CEPH_CRUSH_MAPPER_H | ||
3 | |||
4 | /* | ||
5 | * CRUSH functions for find rules and then mapping an input to an | ||
6 | * output set. | ||
7 | * | ||
8 | * LGPL2 | ||
9 | */ | ||
10 | |||
11 | #include "crush.h" | ||
12 | |||
13 | extern int crush_find_rule(struct crush_map *map, int pool, int type, int size); | ||
14 | extern int crush_do_rule(struct crush_map *map, | ||
15 | int ruleno, | ||
16 | int x, int *result, int result_max, | ||
17 | int forcefeed, /* -1 for none */ | ||
18 | __u32 *weights); | ||
19 | |||
20 | #endif | ||
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c deleted file mode 100644 index a3e627f63293..000000000000 --- a/fs/ceph/crypto.c +++ /dev/null | |||
@@ -1,412 +0,0 @@ | |||
1 | |||
2 | #include "ceph_debug.h" | ||
3 | |||
4 | #include <linux/err.h> | ||
5 | #include <linux/scatterlist.h> | ||
6 | #include <linux/slab.h> | ||
7 | #include <crypto/hash.h> | ||
8 | |||
9 | #include "crypto.h" | ||
10 | #include "decode.h" | ||
11 | |||
12 | int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) | ||
13 | { | ||
14 | if (*p + sizeof(u16) + sizeof(key->created) + | ||
15 | sizeof(u16) + key->len > end) | ||
16 | return -ERANGE; | ||
17 | ceph_encode_16(p, key->type); | ||
18 | ceph_encode_copy(p, &key->created, sizeof(key->created)); | ||
19 | ceph_encode_16(p, key->len); | ||
20 | ceph_encode_copy(p, key->key, key->len); | ||
21 | return 0; | ||
22 | } | ||
23 | |||
24 | int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) | ||
25 | { | ||
26 | ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); | ||
27 | key->type = ceph_decode_16(p); | ||
28 | ceph_decode_copy(p, &key->created, sizeof(key->created)); | ||
29 | key->len = ceph_decode_16(p); | ||
30 | ceph_decode_need(p, end, key->len, bad); | ||
31 | key->key = kmalloc(key->len, GFP_NOFS); | ||
32 | if (!key->key) | ||
33 | return -ENOMEM; | ||
34 | ceph_decode_copy(p, key->key, key->len); | ||
35 | return 0; | ||
36 | |||
37 | bad: | ||
38 | dout("failed to decode crypto key\n"); | ||
39 | return -EINVAL; | ||
40 | } | ||
41 | |||
42 | int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) | ||
43 | { | ||
44 | int inlen = strlen(inkey); | ||
45 | int blen = inlen * 3 / 4; | ||
46 | void *buf, *p; | ||
47 | int ret; | ||
48 | |||
49 | dout("crypto_key_unarmor %s\n", inkey); | ||
50 | buf = kmalloc(blen, GFP_NOFS); | ||
51 | if (!buf) | ||
52 | return -ENOMEM; | ||
53 | blen = ceph_unarmor(buf, inkey, inkey+inlen); | ||
54 | if (blen < 0) { | ||
55 | kfree(buf); | ||
56 | return blen; | ||
57 | } | ||
58 | |||
59 | p = buf; | ||
60 | ret = ceph_crypto_key_decode(key, &p, p + blen); | ||
61 | kfree(buf); | ||
62 | if (ret) | ||
63 | return ret; | ||
64 | dout("crypto_key_unarmor key %p type %d len %d\n", key, | ||
65 | key->type, key->len); | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | |||
70 | |||
71 | #define AES_KEY_SIZE 16 | ||
72 | |||
73 | static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) | ||
74 | { | ||
75 | return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); | ||
76 | } | ||
77 | |||
78 | static const u8 *aes_iv = (u8 *)CEPH_AES_IV; | ||
79 | |||
80 | static int ceph_aes_encrypt(const void *key, int key_len, | ||
81 | void *dst, size_t *dst_len, | ||
82 | const void *src, size_t src_len) | ||
83 | { | ||
84 | struct scatterlist sg_in[2], sg_out[1]; | ||
85 | struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); | ||
86 | struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; | ||
87 | int ret; | ||
88 | void *iv; | ||
89 | int ivsize; | ||
90 | size_t zero_padding = (0x10 - (src_len & 0x0f)); | ||
91 | char pad[16]; | ||
92 | |||
93 | if (IS_ERR(tfm)) | ||
94 | return PTR_ERR(tfm); | ||
95 | |||
96 | memset(pad, zero_padding, zero_padding); | ||
97 | |||
98 | *dst_len = src_len + zero_padding; | ||
99 | |||
100 | crypto_blkcipher_setkey((void *)tfm, key, key_len); | ||
101 | sg_init_table(sg_in, 2); | ||
102 | sg_set_buf(&sg_in[0], src, src_len); | ||
103 | sg_set_buf(&sg_in[1], pad, zero_padding); | ||
104 | sg_init_table(sg_out, 1); | ||
105 | sg_set_buf(sg_out, dst, *dst_len); | ||
106 | iv = crypto_blkcipher_crt(tfm)->iv; | ||
107 | ivsize = crypto_blkcipher_ivsize(tfm); | ||
108 | |||
109 | memcpy(iv, aes_iv, ivsize); | ||
110 | /* | ||
111 | print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, | ||
112 | key, key_len, 1); | ||
113 | print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1, | ||
114 | src, src_len, 1); | ||
115 | print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, | ||
116 | pad, zero_padding, 1); | ||
117 | */ | ||
118 | ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, | ||
119 | src_len + zero_padding); | ||
120 | crypto_free_blkcipher(tfm); | ||
121 | if (ret < 0) | ||
122 | pr_err("ceph_aes_crypt failed %d\n", ret); | ||
123 | /* | ||
124 | print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, | ||
125 | dst, *dst_len, 1); | ||
126 | */ | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, | ||
131 | size_t *dst_len, | ||
132 | const void *src1, size_t src1_len, | ||
133 | const void *src2, size_t src2_len) | ||
134 | { | ||
135 | struct scatterlist sg_in[3], sg_out[1]; | ||
136 | struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); | ||
137 | struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; | ||
138 | int ret; | ||
139 | void *iv; | ||
140 | int ivsize; | ||
141 | size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); | ||
142 | char pad[16]; | ||
143 | |||
144 | if (IS_ERR(tfm)) | ||
145 | return PTR_ERR(tfm); | ||
146 | |||
147 | memset(pad, zero_padding, zero_padding); | ||
148 | |||
149 | *dst_len = src1_len + src2_len + zero_padding; | ||
150 | |||
151 | crypto_blkcipher_setkey((void *)tfm, key, key_len); | ||
152 | sg_init_table(sg_in, 3); | ||
153 | sg_set_buf(&sg_in[0], src1, src1_len); | ||
154 | sg_set_buf(&sg_in[1], src2, src2_len); | ||
155 | sg_set_buf(&sg_in[2], pad, zero_padding); | ||
156 | sg_init_table(sg_out, 1); | ||
157 | sg_set_buf(sg_out, dst, *dst_len); | ||
158 | iv = crypto_blkcipher_crt(tfm)->iv; | ||
159 | ivsize = crypto_blkcipher_ivsize(tfm); | ||
160 | |||
161 | memcpy(iv, aes_iv, ivsize); | ||
162 | /* | ||
163 | print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, | ||
164 | key, key_len, 1); | ||
165 | print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1, | ||
166 | src1, src1_len, 1); | ||
167 | print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1, | ||
168 | src2, src2_len, 1); | ||
169 | print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, | ||
170 | pad, zero_padding, 1); | ||
171 | */ | ||
172 | ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, | ||
173 | src1_len + src2_len + zero_padding); | ||
174 | crypto_free_blkcipher(tfm); | ||
175 | if (ret < 0) | ||
176 | pr_err("ceph_aes_crypt2 failed %d\n", ret); | ||
177 | /* | ||
178 | print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, | ||
179 | dst, *dst_len, 1); | ||
180 | */ | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static int ceph_aes_decrypt(const void *key, int key_len, | ||
185 | void *dst, size_t *dst_len, | ||
186 | const void *src, size_t src_len) | ||
187 | { | ||
188 | struct scatterlist sg_in[1], sg_out[2]; | ||
189 | struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); | ||
190 | struct blkcipher_desc desc = { .tfm = tfm }; | ||
191 | char pad[16]; | ||
192 | void *iv; | ||
193 | int ivsize; | ||
194 | int ret; | ||
195 | int last_byte; | ||
196 | |||
197 | if (IS_ERR(tfm)) | ||
198 | return PTR_ERR(tfm); | ||
199 | |||
200 | crypto_blkcipher_setkey((void *)tfm, key, key_len); | ||
201 | sg_init_table(sg_in, 1); | ||
202 | sg_init_table(sg_out, 2); | ||
203 | sg_set_buf(sg_in, src, src_len); | ||
204 | sg_set_buf(&sg_out[0], dst, *dst_len); | ||
205 | sg_set_buf(&sg_out[1], pad, sizeof(pad)); | ||
206 | |||
207 | iv = crypto_blkcipher_crt(tfm)->iv; | ||
208 | ivsize = crypto_blkcipher_ivsize(tfm); | ||
209 | |||
210 | memcpy(iv, aes_iv, ivsize); | ||
211 | |||
212 | /* | ||
213 | print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, | ||
214 | key, key_len, 1); | ||
215 | print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, | ||
216 | src, src_len, 1); | ||
217 | */ | ||
218 | |||
219 | ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); | ||
220 | crypto_free_blkcipher(tfm); | ||
221 | if (ret < 0) { | ||
222 | pr_err("ceph_aes_decrypt failed %d\n", ret); | ||
223 | return ret; | ||
224 | } | ||
225 | |||
226 | if (src_len <= *dst_len) | ||
227 | last_byte = ((char *)dst)[src_len - 1]; | ||
228 | else | ||
229 | last_byte = pad[src_len - *dst_len - 1]; | ||
230 | if (last_byte <= 16 && src_len >= last_byte) { | ||
231 | *dst_len = src_len - last_byte; | ||
232 | } else { | ||
233 | pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", | ||
234 | last_byte, (int)src_len); | ||
235 | return -EPERM; /* bad padding */ | ||
236 | } | ||
237 | /* | ||
238 | print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, | ||
239 | dst, *dst_len, 1); | ||
240 | */ | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static int ceph_aes_decrypt2(const void *key, int key_len, | ||
245 | void *dst1, size_t *dst1_len, | ||
246 | void *dst2, size_t *dst2_len, | ||
247 | const void *src, size_t src_len) | ||
248 | { | ||
249 | struct scatterlist sg_in[1], sg_out[3]; | ||
250 | struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); | ||
251 | struct blkcipher_desc desc = { .tfm = tfm }; | ||
252 | char pad[16]; | ||
253 | void *iv; | ||
254 | int ivsize; | ||
255 | int ret; | ||
256 | int last_byte; | ||
257 | |||
258 | if (IS_ERR(tfm)) | ||
259 | return PTR_ERR(tfm); | ||
260 | |||
261 | sg_init_table(sg_in, 1); | ||
262 | sg_set_buf(sg_in, src, src_len); | ||
263 | sg_init_table(sg_out, 3); | ||
264 | sg_set_buf(&sg_out[0], dst1, *dst1_len); | ||
265 | sg_set_buf(&sg_out[1], dst2, *dst2_len); | ||
266 | sg_set_buf(&sg_out[2], pad, sizeof(pad)); | ||
267 | |||
268 | crypto_blkcipher_setkey((void *)tfm, key, key_len); | ||
269 | iv = crypto_blkcipher_crt(tfm)->iv; | ||
270 | ivsize = crypto_blkcipher_ivsize(tfm); | ||
271 | |||
272 | memcpy(iv, aes_iv, ivsize); | ||
273 | |||
274 | /* | ||
275 | print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, | ||
276 | key, key_len, 1); | ||
277 | print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, | ||
278 | src, src_len, 1); | ||
279 | */ | ||
280 | |||
281 | ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); | ||
282 | crypto_free_blkcipher(tfm); | ||
283 | if (ret < 0) { | ||
284 | pr_err("ceph_aes_decrypt failed %d\n", ret); | ||
285 | return ret; | ||
286 | } | ||
287 | |||
288 | if (src_len <= *dst1_len) | ||
289 | last_byte = ((char *)dst1)[src_len - 1]; | ||
290 | else if (src_len <= *dst1_len + *dst2_len) | ||
291 | last_byte = ((char *)dst2)[src_len - *dst1_len - 1]; | ||
292 | else | ||
293 | last_byte = pad[src_len - *dst1_len - *dst2_len - 1]; | ||
294 | if (last_byte <= 16 && src_len >= last_byte) { | ||
295 | src_len -= last_byte; | ||
296 | } else { | ||
297 | pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", | ||
298 | last_byte, (int)src_len); | ||
299 | return -EPERM; /* bad padding */ | ||
300 | } | ||
301 | |||
302 | if (src_len < *dst1_len) { | ||
303 | *dst1_len = src_len; | ||
304 | *dst2_len = 0; | ||
305 | } else { | ||
306 | *dst2_len = src_len - *dst1_len; | ||
307 | } | ||
308 | /* | ||
309 | print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, | ||
310 | dst1, *dst1_len, 1); | ||
311 | print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1, | ||
312 | dst2, *dst2_len, 1); | ||
313 | */ | ||
314 | |||
315 | return 0; | ||
316 | } | ||
317 | |||
318 | |||
319 | int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, | ||
320 | const void *src, size_t src_len) | ||
321 | { | ||
322 | switch (secret->type) { | ||
323 | case CEPH_CRYPTO_NONE: | ||
324 | if (*dst_len < src_len) | ||
325 | return -ERANGE; | ||
326 | memcpy(dst, src, src_len); | ||
327 | *dst_len = src_len; | ||
328 | return 0; | ||
329 | |||
330 | case CEPH_CRYPTO_AES: | ||
331 | return ceph_aes_decrypt(secret->key, secret->len, dst, | ||
332 | dst_len, src, src_len); | ||
333 | |||
334 | default: | ||
335 | return -EINVAL; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | int ceph_decrypt2(struct ceph_crypto_key *secret, | ||
340 | void *dst1, size_t *dst1_len, | ||
341 | void *dst2, size_t *dst2_len, | ||
342 | const void *src, size_t src_len) | ||
343 | { | ||
344 | size_t t; | ||
345 | |||
346 | switch (secret->type) { | ||
347 | case CEPH_CRYPTO_NONE: | ||
348 | if (*dst1_len + *dst2_len < src_len) | ||
349 | return -ERANGE; | ||
350 | t = min(*dst1_len, src_len); | ||
351 | memcpy(dst1, src, t); | ||
352 | *dst1_len = t; | ||
353 | src += t; | ||
354 | src_len -= t; | ||
355 | if (src_len) { | ||
356 | t = min(*dst2_len, src_len); | ||
357 | memcpy(dst2, src, t); | ||
358 | *dst2_len = t; | ||
359 | } | ||
360 | return 0; | ||
361 | |||
362 | case CEPH_CRYPTO_AES: | ||
363 | return ceph_aes_decrypt2(secret->key, secret->len, | ||
364 | dst1, dst1_len, dst2, dst2_len, | ||
365 | src, src_len); | ||
366 | |||
367 | default: | ||
368 | return -EINVAL; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, | ||
373 | const void *src, size_t src_len) | ||
374 | { | ||
375 | switch (secret->type) { | ||
376 | case CEPH_CRYPTO_NONE: | ||
377 | if (*dst_len < src_len) | ||
378 | return -ERANGE; | ||
379 | memcpy(dst, src, src_len); | ||
380 | *dst_len = src_len; | ||
381 | return 0; | ||
382 | |||
383 | case CEPH_CRYPTO_AES: | ||
384 | return ceph_aes_encrypt(secret->key, secret->len, dst, | ||
385 | dst_len, src, src_len); | ||
386 | |||
387 | default: | ||
388 | return -EINVAL; | ||
389 | } | ||
390 | } | ||
391 | |||
392 | int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, | ||
393 | const void *src1, size_t src1_len, | ||
394 | const void *src2, size_t src2_len) | ||
395 | { | ||
396 | switch (secret->type) { | ||
397 | case CEPH_CRYPTO_NONE: | ||
398 | if (*dst_len < src1_len + src2_len) | ||
399 | return -ERANGE; | ||
400 | memcpy(dst, src1, src1_len); | ||
401 | memcpy(dst + src1_len, src2, src2_len); | ||
402 | *dst_len = src1_len + src2_len; | ||
403 | return 0; | ||
404 | |||
405 | case CEPH_CRYPTO_AES: | ||
406 | return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, | ||
407 | src1, src1_len, src2, src2_len); | ||
408 | |||
409 | default: | ||
410 | return -EINVAL; | ||
411 | } | ||
412 | } | ||
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h deleted file mode 100644 index bdf38607323c..000000000000 --- a/fs/ceph/crypto.h +++ /dev/null | |||
@@ -1,48 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_CRYPTO_H | ||
2 | #define _FS_CEPH_CRYPTO_H | ||
3 | |||
4 | #include "types.h" | ||
5 | #include "buffer.h" | ||
6 | |||
7 | /* | ||
8 | * cryptographic secret | ||
9 | */ | ||
10 | struct ceph_crypto_key { | ||
11 | int type; | ||
12 | struct ceph_timespec created; | ||
13 | int len; | ||
14 | void *key; | ||
15 | }; | ||
16 | |||
17 | static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) | ||
18 | { | ||
19 | kfree(key->key); | ||
20 | } | ||
21 | |||
22 | extern int ceph_crypto_key_encode(struct ceph_crypto_key *key, | ||
23 | void **p, void *end); | ||
24 | extern int ceph_crypto_key_decode(struct ceph_crypto_key *key, | ||
25 | void **p, void *end); | ||
26 | extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); | ||
27 | |||
28 | /* crypto.c */ | ||
29 | extern int ceph_decrypt(struct ceph_crypto_key *secret, | ||
30 | void *dst, size_t *dst_len, | ||
31 | const void *src, size_t src_len); | ||
32 | extern int ceph_encrypt(struct ceph_crypto_key *secret, | ||
33 | void *dst, size_t *dst_len, | ||
34 | const void *src, size_t src_len); | ||
35 | extern int ceph_decrypt2(struct ceph_crypto_key *secret, | ||
36 | void *dst1, size_t *dst1_len, | ||
37 | void *dst2, size_t *dst2_len, | ||
38 | const void *src, size_t src_len); | ||
39 | extern int ceph_encrypt2(struct ceph_crypto_key *secret, | ||
40 | void *dst, size_t *dst_len, | ||
41 | const void *src1, size_t src1_len, | ||
42 | const void *src2, size_t src2_len); | ||
43 | |||
44 | /* armor.c */ | ||
45 | extern int ceph_armor(char *dst, const char *src, const char *end); | ||
46 | extern int ceph_unarmor(char *dst, const char *src, const char *end); | ||
47 | |||
48 | #endif | ||
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6fd8b20a8611..7ae1b3d55b58 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/device.h> | 3 | #include <linux/device.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
@@ -7,143 +7,49 @@ | |||
7 | #include <linux/debugfs.h> | 7 | #include <linux/debugfs.h> |
8 | #include <linux/seq_file.h> | 8 | #include <linux/seq_file.h> |
9 | 9 | ||
10 | #include <linux/ceph/libceph.h> | ||
11 | #include <linux/ceph/mon_client.h> | ||
12 | #include <linux/ceph/auth.h> | ||
13 | #include <linux/ceph/debugfs.h> | ||
14 | |||
10 | #include "super.h" | 15 | #include "super.h" |
11 | #include "mds_client.h" | ||
12 | #include "mon_client.h" | ||
13 | #include "auth.h" | ||
14 | 16 | ||
15 | #ifdef CONFIG_DEBUG_FS | 17 | #ifdef CONFIG_DEBUG_FS |
16 | 18 | ||
17 | /* | 19 | #include "mds_client.h" |
18 | * Implement /sys/kernel/debug/ceph fun | ||
19 | * | ||
20 | * /sys/kernel/debug/ceph/client* - an instance of the ceph client | ||
21 | * .../osdmap - current osdmap | ||
22 | * .../mdsmap - current mdsmap | ||
23 | * .../monmap - current monmap | ||
24 | * .../osdc - active osd requests | ||
25 | * .../mdsc - active mds requests | ||
26 | * .../monc - mon client state | ||
27 | * .../dentry_lru - dump contents of dentry lru | ||
28 | * .../caps - expose cap (reservation) stats | ||
29 | * .../bdi - symlink to ../../bdi/something | ||
30 | */ | ||
31 | |||
32 | static struct dentry *ceph_debugfs_dir; | ||
33 | |||
34 | static int monmap_show(struct seq_file *s, void *p) | ||
35 | { | ||
36 | int i; | ||
37 | struct ceph_client *client = s->private; | ||
38 | |||
39 | if (client->monc.monmap == NULL) | ||
40 | return 0; | ||
41 | |||
42 | seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); | ||
43 | for (i = 0; i < client->monc.monmap->num_mon; i++) { | ||
44 | struct ceph_entity_inst *inst = | ||
45 | &client->monc.monmap->mon_inst[i]; | ||
46 | |||
47 | seq_printf(s, "\t%s%lld\t%s\n", | ||
48 | ENTITY_NAME(inst->name), | ||
49 | pr_addr(&inst->addr.in_addr)); | ||
50 | } | ||
51 | return 0; | ||
52 | } | ||
53 | 20 | ||
54 | static int mdsmap_show(struct seq_file *s, void *p) | 21 | static int mdsmap_show(struct seq_file *s, void *p) |
55 | { | 22 | { |
56 | int i; | 23 | int i; |
57 | struct ceph_client *client = s->private; | 24 | struct ceph_fs_client *fsc = s->private; |
58 | 25 | ||
59 | if (client->mdsc.mdsmap == NULL) | 26 | if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) |
60 | return 0; | 27 | return 0; |
61 | seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); | 28 | seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); |
62 | seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); | 29 | seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); |
63 | seq_printf(s, "session_timeout %d\n", | 30 | seq_printf(s, "session_timeout %d\n", |
64 | client->mdsc.mdsmap->m_session_timeout); | 31 | fsc->mdsc->mdsmap->m_session_timeout); |
65 | seq_printf(s, "session_autoclose %d\n", | 32 | seq_printf(s, "session_autoclose %d\n", |
66 | client->mdsc.mdsmap->m_session_autoclose); | 33 | fsc->mdsc->mdsmap->m_session_autoclose); |
67 | for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { | 34 | for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { |
68 | struct ceph_entity_addr *addr = | 35 | struct ceph_entity_addr *addr = |
69 | &client->mdsc.mdsmap->m_info[i].addr; | 36 | &fsc->mdsc->mdsmap->m_info[i].addr; |
70 | int state = client->mdsc.mdsmap->m_info[i].state; | 37 | int state = fsc->mdsc->mdsmap->m_info[i].state; |
71 | 38 | ||
72 | seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), | 39 | seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, |
40 | ceph_pr_addr(&addr->in_addr), | ||
73 | ceph_mds_state_name(state)); | 41 | ceph_mds_state_name(state)); |
74 | } | 42 | } |
75 | return 0; | 43 | return 0; |
76 | } | 44 | } |
77 | 45 | ||
78 | static int osdmap_show(struct seq_file *s, void *p) | 46 | /* |
79 | { | 47 | * mdsc debugfs |
80 | int i; | 48 | */ |
81 | struct ceph_client *client = s->private; | ||
82 | struct rb_node *n; | ||
83 | |||
84 | if (client->osdc.osdmap == NULL) | ||
85 | return 0; | ||
86 | seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); | ||
87 | seq_printf(s, "flags%s%s\n", | ||
88 | (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? | ||
89 | " NEARFULL" : "", | ||
90 | (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? | ||
91 | " FULL" : ""); | ||
92 | for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { | ||
93 | struct ceph_pg_pool_info *pool = | ||
94 | rb_entry(n, struct ceph_pg_pool_info, node); | ||
95 | seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", | ||
96 | pool->id, pool->v.pg_num, pool->pg_num_mask, | ||
97 | pool->v.lpg_num, pool->lpg_num_mask); | ||
98 | } | ||
99 | for (i = 0; i < client->osdc.osdmap->max_osd; i++) { | ||
100 | struct ceph_entity_addr *addr = | ||
101 | &client->osdc.osdmap->osd_addr[i]; | ||
102 | int state = client->osdc.osdmap->osd_state[i]; | ||
103 | char sb[64]; | ||
104 | |||
105 | seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", | ||
106 | i, pr_addr(&addr->in_addr), | ||
107 | ((client->osdc.osdmap->osd_weight[i]*100) >> 16), | ||
108 | ceph_osdmap_state_str(sb, sizeof(sb), state)); | ||
109 | } | ||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | static int monc_show(struct seq_file *s, void *p) | ||
114 | { | ||
115 | struct ceph_client *client = s->private; | ||
116 | struct ceph_mon_generic_request *req; | ||
117 | struct ceph_mon_client *monc = &client->monc; | ||
118 | struct rb_node *rp; | ||
119 | |||
120 | mutex_lock(&monc->mutex); | ||
121 | |||
122 | if (monc->have_mdsmap) | ||
123 | seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); | ||
124 | if (monc->have_osdmap) | ||
125 | seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); | ||
126 | if (monc->want_next_osdmap) | ||
127 | seq_printf(s, "want next osdmap\n"); | ||
128 | |||
129 | for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { | ||
130 | __u16 op; | ||
131 | req = rb_entry(rp, struct ceph_mon_generic_request, node); | ||
132 | op = le16_to_cpu(req->request->hdr.type); | ||
133 | if (op == CEPH_MSG_STATFS) | ||
134 | seq_printf(s, "%lld statfs\n", req->tid); | ||
135 | else | ||
136 | seq_printf(s, "%lld unknown\n", req->tid); | ||
137 | } | ||
138 | |||
139 | mutex_unlock(&monc->mutex); | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static int mdsc_show(struct seq_file *s, void *p) | 49 | static int mdsc_show(struct seq_file *s, void *p) |
144 | { | 50 | { |
145 | struct ceph_client *client = s->private; | 51 | struct ceph_fs_client *fsc = s->private; |
146 | struct ceph_mds_client *mdsc = &client->mdsc; | 52 | struct ceph_mds_client *mdsc = fsc->mdsc; |
147 | struct ceph_mds_request *req; | 53 | struct ceph_mds_request *req; |
148 | struct rb_node *rp; | 54 | struct rb_node *rp; |
149 | int pathlen; | 55 | int pathlen; |
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p) | |||
214 | return 0; | 120 | return 0; |
215 | } | 121 | } |
216 | 122 | ||
217 | static int osdc_show(struct seq_file *s, void *pp) | ||
218 | { | ||
219 | struct ceph_client *client = s->private; | ||
220 | struct ceph_osd_client *osdc = &client->osdc; | ||
221 | struct rb_node *p; | ||
222 | |||
223 | mutex_lock(&osdc->request_mutex); | ||
224 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | ||
225 | struct ceph_osd_request *req; | ||
226 | struct ceph_osd_request_head *head; | ||
227 | struct ceph_osd_op *op; | ||
228 | int num_ops; | ||
229 | int opcode, olen; | ||
230 | int i; | ||
231 | |||
232 | req = rb_entry(p, struct ceph_osd_request, r_node); | ||
233 | |||
234 | seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, | ||
235 | req->r_osd ? req->r_osd->o_osd : -1, | ||
236 | le32_to_cpu(req->r_pgid.pool), | ||
237 | le16_to_cpu(req->r_pgid.ps)); | ||
238 | |||
239 | head = req->r_request->front.iov_base; | ||
240 | op = (void *)(head + 1); | ||
241 | |||
242 | num_ops = le16_to_cpu(head->num_ops); | ||
243 | olen = le32_to_cpu(head->object_len); | ||
244 | seq_printf(s, "%.*s", olen, | ||
245 | (const char *)(head->ops + num_ops)); | ||
246 | |||
247 | if (req->r_reassert_version.epoch) | ||
248 | seq_printf(s, "\t%u'%llu", | ||
249 | (unsigned)le32_to_cpu(req->r_reassert_version.epoch), | ||
250 | le64_to_cpu(req->r_reassert_version.version)); | ||
251 | else | ||
252 | seq_printf(s, "\t"); | ||
253 | |||
254 | for (i = 0; i < num_ops; i++) { | ||
255 | opcode = le16_to_cpu(op->op); | ||
256 | seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); | ||
257 | op++; | ||
258 | } | ||
259 | |||
260 | seq_printf(s, "\n"); | ||
261 | } | ||
262 | mutex_unlock(&osdc->request_mutex); | ||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | static int caps_show(struct seq_file *s, void *p) | 123 | static int caps_show(struct seq_file *s, void *p) |
267 | { | 124 | { |
268 | struct ceph_client *client = s->private; | 125 | struct ceph_fs_client *fsc = s->private; |
269 | int total, avail, used, reserved, min; | 126 | int total, avail, used, reserved, min; |
270 | 127 | ||
271 | ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); | 128 | ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); |
272 | seq_printf(s, "total\t\t%d\n" | 129 | seq_printf(s, "total\t\t%d\n" |
273 | "avail\t\t%d\n" | 130 | "avail\t\t%d\n" |
274 | "used\t\t%d\n" | 131 | "used\t\t%d\n" |
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p) | |||
280 | 137 | ||
281 | static int dentry_lru_show(struct seq_file *s, void *ptr) | 138 | static int dentry_lru_show(struct seq_file *s, void *ptr) |
282 | { | 139 | { |
283 | struct ceph_client *client = s->private; | 140 | struct ceph_fs_client *fsc = s->private; |
284 | struct ceph_mds_client *mdsc = &client->mdsc; | 141 | struct ceph_mds_client *mdsc = fsc->mdsc; |
285 | struct ceph_dentry_info *di; | 142 | struct ceph_dentry_info *di; |
286 | 143 | ||
287 | spin_lock(&mdsc->dentry_lru_lock); | 144 | spin_lock(&mdsc->dentry_lru_lock); |
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) | |||
295 | return 0; | 152 | return 0; |
296 | } | 153 | } |
297 | 154 | ||
298 | #define DEFINE_SHOW_FUNC(name) \ | 155 | CEPH_DEFINE_SHOW_FUNC(mdsmap_show) |
299 | static int name##_open(struct inode *inode, struct file *file) \ | 156 | CEPH_DEFINE_SHOW_FUNC(mdsc_show) |
300 | { \ | 157 | CEPH_DEFINE_SHOW_FUNC(caps_show) |
301 | struct seq_file *sf; \ | 158 | CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) |
302 | int ret; \ | 159 | |
303 | \ | ||
304 | ret = single_open(file, name, NULL); \ | ||
305 | sf = file->private_data; \ | ||
306 | sf->private = inode->i_private; \ | ||
307 | return ret; \ | ||
308 | } \ | ||
309 | \ | ||
310 | static const struct file_operations name##_fops = { \ | ||
311 | .open = name##_open, \ | ||
312 | .read = seq_read, \ | ||
313 | .llseek = seq_lseek, \ | ||
314 | .release = single_release, \ | ||
315 | }; | ||
316 | |||
317 | DEFINE_SHOW_FUNC(monmap_show) | ||
318 | DEFINE_SHOW_FUNC(mdsmap_show) | ||
319 | DEFINE_SHOW_FUNC(osdmap_show) | ||
320 | DEFINE_SHOW_FUNC(monc_show) | ||
321 | DEFINE_SHOW_FUNC(mdsc_show) | ||
322 | DEFINE_SHOW_FUNC(osdc_show) | ||
323 | DEFINE_SHOW_FUNC(dentry_lru_show) | ||
324 | DEFINE_SHOW_FUNC(caps_show) | ||
325 | 160 | ||
161 | /* | ||
162 | * debugfs | ||
163 | */ | ||
326 | static int congestion_kb_set(void *data, u64 val) | 164 | static int congestion_kb_set(void *data, u64 val) |
327 | { | 165 | { |
328 | struct ceph_client *client = (struct ceph_client *)data; | 166 | struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; |
329 | |||
330 | if (client) | ||
331 | client->mount_args->congestion_kb = (int)val; | ||
332 | 167 | ||
168 | fsc->mount_options->congestion_kb = (int)val; | ||
333 | return 0; | 169 | return 0; |
334 | } | 170 | } |
335 | 171 | ||
336 | static int congestion_kb_get(void *data, u64 *val) | 172 | static int congestion_kb_get(void *data, u64 *val) |
337 | { | 173 | { |
338 | struct ceph_client *client = (struct ceph_client *)data; | 174 | struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; |
339 | |||
340 | if (client) | ||
341 | *val = (u64)client->mount_args->congestion_kb; | ||
342 | 175 | ||
176 | *val = (u64)fsc->mount_options->congestion_kb; | ||
343 | return 0; | 177 | return 0; |
344 | } | 178 | } |
345 | 179 | ||
346 | |||
347 | DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, | 180 | DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, |
348 | congestion_kb_set, "%llu\n"); | 181 | congestion_kb_set, "%llu\n"); |
349 | 182 | ||
350 | int __init ceph_debugfs_init(void) | ||
351 | { | ||
352 | ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); | ||
353 | if (!ceph_debugfs_dir) | ||
354 | return -ENOMEM; | ||
355 | return 0; | ||
356 | } | ||
357 | 183 | ||
358 | void ceph_debugfs_cleanup(void) | 184 | void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) |
359 | { | 185 | { |
360 | debugfs_remove(ceph_debugfs_dir); | 186 | dout("ceph_fs_debugfs_cleanup\n"); |
187 | debugfs_remove(fsc->debugfs_bdi); | ||
188 | debugfs_remove(fsc->debugfs_congestion_kb); | ||
189 | debugfs_remove(fsc->debugfs_mdsmap); | ||
190 | debugfs_remove(fsc->debugfs_caps); | ||
191 | debugfs_remove(fsc->debugfs_mdsc); | ||
192 | debugfs_remove(fsc->debugfs_dentry_lru); | ||
361 | } | 193 | } |
362 | 194 | ||
363 | int ceph_debugfs_client_init(struct ceph_client *client) | 195 | int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) |
364 | { | 196 | { |
365 | int ret = 0; | 197 | char name[100]; |
366 | char name[80]; | 198 | int err = -ENOMEM; |
367 | |||
368 | snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, | ||
369 | client->monc.auth->global_id); | ||
370 | 199 | ||
371 | client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); | 200 | dout("ceph_fs_debugfs_init\n"); |
372 | if (!client->debugfs_dir) | 201 | fsc->debugfs_congestion_kb = |
373 | goto out; | 202 | debugfs_create_file("writeback_congestion_kb", |
374 | 203 | 0600, | |
375 | client->monc.debugfs_file = debugfs_create_file("monc", | 204 | fsc->client->debugfs_dir, |
376 | 0600, | 205 | fsc, |
377 | client->debugfs_dir, | 206 | &congestion_kb_fops); |
378 | client, | 207 | if (!fsc->debugfs_congestion_kb) |
379 | &monc_show_fops); | ||
380 | if (!client->monc.debugfs_file) | ||
381 | goto out; | 208 | goto out; |
382 | 209 | ||
383 | client->mdsc.debugfs_file = debugfs_create_file("mdsc", | 210 | dout("a\n"); |
384 | 0600, | ||
385 | client->debugfs_dir, | ||
386 | client, | ||
387 | &mdsc_show_fops); | ||
388 | if (!client->mdsc.debugfs_file) | ||
389 | goto out; | ||
390 | 211 | ||
391 | client->osdc.debugfs_file = debugfs_create_file("osdc", | 212 | snprintf(name, sizeof(name), "../../bdi/%s", |
392 | 0600, | 213 | dev_name(fsc->backing_dev_info.dev)); |
393 | client->debugfs_dir, | 214 | fsc->debugfs_bdi = |
394 | client, | 215 | debugfs_create_symlink("bdi", |
395 | &osdc_show_fops); | 216 | fsc->client->debugfs_dir, |
396 | if (!client->osdc.debugfs_file) | 217 | name); |
218 | if (!fsc->debugfs_bdi) | ||
397 | goto out; | 219 | goto out; |
398 | 220 | ||
399 | client->debugfs_monmap = debugfs_create_file("monmap", | 221 | dout("b\n"); |
222 | fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", | ||
400 | 0600, | 223 | 0600, |
401 | client->debugfs_dir, | 224 | fsc->client->debugfs_dir, |
402 | client, | 225 | fsc, |
403 | &monmap_show_fops); | ||
404 | if (!client->debugfs_monmap) | ||
405 | goto out; | ||
406 | |||
407 | client->debugfs_mdsmap = debugfs_create_file("mdsmap", | ||
408 | 0600, | ||
409 | client->debugfs_dir, | ||
410 | client, | ||
411 | &mdsmap_show_fops); | 226 | &mdsmap_show_fops); |
412 | if (!client->debugfs_mdsmap) | 227 | if (!fsc->debugfs_mdsmap) |
413 | goto out; | ||
414 | |||
415 | client->debugfs_osdmap = debugfs_create_file("osdmap", | ||
416 | 0600, | ||
417 | client->debugfs_dir, | ||
418 | client, | ||
419 | &osdmap_show_fops); | ||
420 | if (!client->debugfs_osdmap) | ||
421 | goto out; | 228 | goto out; |
422 | 229 | ||
423 | client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", | 230 | dout("ca\n"); |
424 | 0600, | 231 | fsc->debugfs_mdsc = debugfs_create_file("mdsc", |
425 | client->debugfs_dir, | 232 | 0600, |
426 | client, | 233 | fsc->client->debugfs_dir, |
427 | &dentry_lru_show_fops); | 234 | fsc, |
428 | if (!client->debugfs_dentry_lru) | 235 | &mdsc_show_fops); |
236 | if (!fsc->debugfs_mdsc) | ||
429 | goto out; | 237 | goto out; |
430 | 238 | ||
431 | client->debugfs_caps = debugfs_create_file("caps", | 239 | dout("da\n"); |
240 | fsc->debugfs_caps = debugfs_create_file("caps", | ||
432 | 0400, | 241 | 0400, |
433 | client->debugfs_dir, | 242 | fsc->client->debugfs_dir, |
434 | client, | 243 | fsc, |
435 | &caps_show_fops); | 244 | &caps_show_fops); |
436 | if (!client->debugfs_caps) | 245 | if (!fsc->debugfs_caps) |
437 | goto out; | 246 | goto out; |
438 | 247 | ||
439 | client->debugfs_congestion_kb = | 248 | dout("ea\n"); |
440 | debugfs_create_file("writeback_congestion_kb", | 249 | fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", |
441 | 0600, | 250 | 0600, |
442 | client->debugfs_dir, | 251 | fsc->client->debugfs_dir, |
443 | client, | 252 | fsc, |
444 | &congestion_kb_fops); | 253 | &dentry_lru_show_fops); |
445 | if (!client->debugfs_congestion_kb) | 254 | if (!fsc->debugfs_dentry_lru) |
446 | goto out; | 255 | goto out; |
447 | 256 | ||
448 | sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); | ||
449 | client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, | ||
450 | name); | ||
451 | |||
452 | return 0; | 257 | return 0; |
453 | 258 | ||
454 | out: | 259 | out: |
455 | ceph_debugfs_client_cleanup(client); | 260 | ceph_fs_debugfs_cleanup(fsc); |
456 | return ret; | 261 | return err; |
457 | } | 262 | } |
458 | 263 | ||
459 | void ceph_debugfs_client_cleanup(struct ceph_client *client) | ||
460 | { | ||
461 | debugfs_remove(client->debugfs_bdi); | ||
462 | debugfs_remove(client->debugfs_caps); | ||
463 | debugfs_remove(client->debugfs_dentry_lru); | ||
464 | debugfs_remove(client->debugfs_osdmap); | ||
465 | debugfs_remove(client->debugfs_mdsmap); | ||
466 | debugfs_remove(client->debugfs_monmap); | ||
467 | debugfs_remove(client->osdc.debugfs_file); | ||
468 | debugfs_remove(client->mdsc.debugfs_file); | ||
469 | debugfs_remove(client->monc.debugfs_file); | ||
470 | debugfs_remove(client->debugfs_congestion_kb); | ||
471 | debugfs_remove(client->debugfs_dir); | ||
472 | } | ||
473 | 264 | ||
474 | #else /* CONFIG_DEBUG_FS */ | 265 | #else /* CONFIG_DEBUG_FS */ |
475 | 266 | ||
476 | int __init ceph_debugfs_init(void) | 267 | int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) |
477 | { | ||
478 | return 0; | ||
479 | } | ||
480 | |||
481 | void ceph_debugfs_cleanup(void) | ||
482 | { | ||
483 | } | ||
484 | |||
485 | int ceph_debugfs_client_init(struct ceph_client *client) | ||
486 | { | 268 | { |
487 | return 0; | 269 | return 0; |
488 | } | 270 | } |
489 | 271 | ||
490 | void ceph_debugfs_client_cleanup(struct ceph_client *client) | 272 | void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) |
491 | { | 273 | { |
492 | } | 274 | } |
493 | 275 | ||
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h deleted file mode 100644 index 3d25415afe63..000000000000 --- a/fs/ceph/decode.h +++ /dev/null | |||
@@ -1,196 +0,0 @@ | |||
1 | #ifndef __CEPH_DECODE_H | ||
2 | #define __CEPH_DECODE_H | ||
3 | |||
4 | #include <asm/unaligned.h> | ||
5 | #include <linux/time.h> | ||
6 | |||
7 | #include "types.h" | ||
8 | |||
9 | /* | ||
10 | * in all cases, | ||
11 | * void **p pointer to position pointer | ||
12 | * void *end pointer to end of buffer (last byte + 1) | ||
13 | */ | ||
14 | |||
15 | static inline u64 ceph_decode_64(void **p) | ||
16 | { | ||
17 | u64 v = get_unaligned_le64(*p); | ||
18 | *p += sizeof(u64); | ||
19 | return v; | ||
20 | } | ||
21 | static inline u32 ceph_decode_32(void **p) | ||
22 | { | ||
23 | u32 v = get_unaligned_le32(*p); | ||
24 | *p += sizeof(u32); | ||
25 | return v; | ||
26 | } | ||
27 | static inline u16 ceph_decode_16(void **p) | ||
28 | { | ||
29 | u16 v = get_unaligned_le16(*p); | ||
30 | *p += sizeof(u16); | ||
31 | return v; | ||
32 | } | ||
33 | static inline u8 ceph_decode_8(void **p) | ||
34 | { | ||
35 | u8 v = *(u8 *)*p; | ||
36 | (*p)++; | ||
37 | return v; | ||
38 | } | ||
39 | static inline void ceph_decode_copy(void **p, void *pv, size_t n) | ||
40 | { | ||
41 | memcpy(pv, *p, n); | ||
42 | *p += n; | ||
43 | } | ||
44 | |||
45 | /* | ||
46 | * bounds check input. | ||
47 | */ | ||
48 | #define ceph_decode_need(p, end, n, bad) \ | ||
49 | do { \ | ||
50 | if (unlikely(*(p) + (n) > (end))) \ | ||
51 | goto bad; \ | ||
52 | } while (0) | ||
53 | |||
54 | #define ceph_decode_64_safe(p, end, v, bad) \ | ||
55 | do { \ | ||
56 | ceph_decode_need(p, end, sizeof(u64), bad); \ | ||
57 | v = ceph_decode_64(p); \ | ||
58 | } while (0) | ||
59 | #define ceph_decode_32_safe(p, end, v, bad) \ | ||
60 | do { \ | ||
61 | ceph_decode_need(p, end, sizeof(u32), bad); \ | ||
62 | v = ceph_decode_32(p); \ | ||
63 | } while (0) | ||
64 | #define ceph_decode_16_safe(p, end, v, bad) \ | ||
65 | do { \ | ||
66 | ceph_decode_need(p, end, sizeof(u16), bad); \ | ||
67 | v = ceph_decode_16(p); \ | ||
68 | } while (0) | ||
69 | #define ceph_decode_8_safe(p, end, v, bad) \ | ||
70 | do { \ | ||
71 | ceph_decode_need(p, end, sizeof(u8), bad); \ | ||
72 | v = ceph_decode_8(p); \ | ||
73 | } while (0) | ||
74 | |||
75 | #define ceph_decode_copy_safe(p, end, pv, n, bad) \ | ||
76 | do { \ | ||
77 | ceph_decode_need(p, end, n, bad); \ | ||
78 | ceph_decode_copy(p, pv, n); \ | ||
79 | } while (0) | ||
80 | |||
81 | /* | ||
82 | * struct ceph_timespec <-> struct timespec | ||
83 | */ | ||
84 | static inline void ceph_decode_timespec(struct timespec *ts, | ||
85 | const struct ceph_timespec *tv) | ||
86 | { | ||
87 | ts->tv_sec = le32_to_cpu(tv->tv_sec); | ||
88 | ts->tv_nsec = le32_to_cpu(tv->tv_nsec); | ||
89 | } | ||
90 | static inline void ceph_encode_timespec(struct ceph_timespec *tv, | ||
91 | const struct timespec *ts) | ||
92 | { | ||
93 | tv->tv_sec = cpu_to_le32(ts->tv_sec); | ||
94 | tv->tv_nsec = cpu_to_le32(ts->tv_nsec); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * sockaddr_storage <-> ceph_sockaddr | ||
99 | */ | ||
100 | static inline void ceph_encode_addr(struct ceph_entity_addr *a) | ||
101 | { | ||
102 | __be16 ss_family = htons(a->in_addr.ss_family); | ||
103 | a->in_addr.ss_family = *(__u16 *)&ss_family; | ||
104 | } | ||
105 | static inline void ceph_decode_addr(struct ceph_entity_addr *a) | ||
106 | { | ||
107 | __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; | ||
108 | a->in_addr.ss_family = ntohs(ss_family); | ||
109 | WARN_ON(a->in_addr.ss_family == 512); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * encoders | ||
114 | */ | ||
115 | static inline void ceph_encode_64(void **p, u64 v) | ||
116 | { | ||
117 | put_unaligned_le64(v, (__le64 *)*p); | ||
118 | *p += sizeof(u64); | ||
119 | } | ||
120 | static inline void ceph_encode_32(void **p, u32 v) | ||
121 | { | ||
122 | put_unaligned_le32(v, (__le32 *)*p); | ||
123 | *p += sizeof(u32); | ||
124 | } | ||
125 | static inline void ceph_encode_16(void **p, u16 v) | ||
126 | { | ||
127 | put_unaligned_le16(v, (__le16 *)*p); | ||
128 | *p += sizeof(u16); | ||
129 | } | ||
130 | static inline void ceph_encode_8(void **p, u8 v) | ||
131 | { | ||
132 | *(u8 *)*p = v; | ||
133 | (*p)++; | ||
134 | } | ||
135 | static inline void ceph_encode_copy(void **p, const void *s, int len) | ||
136 | { | ||
137 | memcpy(*p, s, len); | ||
138 | *p += len; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * filepath, string encoders | ||
143 | */ | ||
144 | static inline void ceph_encode_filepath(void **p, void *end, | ||
145 | u64 ino, const char *path) | ||
146 | { | ||
147 | u32 len = path ? strlen(path) : 0; | ||
148 | BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); | ||
149 | ceph_encode_8(p, 1); | ||
150 | ceph_encode_64(p, ino); | ||
151 | ceph_encode_32(p, len); | ||
152 | if (len) | ||
153 | memcpy(*p, path, len); | ||
154 | *p += len; | ||
155 | } | ||
156 | |||
157 | static inline void ceph_encode_string(void **p, void *end, | ||
158 | const char *s, u32 len) | ||
159 | { | ||
160 | BUG_ON(*p + sizeof(len) + len > end); | ||
161 | ceph_encode_32(p, len); | ||
162 | if (len) | ||
163 | memcpy(*p, s, len); | ||
164 | *p += len; | ||
165 | } | ||
166 | |||
167 | #define ceph_encode_need(p, end, n, bad) \ | ||
168 | do { \ | ||
169 | if (unlikely(*(p) + (n) > (end))) \ | ||
170 | goto bad; \ | ||
171 | } while (0) | ||
172 | |||
173 | #define ceph_encode_64_safe(p, end, v, bad) \ | ||
174 | do { \ | ||
175 | ceph_encode_need(p, end, sizeof(u64), bad); \ | ||
176 | ceph_encode_64(p, v); \ | ||
177 | } while (0) | ||
178 | #define ceph_encode_32_safe(p, end, v, bad) \ | ||
179 | do { \ | ||
180 | ceph_encode_need(p, end, sizeof(u32), bad); \ | ||
181 | ceph_encode_32(p, v); \ | ||
182 | } while (0) | ||
183 | #define ceph_encode_16_safe(p, end, v, bad) \ | ||
184 | do { \ | ||
185 | ceph_encode_need(p, end, sizeof(u16), bad); \ | ||
186 | ceph_encode_16(p, v); \ | ||
187 | } while (0) | ||
188 | |||
189 | #define ceph_encode_copy_safe(p, end, pv, n, bad) \ | ||
190 | do { \ | ||
191 | ceph_encode_need(p, end, n, bad); \ | ||
192 | ceph_encode_copy(p, pv, n); \ | ||
193 | } while (0) | ||
194 | |||
195 | |||
196 | #endif | ||
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a1986eb52045..e0a2dc6fcafc 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/spinlock.h> | 3 | #include <linux/spinlock.h> |
4 | #include <linux/fs_struct.h> | 4 | #include <linux/fs_struct.h> |
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | 8 | ||
9 | #include "super.h" | 9 | #include "super.h" |
10 | #include "mds_client.h" | ||
10 | 11 | ||
11 | /* | 12 | /* |
12 | * Directory operations: readdir, lookup, create, link, unlink, | 13 | * Directory operations: readdir, lookup, create, link, unlink, |
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p) | |||
94 | */ | 95 | */ |
95 | static int __dcache_readdir(struct file *filp, | 96 | static int __dcache_readdir(struct file *filp, |
96 | void *dirent, filldir_t filldir) | 97 | void *dirent, filldir_t filldir) |
97 | __releases(inode->i_lock) | ||
98 | __acquires(inode->i_lock) | ||
99 | { | 98 | { |
100 | struct inode *inode = filp->f_dentry->d_inode; | ||
101 | struct ceph_file_info *fi = filp->private_data; | 99 | struct ceph_file_info *fi = filp->private_data; |
102 | struct dentry *parent = filp->f_dentry; | 100 | struct dentry *parent = filp->f_dentry; |
103 | struct inode *dir = parent->d_inode; | 101 | struct inode *dir = parent->d_inode; |
@@ -153,7 +151,6 @@ more: | |||
153 | 151 | ||
154 | atomic_inc(&dentry->d_count); | 152 | atomic_inc(&dentry->d_count); |
155 | spin_unlock(&dcache_lock); | 153 | spin_unlock(&dcache_lock); |
156 | spin_unlock(&inode->i_lock); | ||
157 | 154 | ||
158 | dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, | 155 | dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, |
159 | dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); | 156 | dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); |
@@ -171,35 +168,30 @@ more: | |||
171 | } else { | 168 | } else { |
172 | dput(last); | 169 | dput(last); |
173 | } | 170 | } |
174 | last = NULL; | ||
175 | } | 171 | } |
176 | |||
177 | spin_lock(&inode->i_lock); | ||
178 | spin_lock(&dcache_lock); | ||
179 | |||
180 | last = dentry; | 172 | last = dentry; |
181 | 173 | ||
182 | if (err < 0) | 174 | if (err < 0) |
183 | goto out_unlock; | 175 | goto out; |
184 | 176 | ||
185 | p = p->prev; | ||
186 | filp->f_pos++; | 177 | filp->f_pos++; |
187 | 178 | ||
188 | /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ | 179 | /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ |
189 | if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) | 180 | if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { |
190 | goto more; | 181 | dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); |
191 | dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); | 182 | err = -EAGAIN; |
192 | err = -EAGAIN; | 183 | goto out; |
184 | } | ||
185 | |||
186 | spin_lock(&dcache_lock); | ||
187 | p = p->prev; /* advance to next dentry */ | ||
188 | goto more; | ||
193 | 189 | ||
194 | out_unlock: | 190 | out_unlock: |
195 | spin_unlock(&dcache_lock); | 191 | spin_unlock(&dcache_lock); |
196 | 192 | out: | |
197 | if (last) { | 193 | if (last) |
198 | spin_unlock(&inode->i_lock); | ||
199 | dput(last); | 194 | dput(last); |
200 | spin_lock(&inode->i_lock); | ||
201 | } | ||
202 | |||
203 | return err; | 195 | return err; |
204 | } | 196 | } |
205 | 197 | ||
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
227 | struct ceph_file_info *fi = filp->private_data; | 219 | struct ceph_file_info *fi = filp->private_data; |
228 | struct inode *inode = filp->f_dentry->d_inode; | 220 | struct inode *inode = filp->f_dentry->d_inode; |
229 | struct ceph_inode_info *ci = ceph_inode(inode); | 221 | struct ceph_inode_info *ci = ceph_inode(inode); |
230 | struct ceph_client *client = ceph_inode_to_client(inode); | 222 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
231 | struct ceph_mds_client *mdsc = &client->mdsc; | 223 | struct ceph_mds_client *mdsc = fsc->mdsc; |
232 | unsigned frag = fpos_frag(filp->f_pos); | 224 | unsigned frag = fpos_frag(filp->f_pos); |
233 | int off = fpos_off(filp->f_pos); | 225 | int off = fpos_off(filp->f_pos); |
234 | int err; | 226 | int err; |
235 | u32 ftype; | 227 | u32 ftype; |
236 | struct ceph_mds_reply_info_parsed *rinfo; | 228 | struct ceph_mds_reply_info_parsed *rinfo; |
237 | const int max_entries = client->mount_args->max_readdir; | 229 | const int max_entries = fsc->mount_options->max_readdir; |
238 | const int max_bytes = client->mount_args->max_readdir_bytes; | 230 | const int max_bytes = fsc->mount_options->max_readdir_bytes; |
239 | 231 | ||
240 | dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); | 232 | dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); |
241 | if (fi->at_end) | 233 | if (fi->at_end) |
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
267 | /* can we use the dcache? */ | 259 | /* can we use the dcache? */ |
268 | spin_lock(&inode->i_lock); | 260 | spin_lock(&inode->i_lock); |
269 | if ((filp->f_pos == 2 || fi->dentry) && | 261 | if ((filp->f_pos == 2 || fi->dentry) && |
270 | !ceph_test_opt(client, NOASYNCREADDIR) && | 262 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && |
271 | ceph_snap(inode) != CEPH_SNAPDIR && | 263 | ceph_snap(inode) != CEPH_SNAPDIR && |
272 | (ci->i_ceph_flags & CEPH_I_COMPLETE) && | 264 | (ci->i_ceph_flags & CEPH_I_COMPLETE) && |
273 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { | 265 | __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { |
266 | spin_unlock(&inode->i_lock); | ||
274 | err = __dcache_readdir(filp, dirent, filldir); | 267 | err = __dcache_readdir(filp, dirent, filldir); |
275 | if (err != -EAGAIN) { | 268 | if (err != -EAGAIN) |
276 | spin_unlock(&inode->i_lock); | ||
277 | return err; | 269 | return err; |
278 | } | 270 | } else { |
271 | spin_unlock(&inode->i_lock); | ||
279 | } | 272 | } |
280 | spin_unlock(&inode->i_lock); | ||
281 | if (fi->dentry) { | 273 | if (fi->dentry) { |
282 | err = note_last_dentry(fi, fi->dentry->d_name.name, | 274 | err = note_last_dentry(fi, fi->dentry->d_name.name, |
283 | fi->dentry->d_name.len); | 275 | fi->dentry->d_name.len); |
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) | |||
487 | struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, | 479 | struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, |
488 | struct dentry *dentry, int err) | 480 | struct dentry *dentry, int err) |
489 | { | 481 | { |
490 | struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); | 482 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
491 | struct inode *parent = dentry->d_parent->d_inode; | 483 | struct inode *parent = dentry->d_parent->d_inode; |
492 | 484 | ||
493 | /* .snap dir? */ | 485 | /* .snap dir? */ |
494 | if (err == -ENOENT && | 486 | if (err == -ENOENT && |
495 | ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ | ||
496 | strcmp(dentry->d_name.name, | 487 | strcmp(dentry->d_name.name, |
497 | client->mount_args->snapdir_name) == 0) { | 488 | fsc->mount_options->snapdir_name) == 0) { |
498 | struct inode *inode = ceph_get_snapdir(parent); | 489 | struct inode *inode = ceph_get_snapdir(parent); |
499 | dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", | 490 | dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", |
500 | dentry, dentry->d_name.len, dentry->d_name.name, inode); | 491 | dentry, dentry->d_name.len, dentry->d_name.name, inode); |
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) | |||
539 | static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | 530 | static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, |
540 | struct nameidata *nd) | 531 | struct nameidata *nd) |
541 | { | 532 | { |
542 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 533 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
543 | struct ceph_mds_client *mdsc = &client->mdsc; | 534 | struct ceph_mds_client *mdsc = fsc->mdsc; |
544 | struct ceph_mds_request *req; | 535 | struct ceph_mds_request *req; |
545 | int op; | 536 | int op; |
546 | int err; | 537 | int err; |
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | |||
572 | spin_lock(&dir->i_lock); | 563 | spin_lock(&dir->i_lock); |
573 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); | 564 | dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); |
574 | if (strncmp(dentry->d_name.name, | 565 | if (strncmp(dentry->d_name.name, |
575 | client->mount_args->snapdir_name, | 566 | fsc->mount_options->snapdir_name, |
576 | dentry->d_name.len) && | 567 | dentry->d_name.len) && |
577 | !is_root_ceph_dentry(dir, dentry) && | 568 | !is_root_ceph_dentry(dir, dentry) && |
578 | (ci->i_ceph_flags & CEPH_I_COMPLETE) && | 569 | (ci->i_ceph_flags & CEPH_I_COMPLETE) && |
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) | |||
629 | static int ceph_mknod(struct inode *dir, struct dentry *dentry, | 620 | static int ceph_mknod(struct inode *dir, struct dentry *dentry, |
630 | int mode, dev_t rdev) | 621 | int mode, dev_t rdev) |
631 | { | 622 | { |
632 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 623 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
633 | struct ceph_mds_client *mdsc = &client->mdsc; | 624 | struct ceph_mds_client *mdsc = fsc->mdsc; |
634 | struct ceph_mds_request *req; | 625 | struct ceph_mds_request *req; |
635 | int err; | 626 | int err; |
636 | 627 | ||
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, | |||
685 | static int ceph_symlink(struct inode *dir, struct dentry *dentry, | 676 | static int ceph_symlink(struct inode *dir, struct dentry *dentry, |
686 | const char *dest) | 677 | const char *dest) |
687 | { | 678 | { |
688 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 679 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
689 | struct ceph_mds_client *mdsc = &client->mdsc; | 680 | struct ceph_mds_client *mdsc = fsc->mdsc; |
690 | struct ceph_mds_request *req; | 681 | struct ceph_mds_request *req; |
691 | int err; | 682 | int err; |
692 | 683 | ||
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, | |||
716 | 707 | ||
717 | static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) | 708 | static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) |
718 | { | 709 | { |
719 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 710 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
720 | struct ceph_mds_client *mdsc = &client->mdsc; | 711 | struct ceph_mds_client *mdsc = fsc->mdsc; |
721 | struct ceph_mds_request *req; | 712 | struct ceph_mds_request *req; |
722 | int err = -EROFS; | 713 | int err = -EROFS; |
723 | int op; | 714 | int op; |
@@ -758,8 +749,8 @@ out: | |||
758 | static int ceph_link(struct dentry *old_dentry, struct inode *dir, | 749 | static int ceph_link(struct dentry *old_dentry, struct inode *dir, |
759 | struct dentry *dentry) | 750 | struct dentry *dentry) |
760 | { | 751 | { |
761 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 752 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
762 | struct ceph_mds_client *mdsc = &client->mdsc; | 753 | struct ceph_mds_client *mdsc = fsc->mdsc; |
763 | struct ceph_mds_request *req; | 754 | struct ceph_mds_request *req; |
764 | int err; | 755 | int err; |
765 | 756 | ||
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode) | |||
813 | */ | 804 | */ |
814 | static int ceph_unlink(struct inode *dir, struct dentry *dentry) | 805 | static int ceph_unlink(struct inode *dir, struct dentry *dentry) |
815 | { | 806 | { |
816 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 807 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
817 | struct ceph_mds_client *mdsc = &client->mdsc; | 808 | struct ceph_mds_client *mdsc = fsc->mdsc; |
818 | struct inode *inode = dentry->d_inode; | 809 | struct inode *inode = dentry->d_inode; |
819 | struct ceph_mds_request *req; | 810 | struct ceph_mds_request *req; |
820 | int err = -EROFS; | 811 | int err = -EROFS; |
@@ -854,8 +845,8 @@ out: | |||
854 | static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, | 845 | static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, |
855 | struct inode *new_dir, struct dentry *new_dentry) | 846 | struct inode *new_dir, struct dentry *new_dentry) |
856 | { | 847 | { |
857 | struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); | 848 | struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); |
858 | struct ceph_mds_client *mdsc = &client->mdsc; | 849 | struct ceph_mds_client *mdsc = fsc->mdsc; |
859 | struct ceph_mds_request *req; | 850 | struct ceph_mds_request *req; |
860 | int err; | 851 | int err; |
861 | 852 | ||
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, | |||
1076 | struct ceph_inode_info *ci = ceph_inode(inode); | 1067 | struct ceph_inode_info *ci = ceph_inode(inode); |
1077 | int left; | 1068 | int left; |
1078 | 1069 | ||
1079 | if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) | 1070 | if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) |
1080 | return -EISDIR; | 1071 | return -EISDIR; |
1081 | 1072 | ||
1082 | if (!cf->dir_info) { | 1073 | if (!cf->dir_info) { |
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn) | |||
1177 | dout("dentry_lru_add %p %p '%.*s'\n", di, dn, | 1168 | dout("dentry_lru_add %p %p '%.*s'\n", di, dn, |
1178 | dn->d_name.len, dn->d_name.name); | 1169 | dn->d_name.len, dn->d_name.name); |
1179 | if (di) { | 1170 | if (di) { |
1180 | mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; | 1171 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1181 | spin_lock(&mdsc->dentry_lru_lock); | 1172 | spin_lock(&mdsc->dentry_lru_lock); |
1182 | list_add_tail(&di->lru, &mdsc->dentry_lru); | 1173 | list_add_tail(&di->lru, &mdsc->dentry_lru); |
1183 | mdsc->num_dentry++; | 1174 | mdsc->num_dentry++; |
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn) | |||
1193 | dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, | 1184 | dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, |
1194 | dn->d_name.len, dn->d_name.name, di->offset); | 1185 | dn->d_name.len, dn->d_name.name, di->offset); |
1195 | if (di) { | 1186 | if (di) { |
1196 | mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; | 1187 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1197 | spin_lock(&mdsc->dentry_lru_lock); | 1188 | spin_lock(&mdsc->dentry_lru_lock); |
1198 | list_move_tail(&di->lru, &mdsc->dentry_lru); | 1189 | list_move_tail(&di->lru, &mdsc->dentry_lru); |
1199 | spin_unlock(&mdsc->dentry_lru_lock); | 1190 | spin_unlock(&mdsc->dentry_lru_lock); |
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn) | |||
1208 | dout("dentry_lru_del %p %p '%.*s'\n", di, dn, | 1199 | dout("dentry_lru_del %p %p '%.*s'\n", di, dn, |
1209 | dn->d_name.len, dn->d_name.name); | 1200 | dn->d_name.len, dn->d_name.name); |
1210 | if (di) { | 1201 | if (di) { |
1211 | mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; | 1202 | mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; |
1212 | spin_lock(&mdsc->dentry_lru_lock); | 1203 | spin_lock(&mdsc->dentry_lru_lock); |
1213 | list_del_init(&di->lru); | 1204 | list_del_init(&di->lru); |
1214 | mdsc->num_dentry--; | 1205 | mdsc->num_dentry--; |
diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 4480cb1c63e7..2297d9426992 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c | |||
@@ -1,10 +1,11 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/exportfs.h> | 3 | #include <linux/exportfs.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | #include <asm/unaligned.h> | 5 | #include <asm/unaligned.h> |
6 | 6 | ||
7 | #include "super.h" | 7 | #include "super.h" |
8 | #include "mds_client.h" | ||
8 | 9 | ||
9 | /* | 10 | /* |
10 | * NFS export support | 11 | * NFS export support |
@@ -42,32 +43,37 @@ struct ceph_nfs_confh { | |||
42 | static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, | 43 | static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, |
43 | int connectable) | 44 | int connectable) |
44 | { | 45 | { |
46 | int type; | ||
45 | struct ceph_nfs_fh *fh = (void *)rawfh; | 47 | struct ceph_nfs_fh *fh = (void *)rawfh; |
46 | struct ceph_nfs_confh *cfh = (void *)rawfh; | 48 | struct ceph_nfs_confh *cfh = (void *)rawfh; |
47 | struct dentry *parent = dentry->d_parent; | 49 | struct dentry *parent = dentry->d_parent; |
48 | struct inode *inode = dentry->d_inode; | 50 | struct inode *inode = dentry->d_inode; |
49 | int type; | 51 | int connected_handle_length = sizeof(*cfh)/4; |
52 | int handle_length = sizeof(*fh)/4; | ||
50 | 53 | ||
51 | /* don't re-export snaps */ | 54 | /* don't re-export snaps */ |
52 | if (ceph_snap(inode) != CEPH_NOSNAP) | 55 | if (ceph_snap(inode) != CEPH_NOSNAP) |
53 | return -EINVAL; | 56 | return -EINVAL; |
54 | 57 | ||
55 | if (*max_len >= sizeof(*cfh)) { | 58 | if (*max_len >= connected_handle_length) { |
56 | dout("encode_fh %p connectable\n", dentry); | 59 | dout("encode_fh %p connectable\n", dentry); |
57 | cfh->ino = ceph_ino(dentry->d_inode); | 60 | cfh->ino = ceph_ino(dentry->d_inode); |
58 | cfh->parent_ino = ceph_ino(parent->d_inode); | 61 | cfh->parent_ino = ceph_ino(parent->d_inode); |
59 | cfh->parent_name_hash = parent->d_name.hash; | 62 | cfh->parent_name_hash = parent->d_name.hash; |
60 | *max_len = sizeof(*cfh); | 63 | *max_len = connected_handle_length; |
61 | type = 2; | 64 | type = 2; |
62 | } else if (*max_len > sizeof(*fh)) { | 65 | } else if (*max_len >= handle_length) { |
63 | if (connectable) | 66 | if (connectable) { |
64 | return -ENOSPC; | 67 | *max_len = connected_handle_length; |
68 | return 255; | ||
69 | } | ||
65 | dout("encode_fh %p\n", dentry); | 70 | dout("encode_fh %p\n", dentry); |
66 | fh->ino = ceph_ino(dentry->d_inode); | 71 | fh->ino = ceph_ino(dentry->d_inode); |
67 | *max_len = sizeof(*fh); | 72 | *max_len = handle_length; |
68 | type = 1; | 73 | type = 1; |
69 | } else { | 74 | } else { |
70 | return -ENOSPC; | 75 | *max_len = handle_length; |
76 | return 255; | ||
71 | } | 77 | } |
72 | return type; | 78 | return type; |
73 | } | 79 | } |
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, | |||
115 | static struct dentry *__cfh_to_dentry(struct super_block *sb, | 121 | static struct dentry *__cfh_to_dentry(struct super_block *sb, |
116 | struct ceph_nfs_confh *cfh) | 122 | struct ceph_nfs_confh *cfh) |
117 | { | 123 | { |
118 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; | 124 | struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; |
119 | struct inode *inode; | 125 | struct inode *inode; |
120 | struct dentry *dentry; | 126 | struct dentry *dentry; |
121 | struct ceph_vino vino; | 127 | struct ceph_vino vino; |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c044a4f0457..e77c28cf3690 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/module.h> | ||
3 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
4 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
5 | #include <linux/file.h> | 6 | #include <linux/file.h> |
@@ -38,8 +39,8 @@ | |||
38 | static struct ceph_mds_request * | 39 | static struct ceph_mds_request * |
39 | prepare_open_request(struct super_block *sb, int flags, int create_mode) | 40 | prepare_open_request(struct super_block *sb, int flags, int create_mode) |
40 | { | 41 | { |
41 | struct ceph_client *client = ceph_sb_to_client(sb); | 42 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
42 | struct ceph_mds_client *mdsc = &client->mdsc; | 43 | struct ceph_mds_client *mdsc = fsc->mdsc; |
43 | struct ceph_mds_request *req; | 44 | struct ceph_mds_request *req; |
44 | int want_auth = USE_ANY_MDS; | 45 | int want_auth = USE_ANY_MDS; |
45 | int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; | 46 | int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; |
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) | |||
117 | int ceph_open(struct inode *inode, struct file *file) | 118 | int ceph_open(struct inode *inode, struct file *file) |
118 | { | 119 | { |
119 | struct ceph_inode_info *ci = ceph_inode(inode); | 120 | struct ceph_inode_info *ci = ceph_inode(inode); |
120 | struct ceph_client *client = ceph_sb_to_client(inode->i_sb); | 121 | struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); |
121 | struct ceph_mds_client *mdsc = &client->mdsc; | 122 | struct ceph_mds_client *mdsc = fsc->mdsc; |
122 | struct ceph_mds_request *req; | 123 | struct ceph_mds_request *req; |
123 | struct ceph_file_info *cf = file->private_data; | 124 | struct ceph_file_info *cf = file->private_data; |
124 | struct inode *parent_inode = file->f_dentry->d_parent->d_inode; | 125 | struct inode *parent_inode = file->f_dentry->d_parent->d_inode; |
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, | |||
216 | struct nameidata *nd, int mode, | 217 | struct nameidata *nd, int mode, |
217 | int locked_dir) | 218 | int locked_dir) |
218 | { | 219 | { |
219 | struct ceph_client *client = ceph_sb_to_client(dir->i_sb); | 220 | struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); |
220 | struct ceph_mds_client *mdsc = &client->mdsc; | 221 | struct ceph_mds_client *mdsc = fsc->mdsc; |
221 | struct file *file = nd->intent.open.file; | 222 | struct file *file = nd->intent.open.file; |
222 | struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); | 223 | struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); |
223 | struct ceph_mds_request *req; | 224 | struct ceph_mds_request *req; |
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file) | |||
270 | } | 271 | } |
271 | 272 | ||
272 | /* | 273 | /* |
273 | * build a vector of user pages | ||
274 | */ | ||
275 | static struct page **get_direct_page_vector(const char __user *data, | ||
276 | int num_pages, | ||
277 | loff_t off, size_t len) | ||
278 | { | ||
279 | struct page **pages; | ||
280 | int rc; | ||
281 | |||
282 | pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); | ||
283 | if (!pages) | ||
284 | return ERR_PTR(-ENOMEM); | ||
285 | |||
286 | down_read(¤t->mm->mmap_sem); | ||
287 | rc = get_user_pages(current, current->mm, (unsigned long)data, | ||
288 | num_pages, 0, 0, pages, NULL); | ||
289 | up_read(¤t->mm->mmap_sem); | ||
290 | if (rc < 0) | ||
291 | goto fail; | ||
292 | return pages; | ||
293 | |||
294 | fail: | ||
295 | kfree(pages); | ||
296 | return ERR_PTR(rc); | ||
297 | } | ||
298 | |||
299 | static void put_page_vector(struct page **pages, int num_pages) | ||
300 | { | ||
301 | int i; | ||
302 | |||
303 | for (i = 0; i < num_pages; i++) | ||
304 | put_page(pages[i]); | ||
305 | kfree(pages); | ||
306 | } | ||
307 | |||
308 | void ceph_release_page_vector(struct page **pages, int num_pages) | ||
309 | { | ||
310 | int i; | ||
311 | |||
312 | for (i = 0; i < num_pages; i++) | ||
313 | __free_pages(pages[i], 0); | ||
314 | kfree(pages); | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * allocate a vector new pages | ||
319 | */ | ||
320 | static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) | ||
321 | { | ||
322 | struct page **pages; | ||
323 | int i; | ||
324 | |||
325 | pages = kmalloc(sizeof(*pages) * num_pages, flags); | ||
326 | if (!pages) | ||
327 | return ERR_PTR(-ENOMEM); | ||
328 | for (i = 0; i < num_pages; i++) { | ||
329 | pages[i] = __page_cache_alloc(flags); | ||
330 | if (pages[i] == NULL) { | ||
331 | ceph_release_page_vector(pages, i); | ||
332 | return ERR_PTR(-ENOMEM); | ||
333 | } | ||
334 | } | ||
335 | return pages; | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * copy user data into a page vector | ||
340 | */ | ||
341 | static int copy_user_to_page_vector(struct page **pages, | ||
342 | const char __user *data, | ||
343 | loff_t off, size_t len) | ||
344 | { | ||
345 | int i = 0; | ||
346 | int po = off & ~PAGE_CACHE_MASK; | ||
347 | int left = len; | ||
348 | int l, bad; | ||
349 | |||
350 | while (left > 0) { | ||
351 | l = min_t(int, PAGE_CACHE_SIZE-po, left); | ||
352 | bad = copy_from_user(page_address(pages[i]) + po, data, l); | ||
353 | if (bad == l) | ||
354 | return -EFAULT; | ||
355 | data += l - bad; | ||
356 | left -= l - bad; | ||
357 | po += l - bad; | ||
358 | if (po == PAGE_CACHE_SIZE) { | ||
359 | po = 0; | ||
360 | i++; | ||
361 | } | ||
362 | } | ||
363 | return len; | ||
364 | } | ||
365 | |||
366 | /* | ||
367 | * copy user data from a page vector into a user pointer | ||
368 | */ | ||
369 | static int copy_page_vector_to_user(struct page **pages, char __user *data, | ||
370 | loff_t off, size_t len) | ||
371 | { | ||
372 | int i = 0; | ||
373 | int po = off & ~PAGE_CACHE_MASK; | ||
374 | int left = len; | ||
375 | int l, bad; | ||
376 | |||
377 | while (left > 0) { | ||
378 | l = min_t(int, left, PAGE_CACHE_SIZE-po); | ||
379 | bad = copy_to_user(data, page_address(pages[i]) + po, l); | ||
380 | if (bad == l) | ||
381 | return -EFAULT; | ||
382 | data += l - bad; | ||
383 | left -= l - bad; | ||
384 | if (po) { | ||
385 | po += l - bad; | ||
386 | if (po == PAGE_CACHE_SIZE) | ||
387 | po = 0; | ||
388 | } | ||
389 | i++; | ||
390 | } | ||
391 | return len; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * Zero an extent within a page vector. Offset is relative to the | ||
396 | * start of the first page. | ||
397 | */ | ||
398 | static void zero_page_vector_range(int off, int len, struct page **pages) | ||
399 | { | ||
400 | int i = off >> PAGE_CACHE_SHIFT; | ||
401 | |||
402 | off &= ~PAGE_CACHE_MASK; | ||
403 | |||
404 | dout("zero_page_vector_page %u~%u\n", off, len); | ||
405 | |||
406 | /* leading partial page? */ | ||
407 | if (off) { | ||
408 | int end = min((int)PAGE_CACHE_SIZE, off + len); | ||
409 | dout("zeroing %d %p head from %d\n", i, pages[i], | ||
410 | (int)off); | ||
411 | zero_user_segment(pages[i], off, end); | ||
412 | len -= (end - off); | ||
413 | i++; | ||
414 | } | ||
415 | while (len >= PAGE_CACHE_SIZE) { | ||
416 | dout("zeroing %d %p len=%d\n", i, pages[i], len); | ||
417 | zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); | ||
418 | len -= PAGE_CACHE_SIZE; | ||
419 | i++; | ||
420 | } | ||
421 | /* trailing partial page? */ | ||
422 | if (len) { | ||
423 | dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); | ||
424 | zero_user_segment(pages[i], 0, len); | ||
425 | } | ||
426 | } | ||
427 | |||
428 | |||
429 | /* | ||
430 | * Read a range of bytes striped over one or more objects. Iterate over | 274 | * Read a range of bytes striped over one or more objects. Iterate over |
431 | * objects we stripe over. (That's not atomic, but good enough for now.) | 275 | * objects we stripe over. (That's not atomic, but good enough for now.) |
432 | * | 276 | * |
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode, | |||
438 | struct page **pages, int num_pages, | 282 | struct page **pages, int num_pages, |
439 | int *checkeof) | 283 | int *checkeof) |
440 | { | 284 | { |
441 | struct ceph_client *client = ceph_inode_to_client(inode); | 285 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
442 | struct ceph_inode_info *ci = ceph_inode(inode); | 286 | struct ceph_inode_info *ci = ceph_inode(inode); |
443 | u64 pos, this_len; | 287 | u64 pos, this_len; |
444 | int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ | 288 | int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ |
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode, | |||
459 | 303 | ||
460 | more: | 304 | more: |
461 | this_len = left; | 305 | this_len = left; |
462 | ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), | 306 | ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), |
463 | &ci->i_layout, pos, &this_len, | 307 | &ci->i_layout, pos, &this_len, |
464 | ci->i_truncate_seq, | 308 | ci->i_truncate_seq, |
465 | ci->i_truncate_size, | 309 | ci->i_truncate_size, |
@@ -477,8 +321,8 @@ more: | |||
477 | 321 | ||
478 | if (read < pos - off) { | 322 | if (read < pos - off) { |
479 | dout(" zero gap %llu to %llu\n", off + read, pos); | 323 | dout(" zero gap %llu to %llu\n", off + read, pos); |
480 | zero_page_vector_range(page_off + read, | 324 | ceph_zero_page_vector_range(page_off + read, |
481 | pos - off - read, pages); | 325 | pos - off - read, pages); |
482 | } | 326 | } |
483 | pos += ret; | 327 | pos += ret; |
484 | read = pos - off; | 328 | read = pos - off; |
@@ -495,8 +339,8 @@ more: | |||
495 | /* was original extent fully inside i_size? */ | 339 | /* was original extent fully inside i_size? */ |
496 | if (pos + left <= inode->i_size) { | 340 | if (pos + left <= inode->i_size) { |
497 | dout("zero tail\n"); | 341 | dout("zero tail\n"); |
498 | zero_page_vector_range(page_off + read, len - read, | 342 | ceph_zero_page_vector_range(page_off + read, len - read, |
499 | pages); | 343 | pages); |
500 | read = len; | 344 | read = len; |
501 | goto out; | 345 | goto out; |
502 | } | 346 | } |
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, | |||
531 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); | 375 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); |
532 | 376 | ||
533 | if (file->f_flags & O_DIRECT) { | 377 | if (file->f_flags & O_DIRECT) { |
534 | pages = get_direct_page_vector(data, num_pages, off, len); | 378 | pages = ceph_get_direct_page_vector(data, num_pages, off, len); |
535 | 379 | ||
536 | /* | 380 | /* |
537 | * flush any page cache pages in this range. this | 381 | * flush any page cache pages in this range. this |
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, | |||
552 | ret = striped_read(inode, off, len, pages, num_pages, checkeof); | 396 | ret = striped_read(inode, off, len, pages, num_pages, checkeof); |
553 | 397 | ||
554 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) | 398 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) |
555 | ret = copy_page_vector_to_user(pages, data, off, ret); | 399 | ret = ceph_copy_page_vector_to_user(pages, data, off, ret); |
556 | if (ret >= 0) | 400 | if (ret >= 0) |
557 | *poff = off + ret; | 401 | *poff = off + ret; |
558 | 402 | ||
559 | done: | 403 | done: |
560 | if (file->f_flags & O_DIRECT) | 404 | if (file->f_flags & O_DIRECT) |
561 | put_page_vector(pages, num_pages); | 405 | ceph_put_page_vector(pages, num_pages); |
562 | else | 406 | else |
563 | ceph_release_page_vector(pages, num_pages); | 407 | ceph_release_page_vector(pages, num_pages); |
564 | dout("sync_read result %d\n", ret); | 408 | dout("sync_read result %d\n", ret); |
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, | |||
594 | { | 438 | { |
595 | struct inode *inode = file->f_dentry->d_inode; | 439 | struct inode *inode = file->f_dentry->d_inode; |
596 | struct ceph_inode_info *ci = ceph_inode(inode); | 440 | struct ceph_inode_info *ci = ceph_inode(inode); |
597 | struct ceph_client *client = ceph_inode_to_client(inode); | 441 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
598 | struct ceph_osd_request *req; | 442 | struct ceph_osd_request *req; |
599 | struct page **pages; | 443 | struct page **pages; |
600 | int num_pages; | 444 | int num_pages; |
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, | |||
642 | */ | 486 | */ |
643 | more: | 487 | more: |
644 | len = left; | 488 | len = left; |
645 | req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, | 489 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
646 | ceph_vino(inode), pos, &len, | 490 | ceph_vino(inode), pos, &len, |
647 | CEPH_OSD_OP_WRITE, flags, | 491 | CEPH_OSD_OP_WRITE, flags, |
648 | ci->i_snap_realm->cached_context, | 492 | ci->i_snap_realm->cached_context, |
@@ -655,7 +499,7 @@ more: | |||
655 | num_pages = calc_pages_for(pos, len); | 499 | num_pages = calc_pages_for(pos, len); |
656 | 500 | ||
657 | if (file->f_flags & O_DIRECT) { | 501 | if (file->f_flags & O_DIRECT) { |
658 | pages = get_direct_page_vector(data, num_pages, pos, len); | 502 | pages = ceph_get_direct_page_vector(data, num_pages, pos, len); |
659 | if (IS_ERR(pages)) { | 503 | if (IS_ERR(pages)) { |
660 | ret = PTR_ERR(pages); | 504 | ret = PTR_ERR(pages); |
661 | goto out; | 505 | goto out; |
@@ -673,7 +517,7 @@ more: | |||
673 | ret = PTR_ERR(pages); | 517 | ret = PTR_ERR(pages); |
674 | goto out; | 518 | goto out; |
675 | } | 519 | } |
676 | ret = copy_user_to_page_vector(pages, data, pos, len); | 520 | ret = ceph_copy_user_to_page_vector(pages, data, pos, len); |
677 | if (ret < 0) { | 521 | if (ret < 0) { |
678 | ceph_release_page_vector(pages, num_pages); | 522 | ceph_release_page_vector(pages, num_pages); |
679 | goto out; | 523 | goto out; |
@@ -689,7 +533,7 @@ more: | |||
689 | req->r_num_pages = num_pages; | 533 | req->r_num_pages = num_pages; |
690 | req->r_inode = inode; | 534 | req->r_inode = inode; |
691 | 535 | ||
692 | ret = ceph_osdc_start_request(&client->osdc, req, false); | 536 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
693 | if (!ret) { | 537 | if (!ret) { |
694 | if (req->r_safe_callback) { | 538 | if (req->r_safe_callback) { |
695 | /* | 539 | /* |
@@ -697,15 +541,15 @@ more: | |||
697 | * start_request so that a tid has been assigned. | 541 | * start_request so that a tid has been assigned. |
698 | */ | 542 | */ |
699 | spin_lock(&ci->i_unsafe_lock); | 543 | spin_lock(&ci->i_unsafe_lock); |
700 | list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); | 544 | list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); |
701 | spin_unlock(&ci->i_unsafe_lock); | 545 | spin_unlock(&ci->i_unsafe_lock); |
702 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); | 546 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); |
703 | } | 547 | } |
704 | ret = ceph_osdc_wait_request(&client->osdc, req); | 548 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
705 | } | 549 | } |
706 | 550 | ||
707 | if (file->f_flags & O_DIRECT) | 551 | if (file->f_flags & O_DIRECT) |
708 | put_page_vector(pages, num_pages); | 552 | ceph_put_page_vector(pages, num_pages); |
709 | else if (file->f_flags & O_SYNC) | 553 | else if (file->f_flags & O_SYNC) |
710 | ceph_release_page_vector(pages, num_pages); | 554 | ceph_release_page_vector(pages, num_pages); |
711 | 555 | ||
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
814 | struct ceph_file_info *fi = file->private_data; | 658 | struct ceph_file_info *fi = file->private_data; |
815 | struct inode *inode = file->f_dentry->d_inode; | 659 | struct inode *inode = file->f_dentry->d_inode; |
816 | struct ceph_inode_info *ci = ceph_inode(inode); | 660 | struct ceph_inode_info *ci = ceph_inode(inode); |
817 | struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; | 661 | struct ceph_osd_client *osdc = |
662 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | ||
818 | loff_t endoff = pos + iov->iov_len; | 663 | loff_t endoff = pos + iov->iov_len; |
819 | int want, got = 0; | 664 | int want, got = 0; |
820 | int ret, err; | 665 | int ret, err; |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 62377ec37edf..1d6a45b5a04c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
@@ -13,7 +13,8 @@ | |||
13 | #include <linux/pagevec.h> | 13 | #include <linux/pagevec.h> |
14 | 14 | ||
15 | #include "super.h" | 15 | #include "super.h" |
16 | #include "decode.h" | 16 | #include "mds_client.h" |
17 | #include <linux/ceph/decode.h> | ||
17 | 18 | ||
18 | /* | 19 | /* |
19 | * Ceph inode operations | 20 | * Ceph inode operations |
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode) | |||
384 | */ | 385 | */ |
385 | if (ci->i_snap_realm) { | 386 | if (ci->i_snap_realm) { |
386 | struct ceph_mds_client *mdsc = | 387 | struct ceph_mds_client *mdsc = |
387 | &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 388 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
388 | struct ceph_snap_realm *realm = ci->i_snap_realm; | 389 | struct ceph_snap_realm *realm = ci->i_snap_realm; |
389 | 390 | ||
390 | dout(" dropping residual ref to snap realm %p\n", realm); | 391 | dout(" dropping residual ref to snap realm %p\n", realm); |
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode, | |||
685 | } | 686 | } |
686 | 687 | ||
687 | /* it may be better to set st_size in getattr instead? */ | 688 | /* it may be better to set st_size in getattr instead? */ |
688 | if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) | 689 | if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) |
689 | inode->i_size = ci->i_rbytes; | 690 | inode->i_size = ci->i_rbytes; |
690 | break; | 691 | break; |
691 | default: | 692 | default: |
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | |||
901 | struct inode *in = NULL; | 902 | struct inode *in = NULL; |
902 | struct ceph_mds_reply_inode *ininfo; | 903 | struct ceph_mds_reply_inode *ininfo; |
903 | struct ceph_vino vino; | 904 | struct ceph_vino vino; |
904 | struct ceph_client *client = ceph_sb_to_client(sb); | 905 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
905 | int i = 0; | 906 | int i = 0; |
906 | int err = 0; | 907 | int err = 0; |
907 | 908 | ||
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | |||
965 | */ | 966 | */ |
966 | if (rinfo->head->is_dentry && !req->r_aborted && | 967 | if (rinfo->head->is_dentry && !req->r_aborted && |
967 | (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, | 968 | (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, |
968 | client->mount_args->snapdir_name, | 969 | fsc->mount_options->snapdir_name, |
969 | req->r_dentry->d_name.len))) { | 970 | req->r_dentry->d_name.len))) { |
970 | /* | 971 | /* |
971 | * lookup link rename : null -> possibly existing inode | 972 | * lookup link rename : null -> possibly existing inode |
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1533 | struct inode *parent_inode = dentry->d_parent->d_inode; | 1534 | struct inode *parent_inode = dentry->d_parent->d_inode; |
1534 | const unsigned int ia_valid = attr->ia_valid; | 1535 | const unsigned int ia_valid = attr->ia_valid; |
1535 | struct ceph_mds_request *req; | 1536 | struct ceph_mds_request *req; |
1536 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; | 1537 | struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; |
1537 | int issued; | 1538 | int issued; |
1538 | int release = 0, dirtied = 0; | 1539 | int release = 0, dirtied = 0; |
1539 | int mask = 0; | 1540 | int mask = 0; |
@@ -1728,8 +1729,8 @@ out: | |||
1728 | */ | 1729 | */ |
1729 | int ceph_do_getattr(struct inode *inode, int mask) | 1730 | int ceph_do_getattr(struct inode *inode, int mask) |
1730 | { | 1731 | { |
1731 | struct ceph_client *client = ceph_sb_to_client(inode->i_sb); | 1732 | struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); |
1732 | struct ceph_mds_client *mdsc = &client->mdsc; | 1733 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1733 | struct ceph_mds_request *req; | 1734 | struct ceph_mds_request *req; |
1734 | int err; | 1735 | int err; |
1735 | 1736 | ||
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 76e307d2aba1..8888c9ba68db 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
@@ -1,8 +1,10 @@ | |||
1 | #include <linux/in.h> | 1 | #include <linux/in.h> |
2 | 2 | ||
3 | #include "ioctl.h" | ||
4 | #include "super.h" | 3 | #include "super.h" |
5 | #include "ceph_debug.h" | 4 | #include "mds_client.h" |
5 | #include <linux/ceph/ceph_debug.h> | ||
6 | |||
7 | #include "ioctl.h" | ||
6 | 8 | ||
7 | 9 | ||
8 | /* | 10 | /* |
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) | |||
37 | { | 39 | { |
38 | struct inode *inode = file->f_dentry->d_inode; | 40 | struct inode *inode = file->f_dentry->d_inode; |
39 | struct inode *parent_inode = file->f_dentry->d_parent->d_inode; | 41 | struct inode *parent_inode = file->f_dentry->d_parent->d_inode; |
40 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; | 42 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
41 | struct ceph_mds_request *req; | 43 | struct ceph_mds_request *req; |
42 | struct ceph_ioctl_layout l; | 44 | struct ceph_ioctl_layout l; |
43 | int err, i; | 45 | int err, i; |
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) | |||
90 | } | 92 | } |
91 | 93 | ||
92 | /* | 94 | /* |
95 | * Set a layout policy on a directory inode. All items in the tree | ||
96 | * rooted at this inode will inherit this layout on creation, | ||
97 | * (It doesn't apply retroactively ) | ||
98 | * unless a subdirectory has its own layout policy. | ||
99 | */ | ||
100 | static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) | ||
101 | { | ||
102 | struct inode *inode = file->f_dentry->d_inode; | ||
103 | struct ceph_mds_request *req; | ||
104 | struct ceph_ioctl_layout l; | ||
105 | int err, i; | ||
106 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | ||
107 | |||
108 | /* copy and validate */ | ||
109 | if (copy_from_user(&l, arg, sizeof(l))) | ||
110 | return -EFAULT; | ||
111 | |||
112 | if ((l.object_size & ~PAGE_MASK) || | ||
113 | (l.stripe_unit & ~PAGE_MASK) || | ||
114 | !l.stripe_unit || | ||
115 | (l.object_size && | ||
116 | (unsigned)l.object_size % (unsigned)l.stripe_unit)) | ||
117 | return -EINVAL; | ||
118 | |||
119 | /* make sure it's a valid data pool */ | ||
120 | if (l.data_pool > 0) { | ||
121 | mutex_lock(&mdsc->mutex); | ||
122 | err = -EINVAL; | ||
123 | for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) | ||
124 | if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { | ||
125 | err = 0; | ||
126 | break; | ||
127 | } | ||
128 | mutex_unlock(&mdsc->mutex); | ||
129 | if (err) | ||
130 | return err; | ||
131 | } | ||
132 | |||
133 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, | ||
134 | USE_AUTH_MDS); | ||
135 | |||
136 | if (IS_ERR(req)) | ||
137 | return PTR_ERR(req); | ||
138 | req->r_inode = igrab(inode); | ||
139 | |||
140 | req->r_args.setlayout.layout.fl_stripe_unit = | ||
141 | cpu_to_le32(l.stripe_unit); | ||
142 | req->r_args.setlayout.layout.fl_stripe_count = | ||
143 | cpu_to_le32(l.stripe_count); | ||
144 | req->r_args.setlayout.layout.fl_object_size = | ||
145 | cpu_to_le32(l.object_size); | ||
146 | req->r_args.setlayout.layout.fl_pg_pool = | ||
147 | cpu_to_le32(l.data_pool); | ||
148 | req->r_args.setlayout.layout.fl_pg_preferred = | ||
149 | cpu_to_le32(l.preferred_osd); | ||
150 | |||
151 | err = ceph_mdsc_do_request(mdsc, inode, req); | ||
152 | ceph_mdsc_put_request(req); | ||
153 | return err; | ||
154 | } | ||
155 | |||
156 | /* | ||
93 | * Return object name, size/offset information, and location (OSD | 157 | * Return object name, size/offset information, and location (OSD |
94 | * number, network address) for a given file offset. | 158 | * number, network address) for a given file offset. |
95 | */ | 159 | */ |
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
98 | struct ceph_ioctl_dataloc dl; | 162 | struct ceph_ioctl_dataloc dl; |
99 | struct inode *inode = file->f_dentry->d_inode; | 163 | struct inode *inode = file->f_dentry->d_inode; |
100 | struct ceph_inode_info *ci = ceph_inode(inode); | 164 | struct ceph_inode_info *ci = ceph_inode(inode); |
101 | struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; | 165 | struct ceph_osd_client *osdc = |
166 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | ||
102 | u64 len = 1, olen; | 167 | u64 len = 1, olen; |
103 | u64 tmp; | 168 | u64 tmp; |
104 | struct ceph_object_layout ol; | 169 | struct ceph_object_layout ol; |
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
174 | case CEPH_IOC_SET_LAYOUT: | 239 | case CEPH_IOC_SET_LAYOUT: |
175 | return ceph_ioctl_set_layout(file, (void __user *)arg); | 240 | return ceph_ioctl_set_layout(file, (void __user *)arg); |
176 | 241 | ||
242 | case CEPH_IOC_SET_LAYOUT_POLICY: | ||
243 | return ceph_ioctl_set_layout_policy(file, (void __user *)arg); | ||
244 | |||
177 | case CEPH_IOC_GET_DATALOC: | 245 | case CEPH_IOC_GET_DATALOC: |
178 | return ceph_ioctl_get_dataloc(file, (void __user *)arg); | 246 | return ceph_ioctl_get_dataloc(file, (void __user *)arg); |
179 | 247 | ||
180 | case CEPH_IOC_LAZYIO: | 248 | case CEPH_IOC_LAZYIO: |
181 | return ceph_ioctl_lazyio(file); | 249 | return ceph_ioctl_lazyio(file); |
182 | } | 250 | } |
251 | |||
183 | return -ENOTTY; | 252 | return -ENOTTY; |
184 | } | 253 | } |
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 88451a3b6857..a6ce54e94eb5 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h | |||
@@ -4,7 +4,7 @@ | |||
4 | #include <linux/ioctl.h> | 4 | #include <linux/ioctl.h> |
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | 6 | ||
7 | #define CEPH_IOCTL_MAGIC 0x97 | 7 | #define CEPH_IOCTL_MAGIC 0x98 |
8 | 8 | ||
9 | /* just use u64 to align sanely on all archs */ | 9 | /* just use u64 to align sanely on all archs */ |
10 | struct ceph_ioctl_layout { | 10 | struct ceph_ioctl_layout { |
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout { | |||
17 | struct ceph_ioctl_layout) | 17 | struct ceph_ioctl_layout) |
18 | #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ | 18 | #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ |
19 | struct ceph_ioctl_layout) | 19 | struct ceph_ioctl_layout) |
20 | #define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ | ||
21 | struct ceph_ioctl_layout) | ||
20 | 22 | ||
21 | /* | 23 | /* |
22 | * Extract identity, address of the OSD and object storing a given | 24 | * Extract identity, address of the OSD and object storing a given |
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ff4e753aae92..40abde93c345 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c | |||
@@ -1,11 +1,11 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/file.h> | 3 | #include <linux/file.h> |
4 | #include <linux/namei.h> | 4 | #include <linux/namei.h> |
5 | 5 | ||
6 | #include "super.h" | 6 | #include "super.h" |
7 | #include "mds_client.h" | 7 | #include "mds_client.h" |
8 | #include "pagelist.h" | 8 | #include <linux/ceph/pagelist.h> |
9 | 9 | ||
10 | /** | 10 | /** |
11 | * Implement fcntl and flock locking functions. | 11 | * Implement fcntl and flock locking functions. |
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, | |||
16 | { | 16 | { |
17 | struct inode *inode = file->f_dentry->d_inode; | 17 | struct inode *inode = file->f_dentry->d_inode; |
18 | struct ceph_mds_client *mdsc = | 18 | struct ceph_mds_client *mdsc = |
19 | &ceph_sb_to_client(inode->i_sb)->mdsc; | 19 | ceph_sb_to_client(inode->i_sb)->mdsc; |
20 | struct ceph_mds_request *req; | 20 | struct ceph_mds_request *req; |
21 | int err; | 21 | int err; |
22 | 22 | ||
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) | |||
181 | * Encode the flock and fcntl locks for the given inode into the pagelist. | 181 | * Encode the flock and fcntl locks for the given inode into the pagelist. |
182 | * Format is: #fcntl locks, sequential fcntl locks, #flock locks, | 182 | * Format is: #fcntl locks, sequential fcntl locks, #flock locks, |
183 | * sequential flock locks. | 183 | * sequential flock locks. |
184 | * Must be called with BLK already held, and the lock numbers should have | 184 | * Must be called with lock_flocks() already held. |
185 | * been gathered under the same lock holding window. | 185 | * If we encounter more of a specific lock type than expected, |
186 | * we return the value 1. | ||
186 | */ | 187 | */ |
187 | int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | 188 | int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, |
188 | int num_fcntl_locks, int num_flock_locks) | 189 | int num_fcntl_locks, int num_flock_locks) |
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |||
190 | struct file_lock *lock; | 191 | struct file_lock *lock; |
191 | struct ceph_filelock cephlock; | 192 | struct ceph_filelock cephlock; |
192 | int err = 0; | 193 | int err = 0; |
194 | int seen_fcntl = 0; | ||
195 | int seen_flock = 0; | ||
193 | 196 | ||
194 | dout("encoding %d flock and %d fcntl locks", num_flock_locks, | 197 | dout("encoding %d flock and %d fcntl locks", num_flock_locks, |
195 | num_fcntl_locks); | 198 | num_fcntl_locks); |
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |||
198 | goto fail; | 201 | goto fail; |
199 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { | 202 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
200 | if (lock->fl_flags & FL_POSIX) { | 203 | if (lock->fl_flags & FL_POSIX) { |
204 | ++seen_fcntl; | ||
205 | if (seen_fcntl > num_fcntl_locks) { | ||
206 | err = -ENOSPC; | ||
207 | goto fail; | ||
208 | } | ||
201 | err = lock_to_ceph_filelock(lock, &cephlock); | 209 | err = lock_to_ceph_filelock(lock, &cephlock); |
202 | if (err) | 210 | if (err) |
203 | goto fail; | 211 | goto fail; |
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |||
213 | goto fail; | 221 | goto fail; |
214 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { | 222 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
215 | if (lock->fl_flags & FL_FLOCK) { | 223 | if (lock->fl_flags & FL_FLOCK) { |
224 | ++seen_flock; | ||
225 | if (seen_flock > num_flock_locks) { | ||
226 | err = -ENOSPC; | ||
227 | goto fail; | ||
228 | } | ||
216 | err = lock_to_ceph_filelock(lock, &cephlock); | 229 | err = lock_to_ceph_filelock(lock, &cephlock); |
217 | if (err) | 230 | if (err) |
218 | goto fail; | 231 | goto fail; |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fad95f8f2608..3142b15940c2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -1,17 +1,21 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/fs.h> | ||
3 | #include <linux/wait.h> | 4 | #include <linux/wait.h> |
4 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
5 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/debugfs.h> | ||
8 | #include <linux/seq_file.h> | ||
6 | #include <linux/smp_lock.h> | 9 | #include <linux/smp_lock.h> |
7 | 10 | ||
8 | #include "mds_client.h" | ||
9 | #include "mon_client.h" | ||
10 | #include "super.h" | 11 | #include "super.h" |
11 | #include "messenger.h" | 12 | #include "mds_client.h" |
12 | #include "decode.h" | 13 | |
13 | #include "auth.h" | 14 | #include <linux/ceph/messenger.h> |
14 | #include "pagelist.h" | 15 | #include <linux/ceph/decode.h> |
16 | #include <linux/ceph/pagelist.h> | ||
17 | #include <linux/ceph/auth.h> | ||
18 | #include <linux/ceph/debugfs.h> | ||
15 | 19 | ||
16 | /* | 20 | /* |
17 | * A cluster of MDS (metadata server) daemons is responsible for | 21 | * A cluster of MDS (metadata server) daemons is responsible for |
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) | |||
286 | atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); | 290 | atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); |
287 | if (atomic_dec_and_test(&s->s_ref)) { | 291 | if (atomic_dec_and_test(&s->s_ref)) { |
288 | if (s->s_authorizer) | 292 | if (s->s_authorizer) |
289 | s->s_mdsc->client->monc.auth->ops->destroy_authorizer( | 293 | s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( |
290 | s->s_mdsc->client->monc.auth, s->s_authorizer); | 294 | s->s_mdsc->fsc->client->monc.auth, |
295 | s->s_authorizer); | ||
291 | kfree(s); | 296 | kfree(s); |
292 | } | 297 | } |
293 | } | 298 | } |
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, | |||
344 | s->s_seq = 0; | 349 | s->s_seq = 0; |
345 | mutex_init(&s->s_mutex); | 350 | mutex_init(&s->s_mutex); |
346 | 351 | ||
347 | ceph_con_init(mdsc->client->msgr, &s->s_con); | 352 | ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); |
348 | s->s_con.private = s; | 353 | s->s_con.private = s; |
349 | s->s_con.ops = &mds_con_ops; | 354 | s->s_con.ops = &mds_con_ops; |
350 | s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; | 355 | s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; |
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
599 | } else if (req->r_dentry) { | 604 | } else if (req->r_dentry) { |
600 | struct inode *dir = req->r_dentry->d_parent->d_inode; | 605 | struct inode *dir = req->r_dentry->d_parent->d_inode; |
601 | 606 | ||
602 | if (dir->i_sb != mdsc->client->sb) { | 607 | if (dir->i_sb != mdsc->fsc->sb) { |
603 | /* not this fs! */ | 608 | /* not this fs! */ |
604 | inode = req->r_dentry->d_inode; | 609 | inode = req->r_dentry->d_inode; |
605 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { | 610 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { |
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
884 | __ceph_remove_cap(cap); | 889 | __ceph_remove_cap(cap); |
885 | if (!__ceph_is_any_real_caps(ci)) { | 890 | if (!__ceph_is_any_real_caps(ci)) { |
886 | struct ceph_mds_client *mdsc = | 891 | struct ceph_mds_client *mdsc = |
887 | &ceph_sb_to_client(inode->i_sb)->mdsc; | 892 | ceph_sb_to_client(inode->i_sb)->mdsc; |
888 | 893 | ||
889 | spin_lock(&mdsc->cap_dirty_lock); | 894 | spin_lock(&mdsc->cap_dirty_lock); |
890 | if (!list_empty(&ci->i_dirty_item)) { | 895 | if (!list_empty(&ci->i_dirty_item)) { |
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | |||
1146 | struct ceph_msg *msg, *partial = NULL; | 1151 | struct ceph_msg *msg, *partial = NULL; |
1147 | struct ceph_mds_cap_release *head; | 1152 | struct ceph_mds_cap_release *head; |
1148 | int err = -ENOMEM; | 1153 | int err = -ENOMEM; |
1149 | int extra = mdsc->client->mount_args->cap_release_safety; | 1154 | int extra = mdsc->fsc->mount_options->cap_release_safety; |
1150 | int num; | 1155 | int num; |
1151 | 1156 | ||
1152 | dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, | 1157 | dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, |
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
2085 | 2090 | ||
2086 | /* insert trace into our cache */ | 2091 | /* insert trace into our cache */ |
2087 | mutex_lock(&req->r_fill_mutex); | 2092 | mutex_lock(&req->r_fill_mutex); |
2088 | err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); | 2093 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); |
2089 | if (err == 0) { | 2094 | if (err == 0) { |
2090 | if (result == 0 && rinfo->dir_nr) | 2095 | if (result == 0 && rinfo->dir_nr) |
2091 | ceph_readdir_prepopulate(req, req->r_session); | 2096 | ceph_readdir_prepopulate(req, req->r_session); |
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
2361 | 2366 | ||
2362 | if (recon_state->flock) { | 2367 | if (recon_state->flock) { |
2363 | int num_fcntl_locks, num_flock_locks; | 2368 | int num_fcntl_locks, num_flock_locks; |
2364 | 2369 | struct ceph_pagelist_cursor trunc_point; | |
2365 | lock_kernel(); | 2370 | |
2366 | ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); | 2371 | ceph_pagelist_set_cursor(pagelist, &trunc_point); |
2367 | rec.v2.flock_len = (2*sizeof(u32) + | 2372 | do { |
2368 | (num_fcntl_locks+num_flock_locks) * | 2373 | lock_flocks(); |
2369 | sizeof(struct ceph_filelock)); | 2374 | ceph_count_locks(inode, &num_fcntl_locks, |
2370 | 2375 | &num_flock_locks); | |
2371 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2376 | rec.v2.flock_len = (2*sizeof(u32) + |
2372 | if (!err) | 2377 | (num_fcntl_locks+num_flock_locks) * |
2373 | err = ceph_encode_locks(inode, pagelist, | 2378 | sizeof(struct ceph_filelock)); |
2374 | num_fcntl_locks, | 2379 | unlock_flocks(); |
2375 | num_flock_locks); | 2380 | |
2376 | unlock_kernel(); | 2381 | /* pre-alloc pagelist */ |
2382 | ceph_pagelist_truncate(pagelist, &trunc_point); | ||
2383 | err = ceph_pagelist_append(pagelist, &rec, reclen); | ||
2384 | if (!err) | ||
2385 | err = ceph_pagelist_reserve(pagelist, | ||
2386 | rec.v2.flock_len); | ||
2387 | |||
2388 | /* encode locks */ | ||
2389 | if (!err) { | ||
2390 | lock_flocks(); | ||
2391 | err = ceph_encode_locks(inode, | ||
2392 | pagelist, | ||
2393 | num_fcntl_locks, | ||
2394 | num_flock_locks); | ||
2395 | unlock_flocks(); | ||
2396 | } | ||
2397 | } while (err == -ENOSPC); | ||
2377 | } else { | 2398 | } else { |
2378 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2399 | err = ceph_pagelist_append(pagelist, &rec, reclen); |
2379 | } | 2400 | } |
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, | |||
2613 | struct ceph_mds_session *session, | 2634 | struct ceph_mds_session *session, |
2614 | struct ceph_msg *msg) | 2635 | struct ceph_msg *msg) |
2615 | { | 2636 | { |
2616 | struct super_block *sb = mdsc->client->sb; | 2637 | struct super_block *sb = mdsc->fsc->sb; |
2617 | struct inode *inode; | 2638 | struct inode *inode; |
2618 | struct ceph_inode_info *ci; | 2639 | struct ceph_inode_info *ci; |
2619 | struct dentry *parent, *dentry; | 2640 | struct dentry *parent, *dentry; |
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work) | |||
2891 | schedule_delayed(mdsc); | 2912 | schedule_delayed(mdsc); |
2892 | } | 2913 | } |
2893 | 2914 | ||
2915 | int ceph_mdsc_init(struct ceph_fs_client *fsc) | ||
2894 | 2916 | ||
2895 | int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) | ||
2896 | { | 2917 | { |
2897 | mdsc->client = client; | 2918 | struct ceph_mds_client *mdsc; |
2919 | |||
2920 | mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); | ||
2921 | if (!mdsc) | ||
2922 | return -ENOMEM; | ||
2923 | mdsc->fsc = fsc; | ||
2924 | fsc->mdsc = mdsc; | ||
2898 | mutex_init(&mdsc->mutex); | 2925 | mutex_init(&mdsc->mutex); |
2899 | mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); | 2926 | mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); |
2900 | if (mdsc->mdsmap == NULL) | 2927 | if (mdsc->mdsmap == NULL) |
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) | |||
2927 | INIT_LIST_HEAD(&mdsc->dentry_lru); | 2954 | INIT_LIST_HEAD(&mdsc->dentry_lru); |
2928 | 2955 | ||
2929 | ceph_caps_init(mdsc); | 2956 | ceph_caps_init(mdsc); |
2930 | ceph_adjust_min_caps(mdsc, client->min_caps); | 2957 | ceph_adjust_min_caps(mdsc, fsc->min_caps); |
2931 | 2958 | ||
2932 | return 0; | 2959 | return 0; |
2933 | } | 2960 | } |
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) | |||
2939 | static void wait_requests(struct ceph_mds_client *mdsc) | 2966 | static void wait_requests(struct ceph_mds_client *mdsc) |
2940 | { | 2967 | { |
2941 | struct ceph_mds_request *req; | 2968 | struct ceph_mds_request *req; |
2942 | struct ceph_client *client = mdsc->client; | 2969 | struct ceph_fs_client *fsc = mdsc->fsc; |
2943 | 2970 | ||
2944 | mutex_lock(&mdsc->mutex); | 2971 | mutex_lock(&mdsc->mutex); |
2945 | if (__get_oldest_req(mdsc)) { | 2972 | if (__get_oldest_req(mdsc)) { |
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) | |||
2947 | 2974 | ||
2948 | dout("wait_requests waiting for requests\n"); | 2975 | dout("wait_requests waiting for requests\n"); |
2949 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, | 2976 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, |
2950 | client->mount_args->mount_timeout * HZ); | 2977 | fsc->client->options->mount_timeout * HZ); |
2951 | 2978 | ||
2952 | /* tear down remaining requests */ | 2979 | /* tear down remaining requests */ |
2953 | mutex_lock(&mdsc->mutex); | 2980 | mutex_lock(&mdsc->mutex); |
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | |||
3030 | { | 3057 | { |
3031 | u64 want_tid, want_flush; | 3058 | u64 want_tid, want_flush; |
3032 | 3059 | ||
3033 | if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) | 3060 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) |
3034 | return; | 3061 | return; |
3035 | 3062 | ||
3036 | dout("sync\n"); | 3063 | dout("sync\n"); |
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc) | |||
3053 | { | 3080 | { |
3054 | int i, n = 0; | 3081 | int i, n = 0; |
3055 | 3082 | ||
3056 | if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) | 3083 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) |
3057 | return true; | 3084 | return true; |
3058 | 3085 | ||
3059 | mutex_lock(&mdsc->mutex); | 3086 | mutex_lock(&mdsc->mutex); |
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | |||
3071 | { | 3098 | { |
3072 | struct ceph_mds_session *session; | 3099 | struct ceph_mds_session *session; |
3073 | int i; | 3100 | int i; |
3074 | struct ceph_client *client = mdsc->client; | 3101 | struct ceph_fs_client *fsc = mdsc->fsc; |
3075 | unsigned long timeout = client->mount_args->mount_timeout * HZ; | 3102 | unsigned long timeout = fsc->client->options->mount_timeout * HZ; |
3076 | 3103 | ||
3077 | dout("close_sessions\n"); | 3104 | dout("close_sessions\n"); |
3078 | 3105 | ||
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | |||
3119 | dout("stopped\n"); | 3146 | dout("stopped\n"); |
3120 | } | 3147 | } |
3121 | 3148 | ||
3122 | void ceph_mdsc_stop(struct ceph_mds_client *mdsc) | 3149 | static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) |
3123 | { | 3150 | { |
3124 | dout("stop\n"); | 3151 | dout("stop\n"); |
3125 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ | 3152 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ |
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) | |||
3129 | ceph_caps_finalize(mdsc); | 3156 | ceph_caps_finalize(mdsc); |
3130 | } | 3157 | } |
3131 | 3158 | ||
3159 | void ceph_mdsc_destroy(struct ceph_fs_client *fsc) | ||
3160 | { | ||
3161 | struct ceph_mds_client *mdsc = fsc->mdsc; | ||
3162 | |||
3163 | ceph_mdsc_stop(mdsc); | ||
3164 | fsc->mdsc = NULL; | ||
3165 | kfree(mdsc); | ||
3166 | } | ||
3167 | |||
3132 | 3168 | ||
3133 | /* | 3169 | /* |
3134 | * handle mds map update. | 3170 | * handle mds map update. |
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) | |||
3145 | 3181 | ||
3146 | ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); | 3182 | ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); |
3147 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 3183 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
3148 | if (ceph_check_fsid(mdsc->client, &fsid) < 0) | 3184 | if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) |
3149 | return; | 3185 | return; |
3150 | epoch = ceph_decode_32(&p); | 3186 | epoch = ceph_decode_32(&p); |
3151 | maplen = ceph_decode_32(&p); | 3187 | maplen = ceph_decode_32(&p); |
3152 | dout("handle_map epoch %u len %d\n", epoch, (int)maplen); | 3188 | dout("handle_map epoch %u len %d\n", epoch, (int)maplen); |
3153 | 3189 | ||
3154 | /* do we need it? */ | 3190 | /* do we need it? */ |
3155 | ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); | 3191 | ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); |
3156 | mutex_lock(&mdsc->mutex); | 3192 | mutex_lock(&mdsc->mutex); |
3157 | if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { | 3193 | if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { |
3158 | dout("handle_map epoch %u <= our %u\n", | 3194 | dout("handle_map epoch %u <= our %u\n", |
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) | |||
3176 | } else { | 3212 | } else { |
3177 | mdsc->mdsmap = newmap; /* first mds map */ | 3213 | mdsc->mdsmap = newmap; /* first mds map */ |
3178 | } | 3214 | } |
3179 | mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; | 3215 | mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; |
3180 | 3216 | ||
3181 | __wake_requests(mdsc, &mdsc->waiting_for_map); | 3217 | __wake_requests(mdsc, &mdsc->waiting_for_map); |
3182 | 3218 | ||
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con, | |||
3277 | { | 3313 | { |
3278 | struct ceph_mds_session *s = con->private; | 3314 | struct ceph_mds_session *s = con->private; |
3279 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3315 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3280 | struct ceph_auth_client *ac = mdsc->client->monc.auth; | 3316 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3281 | int ret = 0; | 3317 | int ret = 0; |
3282 | 3318 | ||
3283 | if (force_new && s->s_authorizer) { | 3319 | if (force_new && s->s_authorizer) { |
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) | |||
3311 | { | 3347 | { |
3312 | struct ceph_mds_session *s = con->private; | 3348 | struct ceph_mds_session *s = con->private; |
3313 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3349 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3314 | struct ceph_auth_client *ac = mdsc->client->monc.auth; | 3350 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3315 | 3351 | ||
3316 | return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); | 3352 | return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); |
3317 | } | 3353 | } |
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con) | |||
3320 | { | 3356 | { |
3321 | struct ceph_mds_session *s = con->private; | 3357 | struct ceph_mds_session *s = con->private; |
3322 | struct ceph_mds_client *mdsc = s->s_mdsc; | 3358 | struct ceph_mds_client *mdsc = s->s_mdsc; |
3323 | struct ceph_auth_client *ac = mdsc->client->monc.auth; | 3359 | struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; |
3324 | 3360 | ||
3325 | if (ac->ops->invalidate_authorizer) | 3361 | if (ac->ops->invalidate_authorizer) |
3326 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); | 3362 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); |
3327 | 3363 | ||
3328 | return ceph_monc_validate_auth(&mdsc->client->monc); | 3364 | return ceph_monc_validate_auth(&mdsc->fsc->client->monc); |
3329 | } | 3365 | } |
3330 | 3366 | ||
3331 | static const struct ceph_connection_operations mds_con_ops = { | 3367 | static const struct ceph_connection_operations mds_con_ops = { |
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = { | |||
3338 | .peer_reset = peer_reset, | 3374 | .peer_reset = peer_reset, |
3339 | }; | 3375 | }; |
3340 | 3376 | ||
3341 | |||
3342 | |||
3343 | |||
3344 | /* eof */ | 3377 | /* eof */ |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c98267ce6d2a..d66d63c72355 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -8,9 +8,9 @@ | |||
8 | #include <linux/rbtree.h> | 8 | #include <linux/rbtree.h> |
9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
10 | 10 | ||
11 | #include "types.h" | 11 | #include <linux/ceph/types.h> |
12 | #include "messenger.h" | 12 | #include <linux/ceph/messenger.h> |
13 | #include "mdsmap.h" | 13 | #include <linux/ceph/mdsmap.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Some lock dependencies: | 16 | * Some lock dependencies: |
@@ -26,7 +26,7 @@ | |||
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
29 | struct ceph_client; | 29 | struct ceph_fs_client; |
30 | struct ceph_cap; | 30 | struct ceph_cap; |
31 | 31 | ||
32 | /* | 32 | /* |
@@ -230,7 +230,7 @@ struct ceph_mds_request { | |||
230 | * mds client state | 230 | * mds client state |
231 | */ | 231 | */ |
232 | struct ceph_mds_client { | 232 | struct ceph_mds_client { |
233 | struct ceph_client *client; | 233 | struct ceph_fs_client *fsc; |
234 | struct mutex mutex; /* all nested structures */ | 234 | struct mutex mutex; /* all nested structures */ |
235 | 235 | ||
236 | struct ceph_mdsmap *mdsmap; | 236 | struct ceph_mdsmap *mdsmap; |
@@ -289,11 +289,6 @@ struct ceph_mds_client { | |||
289 | int caps_avail_count; /* unused, unreserved */ | 289 | int caps_avail_count; /* unused, unreserved */ |
290 | int caps_min_count; /* keep at least this many | 290 | int caps_min_count; /* keep at least this many |
291 | (unreserved) */ | 291 | (unreserved) */ |
292 | |||
293 | #ifdef CONFIG_DEBUG_FS | ||
294 | struct dentry *debugfs_file; | ||
295 | #endif | ||
296 | |||
297 | spinlock_t dentry_lru_lock; | 292 | spinlock_t dentry_lru_lock; |
298 | struct list_head dentry_lru; | 293 | struct list_head dentry_lru; |
299 | int num_dentry; | 294 | int num_dentry; |
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s); | |||
316 | extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, | 311 | extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, |
317 | struct ceph_msg *msg, int mds); | 312 | struct ceph_msg *msg, int mds); |
318 | 313 | ||
319 | extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, | 314 | extern int ceph_mdsc_init(struct ceph_fs_client *fsc); |
320 | struct ceph_client *client); | ||
321 | extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); | 315 | extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); |
322 | extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); | 316 | extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); |
323 | 317 | ||
324 | extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); | 318 | extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); |
325 | 319 | ||
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 040be6d1150b..73b7d44e8a35 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c | |||
@@ -1,4 +1,4 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/bug.h> | 3 | #include <linux/bug.h> |
4 | #include <linux/err.h> | 4 | #include <linux/err.h> |
@@ -6,9 +6,9 @@ | |||
6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | 8 | ||
9 | #include "mdsmap.h" | 9 | #include <linux/ceph/mdsmap.h> |
10 | #include "messenger.h" | 10 | #include <linux/ceph/messenger.h> |
11 | #include "decode.h" | 11 | #include <linux/ceph/decode.h> |
12 | 12 | ||
13 | #include "super.h" | 13 | #include "super.h" |
14 | 14 | ||
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
117 | } | 117 | } |
118 | 118 | ||
119 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", | 119 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", |
120 | i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), | 120 | i+1, n, global_id, mds, inc, |
121 | ceph_pr_addr(&addr.in_addr), | ||
121 | ceph_mds_state_name(state)); | 122 | ceph_mds_state_name(state)); |
122 | if (mds >= 0 && mds < m->m_max_mds && state > 0) { | 123 | if (mds >= 0 && mds < m->m_max_mds && state > 0) { |
123 | m->m_info[mds].global_id = global_id; | 124 | m->m_info[mds].global_id = global_id; |
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h deleted file mode 100644 index 4c5cb0880bba..000000000000 --- a/fs/ceph/mdsmap.h +++ /dev/null | |||
@@ -1,62 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_MDSMAP_H | ||
2 | #define _FS_CEPH_MDSMAP_H | ||
3 | |||
4 | #include "types.h" | ||
5 | |||
6 | /* | ||
7 | * mds map - describe servers in the mds cluster. | ||
8 | * | ||
9 | * we limit fields to those the client actually xcares about | ||
10 | */ | ||
11 | struct ceph_mds_info { | ||
12 | u64 global_id; | ||
13 | struct ceph_entity_addr addr; | ||
14 | s32 state; | ||
15 | int num_export_targets; | ||
16 | bool laggy; | ||
17 | u32 *export_targets; | ||
18 | }; | ||
19 | |||
20 | struct ceph_mdsmap { | ||
21 | u32 m_epoch, m_client_epoch, m_last_failure; | ||
22 | u32 m_root; | ||
23 | u32 m_session_timeout; /* seconds */ | ||
24 | u32 m_session_autoclose; /* seconds */ | ||
25 | u64 m_max_file_size; | ||
26 | u32 m_max_mds; /* size of m_addr, m_state arrays */ | ||
27 | struct ceph_mds_info *m_info; | ||
28 | |||
29 | /* which object pools file data can be stored in */ | ||
30 | int m_num_data_pg_pools; | ||
31 | u32 *m_data_pg_pools; | ||
32 | u32 m_cas_pg_pool; | ||
33 | }; | ||
34 | |||
35 | static inline struct ceph_entity_addr * | ||
36 | ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) | ||
37 | { | ||
38 | if (w >= m->m_max_mds) | ||
39 | return NULL; | ||
40 | return &m->m_info[w].addr; | ||
41 | } | ||
42 | |||
43 | static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) | ||
44 | { | ||
45 | BUG_ON(w < 0); | ||
46 | if (w >= m->m_max_mds) | ||
47 | return CEPH_MDS_STATE_DNE; | ||
48 | return m->m_info[w].state; | ||
49 | } | ||
50 | |||
51 | static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) | ||
52 | { | ||
53 | if (w >= 0 && w < m->m_max_mds) | ||
54 | return m->m_info[w].laggy; | ||
55 | return false; | ||
56 | } | ||
57 | |||
58 | extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); | ||
59 | extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); | ||
60 | extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); | ||
61 | |||
62 | #endif | ||
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c deleted file mode 100644 index 2502d76fcec1..000000000000 --- a/fs/ceph/messenger.c +++ /dev/null | |||
@@ -1,2277 +0,0 @@ | |||
1 | #include "ceph_debug.h" | ||
2 | |||
3 | #include <linux/crc32c.h> | ||
4 | #include <linux/ctype.h> | ||
5 | #include <linux/highmem.h> | ||
6 | #include <linux/inet.h> | ||
7 | #include <linux/kthread.h> | ||
8 | #include <linux/net.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/socket.h> | ||
11 | #include <linux/string.h> | ||
12 | #include <net/tcp.h> | ||
13 | |||
14 | #include "super.h" | ||
15 | #include "messenger.h" | ||
16 | #include "decode.h" | ||
17 | #include "pagelist.h" | ||
18 | |||
19 | /* | ||
20 | * Ceph uses the messenger to exchange ceph_msg messages with other | ||
21 | * hosts in the system. The messenger provides ordered and reliable | ||
22 | * delivery. We tolerate TCP disconnects by reconnecting (with | ||
23 | * exponential backoff) in the case of a fault (disconnection, bad | ||
24 | * crc, protocol error). Acks allow sent messages to be discarded by | ||
25 | * the sender. | ||
26 | */ | ||
27 | |||
28 | /* static tag bytes (protocol control messages) */ | ||
29 | static char tag_msg = CEPH_MSGR_TAG_MSG; | ||
30 | static char tag_ack = CEPH_MSGR_TAG_ACK; | ||
31 | static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; | ||
32 | |||
33 | #ifdef CONFIG_LOCKDEP | ||
34 | static struct lock_class_key socket_class; | ||
35 | #endif | ||
36 | |||
37 | |||
38 | static void queue_con(struct ceph_connection *con); | ||
39 | static void con_work(struct work_struct *); | ||
40 | static void ceph_fault(struct ceph_connection *con); | ||
41 | |||
42 | /* | ||
43 | * nicely render a sockaddr as a string. | ||
44 | */ | ||
45 | #define MAX_ADDR_STR 20 | ||
46 | #define MAX_ADDR_STR_LEN 60 | ||
47 | static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; | ||
48 | static DEFINE_SPINLOCK(addr_str_lock); | ||
49 | static int last_addr_str; | ||
50 | |||
51 | const char *pr_addr(const struct sockaddr_storage *ss) | ||
52 | { | ||
53 | int i; | ||
54 | char *s; | ||
55 | struct sockaddr_in *in4 = (void *)ss; | ||
56 | struct sockaddr_in6 *in6 = (void *)ss; | ||
57 | |||
58 | spin_lock(&addr_str_lock); | ||
59 | i = last_addr_str++; | ||
60 | if (last_addr_str == MAX_ADDR_STR) | ||
61 | last_addr_str = 0; | ||
62 | spin_unlock(&addr_str_lock); | ||
63 | s = addr_str[i]; | ||
64 | |||
65 | switch (ss->ss_family) { | ||
66 | case AF_INET: | ||
67 | snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, | ||
68 | (unsigned int)ntohs(in4->sin_port)); | ||
69 | break; | ||
70 | |||
71 | case AF_INET6: | ||
72 | snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, | ||
73 | (unsigned int)ntohs(in6->sin6_port)); | ||
74 | break; | ||
75 | |||
76 | default: | ||
77 | sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); | ||
78 | } | ||
79 | |||
80 | return s; | ||
81 | } | ||
82 | |||
83 | static void encode_my_addr(struct ceph_messenger *msgr) | ||
84 | { | ||
85 | memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); | ||
86 | ceph_encode_addr(&msgr->my_enc_addr); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * work queue for all reading and writing to/from the socket. | ||
91 | */ | ||
92 | struct workqueue_struct *ceph_msgr_wq; | ||
93 | |||
94 | int __init ceph_msgr_init(void) | ||
95 | { | ||
96 | ceph_msgr_wq = create_workqueue("ceph-msgr"); | ||
97 | if (IS_ERR(ceph_msgr_wq)) { | ||
98 | int ret = PTR_ERR(ceph_msgr_wq); | ||
99 | pr_err("msgr_init failed to create workqueue: %d\n", ret); | ||
100 | ceph_msgr_wq = NULL; | ||
101 | return ret; | ||
102 | } | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | void ceph_msgr_exit(void) | ||
107 | { | ||
108 | destroy_workqueue(ceph_msgr_wq); | ||
109 | } | ||
110 | |||
111 | void ceph_msgr_flush(void) | ||
112 | { | ||
113 | flush_workqueue(ceph_msgr_wq); | ||
114 | } | ||
115 | |||
116 | |||
117 | /* | ||
118 | * socket callback functions | ||
119 | */ | ||
120 | |||
121 | /* data available on socket, or listen socket received a connect */ | ||
122 | static void ceph_data_ready(struct sock *sk, int count_unused) | ||
123 | { | ||
124 | struct ceph_connection *con = | ||
125 | (struct ceph_connection *)sk->sk_user_data; | ||
126 | if (sk->sk_state != TCP_CLOSE_WAIT) { | ||
127 | dout("ceph_data_ready on %p state = %lu, queueing work\n", | ||
128 | con, con->state); | ||
129 | queue_con(con); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* socket has buffer space for writing */ | ||
134 | static void ceph_write_space(struct sock *sk) | ||
135 | { | ||
136 | struct ceph_connection *con = | ||
137 | (struct ceph_connection *)sk->sk_user_data; | ||
138 | |||
139 | /* only queue to workqueue if there is data we want to write. */ | ||
140 | if (test_bit(WRITE_PENDING, &con->state)) { | ||
141 | dout("ceph_write_space %p queueing write work\n", con); | ||
142 | queue_con(con); | ||
143 | } else { | ||
144 | dout("ceph_write_space %p nothing to write\n", con); | ||
145 | } | ||
146 | |||
147 | /* since we have our own write_space, clear the SOCK_NOSPACE flag */ | ||
148 | clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | ||
149 | } | ||
150 | |||
151 | /* socket's state has changed */ | ||
152 | static void ceph_state_change(struct sock *sk) | ||
153 | { | ||
154 | struct ceph_connection *con = | ||
155 | (struct ceph_connection *)sk->sk_user_data; | ||
156 | |||
157 | dout("ceph_state_change %p state = %lu sk_state = %u\n", | ||
158 | con, con->state, sk->sk_state); | ||
159 | |||
160 | if (test_bit(CLOSED, &con->state)) | ||
161 | return; | ||
162 | |||
163 | switch (sk->sk_state) { | ||
164 | case TCP_CLOSE: | ||
165 | dout("ceph_state_change TCP_CLOSE\n"); | ||
166 | case TCP_CLOSE_WAIT: | ||
167 | dout("ceph_state_change TCP_CLOSE_WAIT\n"); | ||
168 | if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { | ||
169 | if (test_bit(CONNECTING, &con->state)) | ||
170 | con->error_msg = "connection failed"; | ||
171 | else | ||
172 | con->error_msg = "socket closed"; | ||
173 | queue_con(con); | ||
174 | } | ||
175 | break; | ||
176 | case TCP_ESTABLISHED: | ||
177 | dout("ceph_state_change TCP_ESTABLISHED\n"); | ||
178 | queue_con(con); | ||
179 | break; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * set up socket callbacks | ||
185 | */ | ||
186 | static void set_sock_callbacks(struct socket *sock, | ||
187 | struct ceph_connection *con) | ||
188 | { | ||
189 | struct sock *sk = sock->sk; | ||
190 | sk->sk_user_data = (void *)con; | ||
191 | sk->sk_data_ready = ceph_data_ready; | ||
192 | sk->sk_write_space = ceph_write_space; | ||
193 | sk->sk_state_change = ceph_state_change; | ||
194 | } | ||
195 | |||
196 | |||
197 | /* | ||
198 | * socket helpers | ||
199 | */ | ||
200 | |||
201 | /* | ||
202 | * initiate connection to a remote socket. | ||
203 | */ | ||
204 | static struct socket *ceph_tcp_connect(struct ceph_connection *con) | ||
205 | { | ||
206 | struct sockaddr_storage *paddr = &con->peer_addr.in_addr; | ||
207 | struct socket *sock; | ||
208 | int ret; | ||
209 | |||
210 | BUG_ON(con->sock); | ||
211 | ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, | ||
212 | IPPROTO_TCP, &sock); | ||
213 | if (ret) | ||
214 | return ERR_PTR(ret); | ||
215 | con->sock = sock; | ||
216 | sock->sk->sk_allocation = GFP_NOFS; | ||
217 | |||
218 | #ifdef CONFIG_LOCKDEP | ||
219 | lockdep_set_class(&sock->sk->sk_lock, &socket_class); | ||
220 | #endif | ||
221 | |||
222 | set_sock_callbacks(sock, con); | ||
223 | |||
224 | dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); | ||
225 | |||
226 | ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), | ||
227 | O_NONBLOCK); | ||
228 | if (ret == -EINPROGRESS) { | ||
229 | dout("connect %s EINPROGRESS sk_state = %u\n", | ||
230 | pr_addr(&con->peer_addr.in_addr), | ||
231 | sock->sk->sk_state); | ||
232 | ret = 0; | ||
233 | } | ||
234 | if (ret < 0) { | ||
235 | pr_err("connect %s error %d\n", | ||
236 | pr_addr(&con->peer_addr.in_addr), ret); | ||
237 | sock_release(sock); | ||
238 | con->sock = NULL; | ||
239 | con->error_msg = "connect error"; | ||
240 | } | ||
241 | |||
242 | if (ret < 0) | ||
243 | return ERR_PTR(ret); | ||
244 | return sock; | ||
245 | } | ||
246 | |||
247 | static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) | ||
248 | { | ||
249 | struct kvec iov = {buf, len}; | ||
250 | struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; | ||
251 | |||
252 | return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * write something. @more is true if caller will be sending more data | ||
257 | * shortly. | ||
258 | */ | ||
259 | static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, | ||
260 | size_t kvlen, size_t len, int more) | ||
261 | { | ||
262 | struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; | ||
263 | |||
264 | if (more) | ||
265 | msg.msg_flags |= MSG_MORE; | ||
266 | else | ||
267 | msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ | ||
268 | |||
269 | return kernel_sendmsg(sock, &msg, iov, kvlen, len); | ||
270 | } | ||
271 | |||
272 | |||
273 | /* | ||
274 | * Shutdown/close the socket for the given connection. | ||
275 | */ | ||
276 | static int con_close_socket(struct ceph_connection *con) | ||
277 | { | ||
278 | int rc; | ||
279 | |||
280 | dout("con_close_socket on %p sock %p\n", con, con->sock); | ||
281 | if (!con->sock) | ||
282 | return 0; | ||
283 | set_bit(SOCK_CLOSED, &con->state); | ||
284 | rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); | ||
285 | sock_release(con->sock); | ||
286 | con->sock = NULL; | ||
287 | clear_bit(SOCK_CLOSED, &con->state); | ||
288 | return rc; | ||
289 | } | ||
290 | |||
291 | /* | ||
292 | * Reset a connection. Discard all incoming and outgoing messages | ||
293 | * and clear *_seq state. | ||
294 | */ | ||
295 | static void ceph_msg_remove(struct ceph_msg *msg) | ||
296 | { | ||
297 | list_del_init(&msg->list_head); | ||
298 | ceph_msg_put(msg); | ||
299 | } | ||
300 | static void ceph_msg_remove_list(struct list_head *head) | ||
301 | { | ||
302 | while (!list_empty(head)) { | ||
303 | struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, | ||
304 | list_head); | ||
305 | ceph_msg_remove(msg); | ||
306 | } | ||
307 | } | ||
308 | |||
309 | static void reset_connection(struct ceph_connection *con) | ||
310 | { | ||
311 | /* reset connection, out_queue, msg_ and connect_seq */ | ||
312 | /* discard existing out_queue and msg_seq */ | ||
313 | ceph_msg_remove_list(&con->out_queue); | ||
314 | ceph_msg_remove_list(&con->out_sent); | ||
315 | |||
316 | if (con->in_msg) { | ||
317 | ceph_msg_put(con->in_msg); | ||
318 | con->in_msg = NULL; | ||
319 | } | ||
320 | |||
321 | con->connect_seq = 0; | ||
322 | con->out_seq = 0; | ||
323 | if (con->out_msg) { | ||
324 | ceph_msg_put(con->out_msg); | ||
325 | con->out_msg = NULL; | ||
326 | } | ||
327 | con->out_keepalive_pending = false; | ||
328 | con->in_seq = 0; | ||
329 | con->in_seq_acked = 0; | ||
330 | } | ||
331 | |||
332 | /* | ||
333 | * mark a peer down. drop any open connections. | ||
334 | */ | ||
335 | void ceph_con_close(struct ceph_connection *con) | ||
336 | { | ||
337 | dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); | ||
338 | set_bit(CLOSED, &con->state); /* in case there's queued work */ | ||
339 | clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ | ||
340 | clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ | ||
341 | clear_bit(KEEPALIVE_PENDING, &con->state); | ||
342 | clear_bit(WRITE_PENDING, &con->state); | ||
343 | mutex_lock(&con->mutex); | ||
344 | reset_connection(con); | ||
345 | con->peer_global_seq = 0; | ||
346 | cancel_delayed_work(&con->work); | ||
347 | mutex_unlock(&con->mutex); | ||
348 | queue_con(con); | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Reopen a closed connection, with a new peer address. | ||
353 | */ | ||
354 | void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) | ||
355 | { | ||
356 | dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); | ||
357 | set_bit(OPENING, &con->state); | ||
358 | clear_bit(CLOSED, &con->state); | ||
359 | memcpy(&con->peer_addr, addr, sizeof(*addr)); | ||
360 | con->delay = 0; /* reset backoff memory */ | ||
361 | queue_con(con); | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * return true if this connection ever successfully opened | ||
366 | */ | ||
367 | bool ceph_con_opened(struct ceph_connection *con) | ||
368 | { | ||
369 | return con->connect_seq > 0; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * generic get/put | ||
374 | */ | ||
375 | struct ceph_connection *ceph_con_get(struct ceph_connection *con) | ||
376 | { | ||
377 | dout("con_get %p nref = %d -> %d\n", con, | ||
378 | atomic_read(&con->nref), atomic_read(&con->nref) + 1); | ||
379 | if (atomic_inc_not_zero(&con->nref)) | ||
380 | return con; | ||
381 | return NULL; | ||
382 | } | ||
383 | |||
384 | void ceph_con_put(struct ceph_connection *con) | ||
385 | { | ||
386 | dout("con_put %p nref = %d -> %d\n", con, | ||
387 | atomic_read(&con->nref), atomic_read(&con->nref) - 1); | ||
388 | BUG_ON(atomic_read(&con->nref) == 0); | ||
389 | if (atomic_dec_and_test(&con->nref)) { | ||
390 | BUG_ON(con->sock); | ||
391 | kfree(con); | ||
392 | } | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * initialize a new connection. | ||
397 | */ | ||
398 | void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) | ||
399 | { | ||
400 | dout("con_init %p\n", con); | ||
401 | memset(con, 0, sizeof(*con)); | ||
402 | atomic_set(&con->nref, 1); | ||
403 | con->msgr = msgr; | ||
404 | mutex_init(&con->mutex); | ||
405 | INIT_LIST_HEAD(&con->out_queue); | ||
406 | INIT_LIST_HEAD(&con->out_sent); | ||
407 | INIT_DELAYED_WORK(&con->work, con_work); | ||
408 | } | ||
409 | |||
410 | |||
411 | /* | ||
412 | * We maintain a global counter to order connection attempts. Get | ||
413 | * a unique seq greater than @gt. | ||
414 | */ | ||
415 | static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) | ||
416 | { | ||
417 | u32 ret; | ||
418 | |||
419 | spin_lock(&msgr->global_seq_lock); | ||
420 | if (msgr->global_seq < gt) | ||
421 | msgr->global_seq = gt; | ||
422 | ret = ++msgr->global_seq; | ||
423 | spin_unlock(&msgr->global_seq_lock); | ||
424 | return ret; | ||
425 | } | ||
426 | |||
427 | |||
428 | /* | ||
429 | * Prepare footer for currently outgoing message, and finish things | ||
430 | * off. Assumes out_kvec* are already valid.. we just add on to the end. | ||
431 | */ | ||
432 | static void prepare_write_message_footer(struct ceph_connection *con, int v) | ||
433 | { | ||
434 | struct ceph_msg *m = con->out_msg; | ||
435 | |||
436 | dout("prepare_write_message_footer %p\n", con); | ||
437 | con->out_kvec_is_msg = true; | ||
438 | con->out_kvec[v].iov_base = &m->footer; | ||
439 | con->out_kvec[v].iov_len = sizeof(m->footer); | ||
440 | con->out_kvec_bytes += sizeof(m->footer); | ||
441 | con->out_kvec_left++; | ||
442 | con->out_more = m->more_to_follow; | ||
443 | con->out_msg_done = true; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Prepare headers for the next outgoing message. | ||
448 | */ | ||
449 | static void prepare_write_message(struct ceph_connection *con) | ||
450 | { | ||
451 | struct ceph_msg *m; | ||
452 | int v = 0; | ||
453 | |||
454 | con->out_kvec_bytes = 0; | ||
455 | con->out_kvec_is_msg = true; | ||
456 | con->out_msg_done = false; | ||
457 | |||
458 | /* Sneak an ack in there first? If we can get it into the same | ||
459 | * TCP packet that's a good thing. */ | ||
460 | if (con->in_seq > con->in_seq_acked) { | ||
461 | con->in_seq_acked = con->in_seq; | ||
462 | con->out_kvec[v].iov_base = &tag_ack; | ||
463 | con->out_kvec[v++].iov_len = 1; | ||
464 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); | ||
465 | con->out_kvec[v].iov_base = &con->out_temp_ack; | ||
466 | con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); | ||
467 | con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); | ||
468 | } | ||
469 | |||
470 | m = list_first_entry(&con->out_queue, | ||
471 | struct ceph_msg, list_head); | ||
472 | con->out_msg = m; | ||
473 | if (test_bit(LOSSYTX, &con->state)) { | ||
474 | list_del_init(&m->list_head); | ||
475 | } else { | ||
476 | /* put message on sent list */ | ||
477 | ceph_msg_get(m); | ||
478 | list_move_tail(&m->list_head, &con->out_sent); | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * only assign outgoing seq # if we haven't sent this message | ||
483 | * yet. if it is requeued, resend with it's original seq. | ||
484 | */ | ||
485 | if (m->needs_out_seq) { | ||
486 | m->hdr.seq = cpu_to_le64(++con->out_seq); | ||
487 | m->needs_out_seq = false; | ||
488 | } | ||
489 | |||
490 | dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", | ||
491 | m, con->out_seq, le16_to_cpu(m->hdr.type), | ||
492 | le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), | ||
493 | le32_to_cpu(m->hdr.data_len), | ||
494 | m->nr_pages); | ||
495 | BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); | ||
496 | |||
497 | /* tag + hdr + front + middle */ | ||
498 | con->out_kvec[v].iov_base = &tag_msg; | ||
499 | con->out_kvec[v++].iov_len = 1; | ||
500 | con->out_kvec[v].iov_base = &m->hdr; | ||
501 | con->out_kvec[v++].iov_len = sizeof(m->hdr); | ||
502 | con->out_kvec[v++] = m->front; | ||
503 | if (m->middle) | ||
504 | con->out_kvec[v++] = m->middle->vec; | ||
505 | con->out_kvec_left = v; | ||
506 | con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + | ||
507 | (m->middle ? m->middle->vec.iov_len : 0); | ||
508 | con->out_kvec_cur = con->out_kvec; | ||
509 | |||
510 | /* fill in crc (except data pages), footer */ | ||
511 | con->out_msg->hdr.crc = | ||
512 | cpu_to_le32(crc32c(0, (void *)&m->hdr, | ||
513 | sizeof(m->hdr) - sizeof(m->hdr.crc))); | ||
514 | con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; | ||
515 | con->out_msg->footer.front_crc = | ||
516 | cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); | ||
517 | if (m->middle) | ||
518 | con->out_msg->footer.middle_crc = | ||
519 | cpu_to_le32(crc32c(0, m->middle->vec.iov_base, | ||
520 | m->middle->vec.iov_len)); | ||
521 | else | ||
522 | con->out_msg->footer.middle_crc = 0; | ||
523 | con->out_msg->footer.data_crc = 0; | ||
524 | dout("prepare_write_message front_crc %u data_crc %u\n", | ||
525 | le32_to_cpu(con->out_msg->footer.front_crc), | ||
526 | le32_to_cpu(con->out_msg->footer.middle_crc)); | ||
527 | |||
528 | /* is there a data payload? */ | ||
529 | if (le32_to_cpu(m->hdr.data_len) > 0) { | ||
530 | /* initialize page iterator */ | ||
531 | con->out_msg_pos.page = 0; | ||
532 | con->out_msg_pos.page_pos = | ||
533 | le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; | ||
534 | con->out_msg_pos.data_pos = 0; | ||
535 | con->out_msg_pos.did_page_crc = 0; | ||
536 | con->out_more = 1; /* data + footer will follow */ | ||
537 | } else { | ||
538 | /* no, queue up footer too and be done */ | ||
539 | prepare_write_message_footer(con, v); | ||
540 | } | ||
541 | |||
542 | set_bit(WRITE_PENDING, &con->state); | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Prepare an ack. | ||
547 | */ | ||
548 | static void prepare_write_ack(struct ceph_connection *con) | ||
549 | { | ||
550 | dout("prepare_write_ack %p %llu -> %llu\n", con, | ||
551 | con->in_seq_acked, con->in_seq); | ||
552 | con->in_seq_acked = con->in_seq; | ||
553 | |||
554 | con->out_kvec[0].iov_base = &tag_ack; | ||
555 | con->out_kvec[0].iov_len = 1; | ||
556 | con->out_temp_ack = cpu_to_le64(con->in_seq_acked); | ||
557 | con->out_kvec[1].iov_base = &con->out_temp_ack; | ||
558 | con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); | ||
559 | con->out_kvec_left = 2; | ||
560 | con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); | ||
561 | con->out_kvec_cur = con->out_kvec; | ||
562 | con->out_more = 1; /* more will follow.. eventually.. */ | ||
563 | set_bit(WRITE_PENDING, &con->state); | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * Prepare to write keepalive byte. | ||
568 | */ | ||
569 | static void prepare_write_keepalive(struct ceph_connection *con) | ||
570 | { | ||
571 | dout("prepare_write_keepalive %p\n", con); | ||
572 | con->out_kvec[0].iov_base = &tag_keepalive; | ||
573 | con->out_kvec[0].iov_len = 1; | ||
574 | con->out_kvec_left = 1; | ||
575 | con->out_kvec_bytes = 1; | ||
576 | con->out_kvec_cur = con->out_kvec; | ||
577 | set_bit(WRITE_PENDING, &con->state); | ||
578 | } | ||
579 | |||
580 | /* | ||
581 | * Connection negotiation. | ||
582 | */ | ||
583 | |||
584 | static void prepare_connect_authorizer(struct ceph_connection *con) | ||
585 | { | ||
586 | void *auth_buf; | ||
587 | int auth_len = 0; | ||
588 | int auth_protocol = 0; | ||
589 | |||
590 | mutex_unlock(&con->mutex); | ||
591 | if (con->ops->get_authorizer) | ||
592 | con->ops->get_authorizer(con, &auth_buf, &auth_len, | ||
593 | &auth_protocol, &con->auth_reply_buf, | ||
594 | &con->auth_reply_buf_len, | ||
595 | con->auth_retry); | ||
596 | mutex_lock(&con->mutex); | ||
597 | |||
598 | con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); | ||
599 | con->out_connect.authorizer_len = cpu_to_le32(auth_len); | ||
600 | |||
601 | con->out_kvec[con->out_kvec_left].iov_base = auth_buf; | ||
602 | con->out_kvec[con->out_kvec_left].iov_len = auth_len; | ||
603 | con->out_kvec_left++; | ||
604 | con->out_kvec_bytes += auth_len; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * We connected to a peer and are saying hello. | ||
609 | */ | ||
610 | static void prepare_write_banner(struct ceph_messenger *msgr, | ||
611 | struct ceph_connection *con) | ||
612 | { | ||
613 | int len = strlen(CEPH_BANNER); | ||
614 | |||
615 | con->out_kvec[0].iov_base = CEPH_BANNER; | ||
616 | con->out_kvec[0].iov_len = len; | ||
617 | con->out_kvec[1].iov_base = &msgr->my_enc_addr; | ||
618 | con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); | ||
619 | con->out_kvec_left = 2; | ||
620 | con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); | ||
621 | con->out_kvec_cur = con->out_kvec; | ||
622 | con->out_more = 0; | ||
623 | set_bit(WRITE_PENDING, &con->state); | ||
624 | } | ||
625 | |||
626 | static void prepare_write_connect(struct ceph_messenger *msgr, | ||
627 | struct ceph_connection *con, | ||
628 | int after_banner) | ||
629 | { | ||
630 | unsigned global_seq = get_global_seq(con->msgr, 0); | ||
631 | int proto; | ||
632 | |||
633 | switch (con->peer_name.type) { | ||
634 | case CEPH_ENTITY_TYPE_MON: | ||
635 | proto = CEPH_MONC_PROTOCOL; | ||
636 | break; | ||
637 | case CEPH_ENTITY_TYPE_OSD: | ||
638 | proto = CEPH_OSDC_PROTOCOL; | ||
639 | break; | ||
640 | case CEPH_ENTITY_TYPE_MDS: | ||
641 | proto = CEPH_MDSC_PROTOCOL; | ||
642 | break; | ||
643 | default: | ||
644 | BUG(); | ||
645 | } | ||
646 | |||
647 | dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, | ||
648 | con->connect_seq, global_seq, proto); | ||
649 | |||
650 | con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); | ||
651 | con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); | ||
652 | con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); | ||
653 | con->out_connect.global_seq = cpu_to_le32(global_seq); | ||
654 | con->out_connect.protocol_version = cpu_to_le32(proto); | ||
655 | con->out_connect.flags = 0; | ||
656 | |||
657 | if (!after_banner) { | ||
658 | con->out_kvec_left = 0; | ||
659 | con->out_kvec_bytes = 0; | ||
660 | } | ||
661 | con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; | ||
662 | con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); | ||
663 | con->out_kvec_left++; | ||
664 | con->out_kvec_bytes += sizeof(con->out_connect); | ||
665 | con->out_kvec_cur = con->out_kvec; | ||
666 | con->out_more = 0; | ||
667 | set_bit(WRITE_PENDING, &con->state); | ||
668 | |||
669 | prepare_connect_authorizer(con); | ||
670 | } | ||
671 | |||
672 | |||
673 | /* | ||
674 | * write as much of pending kvecs to the socket as we can. | ||
675 | * 1 -> done | ||
676 | * 0 -> socket full, but more to do | ||
677 | * <0 -> error | ||
678 | */ | ||
679 | static int write_partial_kvec(struct ceph_connection *con) | ||
680 | { | ||
681 | int ret; | ||
682 | |||
683 | dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); | ||
684 | while (con->out_kvec_bytes > 0) { | ||
685 | ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, | ||
686 | con->out_kvec_left, con->out_kvec_bytes, | ||
687 | con->out_more); | ||
688 | if (ret <= 0) | ||
689 | goto out; | ||
690 | con->out_kvec_bytes -= ret; | ||
691 | if (con->out_kvec_bytes == 0) | ||
692 | break; /* done */ | ||
693 | while (ret > 0) { | ||
694 | if (ret >= con->out_kvec_cur->iov_len) { | ||
695 | ret -= con->out_kvec_cur->iov_len; | ||
696 | con->out_kvec_cur++; | ||
697 | con->out_kvec_left--; | ||
698 | } else { | ||
699 | con->out_kvec_cur->iov_len -= ret; | ||
700 | con->out_kvec_cur->iov_base += ret; | ||
701 | ret = 0; | ||
702 | break; | ||
703 | } | ||
704 | } | ||
705 | } | ||
706 | con->out_kvec_left = 0; | ||
707 | con->out_kvec_is_msg = false; | ||
708 | ret = 1; | ||
709 | out: | ||
710 | dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, | ||
711 | con->out_kvec_bytes, con->out_kvec_left, ret); | ||
712 | return ret; /* done! */ | ||
713 | } | ||
714 | |||
715 | /* | ||
716 | * Write as much message data payload as we can. If we finish, queue | ||
717 | * up the footer. | ||
718 | * 1 -> done, footer is now queued in out_kvec[]. | ||
719 | * 0 -> socket full, but more to do | ||
720 | * <0 -> error | ||
721 | */ | ||
722 | static int write_partial_msg_pages(struct ceph_connection *con) | ||
723 | { | ||
724 | struct ceph_msg *msg = con->out_msg; | ||
725 | unsigned data_len = le32_to_cpu(msg->hdr.data_len); | ||
726 | size_t len; | ||
727 | int crc = con->msgr->nocrc; | ||
728 | int ret; | ||
729 | |||
730 | dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", | ||
731 | con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, | ||
732 | con->out_msg_pos.page_pos); | ||
733 | |||
734 | while (con->out_msg_pos.page < con->out_msg->nr_pages) { | ||
735 | struct page *page = NULL; | ||
736 | void *kaddr = NULL; | ||
737 | |||
738 | /* | ||
739 | * if we are calculating the data crc (the default), we need | ||
740 | * to map the page. if our pages[] has been revoked, use the | ||
741 | * zero page. | ||
742 | */ | ||
743 | if (msg->pages) { | ||
744 | page = msg->pages[con->out_msg_pos.page]; | ||
745 | if (crc) | ||
746 | kaddr = kmap(page); | ||
747 | } else if (msg->pagelist) { | ||
748 | page = list_first_entry(&msg->pagelist->head, | ||
749 | struct page, lru); | ||
750 | if (crc) | ||
751 | kaddr = kmap(page); | ||
752 | } else { | ||
753 | page = con->msgr->zero_page; | ||
754 | if (crc) | ||
755 | kaddr = page_address(con->msgr->zero_page); | ||
756 | } | ||
757 | len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), | ||
758 | (int)(data_len - con->out_msg_pos.data_pos)); | ||
759 | if (crc && !con->out_msg_pos.did_page_crc) { | ||
760 | void *base = kaddr + con->out_msg_pos.page_pos; | ||
761 | u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); | ||
762 | |||
763 | BUG_ON(kaddr == NULL); | ||
764 | con->out_msg->footer.data_crc = | ||
765 | cpu_to_le32(crc32c(tmpcrc, base, len)); | ||
766 | con->out_msg_pos.did_page_crc = 1; | ||
767 | } | ||
768 | |||
769 | ret = kernel_sendpage(con->sock, page, | ||
770 | con->out_msg_pos.page_pos, len, | ||
771 | MSG_DONTWAIT | MSG_NOSIGNAL | | ||
772 | MSG_MORE); | ||
773 | |||
774 | if (crc && (msg->pages || msg->pagelist)) | ||
775 | kunmap(page); | ||
776 | |||
777 | if (ret <= 0) | ||
778 | goto out; | ||
779 | |||
780 | con->out_msg_pos.data_pos += ret; | ||
781 | con->out_msg_pos.page_pos += ret; | ||
782 | if (ret == len) { | ||
783 | con->out_msg_pos.page_pos = 0; | ||
784 | con->out_msg_pos.page++; | ||
785 | con->out_msg_pos.did_page_crc = 0; | ||
786 | if (msg->pagelist) | ||
787 | list_move_tail(&page->lru, | ||
788 | &msg->pagelist->head); | ||
789 | } | ||
790 | } | ||
791 | |||
792 | dout("write_partial_msg_pages %p msg %p done\n", con, msg); | ||
793 | |||
794 | /* prepare and queue up footer, too */ | ||
795 | if (!crc) | ||
796 | con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; | ||
797 | con->out_kvec_bytes = 0; | ||
798 | con->out_kvec_left = 0; | ||
799 | con->out_kvec_cur = con->out_kvec; | ||
800 | prepare_write_message_footer(con, 0); | ||
801 | ret = 1; | ||
802 | out: | ||
803 | return ret; | ||
804 | } | ||
805 | |||
806 | /* | ||
807 | * write some zeros | ||
808 | */ | ||
809 | static int write_partial_skip(struct ceph_connection *con) | ||
810 | { | ||
811 | int ret; | ||
812 | |||
813 | while (con->out_skip > 0) { | ||
814 | struct kvec iov = { | ||
815 | .iov_base = page_address(con->msgr->zero_page), | ||
816 | .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) | ||
817 | }; | ||
818 | |||
819 | ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); | ||
820 | if (ret <= 0) | ||
821 | goto out; | ||
822 | con->out_skip -= ret; | ||
823 | } | ||
824 | ret = 1; | ||
825 | out: | ||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * Prepare to read connection handshake, or an ack. | ||
831 | */ | ||
832 | static void prepare_read_banner(struct ceph_connection *con) | ||
833 | { | ||
834 | dout("prepare_read_banner %p\n", con); | ||
835 | con->in_base_pos = 0; | ||
836 | } | ||
837 | |||
838 | static void prepare_read_connect(struct ceph_connection *con) | ||
839 | { | ||
840 | dout("prepare_read_connect %p\n", con); | ||
841 | con->in_base_pos = 0; | ||
842 | } | ||
843 | |||
844 | static void prepare_read_ack(struct ceph_connection *con) | ||
845 | { | ||
846 | dout("prepare_read_ack %p\n", con); | ||
847 | con->in_base_pos = 0; | ||
848 | } | ||
849 | |||
850 | static void prepare_read_tag(struct ceph_connection *con) | ||
851 | { | ||
852 | dout("prepare_read_tag %p\n", con); | ||
853 | con->in_base_pos = 0; | ||
854 | con->in_tag = CEPH_MSGR_TAG_READY; | ||
855 | } | ||
856 | |||
857 | /* | ||
858 | * Prepare to read a message. | ||
859 | */ | ||
860 | static int prepare_read_message(struct ceph_connection *con) | ||
861 | { | ||
862 | dout("prepare_read_message %p\n", con); | ||
863 | BUG_ON(con->in_msg != NULL); | ||
864 | con->in_base_pos = 0; | ||
865 | con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; | ||
866 | return 0; | ||
867 | } | ||
868 | |||
869 | |||
870 | static int read_partial(struct ceph_connection *con, | ||
871 | int *to, int size, void *object) | ||
872 | { | ||
873 | *to += size; | ||
874 | while (con->in_base_pos < *to) { | ||
875 | int left = *to - con->in_base_pos; | ||
876 | int have = size - left; | ||
877 | int ret = ceph_tcp_recvmsg(con->sock, object + have, left); | ||
878 | if (ret <= 0) | ||
879 | return ret; | ||
880 | con->in_base_pos += ret; | ||
881 | } | ||
882 | return 1; | ||
883 | } | ||
884 | |||
885 | |||
886 | /* | ||
887 | * Read all or part of the connect-side handshake on a new connection | ||
888 | */ | ||
889 | static int read_partial_banner(struct ceph_connection *con) | ||
890 | { | ||
891 | int ret, to = 0; | ||
892 | |||
893 | dout("read_partial_banner %p at %d\n", con, con->in_base_pos); | ||
894 | |||
895 | /* peer's banner */ | ||
896 | ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); | ||
897 | if (ret <= 0) | ||
898 | goto out; | ||
899 | ret = read_partial(con, &to, sizeof(con->actual_peer_addr), | ||
900 | &con->actual_peer_addr); | ||
901 | if (ret <= 0) | ||
902 | goto out; | ||
903 | ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), | ||
904 | &con->peer_addr_for_me); | ||
905 | if (ret <= 0) | ||
906 | goto out; | ||
907 | out: | ||
908 | return ret; | ||
909 | } | ||
910 | |||
911 | static int read_partial_connect(struct ceph_connection *con) | ||
912 | { | ||
913 | int ret, to = 0; | ||
914 | |||
915 | dout("read_partial_connect %p at %d\n", con, con->in_base_pos); | ||
916 | |||
917 | ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); | ||
918 | if (ret <= 0) | ||
919 | goto out; | ||
920 | ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), | ||
921 | con->auth_reply_buf); | ||
922 | if (ret <= 0) | ||
923 | goto out; | ||
924 | |||
925 | dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", | ||
926 | con, (int)con->in_reply.tag, | ||
927 | le32_to_cpu(con->in_reply.connect_seq), | ||
928 | le32_to_cpu(con->in_reply.global_seq)); | ||
929 | out: | ||
930 | return ret; | ||
931 | |||
932 | } | ||
933 | |||
934 | /* | ||
935 | * Verify the hello banner looks okay. | ||
936 | */ | ||
937 | static int verify_hello(struct ceph_connection *con) | ||
938 | { | ||
939 | if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { | ||
940 | pr_err("connect to %s got bad banner\n", | ||
941 | pr_addr(&con->peer_addr.in_addr)); | ||
942 | con->error_msg = "protocol error, bad banner"; | ||
943 | return -1; | ||
944 | } | ||
945 | return 0; | ||
946 | } | ||
947 | |||
948 | static bool addr_is_blank(struct sockaddr_storage *ss) | ||
949 | { | ||
950 | switch (ss->ss_family) { | ||
951 | case AF_INET: | ||
952 | return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; | ||
953 | case AF_INET6: | ||
954 | return | ||
955 | ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && | ||
956 | ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && | ||
957 | ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && | ||
958 | ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; | ||
959 | } | ||
960 | return false; | ||
961 | } | ||
962 | |||
963 | static int addr_port(struct sockaddr_storage *ss) | ||
964 | { | ||
965 | switch (ss->ss_family) { | ||
966 | case AF_INET: | ||
967 | return ntohs(((struct sockaddr_in *)ss)->sin_port); | ||
968 | case AF_INET6: | ||
969 | return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); | ||
970 | } | ||
971 | return 0; | ||
972 | } | ||
973 | |||
974 | static void addr_set_port(struct sockaddr_storage *ss, int p) | ||
975 | { | ||
976 | switch (ss->ss_family) { | ||
977 | case AF_INET: | ||
978 | ((struct sockaddr_in *)ss)->sin_port = htons(p); | ||
979 | case AF_INET6: | ||
980 | ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); | ||
981 | } | ||
982 | } | ||
983 | |||
984 | /* | ||
985 | * Parse an ip[:port] list into an addr array. Use the default | ||
986 | * monitor port if a port isn't specified. | ||
987 | */ | ||
988 | int ceph_parse_ips(const char *c, const char *end, | ||
989 | struct ceph_entity_addr *addr, | ||
990 | int max_count, int *count) | ||
991 | { | ||
992 | int i; | ||
993 | const char *p = c; | ||
994 | |||
995 | dout("parse_ips on '%.*s'\n", (int)(end-c), c); | ||
996 | for (i = 0; i < max_count; i++) { | ||
997 | const char *ipend; | ||
998 | struct sockaddr_storage *ss = &addr[i].in_addr; | ||
999 | struct sockaddr_in *in4 = (void *)ss; | ||
1000 | struct sockaddr_in6 *in6 = (void *)ss; | ||
1001 | int port; | ||
1002 | char delim = ','; | ||
1003 | |||
1004 | if (*p == '[') { | ||
1005 | delim = ']'; | ||
1006 | p++; | ||
1007 | } | ||
1008 | |||
1009 | memset(ss, 0, sizeof(*ss)); | ||
1010 | if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, | ||
1011 | delim, &ipend)) | ||
1012 | ss->ss_family = AF_INET; | ||
1013 | else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, | ||
1014 | delim, &ipend)) | ||
1015 | ss->ss_family = AF_INET6; | ||
1016 | else | ||
1017 | goto bad; | ||
1018 | p = ipend; | ||
1019 | |||
1020 | if (delim == ']') { | ||
1021 | if (*p != ']') { | ||
1022 | dout("missing matching ']'\n"); | ||
1023 | goto bad; | ||
1024 | } | ||
1025 | p++; | ||
1026 | } | ||
1027 | |||
1028 | /* port? */ | ||
1029 | if (p < end && *p == ':') { | ||
1030 | port = 0; | ||
1031 | p++; | ||
1032 | while (p < end && *p >= '0' && *p <= '9') { | ||
1033 | port = (port * 10) + (*p - '0'); | ||
1034 | p++; | ||
1035 | } | ||
1036 | if (port > 65535 || port == 0) | ||
1037 | goto bad; | ||
1038 | } else { | ||
1039 | port = CEPH_MON_PORT; | ||
1040 | } | ||
1041 | |||
1042 | addr_set_port(ss, port); | ||
1043 | |||
1044 | dout("parse_ips got %s\n", pr_addr(ss)); | ||
1045 | |||
1046 | if (p == end) | ||
1047 | break; | ||
1048 | if (*p != ',') | ||
1049 | goto bad; | ||
1050 | p++; | ||
1051 | } | ||
1052 | |||
1053 | if (p != end) | ||
1054 | goto bad; | ||
1055 | |||
1056 | if (count) | ||
1057 | *count = i + 1; | ||
1058 | return 0; | ||
1059 | |||
1060 | bad: | ||
1061 | pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); | ||
1062 | return -EINVAL; | ||
1063 | } | ||
1064 | |||
1065 | static int process_banner(struct ceph_connection *con) | ||
1066 | { | ||
1067 | dout("process_banner on %p\n", con); | ||
1068 | |||
1069 | if (verify_hello(con) < 0) | ||
1070 | return -1; | ||
1071 | |||
1072 | ceph_decode_addr(&con->actual_peer_addr); | ||
1073 | ceph_decode_addr(&con->peer_addr_for_me); | ||
1074 | |||
1075 | /* | ||
1076 | * Make sure the other end is who we wanted. note that the other | ||
1077 | * end may not yet know their ip address, so if it's 0.0.0.0, give | ||
1078 | * them the benefit of the doubt. | ||
1079 | */ | ||
1080 | if (memcmp(&con->peer_addr, &con->actual_peer_addr, | ||
1081 | sizeof(con->peer_addr)) != 0 && | ||
1082 | !(addr_is_blank(&con->actual_peer_addr.in_addr) && | ||
1083 | con->actual_peer_addr.nonce == con->peer_addr.nonce)) { | ||
1084 | pr_warning("wrong peer, want %s/%d, got %s/%d\n", | ||
1085 | pr_addr(&con->peer_addr.in_addr), | ||
1086 | (int)le32_to_cpu(con->peer_addr.nonce), | ||
1087 | pr_addr(&con->actual_peer_addr.in_addr), | ||
1088 | (int)le32_to_cpu(con->actual_peer_addr.nonce)); | ||
1089 | con->error_msg = "wrong peer at address"; | ||
1090 | return -1; | ||
1091 | } | ||
1092 | |||
1093 | /* | ||
1094 | * did we learn our address? | ||
1095 | */ | ||
1096 | if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { | ||
1097 | int port = addr_port(&con->msgr->inst.addr.in_addr); | ||
1098 | |||
1099 | memcpy(&con->msgr->inst.addr.in_addr, | ||
1100 | &con->peer_addr_for_me.in_addr, | ||
1101 | sizeof(con->peer_addr_for_me.in_addr)); | ||
1102 | addr_set_port(&con->msgr->inst.addr.in_addr, port); | ||
1103 | encode_my_addr(con->msgr); | ||
1104 | dout("process_banner learned my addr is %s\n", | ||
1105 | pr_addr(&con->msgr->inst.addr.in_addr)); | ||
1106 | } | ||
1107 | |||
1108 | set_bit(NEGOTIATING, &con->state); | ||
1109 | prepare_read_connect(con); | ||
1110 | return 0; | ||
1111 | } | ||
1112 | |||
1113 | static void fail_protocol(struct ceph_connection *con) | ||
1114 | { | ||
1115 | reset_connection(con); | ||
1116 | set_bit(CLOSED, &con->state); /* in case there's queued work */ | ||
1117 | |||
1118 | mutex_unlock(&con->mutex); | ||
1119 | if (con->ops->bad_proto) | ||
1120 | con->ops->bad_proto(con); | ||
1121 | mutex_lock(&con->mutex); | ||
1122 | } | ||
1123 | |||
1124 | static int process_connect(struct ceph_connection *con) | ||
1125 | { | ||
1126 | u64 sup_feat = CEPH_FEATURE_SUPPORTED; | ||
1127 | u64 req_feat = CEPH_FEATURE_REQUIRED; | ||
1128 | u64 server_feat = le64_to_cpu(con->in_reply.features); | ||
1129 | |||
1130 | dout("process_connect on %p tag %d\n", con, (int)con->in_tag); | ||
1131 | |||
1132 | switch (con->in_reply.tag) { | ||
1133 | case CEPH_MSGR_TAG_FEATURES: | ||
1134 | pr_err("%s%lld %s feature set mismatch," | ||
1135 | " my %llx < server's %llx, missing %llx\n", | ||
1136 | ENTITY_NAME(con->peer_name), | ||
1137 | pr_addr(&con->peer_addr.in_addr), | ||
1138 | sup_feat, server_feat, server_feat & ~sup_feat); | ||
1139 | con->error_msg = "missing required protocol features"; | ||
1140 | fail_protocol(con); | ||
1141 | return -1; | ||
1142 | |||
1143 | case CEPH_MSGR_TAG_BADPROTOVER: | ||
1144 | pr_err("%s%lld %s protocol version mismatch," | ||
1145 | " my %d != server's %d\n", | ||
1146 | ENTITY_NAME(con->peer_name), | ||
1147 | pr_addr(&con->peer_addr.in_addr), | ||
1148 | le32_to_cpu(con->out_connect.protocol_version), | ||
1149 | le32_to_cpu(con->in_reply.protocol_version)); | ||
1150 | con->error_msg = "protocol version mismatch"; | ||
1151 | fail_protocol(con); | ||
1152 | return -1; | ||
1153 | |||
1154 | case CEPH_MSGR_TAG_BADAUTHORIZER: | ||
1155 | con->auth_retry++; | ||
1156 | dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, | ||
1157 | con->auth_retry); | ||
1158 | if (con->auth_retry == 2) { | ||
1159 | con->error_msg = "connect authorization failure"; | ||
1160 | reset_connection(con); | ||
1161 | set_bit(CLOSED, &con->state); | ||
1162 | return -1; | ||
1163 | } | ||
1164 | con->auth_retry = 1; | ||
1165 | prepare_write_connect(con->msgr, con, 0); | ||
1166 | prepare_read_connect(con); | ||
1167 | break; | ||
1168 | |||
1169 | case CEPH_MSGR_TAG_RESETSESSION: | ||
1170 | /* | ||
1171 | * If we connected with a large connect_seq but the peer | ||
1172 | * has no record of a session with us (no connection, or | ||
1173 | * connect_seq == 0), they will send RESETSESION to indicate | ||
1174 | * that they must have reset their session, and may have | ||
1175 | * dropped messages. | ||
1176 | */ | ||
1177 | dout("process_connect got RESET peer seq %u\n", | ||
1178 | le32_to_cpu(con->in_connect.connect_seq)); | ||
1179 | pr_err("%s%lld %s connection reset\n", | ||
1180 | ENTITY_NAME(con->peer_name), | ||
1181 | pr_addr(&con->peer_addr.in_addr)); | ||
1182 | reset_connection(con); | ||
1183 | prepare_write_connect(con->msgr, con, 0); | ||
1184 | prepare_read_connect(con); | ||
1185 | |||
1186 | /* Tell ceph about it. */ | ||
1187 | mutex_unlock(&con->mutex); | ||
1188 | pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); | ||
1189 | if (con->ops->peer_reset) | ||
1190 | con->ops->peer_reset(con); | ||
1191 | mutex_lock(&con->mutex); | ||
1192 | break; | ||
1193 | |||
1194 | case CEPH_MSGR_TAG_RETRY_SESSION: | ||
1195 | /* | ||
1196 | * If we sent a smaller connect_seq than the peer has, try | ||
1197 | * again with a larger value. | ||
1198 | */ | ||
1199 | dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", | ||
1200 | le32_to_cpu(con->out_connect.connect_seq), | ||
1201 | le32_to_cpu(con->in_connect.connect_seq)); | ||
1202 | con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); | ||
1203 | prepare_write_connect(con->msgr, con, 0); | ||
1204 | prepare_read_connect(con); | ||
1205 | break; | ||
1206 | |||
1207 | case CEPH_MSGR_TAG_RETRY_GLOBAL: | ||
1208 | /* | ||
1209 | * If we sent a smaller global_seq than the peer has, try | ||
1210 | * again with a larger value. | ||
1211 | */ | ||
1212 | dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", | ||
1213 | con->peer_global_seq, | ||
1214 | le32_to_cpu(con->in_connect.global_seq)); | ||
1215 | get_global_seq(con->msgr, | ||
1216 | le32_to_cpu(con->in_connect.global_seq)); | ||
1217 | prepare_write_connect(con->msgr, con, 0); | ||
1218 | prepare_read_connect(con); | ||
1219 | break; | ||
1220 | |||
1221 | case CEPH_MSGR_TAG_READY: | ||
1222 | if (req_feat & ~server_feat) { | ||
1223 | pr_err("%s%lld %s protocol feature mismatch," | ||
1224 | " my required %llx > server's %llx, need %llx\n", | ||
1225 | ENTITY_NAME(con->peer_name), | ||
1226 | pr_addr(&con->peer_addr.in_addr), | ||
1227 | req_feat, server_feat, req_feat & ~server_feat); | ||
1228 | con->error_msg = "missing required protocol features"; | ||
1229 | fail_protocol(con); | ||
1230 | return -1; | ||
1231 | } | ||
1232 | clear_bit(CONNECTING, &con->state); | ||
1233 | con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); | ||
1234 | con->connect_seq++; | ||
1235 | con->peer_features = server_feat; | ||
1236 | dout("process_connect got READY gseq %d cseq %d (%d)\n", | ||
1237 | con->peer_global_seq, | ||
1238 | le32_to_cpu(con->in_reply.connect_seq), | ||
1239 | con->connect_seq); | ||
1240 | WARN_ON(con->connect_seq != | ||
1241 | le32_to_cpu(con->in_reply.connect_seq)); | ||
1242 | |||
1243 | if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) | ||
1244 | set_bit(LOSSYTX, &con->state); | ||
1245 | |||
1246 | prepare_read_tag(con); | ||
1247 | break; | ||
1248 | |||
1249 | case CEPH_MSGR_TAG_WAIT: | ||
1250 | /* | ||
1251 | * If there is a connection race (we are opening | ||
1252 | * connections to each other), one of us may just have | ||
1253 | * to WAIT. This shouldn't happen if we are the | ||
1254 | * client. | ||
1255 | */ | ||
1256 | pr_err("process_connect peer connecting WAIT\n"); | ||
1257 | |||
1258 | default: | ||
1259 | pr_err("connect protocol error, will retry\n"); | ||
1260 | con->error_msg = "protocol error, garbage tag during connect"; | ||
1261 | return -1; | ||
1262 | } | ||
1263 | return 0; | ||
1264 | } | ||
1265 | |||
1266 | |||
1267 | /* | ||
1268 | * read (part of) an ack | ||
1269 | */ | ||
1270 | static int read_partial_ack(struct ceph_connection *con) | ||
1271 | { | ||
1272 | int to = 0; | ||
1273 | |||
1274 | return read_partial(con, &to, sizeof(con->in_temp_ack), | ||
1275 | &con->in_temp_ack); | ||
1276 | } | ||
1277 | |||
1278 | |||
1279 | /* | ||
1280 | * We can finally discard anything that's been acked. | ||
1281 | */ | ||
1282 | static void process_ack(struct ceph_connection *con) | ||
1283 | { | ||
1284 | struct ceph_msg *m; | ||
1285 | u64 ack = le64_to_cpu(con->in_temp_ack); | ||
1286 | u64 seq; | ||
1287 | |||
1288 | while (!list_empty(&con->out_sent)) { | ||
1289 | m = list_first_entry(&con->out_sent, struct ceph_msg, | ||
1290 | list_head); | ||
1291 | seq = le64_to_cpu(m->hdr.seq); | ||
1292 | if (seq > ack) | ||
1293 | break; | ||
1294 | dout("got ack for seq %llu type %d at %p\n", seq, | ||
1295 | le16_to_cpu(m->hdr.type), m); | ||
1296 | ceph_msg_remove(m); | ||
1297 | } | ||
1298 | prepare_read_tag(con); | ||
1299 | } | ||
1300 | |||
1301 | |||
1302 | |||
1303 | |||
1304 | static int read_partial_message_section(struct ceph_connection *con, | ||
1305 | struct kvec *section, | ||
1306 | unsigned int sec_len, u32 *crc) | ||
1307 | { | ||
1308 | int left; | ||
1309 | int ret; | ||
1310 | |||
1311 | BUG_ON(!section); | ||
1312 | |||
1313 | while (section->iov_len < sec_len) { | ||
1314 | BUG_ON(section->iov_base == NULL); | ||
1315 | left = sec_len - section->iov_len; | ||
1316 | ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + | ||
1317 | section->iov_len, left); | ||
1318 | if (ret <= 0) | ||
1319 | return ret; | ||
1320 | section->iov_len += ret; | ||
1321 | if (section->iov_len == sec_len) | ||
1322 | *crc = crc32c(0, section->iov_base, | ||
1323 | section->iov_len); | ||
1324 | } | ||
1325 | |||
1326 | return 1; | ||
1327 | } | ||
1328 | |||
1329 | static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, | ||
1330 | struct ceph_msg_header *hdr, | ||
1331 | int *skip); | ||
1332 | /* | ||
1333 | * read (part of) a message. | ||
1334 | */ | ||
1335 | static int read_partial_message(struct ceph_connection *con) | ||
1336 | { | ||
1337 | struct ceph_msg *m = con->in_msg; | ||
1338 | void *p; | ||
1339 | int ret; | ||
1340 | int to, left; | ||
1341 | unsigned front_len, middle_len, data_len, data_off; | ||
1342 | int datacrc = con->msgr->nocrc; | ||
1343 | int skip; | ||
1344 | u64 seq; | ||
1345 | |||
1346 | dout("read_partial_message con %p msg %p\n", con, m); | ||
1347 | |||
1348 | /* header */ | ||
1349 | while (con->in_base_pos < sizeof(con->in_hdr)) { | ||
1350 | left = sizeof(con->in_hdr) - con->in_base_pos; | ||
1351 | ret = ceph_tcp_recvmsg(con->sock, | ||
1352 | (char *)&con->in_hdr + con->in_base_pos, | ||
1353 | left); | ||
1354 | if (ret <= 0) | ||
1355 | return ret; | ||
1356 | con->in_base_pos += ret; | ||
1357 | if (con->in_base_pos == sizeof(con->in_hdr)) { | ||
1358 | u32 crc = crc32c(0, (void *)&con->in_hdr, | ||
1359 | sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); | ||
1360 | if (crc != le32_to_cpu(con->in_hdr.crc)) { | ||
1361 | pr_err("read_partial_message bad hdr " | ||
1362 | " crc %u != expected %u\n", | ||
1363 | crc, con->in_hdr.crc); | ||
1364 | return -EBADMSG; | ||
1365 | } | ||
1366 | } | ||
1367 | } | ||
1368 | front_len = le32_to_cpu(con->in_hdr.front_len); | ||
1369 | if (front_len > CEPH_MSG_MAX_FRONT_LEN) | ||
1370 | return -EIO; | ||
1371 | middle_len = le32_to_cpu(con->in_hdr.middle_len); | ||
1372 | if (middle_len > CEPH_MSG_MAX_DATA_LEN) | ||
1373 | return -EIO; | ||
1374 | data_len = le32_to_cpu(con->in_hdr.data_len); | ||
1375 | if (data_len > CEPH_MSG_MAX_DATA_LEN) | ||
1376 | return -EIO; | ||
1377 | data_off = le16_to_cpu(con->in_hdr.data_off); | ||
1378 | |||
1379 | /* verify seq# */ | ||
1380 | seq = le64_to_cpu(con->in_hdr.seq); | ||
1381 | if ((s64)seq - (s64)con->in_seq < 1) { | ||
1382 | pr_info("skipping %s%lld %s seq %lld, expected %lld\n", | ||
1383 | ENTITY_NAME(con->peer_name), | ||
1384 | pr_addr(&con->peer_addr.in_addr), | ||
1385 | seq, con->in_seq + 1); | ||
1386 | con->in_base_pos = -front_len - middle_len - data_len - | ||
1387 | sizeof(m->footer); | ||
1388 | con->in_tag = CEPH_MSGR_TAG_READY; | ||
1389 | con->in_seq++; | ||
1390 | return 0; | ||
1391 | } else if ((s64)seq - (s64)con->in_seq > 1) { | ||
1392 | pr_err("read_partial_message bad seq %lld expected %lld\n", | ||
1393 | seq, con->in_seq + 1); | ||
1394 | con->error_msg = "bad message sequence # for incoming message"; | ||
1395 | return -EBADMSG; | ||
1396 | } | ||
1397 | |||
1398 | /* allocate message? */ | ||
1399 | if (!con->in_msg) { | ||
1400 | dout("got hdr type %d front %d data %d\n", con->in_hdr.type, | ||
1401 | con->in_hdr.front_len, con->in_hdr.data_len); | ||
1402 | skip = 0; | ||
1403 | con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); | ||
1404 | if (skip) { | ||
1405 | /* skip this message */ | ||
1406 | dout("alloc_msg said skip message\n"); | ||
1407 | BUG_ON(con->in_msg); | ||
1408 | con->in_base_pos = -front_len - middle_len - data_len - | ||
1409 | sizeof(m->footer); | ||
1410 | con->in_tag = CEPH_MSGR_TAG_READY; | ||
1411 | con->in_seq++; | ||
1412 | return 0; | ||
1413 | } | ||
1414 | if (!con->in_msg) { | ||
1415 | con->error_msg = | ||
1416 | "error allocating memory for incoming message"; | ||
1417 | return -ENOMEM; | ||
1418 | } | ||
1419 | m = con->in_msg; | ||
1420 | m->front.iov_len = 0; /* haven't read it yet */ | ||
1421 | if (m->middle) | ||
1422 | m->middle->vec.iov_len = 0; | ||
1423 | |||
1424 | con->in_msg_pos.page = 0; | ||
1425 | con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; | ||
1426 | con->in_msg_pos.data_pos = 0; | ||
1427 | } | ||
1428 | |||
1429 | /* front */ | ||
1430 | ret = read_partial_message_section(con, &m->front, front_len, | ||
1431 | &con->in_front_crc); | ||
1432 | if (ret <= 0) | ||
1433 | return ret; | ||
1434 | |||
1435 | /* middle */ | ||
1436 | if (m->middle) { | ||
1437 | ret = read_partial_message_section(con, &m->middle->vec, | ||
1438 | middle_len, | ||
1439 | &con->in_middle_crc); | ||
1440 | if (ret <= 0) | ||
1441 | return ret; | ||
1442 | } | ||
1443 | |||
1444 | /* (page) data */ | ||
1445 | while (con->in_msg_pos.data_pos < data_len) { | ||
1446 | left = min((int)(data_len - con->in_msg_pos.data_pos), | ||
1447 | (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); | ||
1448 | BUG_ON(m->pages == NULL); | ||
1449 | p = kmap(m->pages[con->in_msg_pos.page]); | ||
1450 | ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, | ||
1451 | left); | ||
1452 | if (ret > 0 && datacrc) | ||
1453 | con->in_data_crc = | ||
1454 | crc32c(con->in_data_crc, | ||
1455 | p + con->in_msg_pos.page_pos, ret); | ||
1456 | kunmap(m->pages[con->in_msg_pos.page]); | ||
1457 | if (ret <= 0) | ||
1458 | return ret; | ||
1459 | con->in_msg_pos.data_pos += ret; | ||
1460 | con->in_msg_pos.page_pos += ret; | ||
1461 | if (con->in_msg_pos.page_pos == PAGE_SIZE) { | ||
1462 | con->in_msg_pos.page_pos = 0; | ||
1463 | con->in_msg_pos.page++; | ||
1464 | } | ||
1465 | } | ||
1466 | |||
1467 | /* footer */ | ||
1468 | to = sizeof(m->hdr) + sizeof(m->footer); | ||
1469 | while (con->in_base_pos < to) { | ||
1470 | left = to - con->in_base_pos; | ||
1471 | ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + | ||
1472 | (con->in_base_pos - sizeof(m->hdr)), | ||
1473 | left); | ||
1474 | if (ret <= 0) | ||
1475 | return ret; | ||
1476 | con->in_base_pos += ret; | ||
1477 | } | ||
1478 | dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", | ||
1479 | m, front_len, m->footer.front_crc, middle_len, | ||
1480 | m->footer.middle_crc, data_len, m->footer.data_crc); | ||
1481 | |||
1482 | /* crc ok? */ | ||
1483 | if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { | ||
1484 | pr_err("read_partial_message %p front crc %u != exp. %u\n", | ||
1485 | m, con->in_front_crc, m->footer.front_crc); | ||
1486 | return -EBADMSG; | ||
1487 | } | ||
1488 | if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { | ||
1489 | pr_err("read_partial_message %p middle crc %u != exp %u\n", | ||
1490 | m, con->in_middle_crc, m->footer.middle_crc); | ||
1491 | return -EBADMSG; | ||
1492 | } | ||
1493 | if (datacrc && | ||
1494 | (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && | ||
1495 | con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { | ||
1496 | pr_err("read_partial_message %p data crc %u != exp. %u\n", m, | ||
1497 | con->in_data_crc, le32_to_cpu(m->footer.data_crc)); | ||
1498 | return -EBADMSG; | ||
1499 | } | ||
1500 | |||
1501 | return 1; /* done! */ | ||
1502 | } | ||
1503 | |||
1504 | /* | ||
1505 | * Process message. This happens in the worker thread. The callback should | ||
1506 | * be careful not to do anything that waits on other incoming messages or it | ||
1507 | * may deadlock. | ||
1508 | */ | ||
1509 | static void process_message(struct ceph_connection *con) | ||
1510 | { | ||
1511 | struct ceph_msg *msg; | ||
1512 | |||
1513 | msg = con->in_msg; | ||
1514 | con->in_msg = NULL; | ||
1515 | |||
1516 | /* if first message, set peer_name */ | ||
1517 | if (con->peer_name.type == 0) | ||
1518 | con->peer_name = msg->hdr.src; | ||
1519 | |||
1520 | con->in_seq++; | ||
1521 | mutex_unlock(&con->mutex); | ||
1522 | |||
1523 | dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", | ||
1524 | msg, le64_to_cpu(msg->hdr.seq), | ||
1525 | ENTITY_NAME(msg->hdr.src), | ||
1526 | le16_to_cpu(msg->hdr.type), | ||
1527 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | ||
1528 | le32_to_cpu(msg->hdr.front_len), | ||
1529 | le32_to_cpu(msg->hdr.data_len), | ||
1530 | con->in_front_crc, con->in_middle_crc, con->in_data_crc); | ||
1531 | con->ops->dispatch(con, msg); | ||
1532 | |||
1533 | mutex_lock(&con->mutex); | ||
1534 | prepare_read_tag(con); | ||
1535 | } | ||
1536 | |||
1537 | |||
1538 | /* | ||
1539 | * Write something to the socket. Called in a worker thread when the | ||
1540 | * socket appears to be writeable and we have something ready to send. | ||
1541 | */ | ||
1542 | static int try_write(struct ceph_connection *con) | ||
1543 | { | ||
1544 | struct ceph_messenger *msgr = con->msgr; | ||
1545 | int ret = 1; | ||
1546 | |||
1547 | dout("try_write start %p state %lu nref %d\n", con, con->state, | ||
1548 | atomic_read(&con->nref)); | ||
1549 | |||
1550 | more: | ||
1551 | dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); | ||
1552 | |||
1553 | /* open the socket first? */ | ||
1554 | if (con->sock == NULL) { | ||
1555 | /* | ||
1556 | * if we were STANDBY and are reconnecting _this_ | ||
1557 | * connection, bump connect_seq now. Always bump | ||
1558 | * global_seq. | ||
1559 | */ | ||
1560 | if (test_and_clear_bit(STANDBY, &con->state)) | ||
1561 | con->connect_seq++; | ||
1562 | |||
1563 | prepare_write_banner(msgr, con); | ||
1564 | prepare_write_connect(msgr, con, 1); | ||
1565 | prepare_read_banner(con); | ||
1566 | set_bit(CONNECTING, &con->state); | ||
1567 | clear_bit(NEGOTIATING, &con->state); | ||
1568 | |||
1569 | BUG_ON(con->in_msg); | ||
1570 | con->in_tag = CEPH_MSGR_TAG_READY; | ||
1571 | dout("try_write initiating connect on %p new state %lu\n", | ||
1572 | con, con->state); | ||
1573 | con->sock = ceph_tcp_connect(con); | ||
1574 | if (IS_ERR(con->sock)) { | ||
1575 | con->sock = NULL; | ||
1576 | con->error_msg = "connect error"; | ||
1577 | ret = -1; | ||
1578 | goto out; | ||
1579 | } | ||
1580 | } | ||
1581 | |||
1582 | more_kvec: | ||
1583 | /* kvec data queued? */ | ||
1584 | if (con->out_skip) { | ||
1585 | ret = write_partial_skip(con); | ||
1586 | if (ret <= 0) | ||
1587 | goto done; | ||
1588 | if (ret < 0) { | ||
1589 | dout("try_write write_partial_skip err %d\n", ret); | ||
1590 | goto done; | ||
1591 | } | ||
1592 | } | ||
1593 | if (con->out_kvec_left) { | ||
1594 | ret = write_partial_kvec(con); | ||
1595 | if (ret <= 0) | ||
1596 | goto done; | ||
1597 | } | ||
1598 | |||
1599 | /* msg pages? */ | ||
1600 | if (con->out_msg) { | ||
1601 | if (con->out_msg_done) { | ||
1602 | ceph_msg_put(con->out_msg); | ||
1603 | con->out_msg = NULL; /* we're done with this one */ | ||
1604 | goto do_next; | ||
1605 | } | ||
1606 | |||
1607 | ret = write_partial_msg_pages(con); | ||
1608 | if (ret == 1) | ||
1609 | goto more_kvec; /* we need to send the footer, too! */ | ||
1610 | if (ret == 0) | ||
1611 | goto done; | ||
1612 | if (ret < 0) { | ||
1613 | dout("try_write write_partial_msg_pages err %d\n", | ||
1614 | ret); | ||
1615 | goto done; | ||
1616 | } | ||
1617 | } | ||
1618 | |||
1619 | do_next: | ||
1620 | if (!test_bit(CONNECTING, &con->state)) { | ||
1621 | /* is anything else pending? */ | ||
1622 | if (!list_empty(&con->out_queue)) { | ||
1623 | prepare_write_message(con); | ||
1624 | goto more; | ||
1625 | } | ||
1626 | if (con->in_seq > con->in_seq_acked) { | ||
1627 | prepare_write_ack(con); | ||
1628 | goto more; | ||
1629 | } | ||
1630 | if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { | ||
1631 | prepare_write_keepalive(con); | ||
1632 | goto more; | ||
1633 | } | ||
1634 | } | ||
1635 | |||
1636 | /* Nothing to do! */ | ||
1637 | clear_bit(WRITE_PENDING, &con->state); | ||
1638 | dout("try_write nothing else to write.\n"); | ||
1639 | done: | ||
1640 | ret = 0; | ||
1641 | out: | ||
1642 | dout("try_write done on %p\n", con); | ||
1643 | return ret; | ||
1644 | } | ||
1645 | |||
1646 | |||
1647 | |||
1648 | /* | ||
1649 | * Read what we can from the socket. | ||
1650 | */ | ||
1651 | static int try_read(struct ceph_connection *con) | ||
1652 | { | ||
1653 | int ret = -1; | ||
1654 | |||
1655 | if (!con->sock) | ||
1656 | return 0; | ||
1657 | |||
1658 | if (test_bit(STANDBY, &con->state)) | ||
1659 | return 0; | ||
1660 | |||
1661 | dout("try_read start on %p\n", con); | ||
1662 | |||
1663 | more: | ||
1664 | dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, | ||
1665 | con->in_base_pos); | ||
1666 | if (test_bit(CONNECTING, &con->state)) { | ||
1667 | if (!test_bit(NEGOTIATING, &con->state)) { | ||
1668 | dout("try_read connecting\n"); | ||
1669 | ret = read_partial_banner(con); | ||
1670 | if (ret <= 0) | ||
1671 | goto done; | ||
1672 | if (process_banner(con) < 0) { | ||
1673 | ret = -1; | ||
1674 | goto out; | ||
1675 | } | ||
1676 | } | ||
1677 | ret = read_partial_connect(con); | ||
1678 | if (ret <= 0) | ||
1679 | goto done; | ||
1680 | if (process_connect(con) < 0) { | ||
1681 | ret = -1; | ||
1682 | goto out; | ||
1683 | } | ||
1684 | goto more; | ||
1685 | } | ||
1686 | |||
1687 | if (con->in_base_pos < 0) { | ||
1688 | /* | ||
1689 | * skipping + discarding content. | ||
1690 | * | ||
1691 | * FIXME: there must be a better way to do this! | ||
1692 | */ | ||
1693 | static char buf[1024]; | ||
1694 | int skip = min(1024, -con->in_base_pos); | ||
1695 | dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); | ||
1696 | ret = ceph_tcp_recvmsg(con->sock, buf, skip); | ||
1697 | if (ret <= 0) | ||
1698 | goto done; | ||
1699 | con->in_base_pos += ret; | ||
1700 | if (con->in_base_pos) | ||
1701 | goto more; | ||
1702 | } | ||
1703 | if (con->in_tag == CEPH_MSGR_TAG_READY) { | ||
1704 | /* | ||
1705 | * what's next? | ||
1706 | */ | ||
1707 | ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); | ||
1708 | if (ret <= 0) | ||
1709 | goto done; | ||
1710 | dout("try_read got tag %d\n", (int)con->in_tag); | ||
1711 | switch (con->in_tag) { | ||
1712 | case CEPH_MSGR_TAG_MSG: | ||
1713 | prepare_read_message(con); | ||
1714 | break; | ||
1715 | case CEPH_MSGR_TAG_ACK: | ||
1716 | prepare_read_ack(con); | ||
1717 | break; | ||
1718 | case CEPH_MSGR_TAG_CLOSE: | ||
1719 | set_bit(CLOSED, &con->state); /* fixme */ | ||
1720 | goto done; | ||
1721 | default: | ||
1722 | goto bad_tag; | ||
1723 | } | ||
1724 | } | ||
1725 | if (con->in_tag == CEPH_MSGR_TAG_MSG) { | ||
1726 | ret = read_partial_message(con); | ||
1727 | if (ret <= 0) { | ||
1728 | switch (ret) { | ||
1729 | case -EBADMSG: | ||
1730 | con->error_msg = "bad crc"; | ||
1731 | ret = -EIO; | ||
1732 | goto out; | ||
1733 | case -EIO: | ||
1734 | con->error_msg = "io error"; | ||
1735 | goto out; | ||
1736 | default: | ||
1737 | goto done; | ||
1738 | } | ||
1739 | } | ||
1740 | if (con->in_tag == CEPH_MSGR_TAG_READY) | ||
1741 | goto more; | ||
1742 | process_message(con); | ||
1743 | goto more; | ||
1744 | } | ||
1745 | if (con->in_tag == CEPH_MSGR_TAG_ACK) { | ||
1746 | ret = read_partial_ack(con); | ||
1747 | if (ret <= 0) | ||
1748 | goto done; | ||
1749 | process_ack(con); | ||
1750 | goto more; | ||
1751 | } | ||
1752 | |||
1753 | done: | ||
1754 | ret = 0; | ||
1755 | out: | ||
1756 | dout("try_read done on %p\n", con); | ||
1757 | return ret; | ||
1758 | |||
1759 | bad_tag: | ||
1760 | pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); | ||
1761 | con->error_msg = "protocol error, garbage tag"; | ||
1762 | ret = -1; | ||
1763 | goto out; | ||
1764 | } | ||
1765 | |||
1766 | |||
1767 | /* | ||
1768 | * Atomically queue work on a connection. Bump @con reference to | ||
1769 | * avoid races with connection teardown. | ||
1770 | * | ||
1771 | * There is some trickery going on with QUEUED and BUSY because we | ||
1772 | * only want a _single_ thread operating on each connection at any | ||
1773 | * point in time, but we want to use all available CPUs. | ||
1774 | * | ||
1775 | * The worker thread only proceeds if it can atomically set BUSY. It | ||
1776 | * clears QUEUED and does it's thing. When it thinks it's done, it | ||
1777 | * clears BUSY, then rechecks QUEUED.. if it's set again, it loops | ||
1778 | * (tries again to set BUSY). | ||
1779 | * | ||
1780 | * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we | ||
1781 | * try to queue work. If that fails (work is already queued, or BUSY) | ||
1782 | * we give up (work also already being done or is queued) but leave QUEUED | ||
1783 | * set so that the worker thread will loop if necessary. | ||
1784 | */ | ||
1785 | static void queue_con(struct ceph_connection *con) | ||
1786 | { | ||
1787 | if (test_bit(DEAD, &con->state)) { | ||
1788 | dout("queue_con %p ignoring: DEAD\n", | ||
1789 | con); | ||
1790 | return; | ||
1791 | } | ||
1792 | |||
1793 | if (!con->ops->get(con)) { | ||
1794 | dout("queue_con %p ref count 0\n", con); | ||
1795 | return; | ||
1796 | } | ||
1797 | |||
1798 | set_bit(QUEUED, &con->state); | ||
1799 | if (test_bit(BUSY, &con->state)) { | ||
1800 | dout("queue_con %p - already BUSY\n", con); | ||
1801 | con->ops->put(con); | ||
1802 | } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { | ||
1803 | dout("queue_con %p - already queued\n", con); | ||
1804 | con->ops->put(con); | ||
1805 | } else { | ||
1806 | dout("queue_con %p\n", con); | ||
1807 | } | ||
1808 | } | ||
1809 | |||
1810 | /* | ||
1811 | * Do some work on a connection. Drop a connection ref when we're done. | ||
1812 | */ | ||
1813 | static void con_work(struct work_struct *work) | ||
1814 | { | ||
1815 | struct ceph_connection *con = container_of(work, struct ceph_connection, | ||
1816 | work.work); | ||
1817 | int backoff = 0; | ||
1818 | |||
1819 | more: | ||
1820 | if (test_and_set_bit(BUSY, &con->state) != 0) { | ||
1821 | dout("con_work %p BUSY already set\n", con); | ||
1822 | goto out; | ||
1823 | } | ||
1824 | dout("con_work %p start, clearing QUEUED\n", con); | ||
1825 | clear_bit(QUEUED, &con->state); | ||
1826 | |||
1827 | mutex_lock(&con->mutex); | ||
1828 | |||
1829 | if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ | ||
1830 | dout("con_work CLOSED\n"); | ||
1831 | con_close_socket(con); | ||
1832 | goto done; | ||
1833 | } | ||
1834 | if (test_and_clear_bit(OPENING, &con->state)) { | ||
1835 | /* reopen w/ new peer */ | ||
1836 | dout("con_work OPENING\n"); | ||
1837 | con_close_socket(con); | ||
1838 | } | ||
1839 | |||
1840 | if (test_and_clear_bit(SOCK_CLOSED, &con->state) || | ||
1841 | try_read(con) < 0 || | ||
1842 | try_write(con) < 0) { | ||
1843 | mutex_unlock(&con->mutex); | ||
1844 | backoff = 1; | ||
1845 | ceph_fault(con); /* error/fault path */ | ||
1846 | goto done_unlocked; | ||
1847 | } | ||
1848 | |||
1849 | done: | ||
1850 | mutex_unlock(&con->mutex); | ||
1851 | |||
1852 | done_unlocked: | ||
1853 | clear_bit(BUSY, &con->state); | ||
1854 | dout("con->state=%lu\n", con->state); | ||
1855 | if (test_bit(QUEUED, &con->state)) { | ||
1856 | if (!backoff || test_bit(OPENING, &con->state)) { | ||
1857 | dout("con_work %p QUEUED reset, looping\n", con); | ||
1858 | goto more; | ||
1859 | } | ||
1860 | dout("con_work %p QUEUED reset, but just faulted\n", con); | ||
1861 | clear_bit(QUEUED, &con->state); | ||
1862 | } | ||
1863 | dout("con_work %p done\n", con); | ||
1864 | |||
1865 | out: | ||
1866 | con->ops->put(con); | ||
1867 | } | ||
1868 | |||
1869 | |||
1870 | /* | ||
1871 | * Generic error/fault handler. A retry mechanism is used with | ||
1872 | * exponential backoff | ||
1873 | */ | ||
1874 | static void ceph_fault(struct ceph_connection *con) | ||
1875 | { | ||
1876 | pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), | ||
1877 | pr_addr(&con->peer_addr.in_addr), con->error_msg); | ||
1878 | dout("fault %p state %lu to peer %s\n", | ||
1879 | con, con->state, pr_addr(&con->peer_addr.in_addr)); | ||
1880 | |||
1881 | if (test_bit(LOSSYTX, &con->state)) { | ||
1882 | dout("fault on LOSSYTX channel\n"); | ||
1883 | goto out; | ||
1884 | } | ||
1885 | |||
1886 | mutex_lock(&con->mutex); | ||
1887 | if (test_bit(CLOSED, &con->state)) | ||
1888 | goto out_unlock; | ||
1889 | |||
1890 | con_close_socket(con); | ||
1891 | |||
1892 | if (con->in_msg) { | ||
1893 | ceph_msg_put(con->in_msg); | ||
1894 | con->in_msg = NULL; | ||
1895 | } | ||
1896 | |||
1897 | /* Requeue anything that hasn't been acked */ | ||
1898 | list_splice_init(&con->out_sent, &con->out_queue); | ||
1899 | |||
1900 | /* If there are no messages in the queue, place the connection | ||
1901 | * in a STANDBY state (i.e., don't try to reconnect just yet). */ | ||
1902 | if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { | ||
1903 | dout("fault setting STANDBY\n"); | ||
1904 | set_bit(STANDBY, &con->state); | ||
1905 | } else { | ||
1906 | /* retry after a delay. */ | ||
1907 | if (con->delay == 0) | ||
1908 | con->delay = BASE_DELAY_INTERVAL; | ||
1909 | else if (con->delay < MAX_DELAY_INTERVAL) | ||
1910 | con->delay *= 2; | ||
1911 | dout("fault queueing %p delay %lu\n", con, con->delay); | ||
1912 | con->ops->get(con); | ||
1913 | if (queue_delayed_work(ceph_msgr_wq, &con->work, | ||
1914 | round_jiffies_relative(con->delay)) == 0) | ||
1915 | con->ops->put(con); | ||
1916 | } | ||
1917 | |||
1918 | out_unlock: | ||
1919 | mutex_unlock(&con->mutex); | ||
1920 | out: | ||
1921 | /* | ||
1922 | * in case we faulted due to authentication, invalidate our | ||
1923 | * current tickets so that we can get new ones. | ||
1924 | */ | ||
1925 | if (con->auth_retry && con->ops->invalidate_authorizer) { | ||
1926 | dout("calling invalidate_authorizer()\n"); | ||
1927 | con->ops->invalidate_authorizer(con); | ||
1928 | } | ||
1929 | |||
1930 | if (con->ops->fault) | ||
1931 | con->ops->fault(con); | ||
1932 | } | ||
1933 | |||
1934 | |||
1935 | |||
1936 | /* | ||
1937 | * create a new messenger instance | ||
1938 | */ | ||
1939 | struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) | ||
1940 | { | ||
1941 | struct ceph_messenger *msgr; | ||
1942 | |||
1943 | msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); | ||
1944 | if (msgr == NULL) | ||
1945 | return ERR_PTR(-ENOMEM); | ||
1946 | |||
1947 | spin_lock_init(&msgr->global_seq_lock); | ||
1948 | |||
1949 | /* the zero page is needed if a request is "canceled" while the message | ||
1950 | * is being written over the socket */ | ||
1951 | msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); | ||
1952 | if (!msgr->zero_page) { | ||
1953 | kfree(msgr); | ||
1954 | return ERR_PTR(-ENOMEM); | ||
1955 | } | ||
1956 | kmap(msgr->zero_page); | ||
1957 | |||
1958 | if (myaddr) | ||
1959 | msgr->inst.addr = *myaddr; | ||
1960 | |||
1961 | /* select a random nonce */ | ||
1962 | msgr->inst.addr.type = 0; | ||
1963 | get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); | ||
1964 | encode_my_addr(msgr); | ||
1965 | |||
1966 | dout("messenger_create %p\n", msgr); | ||
1967 | return msgr; | ||
1968 | } | ||
1969 | |||
1970 | void ceph_messenger_destroy(struct ceph_messenger *msgr) | ||
1971 | { | ||
1972 | dout("destroy %p\n", msgr); | ||
1973 | kunmap(msgr->zero_page); | ||
1974 | __free_page(msgr->zero_page); | ||
1975 | kfree(msgr); | ||
1976 | dout("destroyed messenger %p\n", msgr); | ||
1977 | } | ||
1978 | |||
1979 | /* | ||
1980 | * Queue up an outgoing message on the given connection. | ||
1981 | */ | ||
1982 | void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) | ||
1983 | { | ||
1984 | if (test_bit(CLOSED, &con->state)) { | ||
1985 | dout("con_send %p closed, dropping %p\n", con, msg); | ||
1986 | ceph_msg_put(msg); | ||
1987 | return; | ||
1988 | } | ||
1989 | |||
1990 | /* set src+dst */ | ||
1991 | msg->hdr.src = con->msgr->inst.name; | ||
1992 | |||
1993 | BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); | ||
1994 | |||
1995 | msg->needs_out_seq = true; | ||
1996 | |||
1997 | /* queue */ | ||
1998 | mutex_lock(&con->mutex); | ||
1999 | BUG_ON(!list_empty(&msg->list_head)); | ||
2000 | list_add_tail(&msg->list_head, &con->out_queue); | ||
2001 | dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, | ||
2002 | ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), | ||
2003 | ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), | ||
2004 | le32_to_cpu(msg->hdr.front_len), | ||
2005 | le32_to_cpu(msg->hdr.middle_len), | ||
2006 | le32_to_cpu(msg->hdr.data_len)); | ||
2007 | mutex_unlock(&con->mutex); | ||
2008 | |||
2009 | /* if there wasn't anything waiting to send before, queue | ||
2010 | * new work */ | ||
2011 | if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) | ||
2012 | queue_con(con); | ||
2013 | } | ||
2014 | |||
2015 | /* | ||
2016 | * Revoke a message that was previously queued for send | ||
2017 | */ | ||
2018 | void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) | ||
2019 | { | ||
2020 | mutex_lock(&con->mutex); | ||
2021 | if (!list_empty(&msg->list_head)) { | ||
2022 | dout("con_revoke %p msg %p - was on queue\n", con, msg); | ||
2023 | list_del_init(&msg->list_head); | ||
2024 | ceph_msg_put(msg); | ||
2025 | msg->hdr.seq = 0; | ||
2026 | } | ||
2027 | if (con->out_msg == msg) { | ||
2028 | dout("con_revoke %p msg %p - was sending\n", con, msg); | ||
2029 | con->out_msg = NULL; | ||
2030 | if (con->out_kvec_is_msg) { | ||
2031 | con->out_skip = con->out_kvec_bytes; | ||
2032 | con->out_kvec_is_msg = false; | ||
2033 | } | ||
2034 | ceph_msg_put(msg); | ||
2035 | msg->hdr.seq = 0; | ||
2036 | } | ||
2037 | mutex_unlock(&con->mutex); | ||
2038 | } | ||
2039 | |||
2040 | /* | ||
2041 | * Revoke a message that we may be reading data into | ||
2042 | */ | ||
2043 | void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) | ||
2044 | { | ||
2045 | mutex_lock(&con->mutex); | ||
2046 | if (con->in_msg && con->in_msg == msg) { | ||
2047 | unsigned front_len = le32_to_cpu(con->in_hdr.front_len); | ||
2048 | unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); | ||
2049 | unsigned data_len = le32_to_cpu(con->in_hdr.data_len); | ||
2050 | |||
2051 | /* skip rest of message */ | ||
2052 | dout("con_revoke_pages %p msg %p revoked\n", con, msg); | ||
2053 | con->in_base_pos = con->in_base_pos - | ||
2054 | sizeof(struct ceph_msg_header) - | ||
2055 | front_len - | ||
2056 | middle_len - | ||
2057 | data_len - | ||
2058 | sizeof(struct ceph_msg_footer); | ||
2059 | ceph_msg_put(con->in_msg); | ||
2060 | con->in_msg = NULL; | ||
2061 | con->in_tag = CEPH_MSGR_TAG_READY; | ||
2062 | con->in_seq++; | ||
2063 | } else { | ||
2064 | dout("con_revoke_pages %p msg %p pages %p no-op\n", | ||
2065 | con, con->in_msg, msg); | ||
2066 | } | ||
2067 | mutex_unlock(&con->mutex); | ||
2068 | } | ||
2069 | |||
2070 | /* | ||
2071 | * Queue a keepalive byte to ensure the tcp connection is alive. | ||
2072 | */ | ||
2073 | void ceph_con_keepalive(struct ceph_connection *con) | ||
2074 | { | ||
2075 | if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && | ||
2076 | test_and_set_bit(WRITE_PENDING, &con->state) == 0) | ||
2077 | queue_con(con); | ||
2078 | } | ||
2079 | |||
2080 | |||
2081 | /* | ||
2082 | * construct a new message with given type, size | ||
2083 | * the new msg has a ref count of 1. | ||
2084 | */ | ||
2085 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) | ||
2086 | { | ||
2087 | struct ceph_msg *m; | ||
2088 | |||
2089 | m = kmalloc(sizeof(*m), flags); | ||
2090 | if (m == NULL) | ||
2091 | goto out; | ||
2092 | kref_init(&m->kref); | ||
2093 | INIT_LIST_HEAD(&m->list_head); | ||
2094 | |||
2095 | m->hdr.tid = 0; | ||
2096 | m->hdr.type = cpu_to_le16(type); | ||
2097 | m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); | ||
2098 | m->hdr.version = 0; | ||
2099 | m->hdr.front_len = cpu_to_le32(front_len); | ||
2100 | m->hdr.middle_len = 0; | ||
2101 | m->hdr.data_len = 0; | ||
2102 | m->hdr.data_off = 0; | ||
2103 | m->hdr.reserved = 0; | ||
2104 | m->footer.front_crc = 0; | ||
2105 | m->footer.middle_crc = 0; | ||
2106 | m->footer.data_crc = 0; | ||
2107 | m->footer.flags = 0; | ||
2108 | m->front_max = front_len; | ||
2109 | m->front_is_vmalloc = false; | ||
2110 | m->more_to_follow = false; | ||
2111 | m->pool = NULL; | ||
2112 | |||
2113 | /* front */ | ||
2114 | if (front_len) { | ||
2115 | if (front_len > PAGE_CACHE_SIZE) { | ||
2116 | m->front.iov_base = __vmalloc(front_len, flags, | ||
2117 | PAGE_KERNEL); | ||
2118 | m->front_is_vmalloc = true; | ||
2119 | } else { | ||
2120 | m->front.iov_base = kmalloc(front_len, flags); | ||
2121 | } | ||
2122 | if (m->front.iov_base == NULL) { | ||
2123 | pr_err("msg_new can't allocate %d bytes\n", | ||
2124 | front_len); | ||
2125 | goto out2; | ||
2126 | } | ||
2127 | } else { | ||
2128 | m->front.iov_base = NULL; | ||
2129 | } | ||
2130 | m->front.iov_len = front_len; | ||
2131 | |||
2132 | /* middle */ | ||
2133 | m->middle = NULL; | ||
2134 | |||
2135 | /* data */ | ||
2136 | m->nr_pages = 0; | ||
2137 | m->pages = NULL; | ||
2138 | m->pagelist = NULL; | ||
2139 | |||
2140 | dout("ceph_msg_new %p front %d\n", m, front_len); | ||
2141 | return m; | ||
2142 | |||
2143 | out2: | ||
2144 | ceph_msg_put(m); | ||
2145 | out: | ||
2146 | pr_err("msg_new can't create type %d front %d\n", type, front_len); | ||
2147 | return NULL; | ||
2148 | } | ||
2149 | |||
2150 | /* | ||
2151 | * Allocate "middle" portion of a message, if it is needed and wasn't | ||
2152 | * allocated by alloc_msg. This allows us to read a small fixed-size | ||
2153 | * per-type header in the front and then gracefully fail (i.e., | ||
2154 | * propagate the error to the caller based on info in the front) when | ||
2155 | * the middle is too large. | ||
2156 | */ | ||
2157 | static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) | ||
2158 | { | ||
2159 | int type = le16_to_cpu(msg->hdr.type); | ||
2160 | int middle_len = le32_to_cpu(msg->hdr.middle_len); | ||
2161 | |||
2162 | dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, | ||
2163 | ceph_msg_type_name(type), middle_len); | ||
2164 | BUG_ON(!middle_len); | ||
2165 | BUG_ON(msg->middle); | ||
2166 | |||
2167 | msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); | ||
2168 | if (!msg->middle) | ||
2169 | return -ENOMEM; | ||
2170 | return 0; | ||
2171 | } | ||
2172 | |||
2173 | /* | ||
2174 | * Generic message allocator, for incoming messages. | ||
2175 | */ | ||
2176 | static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, | ||
2177 | struct ceph_msg_header *hdr, | ||
2178 | int *skip) | ||
2179 | { | ||
2180 | int type = le16_to_cpu(hdr->type); | ||
2181 | int front_len = le32_to_cpu(hdr->front_len); | ||
2182 | int middle_len = le32_to_cpu(hdr->middle_len); | ||
2183 | struct ceph_msg *msg = NULL; | ||
2184 | int ret; | ||
2185 | |||
2186 | if (con->ops->alloc_msg) { | ||
2187 | mutex_unlock(&con->mutex); | ||
2188 | msg = con->ops->alloc_msg(con, hdr, skip); | ||
2189 | mutex_lock(&con->mutex); | ||
2190 | if (!msg || *skip) | ||
2191 | return NULL; | ||
2192 | } | ||
2193 | if (!msg) { | ||
2194 | *skip = 0; | ||
2195 | msg = ceph_msg_new(type, front_len, GFP_NOFS); | ||
2196 | if (!msg) { | ||
2197 | pr_err("unable to allocate msg type %d len %d\n", | ||
2198 | type, front_len); | ||
2199 | return NULL; | ||
2200 | } | ||
2201 | } | ||
2202 | memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); | ||
2203 | |||
2204 | if (middle_len && !msg->middle) { | ||
2205 | ret = ceph_alloc_middle(con, msg); | ||
2206 | if (ret < 0) { | ||
2207 | ceph_msg_put(msg); | ||
2208 | return NULL; | ||
2209 | } | ||
2210 | } | ||
2211 | |||
2212 | return msg; | ||
2213 | } | ||
2214 | |||
2215 | |||
2216 | /* | ||
2217 | * Free a generically kmalloc'd message. | ||
2218 | */ | ||
2219 | void ceph_msg_kfree(struct ceph_msg *m) | ||
2220 | { | ||
2221 | dout("msg_kfree %p\n", m); | ||
2222 | if (m->front_is_vmalloc) | ||
2223 | vfree(m->front.iov_base); | ||
2224 | else | ||
2225 | kfree(m->front.iov_base); | ||
2226 | kfree(m); | ||
2227 | } | ||
2228 | |||
2229 | /* | ||
2230 | * Drop a msg ref. Destroy as needed. | ||
2231 | */ | ||
2232 | void ceph_msg_last_put(struct kref *kref) | ||
2233 | { | ||
2234 | struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); | ||
2235 | |||
2236 | dout("ceph_msg_put last one on %p\n", m); | ||
2237 | WARN_ON(!list_empty(&m->list_head)); | ||
2238 | |||
2239 | /* drop middle, data, if any */ | ||
2240 | if (m->middle) { | ||
2241 | ceph_buffer_put(m->middle); | ||
2242 | m->middle = NULL; | ||
2243 | } | ||
2244 | m->nr_pages = 0; | ||
2245 | m->pages = NULL; | ||
2246 | |||
2247 | if (m->pagelist) { | ||
2248 | ceph_pagelist_release(m->pagelist); | ||
2249 | kfree(m->pagelist); | ||
2250 | m->pagelist = NULL; | ||
2251 | } | ||
2252 | |||
2253 | if (m->pool) | ||
2254 | ceph_msgpool_put(m->pool, m); | ||
2255 | else | ||
2256 | ceph_msg_kfree(m); | ||
2257 | } | ||
2258 | |||
2259 | void ceph_msg_dump(struct ceph_msg *msg) | ||
2260 | { | ||
2261 | pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, | ||
2262 | msg->front_max, msg->nr_pages); | ||
2263 | print_hex_dump(KERN_DEBUG, "header: ", | ||
2264 | DUMP_PREFIX_OFFSET, 16, 1, | ||
2265 | &msg->hdr, sizeof(msg->hdr), true); | ||
2266 | print_hex_dump(KERN_DEBUG, " front: ", | ||
2267 | DUMP_PREFIX_OFFSET, 16, 1, | ||
2268 | msg->front.iov_base, msg->front.iov_len, true); | ||
2269 | if (msg->middle) | ||
2270 | print_hex_dump(KERN_DEBUG, "middle: ", | ||
2271 | DUMP_PREFIX_OFFSET, 16, 1, | ||
2272 | msg->middle->vec.iov_base, | ||
2273 | msg->middle->vec.iov_len, true); | ||
2274 | print_hex_dump(KERN_DEBUG, "footer: ", | ||
2275 | DUMP_PREFIX_OFFSET, 16, 1, | ||
2276 | &msg->footer, sizeof(msg->footer), true); | ||
2277 | } | ||
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h deleted file mode 100644 index 76fbc957bc13..000000000000 --- a/fs/ceph/messenger.h +++ /dev/null | |||
@@ -1,253 +0,0 @@ | |||
1 | #ifndef __FS_CEPH_MESSENGER_H | ||
2 | #define __FS_CEPH_MESSENGER_H | ||
3 | |||
4 | #include <linux/kref.h> | ||
5 | #include <linux/mutex.h> | ||
6 | #include <linux/net.h> | ||
7 | #include <linux/radix-tree.h> | ||
8 | #include <linux/uio.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | |||
12 | #include "types.h" | ||
13 | #include "buffer.h" | ||
14 | |||
15 | struct ceph_msg; | ||
16 | struct ceph_connection; | ||
17 | |||
18 | extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ | ||
19 | |||
20 | /* | ||
21 | * Ceph defines these callbacks for handling connection events. | ||
22 | */ | ||
23 | struct ceph_connection_operations { | ||
24 | struct ceph_connection *(*get)(struct ceph_connection *); | ||
25 | void (*put)(struct ceph_connection *); | ||
26 | |||
27 | /* handle an incoming message. */ | ||
28 | void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); | ||
29 | |||
30 | /* authorize an outgoing connection */ | ||
31 | int (*get_authorizer) (struct ceph_connection *con, | ||
32 | void **buf, int *len, int *proto, | ||
33 | void **reply_buf, int *reply_len, int force_new); | ||
34 | int (*verify_authorizer_reply) (struct ceph_connection *con, int len); | ||
35 | int (*invalidate_authorizer)(struct ceph_connection *con); | ||
36 | |||
37 | /* protocol version mismatch */ | ||
38 | void (*bad_proto) (struct ceph_connection *con); | ||
39 | |||
40 | /* there was some error on the socket (disconnect, whatever) */ | ||
41 | void (*fault) (struct ceph_connection *con); | ||
42 | |||
43 | /* a remote host as terminated a message exchange session, and messages | ||
44 | * we sent (or they tried to send us) may be lost. */ | ||
45 | void (*peer_reset) (struct ceph_connection *con); | ||
46 | |||
47 | struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, | ||
48 | struct ceph_msg_header *hdr, | ||
49 | int *skip); | ||
50 | }; | ||
51 | |||
52 | /* use format string %s%d */ | ||
53 | #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) | ||
54 | |||
55 | struct ceph_messenger { | ||
56 | struct ceph_entity_inst inst; /* my name+address */ | ||
57 | struct ceph_entity_addr my_enc_addr; | ||
58 | struct page *zero_page; /* used in certain error cases */ | ||
59 | |||
60 | bool nocrc; | ||
61 | |||
62 | /* | ||
63 | * the global_seq counts connections i (attempt to) initiate | ||
64 | * in order to disambiguate certain connect race conditions. | ||
65 | */ | ||
66 | u32 global_seq; | ||
67 | spinlock_t global_seq_lock; | ||
68 | }; | ||
69 | |||
70 | /* | ||
71 | * a single message. it contains a header (src, dest, message type, etc.), | ||
72 | * footer (crc values, mainly), a "front" message body, and possibly a | ||
73 | * data payload (stored in some number of pages). | ||
74 | */ | ||
75 | struct ceph_msg { | ||
76 | struct ceph_msg_header hdr; /* header */ | ||
77 | struct ceph_msg_footer footer; /* footer */ | ||
78 | struct kvec front; /* unaligned blobs of message */ | ||
79 | struct ceph_buffer *middle; | ||
80 | struct page **pages; /* data payload. NOT OWNER. */ | ||
81 | unsigned nr_pages; /* size of page array */ | ||
82 | struct ceph_pagelist *pagelist; /* instead of pages */ | ||
83 | struct list_head list_head; | ||
84 | struct kref kref; | ||
85 | bool front_is_vmalloc; | ||
86 | bool more_to_follow; | ||
87 | bool needs_out_seq; | ||
88 | int front_max; | ||
89 | |||
90 | struct ceph_msgpool *pool; | ||
91 | }; | ||
92 | |||
93 | struct ceph_msg_pos { | ||
94 | int page, page_pos; /* which page; offset in page */ | ||
95 | int data_pos; /* offset in data payload */ | ||
96 | int did_page_crc; /* true if we've calculated crc for current page */ | ||
97 | }; | ||
98 | |||
99 | /* ceph connection fault delay defaults, for exponential backoff */ | ||
100 | #define BASE_DELAY_INTERVAL (HZ/2) | ||
101 | #define MAX_DELAY_INTERVAL (5 * 60 * HZ) | ||
102 | |||
103 | /* | ||
104 | * ceph_connection state bit flags | ||
105 | * | ||
106 | * QUEUED and BUSY are used together to ensure that only a single | ||
107 | * thread is currently opening, reading or writing data to the socket. | ||
108 | */ | ||
109 | #define LOSSYTX 0 /* we can close channel or drop messages on errors */ | ||
110 | #define CONNECTING 1 | ||
111 | #define NEGOTIATING 2 | ||
112 | #define KEEPALIVE_PENDING 3 | ||
113 | #define WRITE_PENDING 4 /* we have data ready to send */ | ||
114 | #define QUEUED 5 /* there is work queued on this connection */ | ||
115 | #define BUSY 6 /* work is being done */ | ||
116 | #define STANDBY 8 /* no outgoing messages, socket closed. we keep | ||
117 | * the ceph_connection around to maintain shared | ||
118 | * state with the peer. */ | ||
119 | #define CLOSED 10 /* we've closed the connection */ | ||
120 | #define SOCK_CLOSED 11 /* socket state changed to closed */ | ||
121 | #define OPENING 13 /* open connection w/ (possibly new) peer */ | ||
122 | #define DEAD 14 /* dead, about to kfree */ | ||
123 | |||
124 | /* | ||
125 | * A single connection with another host. | ||
126 | * | ||
127 | * We maintain a queue of outgoing messages, and some session state to | ||
128 | * ensure that we can preserve the lossless, ordered delivery of | ||
129 | * messages in the case of a TCP disconnect. | ||
130 | */ | ||
131 | struct ceph_connection { | ||
132 | void *private; | ||
133 | atomic_t nref; | ||
134 | |||
135 | const struct ceph_connection_operations *ops; | ||
136 | |||
137 | struct ceph_messenger *msgr; | ||
138 | struct socket *sock; | ||
139 | unsigned long state; /* connection state (see flags above) */ | ||
140 | const char *error_msg; /* error message, if any */ | ||
141 | |||
142 | struct ceph_entity_addr peer_addr; /* peer address */ | ||
143 | struct ceph_entity_name peer_name; /* peer name */ | ||
144 | struct ceph_entity_addr peer_addr_for_me; | ||
145 | unsigned peer_features; | ||
146 | u32 connect_seq; /* identify the most recent connection | ||
147 | attempt for this connection, client */ | ||
148 | u32 peer_global_seq; /* peer's global seq for this connection */ | ||
149 | |||
150 | int auth_retry; /* true if we need a newer authorizer */ | ||
151 | void *auth_reply_buf; /* where to put the authorizer reply */ | ||
152 | int auth_reply_buf_len; | ||
153 | |||
154 | struct mutex mutex; | ||
155 | |||
156 | /* out queue */ | ||
157 | struct list_head out_queue; | ||
158 | struct list_head out_sent; /* sending or sent but unacked */ | ||
159 | u64 out_seq; /* last message queued for send */ | ||
160 | bool out_keepalive_pending; | ||
161 | |||
162 | u64 in_seq, in_seq_acked; /* last message received, acked */ | ||
163 | |||
164 | /* connection negotiation temps */ | ||
165 | char in_banner[CEPH_BANNER_MAX_LEN]; | ||
166 | union { | ||
167 | struct { /* outgoing connection */ | ||
168 | struct ceph_msg_connect out_connect; | ||
169 | struct ceph_msg_connect_reply in_reply; | ||
170 | }; | ||
171 | struct { /* incoming */ | ||
172 | struct ceph_msg_connect in_connect; | ||
173 | struct ceph_msg_connect_reply out_reply; | ||
174 | }; | ||
175 | }; | ||
176 | struct ceph_entity_addr actual_peer_addr; | ||
177 | |||
178 | /* message out temps */ | ||
179 | struct ceph_msg *out_msg; /* sending message (== tail of | ||
180 | out_sent) */ | ||
181 | bool out_msg_done; | ||
182 | struct ceph_msg_pos out_msg_pos; | ||
183 | |||
184 | struct kvec out_kvec[8], /* sending header/footer data */ | ||
185 | *out_kvec_cur; | ||
186 | int out_kvec_left; /* kvec's left in out_kvec */ | ||
187 | int out_skip; /* skip this many bytes */ | ||
188 | int out_kvec_bytes; /* total bytes left */ | ||
189 | bool out_kvec_is_msg; /* kvec refers to out_msg */ | ||
190 | int out_more; /* there is more data after the kvecs */ | ||
191 | __le64 out_temp_ack; /* for writing an ack */ | ||
192 | |||
193 | /* message in temps */ | ||
194 | struct ceph_msg_header in_hdr; | ||
195 | struct ceph_msg *in_msg; | ||
196 | struct ceph_msg_pos in_msg_pos; | ||
197 | u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ | ||
198 | |||
199 | char in_tag; /* protocol control byte */ | ||
200 | int in_base_pos; /* bytes read */ | ||
201 | __le64 in_temp_ack; /* for reading an ack */ | ||
202 | |||
203 | struct delayed_work work; /* send|recv work */ | ||
204 | unsigned long delay; /* current delay interval */ | ||
205 | }; | ||
206 | |||
207 | |||
208 | extern const char *pr_addr(const struct sockaddr_storage *ss); | ||
209 | extern int ceph_parse_ips(const char *c, const char *end, | ||
210 | struct ceph_entity_addr *addr, | ||
211 | int max_count, int *count); | ||
212 | |||
213 | |||
214 | extern int ceph_msgr_init(void); | ||
215 | extern void ceph_msgr_exit(void); | ||
216 | extern void ceph_msgr_flush(void); | ||
217 | |||
218 | extern struct ceph_messenger *ceph_messenger_create( | ||
219 | struct ceph_entity_addr *myaddr); | ||
220 | extern void ceph_messenger_destroy(struct ceph_messenger *); | ||
221 | |||
222 | extern void ceph_con_init(struct ceph_messenger *msgr, | ||
223 | struct ceph_connection *con); | ||
224 | extern void ceph_con_open(struct ceph_connection *con, | ||
225 | struct ceph_entity_addr *addr); | ||
226 | extern bool ceph_con_opened(struct ceph_connection *con); | ||
227 | extern void ceph_con_close(struct ceph_connection *con); | ||
228 | extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); | ||
229 | extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); | ||
230 | extern void ceph_con_revoke_message(struct ceph_connection *con, | ||
231 | struct ceph_msg *msg); | ||
232 | extern void ceph_con_keepalive(struct ceph_connection *con); | ||
233 | extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); | ||
234 | extern void ceph_con_put(struct ceph_connection *con); | ||
235 | |||
236 | extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); | ||
237 | extern void ceph_msg_kfree(struct ceph_msg *m); | ||
238 | |||
239 | |||
240 | static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) | ||
241 | { | ||
242 | kref_get(&msg->kref); | ||
243 | return msg; | ||
244 | } | ||
245 | extern void ceph_msg_last_put(struct kref *kref); | ||
246 | static inline void ceph_msg_put(struct ceph_msg *msg) | ||
247 | { | ||
248 | kref_put(&msg->kref, ceph_msg_last_put); | ||
249 | } | ||
250 | |||
251 | extern void ceph_msg_dump(struct ceph_msg *msg); | ||
252 | |||
253 | #endif | ||
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c deleted file mode 100644 index b2a5a3e4a671..000000000000 --- a/fs/ceph/mon_client.c +++ /dev/null | |||
@@ -1,1018 +0,0 @@ | |||
1 | #include "ceph_debug.h" | ||
2 | |||
3 | #include <linux/types.h> | ||
4 | #include <linux/slab.h> | ||
5 | #include <linux/random.h> | ||
6 | #include <linux/sched.h> | ||
7 | |||
8 | #include "mon_client.h" | ||
9 | #include "super.h" | ||
10 | #include "auth.h" | ||
11 | #include "decode.h" | ||
12 | |||
13 | /* | ||
14 | * Interact with Ceph monitor cluster. Handle requests for new map | ||
15 | * versions, and periodically resend as needed. Also implement | ||
16 | * statfs() and umount(). | ||
17 | * | ||
18 | * A small cluster of Ceph "monitors" are responsible for managing critical | ||
19 | * cluster configuration and state information. An odd number (e.g., 3, 5) | ||
20 | * of cmon daemons use a modified version of the Paxos part-time parliament | ||
21 | * algorithm to manage the MDS map (mds cluster membership), OSD map, and | ||
22 | * list of clients who have mounted the file system. | ||
23 | * | ||
24 | * We maintain an open, active session with a monitor at all times in order to | ||
25 | * receive timely MDSMap updates. We periodically send a keepalive byte on the | ||
26 | * TCP socket to ensure we detect a failure. If the connection does break, we | ||
27 | * randomly hunt for a new monitor. Once the connection is reestablished, we | ||
28 | * resend any outstanding requests. | ||
29 | */ | ||
30 | |||
31 | static const struct ceph_connection_operations mon_con_ops; | ||
32 | |||
33 | static int __validate_auth(struct ceph_mon_client *monc); | ||
34 | |||
35 | /* | ||
36 | * Decode a monmap blob (e.g., during mount). | ||
37 | */ | ||
38 | struct ceph_monmap *ceph_monmap_decode(void *p, void *end) | ||
39 | { | ||
40 | struct ceph_monmap *m = NULL; | ||
41 | int i, err = -EINVAL; | ||
42 | struct ceph_fsid fsid; | ||
43 | u32 epoch, num_mon; | ||
44 | u16 version; | ||
45 | u32 len; | ||
46 | |||
47 | ceph_decode_32_safe(&p, end, len, bad); | ||
48 | ceph_decode_need(&p, end, len, bad); | ||
49 | |||
50 | dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); | ||
51 | |||
52 | ceph_decode_16_safe(&p, end, version, bad); | ||
53 | |||
54 | ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); | ||
55 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | ||
56 | epoch = ceph_decode_32(&p); | ||
57 | |||
58 | num_mon = ceph_decode_32(&p); | ||
59 | ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); | ||
60 | |||
61 | if (num_mon >= CEPH_MAX_MON) | ||
62 | goto bad; | ||
63 | m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); | ||
64 | if (m == NULL) | ||
65 | return ERR_PTR(-ENOMEM); | ||
66 | m->fsid = fsid; | ||
67 | m->epoch = epoch; | ||
68 | m->num_mon = num_mon; | ||
69 | ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); | ||
70 | for (i = 0; i < num_mon; i++) | ||
71 | ceph_decode_addr(&m->mon_inst[i].addr); | ||
72 | |||
73 | dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, | ||
74 | m->num_mon); | ||
75 | for (i = 0; i < m->num_mon; i++) | ||
76 | dout("monmap_decode mon%d is %s\n", i, | ||
77 | pr_addr(&m->mon_inst[i].addr.in_addr)); | ||
78 | return m; | ||
79 | |||
80 | bad: | ||
81 | dout("monmap_decode failed with %d\n", err); | ||
82 | kfree(m); | ||
83 | return ERR_PTR(err); | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * return true if *addr is included in the monmap. | ||
88 | */ | ||
89 | int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) | ||
90 | { | ||
91 | int i; | ||
92 | |||
93 | for (i = 0; i < m->num_mon; i++) | ||
94 | if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) | ||
95 | return 1; | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Send an auth request. | ||
101 | */ | ||
102 | static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) | ||
103 | { | ||
104 | monc->pending_auth = 1; | ||
105 | monc->m_auth->front.iov_len = len; | ||
106 | monc->m_auth->hdr.front_len = cpu_to_le32(len); | ||
107 | ceph_con_revoke(monc->con, monc->m_auth); | ||
108 | ceph_msg_get(monc->m_auth); /* keep our ref */ | ||
109 | ceph_con_send(monc->con, monc->m_auth); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Close monitor session, if any. | ||
114 | */ | ||
115 | static void __close_session(struct ceph_mon_client *monc) | ||
116 | { | ||
117 | if (monc->con) { | ||
118 | dout("__close_session closing mon%d\n", monc->cur_mon); | ||
119 | ceph_con_revoke(monc->con, monc->m_auth); | ||
120 | ceph_con_close(monc->con); | ||
121 | monc->cur_mon = -1; | ||
122 | monc->pending_auth = 0; | ||
123 | ceph_auth_reset(monc->auth); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Open a session with a (new) monitor. | ||
129 | */ | ||
130 | static int __open_session(struct ceph_mon_client *monc) | ||
131 | { | ||
132 | char r; | ||
133 | int ret; | ||
134 | |||
135 | if (monc->cur_mon < 0) { | ||
136 | get_random_bytes(&r, 1); | ||
137 | monc->cur_mon = r % monc->monmap->num_mon; | ||
138 | dout("open_session num=%d r=%d -> mon%d\n", | ||
139 | monc->monmap->num_mon, r, monc->cur_mon); | ||
140 | monc->sub_sent = 0; | ||
141 | monc->sub_renew_after = jiffies; /* i.e., expired */ | ||
142 | monc->want_next_osdmap = !!monc->want_next_osdmap; | ||
143 | |||
144 | dout("open_session mon%d opening\n", monc->cur_mon); | ||
145 | monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; | ||
146 | monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); | ||
147 | ceph_con_open(monc->con, | ||
148 | &monc->monmap->mon_inst[monc->cur_mon].addr); | ||
149 | |||
150 | /* initiatiate authentication handshake */ | ||
151 | ret = ceph_auth_build_hello(monc->auth, | ||
152 | monc->m_auth->front.iov_base, | ||
153 | monc->m_auth->front_max); | ||
154 | __send_prepared_auth_request(monc, ret); | ||
155 | } else { | ||
156 | dout("open_session mon%d already open\n", monc->cur_mon); | ||
157 | } | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | static bool __sub_expired(struct ceph_mon_client *monc) | ||
162 | { | ||
163 | return time_after_eq(jiffies, monc->sub_renew_after); | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * Reschedule delayed work timer. | ||
168 | */ | ||
169 | static void __schedule_delayed(struct ceph_mon_client *monc) | ||
170 | { | ||
171 | unsigned delay; | ||
172 | |||
173 | if (monc->cur_mon < 0 || __sub_expired(monc)) | ||
174 | delay = 10 * HZ; | ||
175 | else | ||
176 | delay = 20 * HZ; | ||
177 | dout("__schedule_delayed after %u\n", delay); | ||
178 | schedule_delayed_work(&monc->delayed_work, delay); | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Send subscribe request for mdsmap and/or osdmap. | ||
183 | */ | ||
184 | static void __send_subscribe(struct ceph_mon_client *monc) | ||
185 | { | ||
186 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", | ||
187 | (unsigned)monc->sub_sent, __sub_expired(monc), | ||
188 | monc->want_next_osdmap); | ||
189 | if ((__sub_expired(monc) && !monc->sub_sent) || | ||
190 | monc->want_next_osdmap == 1) { | ||
191 | struct ceph_msg *msg = monc->m_subscribe; | ||
192 | struct ceph_mon_subscribe_item *i; | ||
193 | void *p, *end; | ||
194 | |||
195 | p = msg->front.iov_base; | ||
196 | end = p + msg->front_max; | ||
197 | |||
198 | dout("__send_subscribe to 'mdsmap' %u+\n", | ||
199 | (unsigned)monc->have_mdsmap); | ||
200 | if (monc->want_next_osdmap) { | ||
201 | dout("__send_subscribe to 'osdmap' %u\n", | ||
202 | (unsigned)monc->have_osdmap); | ||
203 | ceph_encode_32(&p, 3); | ||
204 | ceph_encode_string(&p, end, "osdmap", 6); | ||
205 | i = p; | ||
206 | i->have = cpu_to_le64(monc->have_osdmap); | ||
207 | i->onetime = 1; | ||
208 | p += sizeof(*i); | ||
209 | monc->want_next_osdmap = 2; /* requested */ | ||
210 | } else { | ||
211 | ceph_encode_32(&p, 2); | ||
212 | } | ||
213 | ceph_encode_string(&p, end, "mdsmap", 6); | ||
214 | i = p; | ||
215 | i->have = cpu_to_le64(monc->have_mdsmap); | ||
216 | i->onetime = 0; | ||
217 | p += sizeof(*i); | ||
218 | ceph_encode_string(&p, end, "monmap", 6); | ||
219 | i = p; | ||
220 | i->have = 0; | ||
221 | i->onetime = 0; | ||
222 | p += sizeof(*i); | ||
223 | |||
224 | msg->front.iov_len = p - msg->front.iov_base; | ||
225 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | ||
226 | ceph_con_revoke(monc->con, msg); | ||
227 | ceph_con_send(monc->con, ceph_msg_get(msg)); | ||
228 | |||
229 | monc->sub_sent = jiffies | 1; /* never 0 */ | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static void handle_subscribe_ack(struct ceph_mon_client *monc, | ||
234 | struct ceph_msg *msg) | ||
235 | { | ||
236 | unsigned seconds; | ||
237 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; | ||
238 | |||
239 | if (msg->front.iov_len < sizeof(*h)) | ||
240 | goto bad; | ||
241 | seconds = le32_to_cpu(h->duration); | ||
242 | |||
243 | mutex_lock(&monc->mutex); | ||
244 | if (monc->hunting) { | ||
245 | pr_info("mon%d %s session established\n", | ||
246 | monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); | ||
247 | monc->hunting = false; | ||
248 | } | ||
249 | dout("handle_subscribe_ack after %d seconds\n", seconds); | ||
250 | monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; | ||
251 | monc->sub_sent = 0; | ||
252 | mutex_unlock(&monc->mutex); | ||
253 | return; | ||
254 | bad: | ||
255 | pr_err("got corrupt subscribe-ack msg\n"); | ||
256 | ceph_msg_dump(msg); | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Keep track of which maps we have | ||
261 | */ | ||
262 | int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) | ||
263 | { | ||
264 | mutex_lock(&monc->mutex); | ||
265 | monc->have_mdsmap = got; | ||
266 | mutex_unlock(&monc->mutex); | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) | ||
271 | { | ||
272 | mutex_lock(&monc->mutex); | ||
273 | monc->have_osdmap = got; | ||
274 | monc->want_next_osdmap = 0; | ||
275 | mutex_unlock(&monc->mutex); | ||
276 | return 0; | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * Register interest in the next osdmap | ||
281 | */ | ||
282 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) | ||
283 | { | ||
284 | dout("request_next_osdmap have %u\n", monc->have_osdmap); | ||
285 | mutex_lock(&monc->mutex); | ||
286 | if (!monc->want_next_osdmap) | ||
287 | monc->want_next_osdmap = 1; | ||
288 | if (monc->want_next_osdmap < 2) | ||
289 | __send_subscribe(monc); | ||
290 | mutex_unlock(&monc->mutex); | ||
291 | } | ||
292 | |||
293 | /* | ||
294 | * | ||
295 | */ | ||
296 | int ceph_monc_open_session(struct ceph_mon_client *monc) | ||
297 | { | ||
298 | if (!monc->con) { | ||
299 | monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); | ||
300 | if (!monc->con) | ||
301 | return -ENOMEM; | ||
302 | ceph_con_init(monc->client->msgr, monc->con); | ||
303 | monc->con->private = monc; | ||
304 | monc->con->ops = &mon_con_ops; | ||
305 | } | ||
306 | |||
307 | mutex_lock(&monc->mutex); | ||
308 | __open_session(monc); | ||
309 | __schedule_delayed(monc); | ||
310 | mutex_unlock(&monc->mutex); | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * The monitor responds with mount ack indicate mount success. The | ||
316 | * included client ticket allows the client to talk to MDSs and OSDs. | ||
317 | */ | ||
318 | static void ceph_monc_handle_map(struct ceph_mon_client *monc, | ||
319 | struct ceph_msg *msg) | ||
320 | { | ||
321 | struct ceph_client *client = monc->client; | ||
322 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; | ||
323 | void *p, *end; | ||
324 | |||
325 | mutex_lock(&monc->mutex); | ||
326 | |||
327 | dout("handle_monmap\n"); | ||
328 | p = msg->front.iov_base; | ||
329 | end = p + msg->front.iov_len; | ||
330 | |||
331 | monmap = ceph_monmap_decode(p, end); | ||
332 | if (IS_ERR(monmap)) { | ||
333 | pr_err("problem decoding monmap, %d\n", | ||
334 | (int)PTR_ERR(monmap)); | ||
335 | goto out; | ||
336 | } | ||
337 | |||
338 | if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { | ||
339 | kfree(monmap); | ||
340 | goto out; | ||
341 | } | ||
342 | |||
343 | client->monc.monmap = monmap; | ||
344 | kfree(old); | ||
345 | |||
346 | out: | ||
347 | mutex_unlock(&monc->mutex); | ||
348 | wake_up_all(&client->auth_wq); | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * generic requests (e.g., statfs, poolop) | ||
353 | */ | ||
354 | static struct ceph_mon_generic_request *__lookup_generic_req( | ||
355 | struct ceph_mon_client *monc, u64 tid) | ||
356 | { | ||
357 | struct ceph_mon_generic_request *req; | ||
358 | struct rb_node *n = monc->generic_request_tree.rb_node; | ||
359 | |||
360 | while (n) { | ||
361 | req = rb_entry(n, struct ceph_mon_generic_request, node); | ||
362 | if (tid < req->tid) | ||
363 | n = n->rb_left; | ||
364 | else if (tid > req->tid) | ||
365 | n = n->rb_right; | ||
366 | else | ||
367 | return req; | ||
368 | } | ||
369 | return NULL; | ||
370 | } | ||
371 | |||
372 | static void __insert_generic_request(struct ceph_mon_client *monc, | ||
373 | struct ceph_mon_generic_request *new) | ||
374 | { | ||
375 | struct rb_node **p = &monc->generic_request_tree.rb_node; | ||
376 | struct rb_node *parent = NULL; | ||
377 | struct ceph_mon_generic_request *req = NULL; | ||
378 | |||
379 | while (*p) { | ||
380 | parent = *p; | ||
381 | req = rb_entry(parent, struct ceph_mon_generic_request, node); | ||
382 | if (new->tid < req->tid) | ||
383 | p = &(*p)->rb_left; | ||
384 | else if (new->tid > req->tid) | ||
385 | p = &(*p)->rb_right; | ||
386 | else | ||
387 | BUG(); | ||
388 | } | ||
389 | |||
390 | rb_link_node(&new->node, parent, p); | ||
391 | rb_insert_color(&new->node, &monc->generic_request_tree); | ||
392 | } | ||
393 | |||
394 | static void release_generic_request(struct kref *kref) | ||
395 | { | ||
396 | struct ceph_mon_generic_request *req = | ||
397 | container_of(kref, struct ceph_mon_generic_request, kref); | ||
398 | |||
399 | if (req->reply) | ||
400 | ceph_msg_put(req->reply); | ||
401 | if (req->request) | ||
402 | ceph_msg_put(req->request); | ||
403 | |||
404 | kfree(req); | ||
405 | } | ||
406 | |||
407 | static void put_generic_request(struct ceph_mon_generic_request *req) | ||
408 | { | ||
409 | kref_put(&req->kref, release_generic_request); | ||
410 | } | ||
411 | |||
412 | static void get_generic_request(struct ceph_mon_generic_request *req) | ||
413 | { | ||
414 | kref_get(&req->kref); | ||
415 | } | ||
416 | |||
417 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | ||
418 | struct ceph_msg_header *hdr, | ||
419 | int *skip) | ||
420 | { | ||
421 | struct ceph_mon_client *monc = con->private; | ||
422 | struct ceph_mon_generic_request *req; | ||
423 | u64 tid = le64_to_cpu(hdr->tid); | ||
424 | struct ceph_msg *m; | ||
425 | |||
426 | mutex_lock(&monc->mutex); | ||
427 | req = __lookup_generic_req(monc, tid); | ||
428 | if (!req) { | ||
429 | dout("get_generic_reply %lld dne\n", tid); | ||
430 | *skip = 1; | ||
431 | m = NULL; | ||
432 | } else { | ||
433 | dout("get_generic_reply %lld got %p\n", tid, req->reply); | ||
434 | m = ceph_msg_get(req->reply); | ||
435 | /* | ||
436 | * we don't need to track the connection reading into | ||
437 | * this reply because we only have one open connection | ||
438 | * at a time, ever. | ||
439 | */ | ||
440 | } | ||
441 | mutex_unlock(&monc->mutex); | ||
442 | return m; | ||
443 | } | ||
444 | |||
445 | static int do_generic_request(struct ceph_mon_client *monc, | ||
446 | struct ceph_mon_generic_request *req) | ||
447 | { | ||
448 | int err; | ||
449 | |||
450 | /* register request */ | ||
451 | mutex_lock(&monc->mutex); | ||
452 | req->tid = ++monc->last_tid; | ||
453 | req->request->hdr.tid = cpu_to_le64(req->tid); | ||
454 | __insert_generic_request(monc, req); | ||
455 | monc->num_generic_requests++; | ||
456 | ceph_con_send(monc->con, ceph_msg_get(req->request)); | ||
457 | mutex_unlock(&monc->mutex); | ||
458 | |||
459 | err = wait_for_completion_interruptible(&req->completion); | ||
460 | |||
461 | mutex_lock(&monc->mutex); | ||
462 | rb_erase(&req->node, &monc->generic_request_tree); | ||
463 | monc->num_generic_requests--; | ||
464 | mutex_unlock(&monc->mutex); | ||
465 | |||
466 | if (!err) | ||
467 | err = req->result; | ||
468 | return err; | ||
469 | } | ||
470 | |||
471 | /* | ||
472 | * statfs | ||
473 | */ | ||
474 | static void handle_statfs_reply(struct ceph_mon_client *monc, | ||
475 | struct ceph_msg *msg) | ||
476 | { | ||
477 | struct ceph_mon_generic_request *req; | ||
478 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; | ||
479 | u64 tid = le64_to_cpu(msg->hdr.tid); | ||
480 | |||
481 | if (msg->front.iov_len != sizeof(*reply)) | ||
482 | goto bad; | ||
483 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); | ||
484 | |||
485 | mutex_lock(&monc->mutex); | ||
486 | req = __lookup_generic_req(monc, tid); | ||
487 | if (req) { | ||
488 | *(struct ceph_statfs *)req->buf = reply->st; | ||
489 | req->result = 0; | ||
490 | get_generic_request(req); | ||
491 | } | ||
492 | mutex_unlock(&monc->mutex); | ||
493 | if (req) { | ||
494 | complete_all(&req->completion); | ||
495 | put_generic_request(req); | ||
496 | } | ||
497 | return; | ||
498 | |||
499 | bad: | ||
500 | pr_err("corrupt generic reply, tid %llu\n", tid); | ||
501 | ceph_msg_dump(msg); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Do a synchronous statfs(). | ||
506 | */ | ||
507 | int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) | ||
508 | { | ||
509 | struct ceph_mon_generic_request *req; | ||
510 | struct ceph_mon_statfs *h; | ||
511 | int err; | ||
512 | |||
513 | req = kzalloc(sizeof(*req), GFP_NOFS); | ||
514 | if (!req) | ||
515 | return -ENOMEM; | ||
516 | |||
517 | kref_init(&req->kref); | ||
518 | req->buf = buf; | ||
519 | req->buf_len = sizeof(*buf); | ||
520 | init_completion(&req->completion); | ||
521 | |||
522 | err = -ENOMEM; | ||
523 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); | ||
524 | if (!req->request) | ||
525 | goto out; | ||
526 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); | ||
527 | if (!req->reply) | ||
528 | goto out; | ||
529 | |||
530 | /* fill out request */ | ||
531 | h = req->request->front.iov_base; | ||
532 | h->monhdr.have_version = 0; | ||
533 | h->monhdr.session_mon = cpu_to_le16(-1); | ||
534 | h->monhdr.session_mon_tid = 0; | ||
535 | h->fsid = monc->monmap->fsid; | ||
536 | |||
537 | err = do_generic_request(monc, req); | ||
538 | |||
539 | out: | ||
540 | kref_put(&req->kref, release_generic_request); | ||
541 | return err; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * pool ops | ||
546 | */ | ||
547 | static int get_poolop_reply_buf(const char *src, size_t src_len, | ||
548 | char *dst, size_t dst_len) | ||
549 | { | ||
550 | u32 buf_len; | ||
551 | |||
552 | if (src_len != sizeof(u32) + dst_len) | ||
553 | return -EINVAL; | ||
554 | |||
555 | buf_len = le32_to_cpu(*(u32 *)src); | ||
556 | if (buf_len != dst_len) | ||
557 | return -EINVAL; | ||
558 | |||
559 | memcpy(dst, src + sizeof(u32), dst_len); | ||
560 | return 0; | ||
561 | } | ||
562 | |||
563 | static void handle_poolop_reply(struct ceph_mon_client *monc, | ||
564 | struct ceph_msg *msg) | ||
565 | { | ||
566 | struct ceph_mon_generic_request *req; | ||
567 | struct ceph_mon_poolop_reply *reply = msg->front.iov_base; | ||
568 | u64 tid = le64_to_cpu(msg->hdr.tid); | ||
569 | |||
570 | if (msg->front.iov_len < sizeof(*reply)) | ||
571 | goto bad; | ||
572 | dout("handle_poolop_reply %p tid %llu\n", msg, tid); | ||
573 | |||
574 | mutex_lock(&monc->mutex); | ||
575 | req = __lookup_generic_req(monc, tid); | ||
576 | if (req) { | ||
577 | if (req->buf_len && | ||
578 | get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), | ||
579 | msg->front.iov_len - sizeof(*reply), | ||
580 | req->buf, req->buf_len) < 0) { | ||
581 | mutex_unlock(&monc->mutex); | ||
582 | goto bad; | ||
583 | } | ||
584 | req->result = le32_to_cpu(reply->reply_code); | ||
585 | get_generic_request(req); | ||
586 | } | ||
587 | mutex_unlock(&monc->mutex); | ||
588 | if (req) { | ||
589 | complete(&req->completion); | ||
590 | put_generic_request(req); | ||
591 | } | ||
592 | return; | ||
593 | |||
594 | bad: | ||
595 | pr_err("corrupt generic reply, tid %llu\n", tid); | ||
596 | ceph_msg_dump(msg); | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * Do a synchronous pool op. | ||
601 | */ | ||
602 | int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, | ||
603 | u32 pool, u64 snapid, | ||
604 | char *buf, int len) | ||
605 | { | ||
606 | struct ceph_mon_generic_request *req; | ||
607 | struct ceph_mon_poolop *h; | ||
608 | int err; | ||
609 | |||
610 | req = kzalloc(sizeof(*req), GFP_NOFS); | ||
611 | if (!req) | ||
612 | return -ENOMEM; | ||
613 | |||
614 | kref_init(&req->kref); | ||
615 | req->buf = buf; | ||
616 | req->buf_len = len; | ||
617 | init_completion(&req->completion); | ||
618 | |||
619 | err = -ENOMEM; | ||
620 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); | ||
621 | if (!req->request) | ||
622 | goto out; | ||
623 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); | ||
624 | if (!req->reply) | ||
625 | goto out; | ||
626 | |||
627 | /* fill out request */ | ||
628 | req->request->hdr.version = cpu_to_le16(2); | ||
629 | h = req->request->front.iov_base; | ||
630 | h->monhdr.have_version = 0; | ||
631 | h->monhdr.session_mon = cpu_to_le16(-1); | ||
632 | h->monhdr.session_mon_tid = 0; | ||
633 | h->fsid = monc->monmap->fsid; | ||
634 | h->pool = cpu_to_le32(pool); | ||
635 | h->op = cpu_to_le32(op); | ||
636 | h->auid = 0; | ||
637 | h->snapid = cpu_to_le64(snapid); | ||
638 | h->name_len = 0; | ||
639 | |||
640 | err = do_generic_request(monc, req); | ||
641 | |||
642 | out: | ||
643 | kref_put(&req->kref, release_generic_request); | ||
644 | return err; | ||
645 | } | ||
646 | |||
647 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, | ||
648 | u32 pool, u64 *snapid) | ||
649 | { | ||
650 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | ||
651 | pool, 0, (char *)snapid, sizeof(*snapid)); | ||
652 | |||
653 | } | ||
654 | |||
655 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | ||
656 | u32 pool, u64 snapid) | ||
657 | { | ||
658 | return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | ||
659 | pool, snapid, 0, 0); | ||
660 | |||
661 | } | ||
662 | |||
663 | /* | ||
664 | * Resend pending generic requests. | ||
665 | */ | ||
666 | static void __resend_generic_request(struct ceph_mon_client *monc) | ||
667 | { | ||
668 | struct ceph_mon_generic_request *req; | ||
669 | struct rb_node *p; | ||
670 | |||
671 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { | ||
672 | req = rb_entry(p, struct ceph_mon_generic_request, node); | ||
673 | ceph_con_revoke(monc->con, req->request); | ||
674 | ceph_con_send(monc->con, ceph_msg_get(req->request)); | ||
675 | } | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * Delayed work. If we haven't mounted yet, retry. Otherwise, | ||
680 | * renew/retry subscription as needed (in case it is timing out, or we | ||
681 | * got an ENOMEM). And keep the monitor connection alive. | ||
682 | */ | ||
683 | static void delayed_work(struct work_struct *work) | ||
684 | { | ||
685 | struct ceph_mon_client *monc = | ||
686 | container_of(work, struct ceph_mon_client, delayed_work.work); | ||
687 | |||
688 | dout("monc delayed_work\n"); | ||
689 | mutex_lock(&monc->mutex); | ||
690 | if (monc->hunting) { | ||
691 | __close_session(monc); | ||
692 | __open_session(monc); /* continue hunting */ | ||
693 | } else { | ||
694 | ceph_con_keepalive(monc->con); | ||
695 | |||
696 | __validate_auth(monc); | ||
697 | |||
698 | if (monc->auth->ops->is_authenticated(monc->auth)) | ||
699 | __send_subscribe(monc); | ||
700 | } | ||
701 | __schedule_delayed(monc); | ||
702 | mutex_unlock(&monc->mutex); | ||
703 | } | ||
704 | |||
705 | /* | ||
706 | * On startup, we build a temporary monmap populated with the IPs | ||
707 | * provided by mount(2). | ||
708 | */ | ||
709 | static int build_initial_monmap(struct ceph_mon_client *monc) | ||
710 | { | ||
711 | struct ceph_mount_args *args = monc->client->mount_args; | ||
712 | struct ceph_entity_addr *mon_addr = args->mon_addr; | ||
713 | int num_mon = args->num_mon; | ||
714 | int i; | ||
715 | |||
716 | /* build initial monmap */ | ||
717 | monc->monmap = kzalloc(sizeof(*monc->monmap) + | ||
718 | num_mon*sizeof(monc->monmap->mon_inst[0]), | ||
719 | GFP_KERNEL); | ||
720 | if (!monc->monmap) | ||
721 | return -ENOMEM; | ||
722 | for (i = 0; i < num_mon; i++) { | ||
723 | monc->monmap->mon_inst[i].addr = mon_addr[i]; | ||
724 | monc->monmap->mon_inst[i].addr.nonce = 0; | ||
725 | monc->monmap->mon_inst[i].name.type = | ||
726 | CEPH_ENTITY_TYPE_MON; | ||
727 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); | ||
728 | } | ||
729 | monc->monmap->num_mon = num_mon; | ||
730 | monc->have_fsid = false; | ||
731 | |||
732 | /* release addr memory */ | ||
733 | kfree(args->mon_addr); | ||
734 | args->mon_addr = NULL; | ||
735 | args->num_mon = 0; | ||
736 | return 0; | ||
737 | } | ||
738 | |||
739 | int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | ||
740 | { | ||
741 | int err = 0; | ||
742 | |||
743 | dout("init\n"); | ||
744 | memset(monc, 0, sizeof(*monc)); | ||
745 | monc->client = cl; | ||
746 | monc->monmap = NULL; | ||
747 | mutex_init(&monc->mutex); | ||
748 | |||
749 | err = build_initial_monmap(monc); | ||
750 | if (err) | ||
751 | goto out; | ||
752 | |||
753 | monc->con = NULL; | ||
754 | |||
755 | /* authentication */ | ||
756 | monc->auth = ceph_auth_init(cl->mount_args->name, | ||
757 | cl->mount_args->secret); | ||
758 | if (IS_ERR(monc->auth)) | ||
759 | return PTR_ERR(monc->auth); | ||
760 | monc->auth->want_keys = | ||
761 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | | ||
762 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; | ||
763 | |||
764 | /* msgs */ | ||
765 | err = -ENOMEM; | ||
766 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, | ||
767 | sizeof(struct ceph_mon_subscribe_ack), | ||
768 | GFP_NOFS); | ||
769 | if (!monc->m_subscribe_ack) | ||
770 | goto out_monmap; | ||
771 | |||
772 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); | ||
773 | if (!monc->m_subscribe) | ||
774 | goto out_subscribe_ack; | ||
775 | |||
776 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); | ||
777 | if (!monc->m_auth_reply) | ||
778 | goto out_subscribe; | ||
779 | |||
780 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); | ||
781 | monc->pending_auth = 0; | ||
782 | if (!monc->m_auth) | ||
783 | goto out_auth_reply; | ||
784 | |||
785 | monc->cur_mon = -1; | ||
786 | monc->hunting = true; | ||
787 | monc->sub_renew_after = jiffies; | ||
788 | monc->sub_sent = 0; | ||
789 | |||
790 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); | ||
791 | monc->generic_request_tree = RB_ROOT; | ||
792 | monc->num_generic_requests = 0; | ||
793 | monc->last_tid = 0; | ||
794 | |||
795 | monc->have_mdsmap = 0; | ||
796 | monc->have_osdmap = 0; | ||
797 | monc->want_next_osdmap = 1; | ||
798 | return 0; | ||
799 | |||
800 | out_auth_reply: | ||
801 | ceph_msg_put(monc->m_auth_reply); | ||
802 | out_subscribe: | ||
803 | ceph_msg_put(monc->m_subscribe); | ||
804 | out_subscribe_ack: | ||
805 | ceph_msg_put(monc->m_subscribe_ack); | ||
806 | out_monmap: | ||
807 | kfree(monc->monmap); | ||
808 | out: | ||
809 | return err; | ||
810 | } | ||
811 | |||
812 | void ceph_monc_stop(struct ceph_mon_client *monc) | ||
813 | { | ||
814 | dout("stop\n"); | ||
815 | cancel_delayed_work_sync(&monc->delayed_work); | ||
816 | |||
817 | mutex_lock(&monc->mutex); | ||
818 | __close_session(monc); | ||
819 | if (monc->con) { | ||
820 | monc->con->private = NULL; | ||
821 | monc->con->ops->put(monc->con); | ||
822 | monc->con = NULL; | ||
823 | } | ||
824 | mutex_unlock(&monc->mutex); | ||
825 | |||
826 | ceph_auth_destroy(monc->auth); | ||
827 | |||
828 | ceph_msg_put(monc->m_auth); | ||
829 | ceph_msg_put(monc->m_auth_reply); | ||
830 | ceph_msg_put(monc->m_subscribe); | ||
831 | ceph_msg_put(monc->m_subscribe_ack); | ||
832 | |||
833 | kfree(monc->monmap); | ||
834 | } | ||
835 | |||
836 | static void handle_auth_reply(struct ceph_mon_client *monc, | ||
837 | struct ceph_msg *msg) | ||
838 | { | ||
839 | int ret; | ||
840 | int was_auth = 0; | ||
841 | |||
842 | mutex_lock(&monc->mutex); | ||
843 | if (monc->auth->ops) | ||
844 | was_auth = monc->auth->ops->is_authenticated(monc->auth); | ||
845 | monc->pending_auth = 0; | ||
846 | ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, | ||
847 | msg->front.iov_len, | ||
848 | monc->m_auth->front.iov_base, | ||
849 | monc->m_auth->front_max); | ||
850 | if (ret < 0) { | ||
851 | monc->client->auth_err = ret; | ||
852 | wake_up_all(&monc->client->auth_wq); | ||
853 | } else if (ret > 0) { | ||
854 | __send_prepared_auth_request(monc, ret); | ||
855 | } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { | ||
856 | dout("authenticated, starting session\n"); | ||
857 | |||
858 | monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; | ||
859 | monc->client->msgr->inst.name.num = | ||
860 | cpu_to_le64(monc->auth->global_id); | ||
861 | |||
862 | __send_subscribe(monc); | ||
863 | __resend_generic_request(monc); | ||
864 | } | ||
865 | mutex_unlock(&monc->mutex); | ||
866 | } | ||
867 | |||
868 | static int __validate_auth(struct ceph_mon_client *monc) | ||
869 | { | ||
870 | int ret; | ||
871 | |||
872 | if (monc->pending_auth) | ||
873 | return 0; | ||
874 | |||
875 | ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, | ||
876 | monc->m_auth->front_max); | ||
877 | if (ret <= 0) | ||
878 | return ret; /* either an error, or no need to authenticate */ | ||
879 | __send_prepared_auth_request(monc, ret); | ||
880 | return 0; | ||
881 | } | ||
882 | |||
883 | int ceph_monc_validate_auth(struct ceph_mon_client *monc) | ||
884 | { | ||
885 | int ret; | ||
886 | |||
887 | mutex_lock(&monc->mutex); | ||
888 | ret = __validate_auth(monc); | ||
889 | mutex_unlock(&monc->mutex); | ||
890 | return ret; | ||
891 | } | ||
892 | |||
893 | /* | ||
894 | * handle incoming message | ||
895 | */ | ||
896 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | ||
897 | { | ||
898 | struct ceph_mon_client *monc = con->private; | ||
899 | int type = le16_to_cpu(msg->hdr.type); | ||
900 | |||
901 | if (!monc) | ||
902 | return; | ||
903 | |||
904 | switch (type) { | ||
905 | case CEPH_MSG_AUTH_REPLY: | ||
906 | handle_auth_reply(monc, msg); | ||
907 | break; | ||
908 | |||
909 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | ||
910 | handle_subscribe_ack(monc, msg); | ||
911 | break; | ||
912 | |||
913 | case CEPH_MSG_STATFS_REPLY: | ||
914 | handle_statfs_reply(monc, msg); | ||
915 | break; | ||
916 | |||
917 | case CEPH_MSG_POOLOP_REPLY: | ||
918 | handle_poolop_reply(monc, msg); | ||
919 | break; | ||
920 | |||
921 | case CEPH_MSG_MON_MAP: | ||
922 | ceph_monc_handle_map(monc, msg); | ||
923 | break; | ||
924 | |||
925 | case CEPH_MSG_MDS_MAP: | ||
926 | ceph_mdsc_handle_map(&monc->client->mdsc, msg); | ||
927 | break; | ||
928 | |||
929 | case CEPH_MSG_OSD_MAP: | ||
930 | ceph_osdc_handle_map(&monc->client->osdc, msg); | ||
931 | break; | ||
932 | |||
933 | default: | ||
934 | pr_err("received unknown message type %d %s\n", type, | ||
935 | ceph_msg_type_name(type)); | ||
936 | } | ||
937 | ceph_msg_put(msg); | ||
938 | } | ||
939 | |||
940 | /* | ||
941 | * Allocate memory for incoming message | ||
942 | */ | ||
943 | static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, | ||
944 | struct ceph_msg_header *hdr, | ||
945 | int *skip) | ||
946 | { | ||
947 | struct ceph_mon_client *monc = con->private; | ||
948 | int type = le16_to_cpu(hdr->type); | ||
949 | int front_len = le32_to_cpu(hdr->front_len); | ||
950 | struct ceph_msg *m = NULL; | ||
951 | |||
952 | *skip = 0; | ||
953 | |||
954 | switch (type) { | ||
955 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | ||
956 | m = ceph_msg_get(monc->m_subscribe_ack); | ||
957 | break; | ||
958 | case CEPH_MSG_POOLOP_REPLY: | ||
959 | case CEPH_MSG_STATFS_REPLY: | ||
960 | return get_generic_reply(con, hdr, skip); | ||
961 | case CEPH_MSG_AUTH_REPLY: | ||
962 | m = ceph_msg_get(monc->m_auth_reply); | ||
963 | break; | ||
964 | case CEPH_MSG_MON_MAP: | ||
965 | case CEPH_MSG_MDS_MAP: | ||
966 | case CEPH_MSG_OSD_MAP: | ||
967 | m = ceph_msg_new(type, front_len, GFP_NOFS); | ||
968 | break; | ||
969 | } | ||
970 | |||
971 | if (!m) { | ||
972 | pr_info("alloc_msg unknown type %d\n", type); | ||
973 | *skip = 1; | ||
974 | } | ||
975 | return m; | ||
976 | } | ||
977 | |||
978 | /* | ||
979 | * If the monitor connection resets, pick a new monitor and resubmit | ||
980 | * any pending requests. | ||
981 | */ | ||
982 | static void mon_fault(struct ceph_connection *con) | ||
983 | { | ||
984 | struct ceph_mon_client *monc = con->private; | ||
985 | |||
986 | if (!monc) | ||
987 | return; | ||
988 | |||
989 | dout("mon_fault\n"); | ||
990 | mutex_lock(&monc->mutex); | ||
991 | if (!con->private) | ||
992 | goto out; | ||
993 | |||
994 | if (monc->con && !monc->hunting) | ||
995 | pr_info("mon%d %s session lost, " | ||
996 | "hunting for new mon\n", monc->cur_mon, | ||
997 | pr_addr(&monc->con->peer_addr.in_addr)); | ||
998 | |||
999 | __close_session(monc); | ||
1000 | if (!monc->hunting) { | ||
1001 | /* start hunting */ | ||
1002 | monc->hunting = true; | ||
1003 | __open_session(monc); | ||
1004 | } else { | ||
1005 | /* already hunting, let's wait a bit */ | ||
1006 | __schedule_delayed(monc); | ||
1007 | } | ||
1008 | out: | ||
1009 | mutex_unlock(&monc->mutex); | ||
1010 | } | ||
1011 | |||
1012 | static const struct ceph_connection_operations mon_con_ops = { | ||
1013 | .get = ceph_con_get, | ||
1014 | .put = ceph_con_put, | ||
1015 | .dispatch = dispatch, | ||
1016 | .fault = mon_fault, | ||
1017 | .alloc_msg = mon_alloc_msg, | ||
1018 | }; | ||
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h deleted file mode 100644 index 8e396f2c0963..000000000000 --- a/fs/ceph/mon_client.h +++ /dev/null | |||
@@ -1,121 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_MON_CLIENT_H | ||
2 | #define _FS_CEPH_MON_CLIENT_H | ||
3 | |||
4 | #include <linux/completion.h> | ||
5 | #include <linux/kref.h> | ||
6 | #include <linux/rbtree.h> | ||
7 | |||
8 | #include "messenger.h" | ||
9 | |||
10 | struct ceph_client; | ||
11 | struct ceph_mount_args; | ||
12 | struct ceph_auth_client; | ||
13 | |||
14 | /* | ||
15 | * The monitor map enumerates the set of all monitors. | ||
16 | */ | ||
17 | struct ceph_monmap { | ||
18 | struct ceph_fsid fsid; | ||
19 | u32 epoch; | ||
20 | u32 num_mon; | ||
21 | struct ceph_entity_inst mon_inst[0]; | ||
22 | }; | ||
23 | |||
24 | struct ceph_mon_client; | ||
25 | struct ceph_mon_generic_request; | ||
26 | |||
27 | |||
28 | /* | ||
29 | * Generic mechanism for resending monitor requests. | ||
30 | */ | ||
31 | typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, | ||
32 | int newmon); | ||
33 | |||
34 | /* a pending monitor request */ | ||
35 | struct ceph_mon_request { | ||
36 | struct ceph_mon_client *monc; | ||
37 | struct delayed_work delayed_work; | ||
38 | unsigned long delay; | ||
39 | ceph_monc_request_func_t do_request; | ||
40 | }; | ||
41 | |||
42 | /* | ||
43 | * ceph_mon_generic_request is being used for the statfs and poolop requests | ||
44 | * which are bening done a bit differently because we need to get data back | ||
45 | * to the caller | ||
46 | */ | ||
47 | struct ceph_mon_generic_request { | ||
48 | struct kref kref; | ||
49 | u64 tid; | ||
50 | struct rb_node node; | ||
51 | int result; | ||
52 | void *buf; | ||
53 | int buf_len; | ||
54 | struct completion completion; | ||
55 | struct ceph_msg *request; /* original request */ | ||
56 | struct ceph_msg *reply; /* and reply */ | ||
57 | }; | ||
58 | |||
59 | struct ceph_mon_client { | ||
60 | struct ceph_client *client; | ||
61 | struct ceph_monmap *monmap; | ||
62 | |||
63 | struct mutex mutex; | ||
64 | struct delayed_work delayed_work; | ||
65 | |||
66 | struct ceph_auth_client *auth; | ||
67 | struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; | ||
68 | int pending_auth; | ||
69 | |||
70 | bool hunting; | ||
71 | int cur_mon; /* last monitor i contacted */ | ||
72 | unsigned long sub_sent, sub_renew_after; | ||
73 | struct ceph_connection *con; | ||
74 | bool have_fsid; | ||
75 | |||
76 | /* pending generic requests */ | ||
77 | struct rb_root generic_request_tree; | ||
78 | int num_generic_requests; | ||
79 | u64 last_tid; | ||
80 | |||
81 | /* mds/osd map */ | ||
82 | int want_next_osdmap; /* 1 = want, 2 = want+asked */ | ||
83 | u32 have_osdmap, have_mdsmap; | ||
84 | |||
85 | #ifdef CONFIG_DEBUG_FS | ||
86 | struct dentry *debugfs_file; | ||
87 | #endif | ||
88 | }; | ||
89 | |||
90 | extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); | ||
91 | extern int ceph_monmap_contains(struct ceph_monmap *m, | ||
92 | struct ceph_entity_addr *addr); | ||
93 | |||
94 | extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); | ||
95 | extern void ceph_monc_stop(struct ceph_mon_client *monc); | ||
96 | |||
97 | /* | ||
98 | * The model here is to indicate that we need a new map of at least | ||
99 | * epoch @want, and also call in when we receive a map. We will | ||
100 | * periodically rerequest the map from the monitor cluster until we | ||
101 | * get what we want. | ||
102 | */ | ||
103 | extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); | ||
104 | extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); | ||
105 | |||
106 | extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); | ||
107 | |||
108 | extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, | ||
109 | struct ceph_statfs *buf); | ||
110 | |||
111 | extern int ceph_monc_open_session(struct ceph_mon_client *monc); | ||
112 | |||
113 | extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); | ||
114 | |||
115 | extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, | ||
116 | u32 pool, u64 *snapid); | ||
117 | |||
118 | extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | ||
119 | u32 pool, u64 snapid); | ||
120 | |||
121 | #endif | ||
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c deleted file mode 100644 index dd65a6438131..000000000000 --- a/fs/ceph/msgpool.c +++ /dev/null | |||
@@ -1,64 +0,0 @@ | |||
1 | #include "ceph_debug.h" | ||
2 | |||
3 | #include <linux/err.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/vmalloc.h> | ||
7 | |||
8 | #include "msgpool.h" | ||
9 | |||
10 | static void *alloc_fn(gfp_t gfp_mask, void *arg) | ||
11 | { | ||
12 | struct ceph_msgpool *pool = arg; | ||
13 | void *p; | ||
14 | |||
15 | p = ceph_msg_new(0, pool->front_len, gfp_mask); | ||
16 | if (!p) | ||
17 | pr_err("msgpool %s alloc failed\n", pool->name); | ||
18 | return p; | ||
19 | } | ||
20 | |||
21 | static void free_fn(void *element, void *arg) | ||
22 | { | ||
23 | ceph_msg_put(element); | ||
24 | } | ||
25 | |||
26 | int ceph_msgpool_init(struct ceph_msgpool *pool, | ||
27 | int front_len, int size, bool blocking, const char *name) | ||
28 | { | ||
29 | pool->front_len = front_len; | ||
30 | pool->pool = mempool_create(size, alloc_fn, free_fn, pool); | ||
31 | if (!pool->pool) | ||
32 | return -ENOMEM; | ||
33 | pool->name = name; | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | void ceph_msgpool_destroy(struct ceph_msgpool *pool) | ||
38 | { | ||
39 | mempool_destroy(pool->pool); | ||
40 | } | ||
41 | |||
42 | struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, | ||
43 | int front_len) | ||
44 | { | ||
45 | if (front_len > pool->front_len) { | ||
46 | pr_err("msgpool_get pool %s need front %d, pool size is %d\n", | ||
47 | pool->name, front_len, pool->front_len); | ||
48 | WARN_ON(1); | ||
49 | |||
50 | /* try to alloc a fresh message */ | ||
51 | return ceph_msg_new(0, front_len, GFP_NOFS); | ||
52 | } | ||
53 | |||
54 | return mempool_alloc(pool->pool, GFP_NOFS); | ||
55 | } | ||
56 | |||
57 | void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) | ||
58 | { | ||
59 | /* reset msg front_len; user may have changed it */ | ||
60 | msg->front.iov_len = pool->front_len; | ||
61 | msg->hdr.front_len = cpu_to_le32(pool->front_len); | ||
62 | |||
63 | kref_init(&msg->kref); /* retake single ref */ | ||
64 | } | ||
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h deleted file mode 100644 index a362605f9368..000000000000 --- a/fs/ceph/msgpool.h +++ /dev/null | |||
@@ -1,25 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_MSGPOOL | ||
2 | #define _FS_CEPH_MSGPOOL | ||
3 | |||
4 | #include <linux/mempool.h> | ||
5 | #include "messenger.h" | ||
6 | |||
7 | /* | ||
8 | * we use memory pools for preallocating messages we may receive, to | ||
9 | * avoid unexpected OOM conditions. | ||
10 | */ | ||
11 | struct ceph_msgpool { | ||
12 | const char *name; | ||
13 | mempool_t *pool; | ||
14 | int front_len; /* preallocated payload size */ | ||
15 | }; | ||
16 | |||
17 | extern int ceph_msgpool_init(struct ceph_msgpool *pool, | ||
18 | int front_len, int size, bool blocking, | ||
19 | const char *name); | ||
20 | extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); | ||
21 | extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, | ||
22 | int front_len); | ||
23 | extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); | ||
24 | |||
25 | #endif | ||
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h deleted file mode 100644 index 680d3d648cac..000000000000 --- a/fs/ceph/msgr.h +++ /dev/null | |||
@@ -1,175 +0,0 @@ | |||
1 | #ifndef CEPH_MSGR_H | ||
2 | #define CEPH_MSGR_H | ||
3 | |||
4 | /* | ||
5 | * Data types for message passing layer used by Ceph. | ||
6 | */ | ||
7 | |||
8 | #define CEPH_MON_PORT 6789 /* default monitor port */ | ||
9 | |||
10 | /* | ||
11 | * client-side processes will try to bind to ports in this | ||
12 | * range, simply for the benefit of tools like nmap or wireshark | ||
13 | * that would like to identify the protocol. | ||
14 | */ | ||
15 | #define CEPH_PORT_FIRST 6789 | ||
16 | #define CEPH_PORT_START 6800 /* non-monitors start here */ | ||
17 | #define CEPH_PORT_LAST 6900 | ||
18 | |||
19 | /* | ||
20 | * tcp connection banner. include a protocol version. and adjust | ||
21 | * whenever the wire protocol changes. try to keep this string length | ||
22 | * constant. | ||
23 | */ | ||
24 | #define CEPH_BANNER "ceph v027" | ||
25 | #define CEPH_BANNER_MAX_LEN 30 | ||
26 | |||
27 | |||
28 | /* | ||
29 | * Rollover-safe type and comparator for 32-bit sequence numbers. | ||
30 | * Comparator returns -1, 0, or 1. | ||
31 | */ | ||
32 | typedef __u32 ceph_seq_t; | ||
33 | |||
34 | static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) | ||
35 | { | ||
36 | return (__s32)a - (__s32)b; | ||
37 | } | ||
38 | |||
39 | |||
40 | /* | ||
41 | * entity_name -- logical name for a process participating in the | ||
42 | * network, e.g. 'mds0' or 'osd3'. | ||
43 | */ | ||
44 | struct ceph_entity_name { | ||
45 | __u8 type; /* CEPH_ENTITY_TYPE_* */ | ||
46 | __le64 num; | ||
47 | } __attribute__ ((packed)); | ||
48 | |||
49 | #define CEPH_ENTITY_TYPE_MON 0x01 | ||
50 | #define CEPH_ENTITY_TYPE_MDS 0x02 | ||
51 | #define CEPH_ENTITY_TYPE_OSD 0x04 | ||
52 | #define CEPH_ENTITY_TYPE_CLIENT 0x08 | ||
53 | #define CEPH_ENTITY_TYPE_AUTH 0x20 | ||
54 | |||
55 | #define CEPH_ENTITY_TYPE_ANY 0xFF | ||
56 | |||
57 | extern const char *ceph_entity_type_name(int type); | ||
58 | |||
59 | /* | ||
60 | * entity_addr -- network address | ||
61 | */ | ||
62 | struct ceph_entity_addr { | ||
63 | __le32 type; | ||
64 | __le32 nonce; /* unique id for process (e.g. pid) */ | ||
65 | struct sockaddr_storage in_addr; | ||
66 | } __attribute__ ((packed)); | ||
67 | |||
68 | struct ceph_entity_inst { | ||
69 | struct ceph_entity_name name; | ||
70 | struct ceph_entity_addr addr; | ||
71 | } __attribute__ ((packed)); | ||
72 | |||
73 | |||
74 | /* used by message exchange protocol */ | ||
75 | #define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ | ||
76 | #define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ | ||
77 | #define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing | ||
78 | incoming connection */ | ||
79 | #define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again | ||
80 | with higher cseq */ | ||
81 | #define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again | ||
82 | with higher gseq */ | ||
83 | #define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ | ||
84 | #define CEPH_MSGR_TAG_MSG 7 /* message */ | ||
85 | #define CEPH_MSGR_TAG_ACK 8 /* message ack */ | ||
86 | #define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ | ||
87 | #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ | ||
88 | #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ | ||
89 | #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ | ||
90 | |||
91 | |||
92 | /* | ||
93 | * connection negotiation | ||
94 | */ | ||
95 | struct ceph_msg_connect { | ||
96 | __le64 features; /* supported feature bits */ | ||
97 | __le32 host_type; /* CEPH_ENTITY_TYPE_* */ | ||
98 | __le32 global_seq; /* count connections initiated by this host */ | ||
99 | __le32 connect_seq; /* count connections initiated in this session */ | ||
100 | __le32 protocol_version; | ||
101 | __le32 authorizer_protocol; | ||
102 | __le32 authorizer_len; | ||
103 | __u8 flags; /* CEPH_MSG_CONNECT_* */ | ||
104 | } __attribute__ ((packed)); | ||
105 | |||
106 | struct ceph_msg_connect_reply { | ||
107 | __u8 tag; | ||
108 | __le64 features; /* feature bits for this session */ | ||
109 | __le32 global_seq; | ||
110 | __le32 connect_seq; | ||
111 | __le32 protocol_version; | ||
112 | __le32 authorizer_len; | ||
113 | __u8 flags; | ||
114 | } __attribute__ ((packed)); | ||
115 | |||
116 | #define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ | ||
117 | |||
118 | |||
119 | /* | ||
120 | * message header | ||
121 | */ | ||
122 | struct ceph_msg_header_old { | ||
123 | __le64 seq; /* message seq# for this session */ | ||
124 | __le64 tid; /* transaction id */ | ||
125 | __le16 type; /* message type */ | ||
126 | __le16 priority; /* priority. higher value == higher priority */ | ||
127 | __le16 version; /* version of message encoding */ | ||
128 | |||
129 | __le32 front_len; /* bytes in main payload */ | ||
130 | __le32 middle_len;/* bytes in middle payload */ | ||
131 | __le32 data_len; /* bytes of data payload */ | ||
132 | __le16 data_off; /* sender: include full offset; | ||
133 | receiver: mask against ~PAGE_MASK */ | ||
134 | |||
135 | struct ceph_entity_inst src, orig_src; | ||
136 | __le32 reserved; | ||
137 | __le32 crc; /* header crc32c */ | ||
138 | } __attribute__ ((packed)); | ||
139 | |||
140 | struct ceph_msg_header { | ||
141 | __le64 seq; /* message seq# for this session */ | ||
142 | __le64 tid; /* transaction id */ | ||
143 | __le16 type; /* message type */ | ||
144 | __le16 priority; /* priority. higher value == higher priority */ | ||
145 | __le16 version; /* version of message encoding */ | ||
146 | |||
147 | __le32 front_len; /* bytes in main payload */ | ||
148 | __le32 middle_len;/* bytes in middle payload */ | ||
149 | __le32 data_len; /* bytes of data payload */ | ||
150 | __le16 data_off; /* sender: include full offset; | ||
151 | receiver: mask against ~PAGE_MASK */ | ||
152 | |||
153 | struct ceph_entity_name src; | ||
154 | __le32 reserved; | ||
155 | __le32 crc; /* header crc32c */ | ||
156 | } __attribute__ ((packed)); | ||
157 | |||
158 | #define CEPH_MSG_PRIO_LOW 64 | ||
159 | #define CEPH_MSG_PRIO_DEFAULT 127 | ||
160 | #define CEPH_MSG_PRIO_HIGH 196 | ||
161 | #define CEPH_MSG_PRIO_HIGHEST 255 | ||
162 | |||
163 | /* | ||
164 | * follows data payload | ||
165 | */ | ||
166 | struct ceph_msg_footer { | ||
167 | __le32 front_crc, middle_crc, data_crc; | ||
168 | __u8 flags; | ||
169 | } __attribute__ ((packed)); | ||
170 | |||
171 | #define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ | ||
172 | #define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ | ||
173 | |||
174 | |||
175 | #endif | ||
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c deleted file mode 100644 index dfced1dacbcd..000000000000 --- a/fs/ceph/osd_client.c +++ /dev/null | |||
@@ -1,1539 +0,0 @@ | |||
1 | #include "ceph_debug.h" | ||
2 | |||
3 | #include <linux/err.h> | ||
4 | #include <linux/highmem.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/pagemap.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/uaccess.h> | ||
9 | |||
10 | #include "super.h" | ||
11 | #include "osd_client.h" | ||
12 | #include "messenger.h" | ||
13 | #include "decode.h" | ||
14 | #include "auth.h" | ||
15 | |||
16 | #define OSD_OP_FRONT_LEN 4096 | ||
17 | #define OSD_OPREPLY_FRONT_LEN 512 | ||
18 | |||
19 | static const struct ceph_connection_operations osd_con_ops; | ||
20 | static int __kick_requests(struct ceph_osd_client *osdc, | ||
21 | struct ceph_osd *kickosd); | ||
22 | |||
23 | static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); | ||
24 | |||
25 | /* | ||
26 | * Implement client access to distributed object storage cluster. | ||
27 | * | ||
28 | * All data objects are stored within a cluster/cloud of OSDs, or | ||
29 | * "object storage devices." (Note that Ceph OSDs have _nothing_ to | ||
30 | * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply | ||
31 | * remote daemons serving up and coordinating consistent and safe | ||
32 | * access to storage. | ||
33 | * | ||
34 | * Cluster membership and the mapping of data objects onto storage devices | ||
35 | * are described by the osd map. | ||
36 | * | ||
37 | * We keep track of pending OSD requests (read, write), resubmit | ||
38 | * requests to different OSDs when the cluster topology/data layout | ||
39 | * change, or retry the affected requests when the communications | ||
40 | * channel with an OSD is reset. | ||
41 | */ | ||
42 | |||
43 | /* | ||
44 | * calculate the mapping of a file extent onto an object, and fill out the | ||
45 | * request accordingly. shorten extent as necessary if it crosses an | ||
46 | * object boundary. | ||
47 | * | ||
48 | * fill osd op in request message. | ||
49 | */ | ||
50 | static void calc_layout(struct ceph_osd_client *osdc, | ||
51 | struct ceph_vino vino, struct ceph_file_layout *layout, | ||
52 | u64 off, u64 *plen, | ||
53 | struct ceph_osd_request *req) | ||
54 | { | ||
55 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | ||
56 | struct ceph_osd_op *op = (void *)(reqhead + 1); | ||
57 | u64 orig_len = *plen; | ||
58 | u64 objoff, objlen; /* extent in object */ | ||
59 | u64 bno; | ||
60 | |||
61 | reqhead->snapid = cpu_to_le64(vino.snap); | ||
62 | |||
63 | /* object extent? */ | ||
64 | ceph_calc_file_object_mapping(layout, off, plen, &bno, | ||
65 | &objoff, &objlen); | ||
66 | if (*plen < orig_len) | ||
67 | dout(" skipping last %llu, final file extent %llu~%llu\n", | ||
68 | orig_len - *plen, off, *plen); | ||
69 | |||
70 | sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); | ||
71 | req->r_oid_len = strlen(req->r_oid); | ||
72 | |||
73 | op->extent.offset = cpu_to_le64(objoff); | ||
74 | op->extent.length = cpu_to_le64(objlen); | ||
75 | req->r_num_pages = calc_pages_for(off, *plen); | ||
76 | |||
77 | dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", | ||
78 | req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * requests | ||
83 | */ | ||
84 | void ceph_osdc_release_request(struct kref *kref) | ||
85 | { | ||
86 | struct ceph_osd_request *req = container_of(kref, | ||
87 | struct ceph_osd_request, | ||
88 | r_kref); | ||
89 | |||
90 | if (req->r_request) | ||
91 | ceph_msg_put(req->r_request); | ||
92 | if (req->r_reply) | ||
93 | ceph_msg_put(req->r_reply); | ||
94 | if (req->r_con_filling_msg) { | ||
95 | dout("release_request revoking pages %p from con %p\n", | ||
96 | req->r_pages, req->r_con_filling_msg); | ||
97 | ceph_con_revoke_message(req->r_con_filling_msg, | ||
98 | req->r_reply); | ||
99 | ceph_con_put(req->r_con_filling_msg); | ||
100 | } | ||
101 | if (req->r_own_pages) | ||
102 | ceph_release_page_vector(req->r_pages, | ||
103 | req->r_num_pages); | ||
104 | ceph_put_snap_context(req->r_snapc); | ||
105 | if (req->r_mempool) | ||
106 | mempool_free(req, req->r_osdc->req_mempool); | ||
107 | else | ||
108 | kfree(req); | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * build new request AND message, calculate layout, and adjust file | ||
113 | * extent as needed. | ||
114 | * | ||
115 | * if the file was recently truncated, we include information about its | ||
116 | * old and new size so that the object can be updated appropriately. (we | ||
117 | * avoid synchronously deleting truncated objects because it's slow.) | ||
118 | * | ||
119 | * if @do_sync, include a 'startsync' command so that the osd will flush | ||
120 | * data quickly. | ||
121 | */ | ||
122 | struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | ||
123 | struct ceph_file_layout *layout, | ||
124 | struct ceph_vino vino, | ||
125 | u64 off, u64 *plen, | ||
126 | int opcode, int flags, | ||
127 | struct ceph_snap_context *snapc, | ||
128 | int do_sync, | ||
129 | u32 truncate_seq, | ||
130 | u64 truncate_size, | ||
131 | struct timespec *mtime, | ||
132 | bool use_mempool, int num_reply) | ||
133 | { | ||
134 | struct ceph_osd_request *req; | ||
135 | struct ceph_msg *msg; | ||
136 | struct ceph_osd_request_head *head; | ||
137 | struct ceph_osd_op *op; | ||
138 | void *p; | ||
139 | int num_op = 1 + do_sync; | ||
140 | size_t msg_size = sizeof(*head) + num_op*sizeof(*op); | ||
141 | int i; | ||
142 | |||
143 | if (use_mempool) { | ||
144 | req = mempool_alloc(osdc->req_mempool, GFP_NOFS); | ||
145 | memset(req, 0, sizeof(*req)); | ||
146 | } else { | ||
147 | req = kzalloc(sizeof(*req), GFP_NOFS); | ||
148 | } | ||
149 | if (req == NULL) | ||
150 | return NULL; | ||
151 | |||
152 | req->r_osdc = osdc; | ||
153 | req->r_mempool = use_mempool; | ||
154 | kref_init(&req->r_kref); | ||
155 | init_completion(&req->r_completion); | ||
156 | init_completion(&req->r_safe_completion); | ||
157 | INIT_LIST_HEAD(&req->r_unsafe_item); | ||
158 | req->r_flags = flags; | ||
159 | |||
160 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | ||
161 | |||
162 | /* create reply message */ | ||
163 | if (use_mempool) | ||
164 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | ||
165 | else | ||
166 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, | ||
167 | OSD_OPREPLY_FRONT_LEN, GFP_NOFS); | ||
168 | if (!msg) { | ||
169 | ceph_osdc_put_request(req); | ||
170 | return NULL; | ||
171 | } | ||
172 | req->r_reply = msg; | ||
173 | |||
174 | /* create request message; allow space for oid */ | ||
175 | msg_size += 40; | ||
176 | if (snapc) | ||
177 | msg_size += sizeof(u64) * snapc->num_snaps; | ||
178 | if (use_mempool) | ||
179 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | ||
180 | else | ||
181 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); | ||
182 | if (!msg) { | ||
183 | ceph_osdc_put_request(req); | ||
184 | return NULL; | ||
185 | } | ||
186 | msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); | ||
187 | memset(msg->front.iov_base, 0, msg->front.iov_len); | ||
188 | head = msg->front.iov_base; | ||
189 | op = (void *)(head + 1); | ||
190 | p = (void *)(op + num_op); | ||
191 | |||
192 | req->r_request = msg; | ||
193 | req->r_snapc = ceph_get_snap_context(snapc); | ||
194 | |||
195 | head->client_inc = cpu_to_le32(1); /* always, for now. */ | ||
196 | head->flags = cpu_to_le32(flags); | ||
197 | if (flags & CEPH_OSD_FLAG_WRITE) | ||
198 | ceph_encode_timespec(&head->mtime, mtime); | ||
199 | head->num_ops = cpu_to_le16(num_op); | ||
200 | op->op = cpu_to_le16(opcode); | ||
201 | |||
202 | /* calculate max write size */ | ||
203 | calc_layout(osdc, vino, layout, off, plen, req); | ||
204 | req->r_file_layout = *layout; /* keep a copy */ | ||
205 | |||
206 | if (flags & CEPH_OSD_FLAG_WRITE) { | ||
207 | req->r_request->hdr.data_off = cpu_to_le16(off); | ||
208 | req->r_request->hdr.data_len = cpu_to_le32(*plen); | ||
209 | op->payload_len = cpu_to_le32(*plen); | ||
210 | } | ||
211 | op->extent.truncate_size = cpu_to_le64(truncate_size); | ||
212 | op->extent.truncate_seq = cpu_to_le32(truncate_seq); | ||
213 | |||
214 | /* fill in oid */ | ||
215 | head->object_len = cpu_to_le32(req->r_oid_len); | ||
216 | memcpy(p, req->r_oid, req->r_oid_len); | ||
217 | p += req->r_oid_len; | ||
218 | |||
219 | if (do_sync) { | ||
220 | op++; | ||
221 | op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); | ||
222 | } | ||
223 | if (snapc) { | ||
224 | head->snap_seq = cpu_to_le64(snapc->seq); | ||
225 | head->num_snaps = cpu_to_le32(snapc->num_snaps); | ||
226 | for (i = 0; i < snapc->num_snaps; i++) { | ||
227 | put_unaligned_le64(snapc->snaps[i], p); | ||
228 | p += sizeof(u64); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); | ||
233 | msg_size = p - msg->front.iov_base; | ||
234 | msg->front.iov_len = msg_size; | ||
235 | msg->hdr.front_len = cpu_to_le32(msg_size); | ||
236 | return req; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * We keep osd requests in an rbtree, sorted by ->r_tid. | ||
241 | */ | ||
242 | static void __insert_request(struct ceph_osd_client *osdc, | ||
243 | struct ceph_osd_request *new) | ||
244 | { | ||
245 | struct rb_node **p = &osdc->requests.rb_node; | ||
246 | struct rb_node *parent = NULL; | ||
247 | struct ceph_osd_request *req = NULL; | ||
248 | |||
249 | while (*p) { | ||
250 | parent = *p; | ||
251 | req = rb_entry(parent, struct ceph_osd_request, r_node); | ||
252 | if (new->r_tid < req->r_tid) | ||
253 | p = &(*p)->rb_left; | ||
254 | else if (new->r_tid > req->r_tid) | ||
255 | p = &(*p)->rb_right; | ||
256 | else | ||
257 | BUG(); | ||
258 | } | ||
259 | |||
260 | rb_link_node(&new->r_node, parent, p); | ||
261 | rb_insert_color(&new->r_node, &osdc->requests); | ||
262 | } | ||
263 | |||
264 | static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, | ||
265 | u64 tid) | ||
266 | { | ||
267 | struct ceph_osd_request *req; | ||
268 | struct rb_node *n = osdc->requests.rb_node; | ||
269 | |||
270 | while (n) { | ||
271 | req = rb_entry(n, struct ceph_osd_request, r_node); | ||
272 | if (tid < req->r_tid) | ||
273 | n = n->rb_left; | ||
274 | else if (tid > req->r_tid) | ||
275 | n = n->rb_right; | ||
276 | else | ||
277 | return req; | ||
278 | } | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | static struct ceph_osd_request * | ||
283 | __lookup_request_ge(struct ceph_osd_client *osdc, | ||
284 | u64 tid) | ||
285 | { | ||
286 | struct ceph_osd_request *req; | ||
287 | struct rb_node *n = osdc->requests.rb_node; | ||
288 | |||
289 | while (n) { | ||
290 | req = rb_entry(n, struct ceph_osd_request, r_node); | ||
291 | if (tid < req->r_tid) { | ||
292 | if (!n->rb_left) | ||
293 | return req; | ||
294 | n = n->rb_left; | ||
295 | } else if (tid > req->r_tid) { | ||
296 | n = n->rb_right; | ||
297 | } else { | ||
298 | return req; | ||
299 | } | ||
300 | } | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | |||
305 | /* | ||
306 | * If the osd connection drops, we need to resubmit all requests. | ||
307 | */ | ||
308 | static void osd_reset(struct ceph_connection *con) | ||
309 | { | ||
310 | struct ceph_osd *osd = con->private; | ||
311 | struct ceph_osd_client *osdc; | ||
312 | |||
313 | if (!osd) | ||
314 | return; | ||
315 | dout("osd_reset osd%d\n", osd->o_osd); | ||
316 | osdc = osd->o_osdc; | ||
317 | down_read(&osdc->map_sem); | ||
318 | kick_requests(osdc, osd); | ||
319 | up_read(&osdc->map_sem); | ||
320 | } | ||
321 | |||
322 | /* | ||
323 | * Track open sessions with osds. | ||
324 | */ | ||
325 | static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) | ||
326 | { | ||
327 | struct ceph_osd *osd; | ||
328 | |||
329 | osd = kzalloc(sizeof(*osd), GFP_NOFS); | ||
330 | if (!osd) | ||
331 | return NULL; | ||
332 | |||
333 | atomic_set(&osd->o_ref, 1); | ||
334 | osd->o_osdc = osdc; | ||
335 | INIT_LIST_HEAD(&osd->o_requests); | ||
336 | INIT_LIST_HEAD(&osd->o_osd_lru); | ||
337 | osd->o_incarnation = 1; | ||
338 | |||
339 | ceph_con_init(osdc->client->msgr, &osd->o_con); | ||
340 | osd->o_con.private = osd; | ||
341 | osd->o_con.ops = &osd_con_ops; | ||
342 | osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; | ||
343 | |||
344 | INIT_LIST_HEAD(&osd->o_keepalive_item); | ||
345 | return osd; | ||
346 | } | ||
347 | |||
348 | static struct ceph_osd *get_osd(struct ceph_osd *osd) | ||
349 | { | ||
350 | if (atomic_inc_not_zero(&osd->o_ref)) { | ||
351 | dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, | ||
352 | atomic_read(&osd->o_ref)); | ||
353 | return osd; | ||
354 | } else { | ||
355 | dout("get_osd %p FAIL\n", osd); | ||
356 | return NULL; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | static void put_osd(struct ceph_osd *osd) | ||
361 | { | ||
362 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), | ||
363 | atomic_read(&osd->o_ref) - 1); | ||
364 | if (atomic_dec_and_test(&osd->o_ref)) { | ||
365 | struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; | ||
366 | |||
367 | if (osd->o_authorizer) | ||
368 | ac->ops->destroy_authorizer(ac, osd->o_authorizer); | ||
369 | kfree(osd); | ||
370 | } | ||
371 | } | ||
372 | |||
373 | /* | ||
374 | * remove an osd from our map | ||
375 | */ | ||
376 | static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | ||
377 | { | ||
378 | dout("__remove_osd %p\n", osd); | ||
379 | BUG_ON(!list_empty(&osd->o_requests)); | ||
380 | rb_erase(&osd->o_node, &osdc->osds); | ||
381 | list_del_init(&osd->o_osd_lru); | ||
382 | ceph_con_close(&osd->o_con); | ||
383 | put_osd(osd); | ||
384 | } | ||
385 | |||
386 | static void __move_osd_to_lru(struct ceph_osd_client *osdc, | ||
387 | struct ceph_osd *osd) | ||
388 | { | ||
389 | dout("__move_osd_to_lru %p\n", osd); | ||
390 | BUG_ON(!list_empty(&osd->o_osd_lru)); | ||
391 | list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); | ||
392 | osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; | ||
393 | } | ||
394 | |||
395 | static void __remove_osd_from_lru(struct ceph_osd *osd) | ||
396 | { | ||
397 | dout("__remove_osd_from_lru %p\n", osd); | ||
398 | if (!list_empty(&osd->o_osd_lru)) | ||
399 | list_del_init(&osd->o_osd_lru); | ||
400 | } | ||
401 | |||
402 | static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) | ||
403 | { | ||
404 | struct ceph_osd *osd, *nosd; | ||
405 | |||
406 | dout("__remove_old_osds %p\n", osdc); | ||
407 | mutex_lock(&osdc->request_mutex); | ||
408 | list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { | ||
409 | if (!remove_all && time_before(jiffies, osd->lru_ttl)) | ||
410 | break; | ||
411 | __remove_osd(osdc, osd); | ||
412 | } | ||
413 | mutex_unlock(&osdc->request_mutex); | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | * reset osd connect | ||
418 | */ | ||
419 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | ||
420 | { | ||
421 | struct ceph_osd_request *req; | ||
422 | int ret = 0; | ||
423 | |||
424 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | ||
425 | if (list_empty(&osd->o_requests)) { | ||
426 | __remove_osd(osdc, osd); | ||
427 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], | ||
428 | &osd->o_con.peer_addr, | ||
429 | sizeof(osd->o_con.peer_addr)) == 0 && | ||
430 | !ceph_con_opened(&osd->o_con)) { | ||
431 | dout(" osd addr hasn't changed and connection never opened," | ||
432 | " letting msgr retry"); | ||
433 | /* touch each r_stamp for handle_timeout()'s benfit */ | ||
434 | list_for_each_entry(req, &osd->o_requests, r_osd_item) | ||
435 | req->r_stamp = jiffies; | ||
436 | ret = -EAGAIN; | ||
437 | } else { | ||
438 | ceph_con_close(&osd->o_con); | ||
439 | ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); | ||
440 | osd->o_incarnation++; | ||
441 | } | ||
442 | return ret; | ||
443 | } | ||
444 | |||
445 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) | ||
446 | { | ||
447 | struct rb_node **p = &osdc->osds.rb_node; | ||
448 | struct rb_node *parent = NULL; | ||
449 | struct ceph_osd *osd = NULL; | ||
450 | |||
451 | while (*p) { | ||
452 | parent = *p; | ||
453 | osd = rb_entry(parent, struct ceph_osd, o_node); | ||
454 | if (new->o_osd < osd->o_osd) | ||
455 | p = &(*p)->rb_left; | ||
456 | else if (new->o_osd > osd->o_osd) | ||
457 | p = &(*p)->rb_right; | ||
458 | else | ||
459 | BUG(); | ||
460 | } | ||
461 | |||
462 | rb_link_node(&new->o_node, parent, p); | ||
463 | rb_insert_color(&new->o_node, &osdc->osds); | ||
464 | } | ||
465 | |||
466 | static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) | ||
467 | { | ||
468 | struct ceph_osd *osd; | ||
469 | struct rb_node *n = osdc->osds.rb_node; | ||
470 | |||
471 | while (n) { | ||
472 | osd = rb_entry(n, struct ceph_osd, o_node); | ||
473 | if (o < osd->o_osd) | ||
474 | n = n->rb_left; | ||
475 | else if (o > osd->o_osd) | ||
476 | n = n->rb_right; | ||
477 | else | ||
478 | return osd; | ||
479 | } | ||
480 | return NULL; | ||
481 | } | ||
482 | |||
483 | static void __schedule_osd_timeout(struct ceph_osd_client *osdc) | ||
484 | { | ||
485 | schedule_delayed_work(&osdc->timeout_work, | ||
486 | osdc->client->mount_args->osd_keepalive_timeout * HZ); | ||
487 | } | ||
488 | |||
489 | static void __cancel_osd_timeout(struct ceph_osd_client *osdc) | ||
490 | { | ||
491 | cancel_delayed_work(&osdc->timeout_work); | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Register request, assign tid. If this is the first request, set up | ||
496 | * the timeout event. | ||
497 | */ | ||
498 | static void register_request(struct ceph_osd_client *osdc, | ||
499 | struct ceph_osd_request *req) | ||
500 | { | ||
501 | mutex_lock(&osdc->request_mutex); | ||
502 | req->r_tid = ++osdc->last_tid; | ||
503 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); | ||
504 | INIT_LIST_HEAD(&req->r_req_lru_item); | ||
505 | |||
506 | dout("register_request %p tid %lld\n", req, req->r_tid); | ||
507 | __insert_request(osdc, req); | ||
508 | ceph_osdc_get_request(req); | ||
509 | osdc->num_requests++; | ||
510 | |||
511 | if (osdc->num_requests == 1) { | ||
512 | dout(" first request, scheduling timeout\n"); | ||
513 | __schedule_osd_timeout(osdc); | ||
514 | } | ||
515 | mutex_unlock(&osdc->request_mutex); | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * called under osdc->request_mutex | ||
520 | */ | ||
521 | static void __unregister_request(struct ceph_osd_client *osdc, | ||
522 | struct ceph_osd_request *req) | ||
523 | { | ||
524 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | ||
525 | rb_erase(&req->r_node, &osdc->requests); | ||
526 | osdc->num_requests--; | ||
527 | |||
528 | if (req->r_osd) { | ||
529 | /* make sure the original request isn't in flight. */ | ||
530 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); | ||
531 | |||
532 | list_del_init(&req->r_osd_item); | ||
533 | if (list_empty(&req->r_osd->o_requests)) | ||
534 | __move_osd_to_lru(osdc, req->r_osd); | ||
535 | req->r_osd = NULL; | ||
536 | } | ||
537 | |||
538 | ceph_osdc_put_request(req); | ||
539 | |||
540 | list_del_init(&req->r_req_lru_item); | ||
541 | if (osdc->num_requests == 0) { | ||
542 | dout(" no requests, canceling timeout\n"); | ||
543 | __cancel_osd_timeout(osdc); | ||
544 | } | ||
545 | } | ||
546 | |||
547 | /* | ||
548 | * Cancel a previously queued request message | ||
549 | */ | ||
550 | static void __cancel_request(struct ceph_osd_request *req) | ||
551 | { | ||
552 | if (req->r_sent) { | ||
553 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); | ||
554 | req->r_sent = 0; | ||
555 | } | ||
556 | list_del_init(&req->r_req_lru_item); | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct | ||
561 | * (as needed), and set the request r_osd appropriately. If there is | ||
562 | * no up osd, set r_osd to NULL. | ||
563 | * | ||
564 | * Return 0 if unchanged, 1 if changed, or negative on error. | ||
565 | * | ||
566 | * Caller should hold map_sem for read and request_mutex. | ||
567 | */ | ||
568 | static int __map_osds(struct ceph_osd_client *osdc, | ||
569 | struct ceph_osd_request *req) | ||
570 | { | ||
571 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | ||
572 | struct ceph_pg pgid; | ||
573 | int acting[CEPH_PG_MAX_SIZE]; | ||
574 | int o = -1, num = 0; | ||
575 | int err; | ||
576 | |||
577 | dout("map_osds %p tid %lld\n", req, req->r_tid); | ||
578 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, | ||
579 | &req->r_file_layout, osdc->osdmap); | ||
580 | if (err) | ||
581 | return err; | ||
582 | pgid = reqhead->layout.ol_pgid; | ||
583 | req->r_pgid = pgid; | ||
584 | |||
585 | err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); | ||
586 | if (err > 0) { | ||
587 | o = acting[0]; | ||
588 | num = err; | ||
589 | } | ||
590 | |||
591 | if ((req->r_osd && req->r_osd->o_osd == o && | ||
592 | req->r_sent >= req->r_osd->o_incarnation && | ||
593 | req->r_num_pg_osds == num && | ||
594 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || | ||
595 | (req->r_osd == NULL && o == -1)) | ||
596 | return 0; /* no change */ | ||
597 | |||
598 | dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", | ||
599 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, | ||
600 | req->r_osd ? req->r_osd->o_osd : -1); | ||
601 | |||
602 | /* record full pg acting set */ | ||
603 | memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); | ||
604 | req->r_num_pg_osds = num; | ||
605 | |||
606 | if (req->r_osd) { | ||
607 | __cancel_request(req); | ||
608 | list_del_init(&req->r_osd_item); | ||
609 | req->r_osd = NULL; | ||
610 | } | ||
611 | |||
612 | req->r_osd = __lookup_osd(osdc, o); | ||
613 | if (!req->r_osd && o >= 0) { | ||
614 | err = -ENOMEM; | ||
615 | req->r_osd = create_osd(osdc); | ||
616 | if (!req->r_osd) | ||
617 | goto out; | ||
618 | |||
619 | dout("map_osds osd %p is osd%d\n", req->r_osd, o); | ||
620 | req->r_osd->o_osd = o; | ||
621 | req->r_osd->o_con.peer_name.num = cpu_to_le64(o); | ||
622 | __insert_osd(osdc, req->r_osd); | ||
623 | |||
624 | ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); | ||
625 | } | ||
626 | |||
627 | if (req->r_osd) { | ||
628 | __remove_osd_from_lru(req->r_osd); | ||
629 | list_add(&req->r_osd_item, &req->r_osd->o_requests); | ||
630 | } | ||
631 | err = 1; /* osd or pg changed */ | ||
632 | |||
633 | out: | ||
634 | return err; | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * caller should hold map_sem (for read) and request_mutex | ||
639 | */ | ||
640 | static int __send_request(struct ceph_osd_client *osdc, | ||
641 | struct ceph_osd_request *req) | ||
642 | { | ||
643 | struct ceph_osd_request_head *reqhead; | ||
644 | int err; | ||
645 | |||
646 | err = __map_osds(osdc, req); | ||
647 | if (err < 0) | ||
648 | return err; | ||
649 | if (req->r_osd == NULL) { | ||
650 | dout("send_request %p no up osds in pg\n", req); | ||
651 | ceph_monc_request_next_osdmap(&osdc->client->monc); | ||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | dout("send_request %p tid %llu to osd%d flags %d\n", | ||
656 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); | ||
657 | |||
658 | reqhead = req->r_request->front.iov_base; | ||
659 | reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); | ||
660 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ | ||
661 | reqhead->reassert_version = req->r_reassert_version; | ||
662 | |||
663 | req->r_stamp = jiffies; | ||
664 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); | ||
665 | |||
666 | ceph_msg_get(req->r_request); /* send consumes a ref */ | ||
667 | ceph_con_send(&req->r_osd->o_con, req->r_request); | ||
668 | req->r_sent = req->r_osd->o_incarnation; | ||
669 | return 0; | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Timeout callback, called every N seconds when 1 or more osd | ||
674 | * requests has been active for more than N seconds. When this | ||
675 | * happens, we ping all OSDs with requests who have timed out to | ||
676 | * ensure any communications channel reset is detected. Reset the | ||
677 | * request timeouts another N seconds in the future as we go. | ||
678 | * Reschedule the timeout event another N seconds in future (unless | ||
679 | * there are no open requests). | ||
680 | */ | ||
681 | static void handle_timeout(struct work_struct *work) | ||
682 | { | ||
683 | struct ceph_osd_client *osdc = | ||
684 | container_of(work, struct ceph_osd_client, timeout_work.work); | ||
685 | struct ceph_osd_request *req, *last_req = NULL; | ||
686 | struct ceph_osd *osd; | ||
687 | unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; | ||
688 | unsigned long keepalive = | ||
689 | osdc->client->mount_args->osd_keepalive_timeout * HZ; | ||
690 | unsigned long last_stamp = 0; | ||
691 | struct rb_node *p; | ||
692 | struct list_head slow_osds; | ||
693 | |||
694 | dout("timeout\n"); | ||
695 | down_read(&osdc->map_sem); | ||
696 | |||
697 | ceph_monc_request_next_osdmap(&osdc->client->monc); | ||
698 | |||
699 | mutex_lock(&osdc->request_mutex); | ||
700 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | ||
701 | req = rb_entry(p, struct ceph_osd_request, r_node); | ||
702 | |||
703 | if (req->r_resend) { | ||
704 | int err; | ||
705 | |||
706 | dout("osdc resending prev failed %lld\n", req->r_tid); | ||
707 | err = __send_request(osdc, req); | ||
708 | if (err) | ||
709 | dout("osdc failed again on %lld\n", req->r_tid); | ||
710 | else | ||
711 | req->r_resend = false; | ||
712 | continue; | ||
713 | } | ||
714 | } | ||
715 | |||
716 | /* | ||
717 | * reset osds that appear to be _really_ unresponsive. this | ||
718 | * is a failsafe measure.. we really shouldn't be getting to | ||
719 | * this point if the system is working properly. the monitors | ||
720 | * should mark the osd as failed and we should find out about | ||
721 | * it from an updated osd map. | ||
722 | */ | ||
723 | while (timeout && !list_empty(&osdc->req_lru)) { | ||
724 | req = list_entry(osdc->req_lru.next, struct ceph_osd_request, | ||
725 | r_req_lru_item); | ||
726 | |||
727 | if (time_before(jiffies, req->r_stamp + timeout)) | ||
728 | break; | ||
729 | |||
730 | BUG_ON(req == last_req && req->r_stamp == last_stamp); | ||
731 | last_req = req; | ||
732 | last_stamp = req->r_stamp; | ||
733 | |||
734 | osd = req->r_osd; | ||
735 | BUG_ON(!osd); | ||
736 | pr_warning(" tid %llu timed out on osd%d, will reset osd\n", | ||
737 | req->r_tid, osd->o_osd); | ||
738 | __kick_requests(osdc, osd); | ||
739 | } | ||
740 | |||
741 | /* | ||
742 | * ping osds that are a bit slow. this ensures that if there | ||
743 | * is a break in the TCP connection we will notice, and reopen | ||
744 | * a connection with that osd (from the fault callback). | ||
745 | */ | ||
746 | INIT_LIST_HEAD(&slow_osds); | ||
747 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { | ||
748 | if (time_before(jiffies, req->r_stamp + keepalive)) | ||
749 | break; | ||
750 | |||
751 | osd = req->r_osd; | ||
752 | BUG_ON(!osd); | ||
753 | dout(" tid %llu is slow, will send keepalive on osd%d\n", | ||
754 | req->r_tid, osd->o_osd); | ||
755 | list_move_tail(&osd->o_keepalive_item, &slow_osds); | ||
756 | } | ||
757 | while (!list_empty(&slow_osds)) { | ||
758 | osd = list_entry(slow_osds.next, struct ceph_osd, | ||
759 | o_keepalive_item); | ||
760 | list_del_init(&osd->o_keepalive_item); | ||
761 | ceph_con_keepalive(&osd->o_con); | ||
762 | } | ||
763 | |||
764 | __schedule_osd_timeout(osdc); | ||
765 | mutex_unlock(&osdc->request_mutex); | ||
766 | |||
767 | up_read(&osdc->map_sem); | ||
768 | } | ||
769 | |||
770 | static void handle_osds_timeout(struct work_struct *work) | ||
771 | { | ||
772 | struct ceph_osd_client *osdc = | ||
773 | container_of(work, struct ceph_osd_client, | ||
774 | osds_timeout_work.work); | ||
775 | unsigned long delay = | ||
776 | osdc->client->mount_args->osd_idle_ttl * HZ >> 2; | ||
777 | |||
778 | dout("osds timeout\n"); | ||
779 | down_read(&osdc->map_sem); | ||
780 | remove_old_osds(osdc, 0); | ||
781 | up_read(&osdc->map_sem); | ||
782 | |||
783 | schedule_delayed_work(&osdc->osds_timeout_work, | ||
784 | round_jiffies_relative(delay)); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * handle osd op reply. either call the callback if it is specified, | ||
789 | * or do the completion to wake up the waiting thread. | ||
790 | */ | ||
791 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | ||
792 | struct ceph_connection *con) | ||
793 | { | ||
794 | struct ceph_osd_reply_head *rhead = msg->front.iov_base; | ||
795 | struct ceph_osd_request *req; | ||
796 | u64 tid; | ||
797 | int numops, object_len, flags; | ||
798 | s32 result; | ||
799 | |||
800 | tid = le64_to_cpu(msg->hdr.tid); | ||
801 | if (msg->front.iov_len < sizeof(*rhead)) | ||
802 | goto bad; | ||
803 | numops = le32_to_cpu(rhead->num_ops); | ||
804 | object_len = le32_to_cpu(rhead->object_len); | ||
805 | result = le32_to_cpu(rhead->result); | ||
806 | if (msg->front.iov_len != sizeof(*rhead) + object_len + | ||
807 | numops * sizeof(struct ceph_osd_op)) | ||
808 | goto bad; | ||
809 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); | ||
810 | |||
811 | /* lookup */ | ||
812 | mutex_lock(&osdc->request_mutex); | ||
813 | req = __lookup_request(osdc, tid); | ||
814 | if (req == NULL) { | ||
815 | dout("handle_reply tid %llu dne\n", tid); | ||
816 | mutex_unlock(&osdc->request_mutex); | ||
817 | return; | ||
818 | } | ||
819 | ceph_osdc_get_request(req); | ||
820 | flags = le32_to_cpu(rhead->flags); | ||
821 | |||
822 | /* | ||
823 | * if this connection filled our message, drop our reference now, to | ||
824 | * avoid a (safe but slower) revoke later. | ||
825 | */ | ||
826 | if (req->r_con_filling_msg == con && req->r_reply == msg) { | ||
827 | dout(" dropping con_filling_msg ref %p\n", con); | ||
828 | req->r_con_filling_msg = NULL; | ||
829 | ceph_con_put(con); | ||
830 | } | ||
831 | |||
832 | if (!req->r_got_reply) { | ||
833 | unsigned bytes; | ||
834 | |||
835 | req->r_result = le32_to_cpu(rhead->result); | ||
836 | bytes = le32_to_cpu(msg->hdr.data_len); | ||
837 | dout("handle_reply result %d bytes %d\n", req->r_result, | ||
838 | bytes); | ||
839 | if (req->r_result == 0) | ||
840 | req->r_result = bytes; | ||
841 | |||
842 | /* in case this is a write and we need to replay, */ | ||
843 | req->r_reassert_version = rhead->reassert_version; | ||
844 | |||
845 | req->r_got_reply = 1; | ||
846 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { | ||
847 | dout("handle_reply tid %llu dup ack\n", tid); | ||
848 | mutex_unlock(&osdc->request_mutex); | ||
849 | goto done; | ||
850 | } | ||
851 | |||
852 | dout("handle_reply tid %llu flags %d\n", tid, flags); | ||
853 | |||
854 | /* either this is a read, or we got the safe response */ | ||
855 | if (result < 0 || | ||
856 | (flags & CEPH_OSD_FLAG_ONDISK) || | ||
857 | ((flags & CEPH_OSD_FLAG_WRITE) == 0)) | ||
858 | __unregister_request(osdc, req); | ||
859 | |||
860 | mutex_unlock(&osdc->request_mutex); | ||
861 | |||
862 | if (req->r_callback) | ||
863 | req->r_callback(req, msg); | ||
864 | else | ||
865 | complete_all(&req->r_completion); | ||
866 | |||
867 | if (flags & CEPH_OSD_FLAG_ONDISK) { | ||
868 | if (req->r_safe_callback) | ||
869 | req->r_safe_callback(req, msg); | ||
870 | complete_all(&req->r_safe_completion); /* fsync waiter */ | ||
871 | } | ||
872 | |||
873 | done: | ||
874 | ceph_osdc_put_request(req); | ||
875 | return; | ||
876 | |||
877 | bad: | ||
878 | pr_err("corrupt osd_op_reply got %d %d expected %d\n", | ||
879 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), | ||
880 | (int)sizeof(*rhead)); | ||
881 | ceph_msg_dump(msg); | ||
882 | } | ||
883 | |||
884 | |||
885 | static int __kick_requests(struct ceph_osd_client *osdc, | ||
886 | struct ceph_osd *kickosd) | ||
887 | { | ||
888 | struct ceph_osd_request *req; | ||
889 | struct rb_node *p, *n; | ||
890 | int needmap = 0; | ||
891 | int err; | ||
892 | |||
893 | dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); | ||
894 | if (kickosd) { | ||
895 | err = __reset_osd(osdc, kickosd); | ||
896 | if (err == -EAGAIN) | ||
897 | return 1; | ||
898 | } else { | ||
899 | for (p = rb_first(&osdc->osds); p; p = n) { | ||
900 | struct ceph_osd *osd = | ||
901 | rb_entry(p, struct ceph_osd, o_node); | ||
902 | |||
903 | n = rb_next(p); | ||
904 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | ||
905 | memcmp(&osd->o_con.peer_addr, | ||
906 | ceph_osd_addr(osdc->osdmap, | ||
907 | osd->o_osd), | ||
908 | sizeof(struct ceph_entity_addr)) != 0) | ||
909 | __reset_osd(osdc, osd); | ||
910 | } | ||
911 | } | ||
912 | |||
913 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | ||
914 | req = rb_entry(p, struct ceph_osd_request, r_node); | ||
915 | |||
916 | if (req->r_resend) { | ||
917 | dout(" r_resend set on tid %llu\n", req->r_tid); | ||
918 | __cancel_request(req); | ||
919 | goto kick; | ||
920 | } | ||
921 | if (req->r_osd && kickosd == req->r_osd) { | ||
922 | __cancel_request(req); | ||
923 | goto kick; | ||
924 | } | ||
925 | |||
926 | err = __map_osds(osdc, req); | ||
927 | if (err == 0) | ||
928 | continue; /* no change */ | ||
929 | if (err < 0) { | ||
930 | /* | ||
931 | * FIXME: really, we should set the request | ||
932 | * error and fail if this isn't a 'nofail' | ||
933 | * request, but that's a fair bit more | ||
934 | * complicated to do. So retry! | ||
935 | */ | ||
936 | dout(" setting r_resend on %llu\n", req->r_tid); | ||
937 | req->r_resend = true; | ||
938 | continue; | ||
939 | } | ||
940 | if (req->r_osd == NULL) { | ||
941 | dout("tid %llu maps to no valid osd\n", req->r_tid); | ||
942 | needmap++; /* request a newer map */ | ||
943 | continue; | ||
944 | } | ||
945 | |||
946 | kick: | ||
947 | dout("kicking %p tid %llu osd%d\n", req, req->r_tid, | ||
948 | req->r_osd ? req->r_osd->o_osd : -1); | ||
949 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
950 | err = __send_request(osdc, req); | ||
951 | if (err) { | ||
952 | dout(" setting r_resend on %llu\n", req->r_tid); | ||
953 | req->r_resend = true; | ||
954 | } | ||
955 | } | ||
956 | |||
957 | return needmap; | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Resubmit osd requests whose osd or osd address has changed. Request | ||
962 | * a new osd map if osds are down, or we are otherwise unable to determine | ||
963 | * how to direct a request. | ||
964 | * | ||
965 | * Close connections to down osds. | ||
966 | * | ||
967 | * If @who is specified, resubmit requests for that specific osd. | ||
968 | * | ||
969 | * Caller should hold map_sem for read and request_mutex. | ||
970 | */ | ||
971 | static void kick_requests(struct ceph_osd_client *osdc, | ||
972 | struct ceph_osd *kickosd) | ||
973 | { | ||
974 | int needmap; | ||
975 | |||
976 | mutex_lock(&osdc->request_mutex); | ||
977 | needmap = __kick_requests(osdc, kickosd); | ||
978 | mutex_unlock(&osdc->request_mutex); | ||
979 | |||
980 | if (needmap) { | ||
981 | dout("%d requests for down osds, need new map\n", needmap); | ||
982 | ceph_monc_request_next_osdmap(&osdc->client->monc); | ||
983 | } | ||
984 | |||
985 | } | ||
986 | /* | ||
987 | * Process updated osd map. | ||
988 | * | ||
989 | * The message contains any number of incremental and full maps, normally | ||
990 | * indicating some sort of topology change in the cluster. Kick requests | ||
991 | * off to different OSDs as needed. | ||
992 | */ | ||
993 | void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | ||
994 | { | ||
995 | void *p, *end, *next; | ||
996 | u32 nr_maps, maplen; | ||
997 | u32 epoch; | ||
998 | struct ceph_osdmap *newmap = NULL, *oldmap; | ||
999 | int err; | ||
1000 | struct ceph_fsid fsid; | ||
1001 | |||
1002 | dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); | ||
1003 | p = msg->front.iov_base; | ||
1004 | end = p + msg->front.iov_len; | ||
1005 | |||
1006 | /* verify fsid */ | ||
1007 | ceph_decode_need(&p, end, sizeof(fsid), bad); | ||
1008 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | ||
1009 | if (ceph_check_fsid(osdc->client, &fsid) < 0) | ||
1010 | return; | ||
1011 | |||
1012 | down_write(&osdc->map_sem); | ||
1013 | |||
1014 | /* incremental maps */ | ||
1015 | ceph_decode_32_safe(&p, end, nr_maps, bad); | ||
1016 | dout(" %d inc maps\n", nr_maps); | ||
1017 | while (nr_maps > 0) { | ||
1018 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); | ||
1019 | epoch = ceph_decode_32(&p); | ||
1020 | maplen = ceph_decode_32(&p); | ||
1021 | ceph_decode_need(&p, end, maplen, bad); | ||
1022 | next = p + maplen; | ||
1023 | if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { | ||
1024 | dout("applying incremental map %u len %d\n", | ||
1025 | epoch, maplen); | ||
1026 | newmap = osdmap_apply_incremental(&p, next, | ||
1027 | osdc->osdmap, | ||
1028 | osdc->client->msgr); | ||
1029 | if (IS_ERR(newmap)) { | ||
1030 | err = PTR_ERR(newmap); | ||
1031 | goto bad; | ||
1032 | } | ||
1033 | BUG_ON(!newmap); | ||
1034 | if (newmap != osdc->osdmap) { | ||
1035 | ceph_osdmap_destroy(osdc->osdmap); | ||
1036 | osdc->osdmap = newmap; | ||
1037 | } | ||
1038 | } else { | ||
1039 | dout("ignoring incremental map %u len %d\n", | ||
1040 | epoch, maplen); | ||
1041 | } | ||
1042 | p = next; | ||
1043 | nr_maps--; | ||
1044 | } | ||
1045 | if (newmap) | ||
1046 | goto done; | ||
1047 | |||
1048 | /* full maps */ | ||
1049 | ceph_decode_32_safe(&p, end, nr_maps, bad); | ||
1050 | dout(" %d full maps\n", nr_maps); | ||
1051 | while (nr_maps) { | ||
1052 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); | ||
1053 | epoch = ceph_decode_32(&p); | ||
1054 | maplen = ceph_decode_32(&p); | ||
1055 | ceph_decode_need(&p, end, maplen, bad); | ||
1056 | if (nr_maps > 1) { | ||
1057 | dout("skipping non-latest full map %u len %d\n", | ||
1058 | epoch, maplen); | ||
1059 | } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { | ||
1060 | dout("skipping full map %u len %d, " | ||
1061 | "older than our %u\n", epoch, maplen, | ||
1062 | osdc->osdmap->epoch); | ||
1063 | } else { | ||
1064 | dout("taking full map %u len %d\n", epoch, maplen); | ||
1065 | newmap = osdmap_decode(&p, p+maplen); | ||
1066 | if (IS_ERR(newmap)) { | ||
1067 | err = PTR_ERR(newmap); | ||
1068 | goto bad; | ||
1069 | } | ||
1070 | BUG_ON(!newmap); | ||
1071 | oldmap = osdc->osdmap; | ||
1072 | osdc->osdmap = newmap; | ||
1073 | if (oldmap) | ||
1074 | ceph_osdmap_destroy(oldmap); | ||
1075 | } | ||
1076 | p += maplen; | ||
1077 | nr_maps--; | ||
1078 | } | ||
1079 | |||
1080 | done: | ||
1081 | downgrade_write(&osdc->map_sem); | ||
1082 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); | ||
1083 | if (newmap) | ||
1084 | kick_requests(osdc, NULL); | ||
1085 | up_read(&osdc->map_sem); | ||
1086 | wake_up_all(&osdc->client->auth_wq); | ||
1087 | return; | ||
1088 | |||
1089 | bad: | ||
1090 | pr_err("osdc handle_map corrupt msg\n"); | ||
1091 | ceph_msg_dump(msg); | ||
1092 | up_write(&osdc->map_sem); | ||
1093 | return; | ||
1094 | } | ||
1095 | |||
1096 | /* | ||
1097 | * Register request, send initial attempt. | ||
1098 | */ | ||
1099 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, | ||
1100 | struct ceph_osd_request *req, | ||
1101 | bool nofail) | ||
1102 | { | ||
1103 | int rc = 0; | ||
1104 | |||
1105 | req->r_request->pages = req->r_pages; | ||
1106 | req->r_request->nr_pages = req->r_num_pages; | ||
1107 | |||
1108 | register_request(osdc, req); | ||
1109 | |||
1110 | down_read(&osdc->map_sem); | ||
1111 | mutex_lock(&osdc->request_mutex); | ||
1112 | /* | ||
1113 | * a racing kick_requests() may have sent the message for us | ||
1114 | * while we dropped request_mutex above, so only send now if | ||
1115 | * the request still han't been touched yet. | ||
1116 | */ | ||
1117 | if (req->r_sent == 0) { | ||
1118 | rc = __send_request(osdc, req); | ||
1119 | if (rc) { | ||
1120 | if (nofail) { | ||
1121 | dout("osdc_start_request failed send, " | ||
1122 | " marking %lld\n", req->r_tid); | ||
1123 | req->r_resend = true; | ||
1124 | rc = 0; | ||
1125 | } else { | ||
1126 | __unregister_request(osdc, req); | ||
1127 | } | ||
1128 | } | ||
1129 | } | ||
1130 | mutex_unlock(&osdc->request_mutex); | ||
1131 | up_read(&osdc->map_sem); | ||
1132 | return rc; | ||
1133 | } | ||
1134 | |||
1135 | /* | ||
1136 | * wait for a request to complete | ||
1137 | */ | ||
1138 | int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | ||
1139 | struct ceph_osd_request *req) | ||
1140 | { | ||
1141 | int rc; | ||
1142 | |||
1143 | rc = wait_for_completion_interruptible(&req->r_completion); | ||
1144 | if (rc < 0) { | ||
1145 | mutex_lock(&osdc->request_mutex); | ||
1146 | __cancel_request(req); | ||
1147 | __unregister_request(osdc, req); | ||
1148 | mutex_unlock(&osdc->request_mutex); | ||
1149 | dout("wait_request tid %llu canceled/timed out\n", req->r_tid); | ||
1150 | return rc; | ||
1151 | } | ||
1152 | |||
1153 | dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); | ||
1154 | return req->r_result; | ||
1155 | } | ||
1156 | |||
1157 | /* | ||
1158 | * sync - wait for all in-flight requests to flush. avoid starvation. | ||
1159 | */ | ||
1160 | void ceph_osdc_sync(struct ceph_osd_client *osdc) | ||
1161 | { | ||
1162 | struct ceph_osd_request *req; | ||
1163 | u64 last_tid, next_tid = 0; | ||
1164 | |||
1165 | mutex_lock(&osdc->request_mutex); | ||
1166 | last_tid = osdc->last_tid; | ||
1167 | while (1) { | ||
1168 | req = __lookup_request_ge(osdc, next_tid); | ||
1169 | if (!req) | ||
1170 | break; | ||
1171 | if (req->r_tid > last_tid) | ||
1172 | break; | ||
1173 | |||
1174 | next_tid = req->r_tid + 1; | ||
1175 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) | ||
1176 | continue; | ||
1177 | |||
1178 | ceph_osdc_get_request(req); | ||
1179 | mutex_unlock(&osdc->request_mutex); | ||
1180 | dout("sync waiting on tid %llu (last is %llu)\n", | ||
1181 | req->r_tid, last_tid); | ||
1182 | wait_for_completion(&req->r_safe_completion); | ||
1183 | mutex_lock(&osdc->request_mutex); | ||
1184 | ceph_osdc_put_request(req); | ||
1185 | } | ||
1186 | mutex_unlock(&osdc->request_mutex); | ||
1187 | dout("sync done (thru tid %llu)\n", last_tid); | ||
1188 | } | ||
1189 | |||
1190 | /* | ||
1191 | * init, shutdown | ||
1192 | */ | ||
1193 | int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | ||
1194 | { | ||
1195 | int err; | ||
1196 | |||
1197 | dout("init\n"); | ||
1198 | osdc->client = client; | ||
1199 | osdc->osdmap = NULL; | ||
1200 | init_rwsem(&osdc->map_sem); | ||
1201 | init_completion(&osdc->map_waiters); | ||
1202 | osdc->last_requested_map = 0; | ||
1203 | mutex_init(&osdc->request_mutex); | ||
1204 | osdc->last_tid = 0; | ||
1205 | osdc->osds = RB_ROOT; | ||
1206 | INIT_LIST_HEAD(&osdc->osd_lru); | ||
1207 | osdc->requests = RB_ROOT; | ||
1208 | INIT_LIST_HEAD(&osdc->req_lru); | ||
1209 | osdc->num_requests = 0; | ||
1210 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); | ||
1211 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); | ||
1212 | |||
1213 | schedule_delayed_work(&osdc->osds_timeout_work, | ||
1214 | round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); | ||
1215 | |||
1216 | err = -ENOMEM; | ||
1217 | osdc->req_mempool = mempool_create_kmalloc_pool(10, | ||
1218 | sizeof(struct ceph_osd_request)); | ||
1219 | if (!osdc->req_mempool) | ||
1220 | goto out; | ||
1221 | |||
1222 | err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, | ||
1223 | "osd_op"); | ||
1224 | if (err < 0) | ||
1225 | goto out_mempool; | ||
1226 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, | ||
1227 | OSD_OPREPLY_FRONT_LEN, 10, true, | ||
1228 | "osd_op_reply"); | ||
1229 | if (err < 0) | ||
1230 | goto out_msgpool; | ||
1231 | return 0; | ||
1232 | |||
1233 | out_msgpool: | ||
1234 | ceph_msgpool_destroy(&osdc->msgpool_op); | ||
1235 | out_mempool: | ||
1236 | mempool_destroy(osdc->req_mempool); | ||
1237 | out: | ||
1238 | return err; | ||
1239 | } | ||
1240 | |||
1241 | void ceph_osdc_stop(struct ceph_osd_client *osdc) | ||
1242 | { | ||
1243 | cancel_delayed_work_sync(&osdc->timeout_work); | ||
1244 | cancel_delayed_work_sync(&osdc->osds_timeout_work); | ||
1245 | if (osdc->osdmap) { | ||
1246 | ceph_osdmap_destroy(osdc->osdmap); | ||
1247 | osdc->osdmap = NULL; | ||
1248 | } | ||
1249 | remove_old_osds(osdc, 1); | ||
1250 | mempool_destroy(osdc->req_mempool); | ||
1251 | ceph_msgpool_destroy(&osdc->msgpool_op); | ||
1252 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); | ||
1253 | } | ||
1254 | |||
1255 | /* | ||
1256 | * Read some contiguous pages. If we cross a stripe boundary, shorten | ||
1257 | * *plen. Return number of bytes read, or error. | ||
1258 | */ | ||
1259 | int ceph_osdc_readpages(struct ceph_osd_client *osdc, | ||
1260 | struct ceph_vino vino, struct ceph_file_layout *layout, | ||
1261 | u64 off, u64 *plen, | ||
1262 | u32 truncate_seq, u64 truncate_size, | ||
1263 | struct page **pages, int num_pages) | ||
1264 | { | ||
1265 | struct ceph_osd_request *req; | ||
1266 | int rc = 0; | ||
1267 | |||
1268 | dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, | ||
1269 | vino.snap, off, *plen); | ||
1270 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, | ||
1271 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | ||
1272 | NULL, 0, truncate_seq, truncate_size, NULL, | ||
1273 | false, 1); | ||
1274 | if (!req) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | /* it may be a short read due to an object boundary */ | ||
1278 | req->r_pages = pages; | ||
1279 | |||
1280 | dout("readpages final extent is %llu~%llu (%d pages)\n", | ||
1281 | off, *plen, req->r_num_pages); | ||
1282 | |||
1283 | rc = ceph_osdc_start_request(osdc, req, false); | ||
1284 | if (!rc) | ||
1285 | rc = ceph_osdc_wait_request(osdc, req); | ||
1286 | |||
1287 | ceph_osdc_put_request(req); | ||
1288 | dout("readpages result %d\n", rc); | ||
1289 | return rc; | ||
1290 | } | ||
1291 | |||
1292 | /* | ||
1293 | * do a synchronous write on N pages | ||
1294 | */ | ||
1295 | int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | ||
1296 | struct ceph_file_layout *layout, | ||
1297 | struct ceph_snap_context *snapc, | ||
1298 | u64 off, u64 len, | ||
1299 | u32 truncate_seq, u64 truncate_size, | ||
1300 | struct timespec *mtime, | ||
1301 | struct page **pages, int num_pages, | ||
1302 | int flags, int do_sync, bool nofail) | ||
1303 | { | ||
1304 | struct ceph_osd_request *req; | ||
1305 | int rc = 0; | ||
1306 | |||
1307 | BUG_ON(vino.snap != CEPH_NOSNAP); | ||
1308 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, | ||
1309 | CEPH_OSD_OP_WRITE, | ||
1310 | flags | CEPH_OSD_FLAG_ONDISK | | ||
1311 | CEPH_OSD_FLAG_WRITE, | ||
1312 | snapc, do_sync, | ||
1313 | truncate_seq, truncate_size, mtime, | ||
1314 | nofail, 1); | ||
1315 | if (!req) | ||
1316 | return -ENOMEM; | ||
1317 | |||
1318 | /* it may be a short write due to an object boundary */ | ||
1319 | req->r_pages = pages; | ||
1320 | dout("writepages %llu~%llu (%d pages)\n", off, len, | ||
1321 | req->r_num_pages); | ||
1322 | |||
1323 | rc = ceph_osdc_start_request(osdc, req, nofail); | ||
1324 | if (!rc) | ||
1325 | rc = ceph_osdc_wait_request(osdc, req); | ||
1326 | |||
1327 | ceph_osdc_put_request(req); | ||
1328 | if (rc == 0) | ||
1329 | rc = len; | ||
1330 | dout("writepages result %d\n", rc); | ||
1331 | return rc; | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * handle incoming message | ||
1336 | */ | ||
1337 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | ||
1338 | { | ||
1339 | struct ceph_osd *osd = con->private; | ||
1340 | struct ceph_osd_client *osdc; | ||
1341 | int type = le16_to_cpu(msg->hdr.type); | ||
1342 | |||
1343 | if (!osd) | ||
1344 | goto out; | ||
1345 | osdc = osd->o_osdc; | ||
1346 | |||
1347 | switch (type) { | ||
1348 | case CEPH_MSG_OSD_MAP: | ||
1349 | ceph_osdc_handle_map(osdc, msg); | ||
1350 | break; | ||
1351 | case CEPH_MSG_OSD_OPREPLY: | ||
1352 | handle_reply(osdc, msg, con); | ||
1353 | break; | ||
1354 | |||
1355 | default: | ||
1356 | pr_err("received unknown message type %d %s\n", type, | ||
1357 | ceph_msg_type_name(type)); | ||
1358 | } | ||
1359 | out: | ||
1360 | ceph_msg_put(msg); | ||
1361 | } | ||
1362 | |||
1363 | /* | ||
1364 | * lookup and return message for incoming reply. set up reply message | ||
1365 | * pages. | ||
1366 | */ | ||
1367 | static struct ceph_msg *get_reply(struct ceph_connection *con, | ||
1368 | struct ceph_msg_header *hdr, | ||
1369 | int *skip) | ||
1370 | { | ||
1371 | struct ceph_osd *osd = con->private; | ||
1372 | struct ceph_osd_client *osdc = osd->o_osdc; | ||
1373 | struct ceph_msg *m; | ||
1374 | struct ceph_osd_request *req; | ||
1375 | int front = le32_to_cpu(hdr->front_len); | ||
1376 | int data_len = le32_to_cpu(hdr->data_len); | ||
1377 | u64 tid; | ||
1378 | |||
1379 | tid = le64_to_cpu(hdr->tid); | ||
1380 | mutex_lock(&osdc->request_mutex); | ||
1381 | req = __lookup_request(osdc, tid); | ||
1382 | if (!req) { | ||
1383 | *skip = 1; | ||
1384 | m = NULL; | ||
1385 | pr_info("get_reply unknown tid %llu from osd%d\n", tid, | ||
1386 | osd->o_osd); | ||
1387 | goto out; | ||
1388 | } | ||
1389 | |||
1390 | if (req->r_con_filling_msg) { | ||
1391 | dout("get_reply revoking msg %p from old con %p\n", | ||
1392 | req->r_reply, req->r_con_filling_msg); | ||
1393 | ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); | ||
1394 | ceph_con_put(req->r_con_filling_msg); | ||
1395 | req->r_con_filling_msg = NULL; | ||
1396 | } | ||
1397 | |||
1398 | if (front > req->r_reply->front.iov_len) { | ||
1399 | pr_warning("get_reply front %d > preallocated %d\n", | ||
1400 | front, (int)req->r_reply->front.iov_len); | ||
1401 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); | ||
1402 | if (!m) | ||
1403 | goto out; | ||
1404 | ceph_msg_put(req->r_reply); | ||
1405 | req->r_reply = m; | ||
1406 | } | ||
1407 | m = ceph_msg_get(req->r_reply); | ||
1408 | |||
1409 | if (data_len > 0) { | ||
1410 | unsigned data_off = le16_to_cpu(hdr->data_off); | ||
1411 | int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); | ||
1412 | |||
1413 | if (unlikely(req->r_num_pages < want)) { | ||
1414 | pr_warning("tid %lld reply %d > expected %d pages\n", | ||
1415 | tid, want, m->nr_pages); | ||
1416 | *skip = 1; | ||
1417 | ceph_msg_put(m); | ||
1418 | m = NULL; | ||
1419 | goto out; | ||
1420 | } | ||
1421 | m->pages = req->r_pages; | ||
1422 | m->nr_pages = req->r_num_pages; | ||
1423 | } | ||
1424 | *skip = 0; | ||
1425 | req->r_con_filling_msg = ceph_con_get(con); | ||
1426 | dout("get_reply tid %lld %p\n", tid, m); | ||
1427 | |||
1428 | out: | ||
1429 | mutex_unlock(&osdc->request_mutex); | ||
1430 | return m; | ||
1431 | |||
1432 | } | ||
1433 | |||
1434 | static struct ceph_msg *alloc_msg(struct ceph_connection *con, | ||
1435 | struct ceph_msg_header *hdr, | ||
1436 | int *skip) | ||
1437 | { | ||
1438 | struct ceph_osd *osd = con->private; | ||
1439 | int type = le16_to_cpu(hdr->type); | ||
1440 | int front = le32_to_cpu(hdr->front_len); | ||
1441 | |||
1442 | switch (type) { | ||
1443 | case CEPH_MSG_OSD_MAP: | ||
1444 | return ceph_msg_new(type, front, GFP_NOFS); | ||
1445 | case CEPH_MSG_OSD_OPREPLY: | ||
1446 | return get_reply(con, hdr, skip); | ||
1447 | default: | ||
1448 | pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, | ||
1449 | osd->o_osd); | ||
1450 | *skip = 1; | ||
1451 | return NULL; | ||
1452 | } | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * Wrappers to refcount containing ceph_osd struct | ||
1457 | */ | ||
1458 | static struct ceph_connection *get_osd_con(struct ceph_connection *con) | ||
1459 | { | ||
1460 | struct ceph_osd *osd = con->private; | ||
1461 | if (get_osd(osd)) | ||
1462 | return con; | ||
1463 | return NULL; | ||
1464 | } | ||
1465 | |||
1466 | static void put_osd_con(struct ceph_connection *con) | ||
1467 | { | ||
1468 | struct ceph_osd *osd = con->private; | ||
1469 | put_osd(osd); | ||
1470 | } | ||
1471 | |||
1472 | /* | ||
1473 | * authentication | ||
1474 | */ | ||
1475 | static int get_authorizer(struct ceph_connection *con, | ||
1476 | void **buf, int *len, int *proto, | ||
1477 | void **reply_buf, int *reply_len, int force_new) | ||
1478 | { | ||
1479 | struct ceph_osd *o = con->private; | ||
1480 | struct ceph_osd_client *osdc = o->o_osdc; | ||
1481 | struct ceph_auth_client *ac = osdc->client->monc.auth; | ||
1482 | int ret = 0; | ||
1483 | |||
1484 | if (force_new && o->o_authorizer) { | ||
1485 | ac->ops->destroy_authorizer(ac, o->o_authorizer); | ||
1486 | o->o_authorizer = NULL; | ||
1487 | } | ||
1488 | if (o->o_authorizer == NULL) { | ||
1489 | ret = ac->ops->create_authorizer( | ||
1490 | ac, CEPH_ENTITY_TYPE_OSD, | ||
1491 | &o->o_authorizer, | ||
1492 | &o->o_authorizer_buf, | ||
1493 | &o->o_authorizer_buf_len, | ||
1494 | &o->o_authorizer_reply_buf, | ||
1495 | &o->o_authorizer_reply_buf_len); | ||
1496 | if (ret) | ||
1497 | return ret; | ||
1498 | } | ||
1499 | |||
1500 | *proto = ac->protocol; | ||
1501 | *buf = o->o_authorizer_buf; | ||
1502 | *len = o->o_authorizer_buf_len; | ||
1503 | *reply_buf = o->o_authorizer_reply_buf; | ||
1504 | *reply_len = o->o_authorizer_reply_buf_len; | ||
1505 | return 0; | ||
1506 | } | ||
1507 | |||
1508 | |||
1509 | static int verify_authorizer_reply(struct ceph_connection *con, int len) | ||
1510 | { | ||
1511 | struct ceph_osd *o = con->private; | ||
1512 | struct ceph_osd_client *osdc = o->o_osdc; | ||
1513 | struct ceph_auth_client *ac = osdc->client->monc.auth; | ||
1514 | |||
1515 | return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); | ||
1516 | } | ||
1517 | |||
1518 | static int invalidate_authorizer(struct ceph_connection *con) | ||
1519 | { | ||
1520 | struct ceph_osd *o = con->private; | ||
1521 | struct ceph_osd_client *osdc = o->o_osdc; | ||
1522 | struct ceph_auth_client *ac = osdc->client->monc.auth; | ||
1523 | |||
1524 | if (ac->ops->invalidate_authorizer) | ||
1525 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); | ||
1526 | |||
1527 | return ceph_monc_validate_auth(&osdc->client->monc); | ||
1528 | } | ||
1529 | |||
1530 | static const struct ceph_connection_operations osd_con_ops = { | ||
1531 | .get = get_osd_con, | ||
1532 | .put = put_osd_con, | ||
1533 | .dispatch = dispatch, | ||
1534 | .get_authorizer = get_authorizer, | ||
1535 | .verify_authorizer_reply = verify_authorizer_reply, | ||
1536 | .invalidate_authorizer = invalidate_authorizer, | ||
1537 | .alloc_msg = alloc_msg, | ||
1538 | .fault = osd_reset, | ||
1539 | }; | ||
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h deleted file mode 100644 index ce776989ef6a..000000000000 --- a/fs/ceph/osd_client.h +++ /dev/null | |||
@@ -1,167 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_OSD_CLIENT_H | ||
2 | #define _FS_CEPH_OSD_CLIENT_H | ||
3 | |||
4 | #include <linux/completion.h> | ||
5 | #include <linux/kref.h> | ||
6 | #include <linux/mempool.h> | ||
7 | #include <linux/rbtree.h> | ||
8 | |||
9 | #include "types.h" | ||
10 | #include "osdmap.h" | ||
11 | #include "messenger.h" | ||
12 | |||
13 | struct ceph_msg; | ||
14 | struct ceph_snap_context; | ||
15 | struct ceph_osd_request; | ||
16 | struct ceph_osd_client; | ||
17 | struct ceph_authorizer; | ||
18 | |||
19 | /* | ||
20 | * completion callback for async writepages | ||
21 | */ | ||
22 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, | ||
23 | struct ceph_msg *); | ||
24 | |||
25 | /* a given osd we're communicating with */ | ||
26 | struct ceph_osd { | ||
27 | atomic_t o_ref; | ||
28 | struct ceph_osd_client *o_osdc; | ||
29 | int o_osd; | ||
30 | int o_incarnation; | ||
31 | struct rb_node o_node; | ||
32 | struct ceph_connection o_con; | ||
33 | struct list_head o_requests; | ||
34 | struct list_head o_osd_lru; | ||
35 | struct ceph_authorizer *o_authorizer; | ||
36 | void *o_authorizer_buf, *o_authorizer_reply_buf; | ||
37 | size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; | ||
38 | unsigned long lru_ttl; | ||
39 | int o_marked_for_keepalive; | ||
40 | struct list_head o_keepalive_item; | ||
41 | }; | ||
42 | |||
43 | /* an in-flight request */ | ||
44 | struct ceph_osd_request { | ||
45 | u64 r_tid; /* unique for this client */ | ||
46 | struct rb_node r_node; | ||
47 | struct list_head r_req_lru_item; | ||
48 | struct list_head r_osd_item; | ||
49 | struct ceph_osd *r_osd; | ||
50 | struct ceph_pg r_pgid; | ||
51 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | ||
52 | int r_num_pg_osds; | ||
53 | |||
54 | struct ceph_connection *r_con_filling_msg; | ||
55 | |||
56 | struct ceph_msg *r_request, *r_reply; | ||
57 | int r_result; | ||
58 | int r_flags; /* any additional flags for the osd */ | ||
59 | u32 r_sent; /* >0 if r_request is sending/sent */ | ||
60 | int r_got_reply; | ||
61 | |||
62 | struct ceph_osd_client *r_osdc; | ||
63 | struct kref r_kref; | ||
64 | bool r_mempool; | ||
65 | struct completion r_completion, r_safe_completion; | ||
66 | ceph_osdc_callback_t r_callback, r_safe_callback; | ||
67 | struct ceph_eversion r_reassert_version; | ||
68 | struct list_head r_unsafe_item; | ||
69 | |||
70 | struct inode *r_inode; /* for use by callbacks */ | ||
71 | |||
72 | char r_oid[40]; /* object name */ | ||
73 | int r_oid_len; | ||
74 | unsigned long r_stamp; /* send OR check time */ | ||
75 | bool r_resend; /* msg send failed, needs retry */ | ||
76 | |||
77 | struct ceph_file_layout r_file_layout; | ||
78 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | ||
79 | unsigned r_num_pages; /* size of page array (follows) */ | ||
80 | struct page **r_pages; /* pages for data payload */ | ||
81 | int r_pages_from_pool; | ||
82 | int r_own_pages; /* if true, i own page list */ | ||
83 | }; | ||
84 | |||
85 | struct ceph_osd_client { | ||
86 | struct ceph_client *client; | ||
87 | |||
88 | struct ceph_osdmap *osdmap; /* current map */ | ||
89 | struct rw_semaphore map_sem; | ||
90 | struct completion map_waiters; | ||
91 | u64 last_requested_map; | ||
92 | |||
93 | struct mutex request_mutex; | ||
94 | struct rb_root osds; /* osds */ | ||
95 | struct list_head osd_lru; /* idle osds */ | ||
96 | u64 timeout_tid; /* tid of timeout triggering rq */ | ||
97 | u64 last_tid; /* tid of last request */ | ||
98 | struct rb_root requests; /* pending requests */ | ||
99 | struct list_head req_lru; /* pending requests lru */ | ||
100 | int num_requests; | ||
101 | struct delayed_work timeout_work; | ||
102 | struct delayed_work osds_timeout_work; | ||
103 | #ifdef CONFIG_DEBUG_FS | ||
104 | struct dentry *debugfs_file; | ||
105 | #endif | ||
106 | |||
107 | mempool_t *req_mempool; | ||
108 | |||
109 | struct ceph_msgpool msgpool_op; | ||
110 | struct ceph_msgpool msgpool_op_reply; | ||
111 | }; | ||
112 | |||
113 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | ||
114 | struct ceph_client *client); | ||
115 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); | ||
116 | |||
117 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | ||
118 | struct ceph_msg *msg); | ||
119 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | ||
120 | struct ceph_msg *msg); | ||
121 | |||
122 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | ||
123 | struct ceph_file_layout *layout, | ||
124 | struct ceph_vino vino, | ||
125 | u64 offset, u64 *len, int op, int flags, | ||
126 | struct ceph_snap_context *snapc, | ||
127 | int do_sync, u32 truncate_seq, | ||
128 | u64 truncate_size, | ||
129 | struct timespec *mtime, | ||
130 | bool use_mempool, int num_reply); | ||
131 | |||
132 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) | ||
133 | { | ||
134 | kref_get(&req->r_kref); | ||
135 | } | ||
136 | extern void ceph_osdc_release_request(struct kref *kref); | ||
137 | static inline void ceph_osdc_put_request(struct ceph_osd_request *req) | ||
138 | { | ||
139 | kref_put(&req->r_kref, ceph_osdc_release_request); | ||
140 | } | ||
141 | |||
142 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, | ||
143 | struct ceph_osd_request *req, | ||
144 | bool nofail); | ||
145 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | ||
146 | struct ceph_osd_request *req); | ||
147 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); | ||
148 | |||
149 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, | ||
150 | struct ceph_vino vino, | ||
151 | struct ceph_file_layout *layout, | ||
152 | u64 off, u64 *plen, | ||
153 | u32 truncate_seq, u64 truncate_size, | ||
154 | struct page **pages, int nr_pages); | ||
155 | |||
156 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | ||
157 | struct ceph_vino vino, | ||
158 | struct ceph_file_layout *layout, | ||
159 | struct ceph_snap_context *sc, | ||
160 | u64 off, u64 len, | ||
161 | u32 truncate_seq, u64 truncate_size, | ||
162 | struct timespec *mtime, | ||
163 | struct page **pages, int nr_pages, | ||
164 | int flags, int do_sync, bool nofail); | ||
165 | |||
166 | #endif | ||
167 | |||
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c deleted file mode 100644 index e31f118f1392..000000000000 --- a/fs/ceph/osdmap.c +++ /dev/null | |||
@@ -1,1110 +0,0 @@ | |||
1 | |||
2 | #include "ceph_debug.h" | ||
3 | |||
4 | #include <linux/slab.h> | ||
5 | #include <asm/div64.h> | ||
6 | |||
7 | #include "super.h" | ||
8 | #include "osdmap.h" | ||
9 | #include "crush/hash.h" | ||
10 | #include "crush/mapper.h" | ||
11 | #include "decode.h" | ||
12 | |||
13 | char *ceph_osdmap_state_str(char *str, int len, int state) | ||
14 | { | ||
15 | int flag = 0; | ||
16 | |||
17 | if (!len) | ||
18 | goto done; | ||
19 | |||
20 | *str = '\0'; | ||
21 | if (state) { | ||
22 | if (state & CEPH_OSD_EXISTS) { | ||
23 | snprintf(str, len, "exists"); | ||
24 | flag = 1; | ||
25 | } | ||
26 | if (state & CEPH_OSD_UP) { | ||
27 | snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), | ||
28 | "up"); | ||
29 | flag = 1; | ||
30 | } | ||
31 | } else { | ||
32 | snprintf(str, len, "doesn't exist"); | ||
33 | } | ||
34 | done: | ||
35 | return str; | ||
36 | } | ||
37 | |||
38 | /* maps */ | ||
39 | |||
40 | static int calc_bits_of(unsigned t) | ||
41 | { | ||
42 | int b = 0; | ||
43 | while (t) { | ||
44 | t = t >> 1; | ||
45 | b++; | ||
46 | } | ||
47 | return b; | ||
48 | } | ||
49 | |||
50 | /* | ||
51 | * the foo_mask is the smallest value 2^n-1 that is >= foo. | ||
52 | */ | ||
53 | static void calc_pg_masks(struct ceph_pg_pool_info *pi) | ||
54 | { | ||
55 | pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; | ||
56 | pi->pgp_num_mask = | ||
57 | (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; | ||
58 | pi->lpg_num_mask = | ||
59 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; | ||
60 | pi->lpgp_num_mask = | ||
61 | (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * decode crush map | ||
66 | */ | ||
67 | static int crush_decode_uniform_bucket(void **p, void *end, | ||
68 | struct crush_bucket_uniform *b) | ||
69 | { | ||
70 | dout("crush_decode_uniform_bucket %p to %p\n", *p, end); | ||
71 | ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); | ||
72 | b->item_weight = ceph_decode_32(p); | ||
73 | return 0; | ||
74 | bad: | ||
75 | return -EINVAL; | ||
76 | } | ||
77 | |||
78 | static int crush_decode_list_bucket(void **p, void *end, | ||
79 | struct crush_bucket_list *b) | ||
80 | { | ||
81 | int j; | ||
82 | dout("crush_decode_list_bucket %p to %p\n", *p, end); | ||
83 | b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | ||
84 | if (b->item_weights == NULL) | ||
85 | return -ENOMEM; | ||
86 | b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | ||
87 | if (b->sum_weights == NULL) | ||
88 | return -ENOMEM; | ||
89 | ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); | ||
90 | for (j = 0; j < b->h.size; j++) { | ||
91 | b->item_weights[j] = ceph_decode_32(p); | ||
92 | b->sum_weights[j] = ceph_decode_32(p); | ||
93 | } | ||
94 | return 0; | ||
95 | bad: | ||
96 | return -EINVAL; | ||
97 | } | ||
98 | |||
99 | static int crush_decode_tree_bucket(void **p, void *end, | ||
100 | struct crush_bucket_tree *b) | ||
101 | { | ||
102 | int j; | ||
103 | dout("crush_decode_tree_bucket %p to %p\n", *p, end); | ||
104 | ceph_decode_32_safe(p, end, b->num_nodes, bad); | ||
105 | b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); | ||
106 | if (b->node_weights == NULL) | ||
107 | return -ENOMEM; | ||
108 | ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); | ||
109 | for (j = 0; j < b->num_nodes; j++) | ||
110 | b->node_weights[j] = ceph_decode_32(p); | ||
111 | return 0; | ||
112 | bad: | ||
113 | return -EINVAL; | ||
114 | } | ||
115 | |||
116 | static int crush_decode_straw_bucket(void **p, void *end, | ||
117 | struct crush_bucket_straw *b) | ||
118 | { | ||
119 | int j; | ||
120 | dout("crush_decode_straw_bucket %p to %p\n", *p, end); | ||
121 | b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | ||
122 | if (b->item_weights == NULL) | ||
123 | return -ENOMEM; | ||
124 | b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); | ||
125 | if (b->straws == NULL) | ||
126 | return -ENOMEM; | ||
127 | ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); | ||
128 | for (j = 0; j < b->h.size; j++) { | ||
129 | b->item_weights[j] = ceph_decode_32(p); | ||
130 | b->straws[j] = ceph_decode_32(p); | ||
131 | } | ||
132 | return 0; | ||
133 | bad: | ||
134 | return -EINVAL; | ||
135 | } | ||
136 | |||
137 | static struct crush_map *crush_decode(void *pbyval, void *end) | ||
138 | { | ||
139 | struct crush_map *c; | ||
140 | int err = -EINVAL; | ||
141 | int i, j; | ||
142 | void **p = &pbyval; | ||
143 | void *start = pbyval; | ||
144 | u32 magic; | ||
145 | |||
146 | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | ||
147 | |||
148 | c = kzalloc(sizeof(*c), GFP_NOFS); | ||
149 | if (c == NULL) | ||
150 | return ERR_PTR(-ENOMEM); | ||
151 | |||
152 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | ||
153 | magic = ceph_decode_32(p); | ||
154 | if (magic != CRUSH_MAGIC) { | ||
155 | pr_err("crush_decode magic %x != current %x\n", | ||
156 | (unsigned)magic, (unsigned)CRUSH_MAGIC); | ||
157 | goto bad; | ||
158 | } | ||
159 | c->max_buckets = ceph_decode_32(p); | ||
160 | c->max_rules = ceph_decode_32(p); | ||
161 | c->max_devices = ceph_decode_32(p); | ||
162 | |||
163 | c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); | ||
164 | if (c->device_parents == NULL) | ||
165 | goto badmem; | ||
166 | c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); | ||
167 | if (c->bucket_parents == NULL) | ||
168 | goto badmem; | ||
169 | |||
170 | c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); | ||
171 | if (c->buckets == NULL) | ||
172 | goto badmem; | ||
173 | c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); | ||
174 | if (c->rules == NULL) | ||
175 | goto badmem; | ||
176 | |||
177 | /* buckets */ | ||
178 | for (i = 0; i < c->max_buckets; i++) { | ||
179 | int size = 0; | ||
180 | u32 alg; | ||
181 | struct crush_bucket *b; | ||
182 | |||
183 | ceph_decode_32_safe(p, end, alg, bad); | ||
184 | if (alg == 0) { | ||
185 | c->buckets[i] = NULL; | ||
186 | continue; | ||
187 | } | ||
188 | dout("crush_decode bucket %d off %x %p to %p\n", | ||
189 | i, (int)(*p-start), *p, end); | ||
190 | |||
191 | switch (alg) { | ||
192 | case CRUSH_BUCKET_UNIFORM: | ||
193 | size = sizeof(struct crush_bucket_uniform); | ||
194 | break; | ||
195 | case CRUSH_BUCKET_LIST: | ||
196 | size = sizeof(struct crush_bucket_list); | ||
197 | break; | ||
198 | case CRUSH_BUCKET_TREE: | ||
199 | size = sizeof(struct crush_bucket_tree); | ||
200 | break; | ||
201 | case CRUSH_BUCKET_STRAW: | ||
202 | size = sizeof(struct crush_bucket_straw); | ||
203 | break; | ||
204 | default: | ||
205 | err = -EINVAL; | ||
206 | goto bad; | ||
207 | } | ||
208 | BUG_ON(size == 0); | ||
209 | b = c->buckets[i] = kzalloc(size, GFP_NOFS); | ||
210 | if (b == NULL) | ||
211 | goto badmem; | ||
212 | |||
213 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | ||
214 | b->id = ceph_decode_32(p); | ||
215 | b->type = ceph_decode_16(p); | ||
216 | b->alg = ceph_decode_8(p); | ||
217 | b->hash = ceph_decode_8(p); | ||
218 | b->weight = ceph_decode_32(p); | ||
219 | b->size = ceph_decode_32(p); | ||
220 | |||
221 | dout("crush_decode bucket size %d off %x %p to %p\n", | ||
222 | b->size, (int)(*p-start), *p, end); | ||
223 | |||
224 | b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); | ||
225 | if (b->items == NULL) | ||
226 | goto badmem; | ||
227 | b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); | ||
228 | if (b->perm == NULL) | ||
229 | goto badmem; | ||
230 | b->perm_n = 0; | ||
231 | |||
232 | ceph_decode_need(p, end, b->size*sizeof(u32), bad); | ||
233 | for (j = 0; j < b->size; j++) | ||
234 | b->items[j] = ceph_decode_32(p); | ||
235 | |||
236 | switch (b->alg) { | ||
237 | case CRUSH_BUCKET_UNIFORM: | ||
238 | err = crush_decode_uniform_bucket(p, end, | ||
239 | (struct crush_bucket_uniform *)b); | ||
240 | if (err < 0) | ||
241 | goto bad; | ||
242 | break; | ||
243 | case CRUSH_BUCKET_LIST: | ||
244 | err = crush_decode_list_bucket(p, end, | ||
245 | (struct crush_bucket_list *)b); | ||
246 | if (err < 0) | ||
247 | goto bad; | ||
248 | break; | ||
249 | case CRUSH_BUCKET_TREE: | ||
250 | err = crush_decode_tree_bucket(p, end, | ||
251 | (struct crush_bucket_tree *)b); | ||
252 | if (err < 0) | ||
253 | goto bad; | ||
254 | break; | ||
255 | case CRUSH_BUCKET_STRAW: | ||
256 | err = crush_decode_straw_bucket(p, end, | ||
257 | (struct crush_bucket_straw *)b); | ||
258 | if (err < 0) | ||
259 | goto bad; | ||
260 | break; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | /* rules */ | ||
265 | dout("rule vec is %p\n", c->rules); | ||
266 | for (i = 0; i < c->max_rules; i++) { | ||
267 | u32 yes; | ||
268 | struct crush_rule *r; | ||
269 | |||
270 | ceph_decode_32_safe(p, end, yes, bad); | ||
271 | if (!yes) { | ||
272 | dout("crush_decode NO rule %d off %x %p to %p\n", | ||
273 | i, (int)(*p-start), *p, end); | ||
274 | c->rules[i] = NULL; | ||
275 | continue; | ||
276 | } | ||
277 | |||
278 | dout("crush_decode rule %d off %x %p to %p\n", | ||
279 | i, (int)(*p-start), *p, end); | ||
280 | |||
281 | /* len */ | ||
282 | ceph_decode_32_safe(p, end, yes, bad); | ||
283 | #if BITS_PER_LONG == 32 | ||
284 | err = -EINVAL; | ||
285 | if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) | ||
286 | goto bad; | ||
287 | #endif | ||
288 | r = c->rules[i] = kmalloc(sizeof(*r) + | ||
289 | yes*sizeof(struct crush_rule_step), | ||
290 | GFP_NOFS); | ||
291 | if (r == NULL) | ||
292 | goto badmem; | ||
293 | dout(" rule %d is at %p\n", i, r); | ||
294 | r->len = yes; | ||
295 | ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ | ||
296 | ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); | ||
297 | for (j = 0; j < r->len; j++) { | ||
298 | r->steps[j].op = ceph_decode_32(p); | ||
299 | r->steps[j].arg1 = ceph_decode_32(p); | ||
300 | r->steps[j].arg2 = ceph_decode_32(p); | ||
301 | } | ||
302 | } | ||
303 | |||
304 | /* ignore trailing name maps. */ | ||
305 | |||
306 | dout("crush_decode success\n"); | ||
307 | return c; | ||
308 | |||
309 | badmem: | ||
310 | err = -ENOMEM; | ||
311 | bad: | ||
312 | dout("crush_decode fail %d\n", err); | ||
313 | crush_destroy(c); | ||
314 | return ERR_PTR(err); | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid | ||
319 | * to a set of osds) | ||
320 | */ | ||
321 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) | ||
322 | { | ||
323 | u64 a = *(u64 *)&l; | ||
324 | u64 b = *(u64 *)&r; | ||
325 | |||
326 | if (a < b) | ||
327 | return -1; | ||
328 | if (a > b) | ||
329 | return 1; | ||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | static int __insert_pg_mapping(struct ceph_pg_mapping *new, | ||
334 | struct rb_root *root) | ||
335 | { | ||
336 | struct rb_node **p = &root->rb_node; | ||
337 | struct rb_node *parent = NULL; | ||
338 | struct ceph_pg_mapping *pg = NULL; | ||
339 | int c; | ||
340 | |||
341 | while (*p) { | ||
342 | parent = *p; | ||
343 | pg = rb_entry(parent, struct ceph_pg_mapping, node); | ||
344 | c = pgid_cmp(new->pgid, pg->pgid); | ||
345 | if (c < 0) | ||
346 | p = &(*p)->rb_left; | ||
347 | else if (c > 0) | ||
348 | p = &(*p)->rb_right; | ||
349 | else | ||
350 | return -EEXIST; | ||
351 | } | ||
352 | |||
353 | rb_link_node(&new->node, parent, p); | ||
354 | rb_insert_color(&new->node, root); | ||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, | ||
359 | struct ceph_pg pgid) | ||
360 | { | ||
361 | struct rb_node *n = root->rb_node; | ||
362 | struct ceph_pg_mapping *pg; | ||
363 | int c; | ||
364 | |||
365 | while (n) { | ||
366 | pg = rb_entry(n, struct ceph_pg_mapping, node); | ||
367 | c = pgid_cmp(pgid, pg->pgid); | ||
368 | if (c < 0) | ||
369 | n = n->rb_left; | ||
370 | else if (c > 0) | ||
371 | n = n->rb_right; | ||
372 | else | ||
373 | return pg; | ||
374 | } | ||
375 | return NULL; | ||
376 | } | ||
377 | |||
378 | /* | ||
379 | * rbtree of pg pool info | ||
380 | */ | ||
381 | static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) | ||
382 | { | ||
383 | struct rb_node **p = &root->rb_node; | ||
384 | struct rb_node *parent = NULL; | ||
385 | struct ceph_pg_pool_info *pi = NULL; | ||
386 | |||
387 | while (*p) { | ||
388 | parent = *p; | ||
389 | pi = rb_entry(parent, struct ceph_pg_pool_info, node); | ||
390 | if (new->id < pi->id) | ||
391 | p = &(*p)->rb_left; | ||
392 | else if (new->id > pi->id) | ||
393 | p = &(*p)->rb_right; | ||
394 | else | ||
395 | return -EEXIST; | ||
396 | } | ||
397 | |||
398 | rb_link_node(&new->node, parent, p); | ||
399 | rb_insert_color(&new->node, root); | ||
400 | return 0; | ||
401 | } | ||
402 | |||
403 | static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | ||
404 | { | ||
405 | struct ceph_pg_pool_info *pi; | ||
406 | struct rb_node *n = root->rb_node; | ||
407 | |||
408 | while (n) { | ||
409 | pi = rb_entry(n, struct ceph_pg_pool_info, node); | ||
410 | if (id < pi->id) | ||
411 | n = n->rb_left; | ||
412 | else if (id > pi->id) | ||
413 | n = n->rb_right; | ||
414 | else | ||
415 | return pi; | ||
416 | } | ||
417 | return NULL; | ||
418 | } | ||
419 | |||
420 | static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) | ||
421 | { | ||
422 | rb_erase(&pi->node, root); | ||
423 | kfree(pi->name); | ||
424 | kfree(pi); | ||
425 | } | ||
426 | |||
427 | static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | ||
428 | { | ||
429 | unsigned n, m; | ||
430 | |||
431 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | ||
432 | calc_pg_masks(pi); | ||
433 | |||
434 | /* num_snaps * snap_info_t */ | ||
435 | n = le32_to_cpu(pi->v.num_snaps); | ||
436 | while (n--) { | ||
437 | ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + | ||
438 | sizeof(struct ceph_timespec), bad); | ||
439 | *p += sizeof(u64) + /* key */ | ||
440 | 1 + sizeof(u64) + /* u8, snapid */ | ||
441 | sizeof(struct ceph_timespec); | ||
442 | m = ceph_decode_32(p); /* snap name */ | ||
443 | *p += m; | ||
444 | } | ||
445 | |||
446 | *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; | ||
447 | return 0; | ||
448 | |||
449 | bad: | ||
450 | return -EINVAL; | ||
451 | } | ||
452 | |||
453 | static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) | ||
454 | { | ||
455 | struct ceph_pg_pool_info *pi; | ||
456 | u32 num, len, pool; | ||
457 | |||
458 | ceph_decode_32_safe(p, end, num, bad); | ||
459 | dout(" %d pool names\n", num); | ||
460 | while (num--) { | ||
461 | ceph_decode_32_safe(p, end, pool, bad); | ||
462 | ceph_decode_32_safe(p, end, len, bad); | ||
463 | dout(" pool %d len %d\n", pool, len); | ||
464 | pi = __lookup_pg_pool(&map->pg_pools, pool); | ||
465 | if (pi) { | ||
466 | kfree(pi->name); | ||
467 | pi->name = kmalloc(len + 1, GFP_NOFS); | ||
468 | if (pi->name) { | ||
469 | memcpy(pi->name, *p, len); | ||
470 | pi->name[len] = '\0'; | ||
471 | dout(" name is %s\n", pi->name); | ||
472 | } | ||
473 | } | ||
474 | *p += len; | ||
475 | } | ||
476 | return 0; | ||
477 | |||
478 | bad: | ||
479 | return -EINVAL; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * osd map | ||
484 | */ | ||
485 | void ceph_osdmap_destroy(struct ceph_osdmap *map) | ||
486 | { | ||
487 | dout("osdmap_destroy %p\n", map); | ||
488 | if (map->crush) | ||
489 | crush_destroy(map->crush); | ||
490 | while (!RB_EMPTY_ROOT(&map->pg_temp)) { | ||
491 | struct ceph_pg_mapping *pg = | ||
492 | rb_entry(rb_first(&map->pg_temp), | ||
493 | struct ceph_pg_mapping, node); | ||
494 | rb_erase(&pg->node, &map->pg_temp); | ||
495 | kfree(pg); | ||
496 | } | ||
497 | while (!RB_EMPTY_ROOT(&map->pg_pools)) { | ||
498 | struct ceph_pg_pool_info *pi = | ||
499 | rb_entry(rb_first(&map->pg_pools), | ||
500 | struct ceph_pg_pool_info, node); | ||
501 | __remove_pg_pool(&map->pg_pools, pi); | ||
502 | } | ||
503 | kfree(map->osd_state); | ||
504 | kfree(map->osd_weight); | ||
505 | kfree(map->osd_addr); | ||
506 | kfree(map); | ||
507 | } | ||
508 | |||
509 | /* | ||
510 | * adjust max osd value. reallocate arrays. | ||
511 | */ | ||
512 | static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) | ||
513 | { | ||
514 | u8 *state; | ||
515 | struct ceph_entity_addr *addr; | ||
516 | u32 *weight; | ||
517 | |||
518 | state = kcalloc(max, sizeof(*state), GFP_NOFS); | ||
519 | addr = kcalloc(max, sizeof(*addr), GFP_NOFS); | ||
520 | weight = kcalloc(max, sizeof(*weight), GFP_NOFS); | ||
521 | if (state == NULL || addr == NULL || weight == NULL) { | ||
522 | kfree(state); | ||
523 | kfree(addr); | ||
524 | kfree(weight); | ||
525 | return -ENOMEM; | ||
526 | } | ||
527 | |||
528 | /* copy old? */ | ||
529 | if (map->osd_state) { | ||
530 | memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); | ||
531 | memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); | ||
532 | memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); | ||
533 | kfree(map->osd_state); | ||
534 | kfree(map->osd_addr); | ||
535 | kfree(map->osd_weight); | ||
536 | } | ||
537 | |||
538 | map->osd_state = state; | ||
539 | map->osd_weight = weight; | ||
540 | map->osd_addr = addr; | ||
541 | map->max_osd = max; | ||
542 | return 0; | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * decode a full map. | ||
547 | */ | ||
548 | struct ceph_osdmap *osdmap_decode(void **p, void *end) | ||
549 | { | ||
550 | struct ceph_osdmap *map; | ||
551 | u16 version; | ||
552 | u32 len, max, i; | ||
553 | u8 ev; | ||
554 | int err = -EINVAL; | ||
555 | void *start = *p; | ||
556 | struct ceph_pg_pool_info *pi; | ||
557 | |||
558 | dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | ||
559 | |||
560 | map = kzalloc(sizeof(*map), GFP_NOFS); | ||
561 | if (map == NULL) | ||
562 | return ERR_PTR(-ENOMEM); | ||
563 | map->pg_temp = RB_ROOT; | ||
564 | |||
565 | ceph_decode_16_safe(p, end, version, bad); | ||
566 | if (version > CEPH_OSDMAP_VERSION) { | ||
567 | pr_warning("got unknown v %d > %d of osdmap\n", version, | ||
568 | CEPH_OSDMAP_VERSION); | ||
569 | goto bad; | ||
570 | } | ||
571 | |||
572 | ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); | ||
573 | ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); | ||
574 | map->epoch = ceph_decode_32(p); | ||
575 | ceph_decode_copy(p, &map->created, sizeof(map->created)); | ||
576 | ceph_decode_copy(p, &map->modified, sizeof(map->modified)); | ||
577 | |||
578 | ceph_decode_32_safe(p, end, max, bad); | ||
579 | while (max--) { | ||
580 | ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); | ||
581 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | ||
582 | if (!pi) | ||
583 | goto bad; | ||
584 | pi->id = ceph_decode_32(p); | ||
585 | ev = ceph_decode_8(p); /* encoding version */ | ||
586 | if (ev > CEPH_PG_POOL_VERSION) { | ||
587 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
588 | ev, CEPH_PG_POOL_VERSION); | ||
589 | kfree(pi); | ||
590 | goto bad; | ||
591 | } | ||
592 | err = __decode_pool(p, end, pi); | ||
593 | if (err < 0) | ||
594 | goto bad; | ||
595 | __insert_pg_pool(&map->pg_pools, pi); | ||
596 | } | ||
597 | |||
598 | if (version >= 5 && __decode_pool_names(p, end, map) < 0) | ||
599 | goto bad; | ||
600 | |||
601 | ceph_decode_32_safe(p, end, map->pool_max, bad); | ||
602 | |||
603 | ceph_decode_32_safe(p, end, map->flags, bad); | ||
604 | |||
605 | max = ceph_decode_32(p); | ||
606 | |||
607 | /* (re)alloc osd arrays */ | ||
608 | err = osdmap_set_max_osd(map, max); | ||
609 | if (err < 0) | ||
610 | goto bad; | ||
611 | dout("osdmap_decode max_osd = %d\n", map->max_osd); | ||
612 | |||
613 | /* osds */ | ||
614 | err = -EINVAL; | ||
615 | ceph_decode_need(p, end, 3*sizeof(u32) + | ||
616 | map->max_osd*(1 + sizeof(*map->osd_weight) + | ||
617 | sizeof(*map->osd_addr)), bad); | ||
618 | *p += 4; /* skip length field (should match max) */ | ||
619 | ceph_decode_copy(p, map->osd_state, map->max_osd); | ||
620 | |||
621 | *p += 4; /* skip length field (should match max) */ | ||
622 | for (i = 0; i < map->max_osd; i++) | ||
623 | map->osd_weight[i] = ceph_decode_32(p); | ||
624 | |||
625 | *p += 4; /* skip length field (should match max) */ | ||
626 | ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); | ||
627 | for (i = 0; i < map->max_osd; i++) | ||
628 | ceph_decode_addr(&map->osd_addr[i]); | ||
629 | |||
630 | /* pg_temp */ | ||
631 | ceph_decode_32_safe(p, end, len, bad); | ||
632 | for (i = 0; i < len; i++) { | ||
633 | int n, j; | ||
634 | struct ceph_pg pgid; | ||
635 | struct ceph_pg_mapping *pg; | ||
636 | |||
637 | ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); | ||
638 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | ||
639 | n = ceph_decode_32(p); | ||
640 | ceph_decode_need(p, end, n * sizeof(u32), bad); | ||
641 | err = -ENOMEM; | ||
642 | pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); | ||
643 | if (!pg) | ||
644 | goto bad; | ||
645 | pg->pgid = pgid; | ||
646 | pg->len = n; | ||
647 | for (j = 0; j < n; j++) | ||
648 | pg->osds[j] = ceph_decode_32(p); | ||
649 | |||
650 | err = __insert_pg_mapping(pg, &map->pg_temp); | ||
651 | if (err) | ||
652 | goto bad; | ||
653 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); | ||
654 | } | ||
655 | |||
656 | /* crush */ | ||
657 | ceph_decode_32_safe(p, end, len, bad); | ||
658 | dout("osdmap_decode crush len %d from off 0x%x\n", len, | ||
659 | (int)(*p - start)); | ||
660 | ceph_decode_need(p, end, len, bad); | ||
661 | map->crush = crush_decode(*p, end); | ||
662 | *p += len; | ||
663 | if (IS_ERR(map->crush)) { | ||
664 | err = PTR_ERR(map->crush); | ||
665 | map->crush = NULL; | ||
666 | goto bad; | ||
667 | } | ||
668 | |||
669 | /* ignore the rest of the map */ | ||
670 | *p = end; | ||
671 | |||
672 | dout("osdmap_decode done %p %p\n", *p, end); | ||
673 | return map; | ||
674 | |||
675 | bad: | ||
676 | dout("osdmap_decode fail\n"); | ||
677 | ceph_osdmap_destroy(map); | ||
678 | return ERR_PTR(err); | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * decode and apply an incremental map update. | ||
683 | */ | ||
684 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | ||
685 | struct ceph_osdmap *map, | ||
686 | struct ceph_messenger *msgr) | ||
687 | { | ||
688 | struct crush_map *newcrush = NULL; | ||
689 | struct ceph_fsid fsid; | ||
690 | u32 epoch = 0; | ||
691 | struct ceph_timespec modified; | ||
692 | u32 len, pool; | ||
693 | __s32 new_pool_max, new_flags, max; | ||
694 | void *start = *p; | ||
695 | int err = -EINVAL; | ||
696 | u16 version; | ||
697 | struct rb_node *rbp; | ||
698 | |||
699 | ceph_decode_16_safe(p, end, version, bad); | ||
700 | if (version > CEPH_OSDMAP_INC_VERSION) { | ||
701 | pr_warning("got unknown v %d > %d of inc osdmap\n", version, | ||
702 | CEPH_OSDMAP_INC_VERSION); | ||
703 | goto bad; | ||
704 | } | ||
705 | |||
706 | ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), | ||
707 | bad); | ||
708 | ceph_decode_copy(p, &fsid, sizeof(fsid)); | ||
709 | epoch = ceph_decode_32(p); | ||
710 | BUG_ON(epoch != map->epoch+1); | ||
711 | ceph_decode_copy(p, &modified, sizeof(modified)); | ||
712 | new_pool_max = ceph_decode_32(p); | ||
713 | new_flags = ceph_decode_32(p); | ||
714 | |||
715 | /* full map? */ | ||
716 | ceph_decode_32_safe(p, end, len, bad); | ||
717 | if (len > 0) { | ||
718 | dout("apply_incremental full map len %d, %p to %p\n", | ||
719 | len, *p, end); | ||
720 | return osdmap_decode(p, min(*p+len, end)); | ||
721 | } | ||
722 | |||
723 | /* new crush? */ | ||
724 | ceph_decode_32_safe(p, end, len, bad); | ||
725 | if (len > 0) { | ||
726 | dout("apply_incremental new crush map len %d, %p to %p\n", | ||
727 | len, *p, end); | ||
728 | newcrush = crush_decode(*p, min(*p+len, end)); | ||
729 | if (IS_ERR(newcrush)) | ||
730 | return ERR_CAST(newcrush); | ||
731 | *p += len; | ||
732 | } | ||
733 | |||
734 | /* new flags? */ | ||
735 | if (new_flags >= 0) | ||
736 | map->flags = new_flags; | ||
737 | if (new_pool_max >= 0) | ||
738 | map->pool_max = new_pool_max; | ||
739 | |||
740 | ceph_decode_need(p, end, 5*sizeof(u32), bad); | ||
741 | |||
742 | /* new max? */ | ||
743 | max = ceph_decode_32(p); | ||
744 | if (max >= 0) { | ||
745 | err = osdmap_set_max_osd(map, max); | ||
746 | if (err < 0) | ||
747 | goto bad; | ||
748 | } | ||
749 | |||
750 | map->epoch++; | ||
751 | map->modified = map->modified; | ||
752 | if (newcrush) { | ||
753 | if (map->crush) | ||
754 | crush_destroy(map->crush); | ||
755 | map->crush = newcrush; | ||
756 | newcrush = NULL; | ||
757 | } | ||
758 | |||
759 | /* new_pool */ | ||
760 | ceph_decode_32_safe(p, end, len, bad); | ||
761 | while (len--) { | ||
762 | __u8 ev; | ||
763 | struct ceph_pg_pool_info *pi; | ||
764 | |||
765 | ceph_decode_32_safe(p, end, pool, bad); | ||
766 | ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); | ||
767 | ev = ceph_decode_8(p); /* encoding version */ | ||
768 | if (ev > CEPH_PG_POOL_VERSION) { | ||
769 | pr_warning("got unknown v %d > %d of ceph_pg_pool\n", | ||
770 | ev, CEPH_PG_POOL_VERSION); | ||
771 | goto bad; | ||
772 | } | ||
773 | pi = __lookup_pg_pool(&map->pg_pools, pool); | ||
774 | if (!pi) { | ||
775 | pi = kzalloc(sizeof(*pi), GFP_NOFS); | ||
776 | if (!pi) { | ||
777 | err = -ENOMEM; | ||
778 | goto bad; | ||
779 | } | ||
780 | pi->id = pool; | ||
781 | __insert_pg_pool(&map->pg_pools, pi); | ||
782 | } | ||
783 | err = __decode_pool(p, end, pi); | ||
784 | if (err < 0) | ||
785 | goto bad; | ||
786 | } | ||
787 | if (version >= 5 && __decode_pool_names(p, end, map) < 0) | ||
788 | goto bad; | ||
789 | |||
790 | /* old_pool */ | ||
791 | ceph_decode_32_safe(p, end, len, bad); | ||
792 | while (len--) { | ||
793 | struct ceph_pg_pool_info *pi; | ||
794 | |||
795 | ceph_decode_32_safe(p, end, pool, bad); | ||
796 | pi = __lookup_pg_pool(&map->pg_pools, pool); | ||
797 | if (pi) | ||
798 | __remove_pg_pool(&map->pg_pools, pi); | ||
799 | } | ||
800 | |||
801 | /* new_up */ | ||
802 | err = -EINVAL; | ||
803 | ceph_decode_32_safe(p, end, len, bad); | ||
804 | while (len--) { | ||
805 | u32 osd; | ||
806 | struct ceph_entity_addr addr; | ||
807 | ceph_decode_32_safe(p, end, osd, bad); | ||
808 | ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); | ||
809 | ceph_decode_addr(&addr); | ||
810 | pr_info("osd%d up\n", osd); | ||
811 | BUG_ON(osd >= map->max_osd); | ||
812 | map->osd_state[osd] |= CEPH_OSD_UP; | ||
813 | map->osd_addr[osd] = addr; | ||
814 | } | ||
815 | |||
816 | /* new_down */ | ||
817 | ceph_decode_32_safe(p, end, len, bad); | ||
818 | while (len--) { | ||
819 | u32 osd; | ||
820 | ceph_decode_32_safe(p, end, osd, bad); | ||
821 | (*p)++; /* clean flag */ | ||
822 | pr_info("osd%d down\n", osd); | ||
823 | if (osd < map->max_osd) | ||
824 | map->osd_state[osd] &= ~CEPH_OSD_UP; | ||
825 | } | ||
826 | |||
827 | /* new_weight */ | ||
828 | ceph_decode_32_safe(p, end, len, bad); | ||
829 | while (len--) { | ||
830 | u32 osd, off; | ||
831 | ceph_decode_need(p, end, sizeof(u32)*2, bad); | ||
832 | osd = ceph_decode_32(p); | ||
833 | off = ceph_decode_32(p); | ||
834 | pr_info("osd%d weight 0x%x %s\n", osd, off, | ||
835 | off == CEPH_OSD_IN ? "(in)" : | ||
836 | (off == CEPH_OSD_OUT ? "(out)" : "")); | ||
837 | if (osd < map->max_osd) | ||
838 | map->osd_weight[osd] = off; | ||
839 | } | ||
840 | |||
841 | /* new_pg_temp */ | ||
842 | rbp = rb_first(&map->pg_temp); | ||
843 | ceph_decode_32_safe(p, end, len, bad); | ||
844 | while (len--) { | ||
845 | struct ceph_pg_mapping *pg; | ||
846 | int j; | ||
847 | struct ceph_pg pgid; | ||
848 | u32 pglen; | ||
849 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | ||
850 | ceph_decode_copy(p, &pgid, sizeof(pgid)); | ||
851 | pglen = ceph_decode_32(p); | ||
852 | |||
853 | /* remove any? */ | ||
854 | while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, | ||
855 | node)->pgid, pgid) <= 0) { | ||
856 | struct ceph_pg_mapping *cur = | ||
857 | rb_entry(rbp, struct ceph_pg_mapping, node); | ||
858 | |||
859 | rbp = rb_next(rbp); | ||
860 | dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); | ||
861 | rb_erase(&cur->node, &map->pg_temp); | ||
862 | kfree(cur); | ||
863 | } | ||
864 | |||
865 | if (pglen) { | ||
866 | /* insert */ | ||
867 | ceph_decode_need(p, end, pglen*sizeof(u32), bad); | ||
868 | pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); | ||
869 | if (!pg) { | ||
870 | err = -ENOMEM; | ||
871 | goto bad; | ||
872 | } | ||
873 | pg->pgid = pgid; | ||
874 | pg->len = pglen; | ||
875 | for (j = 0; j < pglen; j++) | ||
876 | pg->osds[j] = ceph_decode_32(p); | ||
877 | err = __insert_pg_mapping(pg, &map->pg_temp); | ||
878 | if (err) { | ||
879 | kfree(pg); | ||
880 | goto bad; | ||
881 | } | ||
882 | dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, | ||
883 | pglen); | ||
884 | } | ||
885 | } | ||
886 | while (rbp) { | ||
887 | struct ceph_pg_mapping *cur = | ||
888 | rb_entry(rbp, struct ceph_pg_mapping, node); | ||
889 | |||
890 | rbp = rb_next(rbp); | ||
891 | dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); | ||
892 | rb_erase(&cur->node, &map->pg_temp); | ||
893 | kfree(cur); | ||
894 | } | ||
895 | |||
896 | /* ignore the rest */ | ||
897 | *p = end; | ||
898 | return map; | ||
899 | |||
900 | bad: | ||
901 | pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", | ||
902 | epoch, (int)(*p - start), *p, start, end); | ||
903 | print_hex_dump(KERN_DEBUG, "osdmap: ", | ||
904 | DUMP_PREFIX_OFFSET, 16, 1, | ||
905 | start, end - start, true); | ||
906 | if (newcrush) | ||
907 | crush_destroy(newcrush); | ||
908 | return ERR_PTR(err); | ||
909 | } | ||
910 | |||
911 | |||
912 | |||
913 | |||
914 | /* | ||
915 | * calculate file layout from given offset, length. | ||
916 | * fill in correct oid, logical length, and object extent | ||
917 | * offset, length. | ||
918 | * | ||
919 | * for now, we write only a single su, until we can | ||
920 | * pass a stride back to the caller. | ||
921 | */ | ||
922 | void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | ||
923 | u64 off, u64 *plen, | ||
924 | u64 *ono, | ||
925 | u64 *oxoff, u64 *oxlen) | ||
926 | { | ||
927 | u32 osize = le32_to_cpu(layout->fl_object_size); | ||
928 | u32 su = le32_to_cpu(layout->fl_stripe_unit); | ||
929 | u32 sc = le32_to_cpu(layout->fl_stripe_count); | ||
930 | u32 bl, stripeno, stripepos, objsetno; | ||
931 | u32 su_per_object; | ||
932 | u64 t, su_offset; | ||
933 | |||
934 | dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, | ||
935 | osize, su); | ||
936 | su_per_object = osize / su; | ||
937 | dout("osize %u / su %u = su_per_object %u\n", osize, su, | ||
938 | su_per_object); | ||
939 | |||
940 | BUG_ON((su & ~PAGE_MASK) != 0); | ||
941 | /* bl = *off / su; */ | ||
942 | t = off; | ||
943 | do_div(t, su); | ||
944 | bl = t; | ||
945 | dout("off %llu / su %u = bl %u\n", off, su, bl); | ||
946 | |||
947 | stripeno = bl / sc; | ||
948 | stripepos = bl % sc; | ||
949 | objsetno = stripeno / su_per_object; | ||
950 | |||
951 | *ono = objsetno * sc + stripepos; | ||
952 | dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); | ||
953 | |||
954 | /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ | ||
955 | t = off; | ||
956 | su_offset = do_div(t, su); | ||
957 | *oxoff = su_offset + (stripeno % su_per_object) * su; | ||
958 | |||
959 | /* | ||
960 | * Calculate the length of the extent being written to the selected | ||
961 | * object. This is the minimum of the full length requested (plen) or | ||
962 | * the remainder of the current stripe being written to. | ||
963 | */ | ||
964 | *oxlen = min_t(u64, *plen, su - su_offset); | ||
965 | *plen = *oxlen; | ||
966 | |||
967 | dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); | ||
968 | } | ||
969 | |||
970 | /* | ||
971 | * calculate an object layout (i.e. pgid) from an oid, | ||
972 | * file_layout, and osdmap | ||
973 | */ | ||
974 | int ceph_calc_object_layout(struct ceph_object_layout *ol, | ||
975 | const char *oid, | ||
976 | struct ceph_file_layout *fl, | ||
977 | struct ceph_osdmap *osdmap) | ||
978 | { | ||
979 | unsigned num, num_mask; | ||
980 | struct ceph_pg pgid; | ||
981 | s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); | ||
982 | int poolid = le32_to_cpu(fl->fl_pg_pool); | ||
983 | struct ceph_pg_pool_info *pool; | ||
984 | unsigned ps; | ||
985 | |||
986 | BUG_ON(!osdmap); | ||
987 | |||
988 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | ||
989 | if (!pool) | ||
990 | return -EIO; | ||
991 | ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); | ||
992 | if (preferred >= 0) { | ||
993 | ps += preferred; | ||
994 | num = le32_to_cpu(pool->v.lpg_num); | ||
995 | num_mask = pool->lpg_num_mask; | ||
996 | } else { | ||
997 | num = le32_to_cpu(pool->v.pg_num); | ||
998 | num_mask = pool->pg_num_mask; | ||
999 | } | ||
1000 | |||
1001 | pgid.ps = cpu_to_le16(ps); | ||
1002 | pgid.preferred = cpu_to_le16(preferred); | ||
1003 | pgid.pool = fl->fl_pg_pool; | ||
1004 | if (preferred >= 0) | ||
1005 | dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, | ||
1006 | (int)preferred); | ||
1007 | else | ||
1008 | dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); | ||
1009 | |||
1010 | ol->ol_pgid = pgid; | ||
1011 | ol->ol_stripe_unit = fl->fl_object_stripe_unit; | ||
1012 | return 0; | ||
1013 | } | ||
1014 | |||
1015 | /* | ||
1016 | * Calculate raw osd vector for the given pgid. Return pointer to osd | ||
1017 | * array, or NULL on failure. | ||
1018 | */ | ||
1019 | static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | ||
1020 | int *osds, int *num) | ||
1021 | { | ||
1022 | struct ceph_pg_mapping *pg; | ||
1023 | struct ceph_pg_pool_info *pool; | ||
1024 | int ruleno; | ||
1025 | unsigned poolid, ps, pps; | ||
1026 | int preferred; | ||
1027 | |||
1028 | /* pg_temp? */ | ||
1029 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | ||
1030 | if (pg) { | ||
1031 | *num = pg->len; | ||
1032 | return pg->osds; | ||
1033 | } | ||
1034 | |||
1035 | /* crush */ | ||
1036 | poolid = le32_to_cpu(pgid.pool); | ||
1037 | ps = le16_to_cpu(pgid.ps); | ||
1038 | preferred = (s16)le16_to_cpu(pgid.preferred); | ||
1039 | |||
1040 | /* don't forcefeed bad device ids to crush */ | ||
1041 | if (preferred >= osdmap->max_osd || | ||
1042 | preferred >= osdmap->crush->max_devices) | ||
1043 | preferred = -1; | ||
1044 | |||
1045 | pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); | ||
1046 | if (!pool) | ||
1047 | return NULL; | ||
1048 | ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, | ||
1049 | pool->v.type, pool->v.size); | ||
1050 | if (ruleno < 0) { | ||
1051 | pr_err("no crush rule pool %d ruleset %d type %d size %d\n", | ||
1052 | poolid, pool->v.crush_ruleset, pool->v.type, | ||
1053 | pool->v.size); | ||
1054 | return NULL; | ||
1055 | } | ||
1056 | |||
1057 | if (preferred >= 0) | ||
1058 | pps = ceph_stable_mod(ps, | ||
1059 | le32_to_cpu(pool->v.lpgp_num), | ||
1060 | pool->lpgp_num_mask); | ||
1061 | else | ||
1062 | pps = ceph_stable_mod(ps, | ||
1063 | le32_to_cpu(pool->v.pgp_num), | ||
1064 | pool->pgp_num_mask); | ||
1065 | pps += poolid; | ||
1066 | *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, | ||
1067 | min_t(int, pool->v.size, *num), | ||
1068 | preferred, osdmap->osd_weight); | ||
1069 | return osds; | ||
1070 | } | ||
1071 | |||
1072 | /* | ||
1073 | * Return acting set for given pgid. | ||
1074 | */ | ||
1075 | int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | ||
1076 | int *acting) | ||
1077 | { | ||
1078 | int rawosds[CEPH_PG_MAX_SIZE], *osds; | ||
1079 | int i, o, num = CEPH_PG_MAX_SIZE; | ||
1080 | |||
1081 | osds = calc_pg_raw(osdmap, pgid, rawosds, &num); | ||
1082 | if (!osds) | ||
1083 | return -1; | ||
1084 | |||
1085 | /* primary is first up osd */ | ||
1086 | o = 0; | ||
1087 | for (i = 0; i < num; i++) | ||
1088 | if (ceph_osd_is_up(osdmap, osds[i])) | ||
1089 | acting[o++] = osds[i]; | ||
1090 | return o; | ||
1091 | } | ||
1092 | |||
1093 | /* | ||
1094 | * Return primary osd for given pgid, or -1 if none. | ||
1095 | */ | ||
1096 | int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) | ||
1097 | { | ||
1098 | int rawosds[CEPH_PG_MAX_SIZE], *osds; | ||
1099 | int i, num = CEPH_PG_MAX_SIZE; | ||
1100 | |||
1101 | osds = calc_pg_raw(osdmap, pgid, rawosds, &num); | ||
1102 | if (!osds) | ||
1103 | return -1; | ||
1104 | |||
1105 | /* primary is first up osd */ | ||
1106 | for (i = 0; i < num; i++) | ||
1107 | if (ceph_osd_is_up(osdmap, osds[i])) | ||
1108 | return osds[i]; | ||
1109 | return -1; | ||
1110 | } | ||
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h deleted file mode 100644 index 970b547e510d..000000000000 --- a/fs/ceph/osdmap.h +++ /dev/null | |||
@@ -1,128 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_OSDMAP_H | ||
2 | #define _FS_CEPH_OSDMAP_H | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | #include "types.h" | ||
6 | #include "ceph_fs.h" | ||
7 | #include "crush/crush.h" | ||
8 | |||
9 | /* | ||
10 | * The osd map describes the current membership of the osd cluster and | ||
11 | * specifies the mapping of objects to placement groups and placement | ||
12 | * groups to (sets of) osds. That is, it completely specifies the | ||
13 | * (desired) distribution of all data objects in the system at some | ||
14 | * point in time. | ||
15 | * | ||
16 | * Each map version is identified by an epoch, which increases monotonically. | ||
17 | * | ||
18 | * The map can be updated either via an incremental map (diff) describing | ||
19 | * the change between two successive epochs, or as a fully encoded map. | ||
20 | */ | ||
21 | struct ceph_pg_pool_info { | ||
22 | struct rb_node node; | ||
23 | int id; | ||
24 | struct ceph_pg_pool v; | ||
25 | int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; | ||
26 | char *name; | ||
27 | }; | ||
28 | |||
29 | struct ceph_pg_mapping { | ||
30 | struct rb_node node; | ||
31 | struct ceph_pg pgid; | ||
32 | int len; | ||
33 | int osds[]; | ||
34 | }; | ||
35 | |||
36 | struct ceph_osdmap { | ||
37 | struct ceph_fsid fsid; | ||
38 | u32 epoch; | ||
39 | u32 mkfs_epoch; | ||
40 | struct ceph_timespec created, modified; | ||
41 | |||
42 | u32 flags; /* CEPH_OSDMAP_* */ | ||
43 | |||
44 | u32 max_osd; /* size of osd_state, _offload, _addr arrays */ | ||
45 | u8 *osd_state; /* CEPH_OSD_* */ | ||
46 | u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ | ||
47 | struct ceph_entity_addr *osd_addr; | ||
48 | |||
49 | struct rb_root pg_temp; | ||
50 | struct rb_root pg_pools; | ||
51 | u32 pool_max; | ||
52 | |||
53 | /* the CRUSH map specifies the mapping of placement groups to | ||
54 | * the list of osds that store+replicate them. */ | ||
55 | struct crush_map *crush; | ||
56 | }; | ||
57 | |||
58 | /* | ||
59 | * file layout helpers | ||
60 | */ | ||
61 | #define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) | ||
62 | #define ceph_file_layout_stripe_count(l) \ | ||
63 | ((__s32)le32_to_cpu((l).fl_stripe_count)) | ||
64 | #define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) | ||
65 | #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) | ||
66 | #define ceph_file_layout_object_su(l) \ | ||
67 | ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) | ||
68 | #define ceph_file_layout_pg_preferred(l) \ | ||
69 | ((__s32)le32_to_cpu((l).fl_pg_preferred)) | ||
70 | #define ceph_file_layout_pg_pool(l) \ | ||
71 | ((__s32)le32_to_cpu((l).fl_pg_pool)) | ||
72 | |||
73 | static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) | ||
74 | { | ||
75 | return le32_to_cpu(l->fl_stripe_unit) * | ||
76 | le32_to_cpu(l->fl_stripe_count); | ||
77 | } | ||
78 | |||
79 | /* "period" == bytes before i start on a new set of objects */ | ||
80 | static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) | ||
81 | { | ||
82 | return le32_to_cpu(l->fl_object_size) * | ||
83 | le32_to_cpu(l->fl_stripe_count); | ||
84 | } | ||
85 | |||
86 | |||
87 | static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) | ||
88 | { | ||
89 | return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); | ||
90 | } | ||
91 | |||
92 | static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) | ||
93 | { | ||
94 | return map && (map->flags & flag); | ||
95 | } | ||
96 | |||
97 | extern char *ceph_osdmap_state_str(char *str, int len, int state); | ||
98 | |||
99 | static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, | ||
100 | int osd) | ||
101 | { | ||
102 | if (osd >= map->max_osd) | ||
103 | return NULL; | ||
104 | return &map->osd_addr[osd]; | ||
105 | } | ||
106 | |||
107 | extern struct ceph_osdmap *osdmap_decode(void **p, void *end); | ||
108 | extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | ||
109 | struct ceph_osdmap *map, | ||
110 | struct ceph_messenger *msgr); | ||
111 | extern void ceph_osdmap_destroy(struct ceph_osdmap *map); | ||
112 | |||
113 | /* calculate mapping of a file extent to an object */ | ||
114 | extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | ||
115 | u64 off, u64 *plen, | ||
116 | u64 *bno, u64 *oxoff, u64 *oxlen); | ||
117 | |||
118 | /* calculate mapping of object to a placement group */ | ||
119 | extern int ceph_calc_object_layout(struct ceph_object_layout *ol, | ||
120 | const char *oid, | ||
121 | struct ceph_file_layout *fl, | ||
122 | struct ceph_osdmap *osdmap); | ||
123 | extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | ||
124 | int *acting); | ||
125 | extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, | ||
126 | struct ceph_pg pgid); | ||
127 | |||
128 | #endif | ||
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c deleted file mode 100644 index 46a368b6dce5..000000000000 --- a/fs/ceph/pagelist.c +++ /dev/null | |||
@@ -1,63 +0,0 @@ | |||
1 | |||
2 | #include <linux/gfp.h> | ||
3 | #include <linux/pagemap.h> | ||
4 | #include <linux/highmem.h> | ||
5 | |||
6 | #include "pagelist.h" | ||
7 | |||
8 | static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) | ||
9 | { | ||
10 | struct page *page = list_entry(pl->head.prev, struct page, | ||
11 | lru); | ||
12 | kunmap(page); | ||
13 | } | ||
14 | |||
15 | int ceph_pagelist_release(struct ceph_pagelist *pl) | ||
16 | { | ||
17 | if (pl->mapped_tail) | ||
18 | ceph_pagelist_unmap_tail(pl); | ||
19 | |||
20 | while (!list_empty(&pl->head)) { | ||
21 | struct page *page = list_first_entry(&pl->head, struct page, | ||
22 | lru); | ||
23 | list_del(&page->lru); | ||
24 | __free_page(page); | ||
25 | } | ||
26 | return 0; | ||
27 | } | ||
28 | |||
29 | static int ceph_pagelist_addpage(struct ceph_pagelist *pl) | ||
30 | { | ||
31 | struct page *page = __page_cache_alloc(GFP_NOFS); | ||
32 | if (!page) | ||
33 | return -ENOMEM; | ||
34 | pl->room += PAGE_SIZE; | ||
35 | list_add_tail(&page->lru, &pl->head); | ||
36 | if (pl->mapped_tail) | ||
37 | ceph_pagelist_unmap_tail(pl); | ||
38 | pl->mapped_tail = kmap(page); | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) | ||
43 | { | ||
44 | while (pl->room < len) { | ||
45 | size_t bit = pl->room; | ||
46 | int ret; | ||
47 | |||
48 | memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), | ||
49 | buf, bit); | ||
50 | pl->length += bit; | ||
51 | pl->room -= bit; | ||
52 | buf += bit; | ||
53 | len -= bit; | ||
54 | ret = ceph_pagelist_addpage(pl); | ||
55 | if (ret) | ||
56 | return ret; | ||
57 | } | ||
58 | |||
59 | memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); | ||
60 | pl->length += len; | ||
61 | pl->room -= len; | ||
62 | return 0; | ||
63 | } | ||
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h deleted file mode 100644 index e8a4187e1087..000000000000 --- a/fs/ceph/pagelist.h +++ /dev/null | |||
@@ -1,54 +0,0 @@ | |||
1 | #ifndef __FS_CEPH_PAGELIST_H | ||
2 | #define __FS_CEPH_PAGELIST_H | ||
3 | |||
4 | #include <linux/list.h> | ||
5 | |||
6 | struct ceph_pagelist { | ||
7 | struct list_head head; | ||
8 | void *mapped_tail; | ||
9 | size_t length; | ||
10 | size_t room; | ||
11 | }; | ||
12 | |||
13 | static inline void ceph_pagelist_init(struct ceph_pagelist *pl) | ||
14 | { | ||
15 | INIT_LIST_HEAD(&pl->head); | ||
16 | pl->mapped_tail = NULL; | ||
17 | pl->length = 0; | ||
18 | pl->room = 0; | ||
19 | } | ||
20 | extern int ceph_pagelist_release(struct ceph_pagelist *pl); | ||
21 | |||
22 | extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); | ||
23 | |||
24 | static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) | ||
25 | { | ||
26 | __le64 ev = cpu_to_le64(v); | ||
27 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
28 | } | ||
29 | static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) | ||
30 | { | ||
31 | __le32 ev = cpu_to_le32(v); | ||
32 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
33 | } | ||
34 | static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) | ||
35 | { | ||
36 | __le16 ev = cpu_to_le16(v); | ||
37 | return ceph_pagelist_append(pl, &ev, sizeof(ev)); | ||
38 | } | ||
39 | static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) | ||
40 | { | ||
41 | return ceph_pagelist_append(pl, &v, 1); | ||
42 | } | ||
43 | static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, | ||
44 | char *s, size_t len) | ||
45 | { | ||
46 | int ret = ceph_pagelist_encode_32(pl, len); | ||
47 | if (ret) | ||
48 | return ret; | ||
49 | if (len) | ||
50 | return ceph_pagelist_append(pl, s, len); | ||
51 | return 0; | ||
52 | } | ||
53 | |||
54 | #endif | ||
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h deleted file mode 100644 index 6d5247f2e81b..000000000000 --- a/fs/ceph/rados.h +++ /dev/null | |||
@@ -1,405 +0,0 @@ | |||
1 | #ifndef CEPH_RADOS_H | ||
2 | #define CEPH_RADOS_H | ||
3 | |||
4 | /* | ||
5 | * Data types for the Ceph distributed object storage layer RADOS | ||
6 | * (Reliable Autonomic Distributed Object Store). | ||
7 | */ | ||
8 | |||
9 | #include "msgr.h" | ||
10 | |||
11 | /* | ||
12 | * osdmap encoding versions | ||
13 | */ | ||
14 | #define CEPH_OSDMAP_INC_VERSION 5 | ||
15 | #define CEPH_OSDMAP_INC_VERSION_EXT 5 | ||
16 | #define CEPH_OSDMAP_VERSION 5 | ||
17 | #define CEPH_OSDMAP_VERSION_EXT 5 | ||
18 | |||
19 | /* | ||
20 | * fs id | ||
21 | */ | ||
22 | struct ceph_fsid { | ||
23 | unsigned char fsid[16]; | ||
24 | }; | ||
25 | |||
26 | static inline int ceph_fsid_compare(const struct ceph_fsid *a, | ||
27 | const struct ceph_fsid *b) | ||
28 | { | ||
29 | return memcmp(a, b, sizeof(*a)); | ||
30 | } | ||
31 | |||
32 | /* | ||
33 | * ino, object, etc. | ||
34 | */ | ||
35 | typedef __le64 ceph_snapid_t; | ||
36 | #define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ | ||
37 | #define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ | ||
38 | #define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ | ||
39 | |||
40 | struct ceph_timespec { | ||
41 | __le32 tv_sec; | ||
42 | __le32 tv_nsec; | ||
43 | } __attribute__ ((packed)); | ||
44 | |||
45 | |||
46 | /* | ||
47 | * object layout - how objects are mapped into PGs | ||
48 | */ | ||
49 | #define CEPH_OBJECT_LAYOUT_HASH 1 | ||
50 | #define CEPH_OBJECT_LAYOUT_LINEAR 2 | ||
51 | #define CEPH_OBJECT_LAYOUT_HASHINO 3 | ||
52 | |||
53 | /* | ||
54 | * pg layout -- how PGs are mapped onto (sets of) OSDs | ||
55 | */ | ||
56 | #define CEPH_PG_LAYOUT_CRUSH 0 | ||
57 | #define CEPH_PG_LAYOUT_HASH 1 | ||
58 | #define CEPH_PG_LAYOUT_LINEAR 2 | ||
59 | #define CEPH_PG_LAYOUT_HYBRID 3 | ||
60 | |||
61 | #define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ | ||
62 | |||
63 | /* | ||
64 | * placement group. | ||
65 | * we encode this into one __le64. | ||
66 | */ | ||
67 | struct ceph_pg { | ||
68 | __le16 preferred; /* preferred primary osd */ | ||
69 | __le16 ps; /* placement seed */ | ||
70 | __le32 pool; /* object pool */ | ||
71 | } __attribute__ ((packed)); | ||
72 | |||
73 | /* | ||
74 | * pg_pool is a set of pgs storing a pool of objects | ||
75 | * | ||
76 | * pg_num -- base number of pseudorandomly placed pgs | ||
77 | * | ||
78 | * pgp_num -- effective number when calculating pg placement. this | ||
79 | * is used for pg_num increases. new pgs result in data being "split" | ||
80 | * into new pgs. for this to proceed smoothly, new pgs are intiially | ||
81 | * colocated with their parents; that is, pgp_num doesn't increase | ||
82 | * until the new pgs have successfully split. only _then_ are the new | ||
83 | * pgs placed independently. | ||
84 | * | ||
85 | * lpg_num -- localized pg count (per device). replicas are randomly | ||
86 | * selected. | ||
87 | * | ||
88 | * lpgp_num -- as above. | ||
89 | */ | ||
90 | #define CEPH_PG_TYPE_REP 1 | ||
91 | #define CEPH_PG_TYPE_RAID4 2 | ||
92 | #define CEPH_PG_POOL_VERSION 2 | ||
93 | struct ceph_pg_pool { | ||
94 | __u8 type; /* CEPH_PG_TYPE_* */ | ||
95 | __u8 size; /* number of osds in each pg */ | ||
96 | __u8 crush_ruleset; /* crush placement rule */ | ||
97 | __u8 object_hash; /* hash mapping object name to ps */ | ||
98 | __le32 pg_num, pgp_num; /* number of pg's */ | ||
99 | __le32 lpg_num, lpgp_num; /* number of localized pg's */ | ||
100 | __le32 last_change; /* most recent epoch changed */ | ||
101 | __le64 snap_seq; /* seq for per-pool snapshot */ | ||
102 | __le32 snap_epoch; /* epoch of last snap */ | ||
103 | __le32 num_snaps; | ||
104 | __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ | ||
105 | __le64 auid; /* who owns the pg */ | ||
106 | } __attribute__ ((packed)); | ||
107 | |||
108 | /* | ||
109 | * stable_mod func is used to control number of placement groups. | ||
110 | * similar to straight-up modulo, but produces a stable mapping as b | ||
111 | * increases over time. b is the number of bins, and bmask is the | ||
112 | * containing power of 2 minus 1. | ||
113 | * | ||
114 | * b <= bmask and bmask=(2**n)-1 | ||
115 | * e.g., b=12 -> bmask=15, b=123 -> bmask=127 | ||
116 | */ | ||
117 | static inline int ceph_stable_mod(int x, int b, int bmask) | ||
118 | { | ||
119 | if ((x & bmask) < b) | ||
120 | return x & bmask; | ||
121 | else | ||
122 | return x & (bmask >> 1); | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * object layout - how a given object should be stored. | ||
127 | */ | ||
128 | struct ceph_object_layout { | ||
129 | struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ | ||
130 | __le32 ol_stripe_unit; /* for per-object parity, if any */ | ||
131 | } __attribute__ ((packed)); | ||
132 | |||
133 | /* | ||
134 | * compound epoch+version, used by storage layer to serialize mutations | ||
135 | */ | ||
136 | struct ceph_eversion { | ||
137 | __le32 epoch; | ||
138 | __le64 version; | ||
139 | } __attribute__ ((packed)); | ||
140 | |||
141 | /* | ||
142 | * osd map bits | ||
143 | */ | ||
144 | |||
145 | /* status bits */ | ||
146 | #define CEPH_OSD_EXISTS 1 | ||
147 | #define CEPH_OSD_UP 2 | ||
148 | |||
149 | /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ | ||
150 | #define CEPH_OSD_IN 0x10000 | ||
151 | #define CEPH_OSD_OUT 0 | ||
152 | |||
153 | |||
154 | /* | ||
155 | * osd map flag bits | ||
156 | */ | ||
157 | #define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ | ||
158 | #define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ | ||
159 | #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ | ||
160 | #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ | ||
161 | #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ | ||
162 | |||
163 | /* | ||
164 | * osd ops | ||
165 | */ | ||
166 | #define CEPH_OSD_OP_MODE 0xf000 | ||
167 | #define CEPH_OSD_OP_MODE_RD 0x1000 | ||
168 | #define CEPH_OSD_OP_MODE_WR 0x2000 | ||
169 | #define CEPH_OSD_OP_MODE_RMW 0x3000 | ||
170 | #define CEPH_OSD_OP_MODE_SUB 0x4000 | ||
171 | |||
172 | #define CEPH_OSD_OP_TYPE 0x0f00 | ||
173 | #define CEPH_OSD_OP_TYPE_LOCK 0x0100 | ||
174 | #define CEPH_OSD_OP_TYPE_DATA 0x0200 | ||
175 | #define CEPH_OSD_OP_TYPE_ATTR 0x0300 | ||
176 | #define CEPH_OSD_OP_TYPE_EXEC 0x0400 | ||
177 | #define CEPH_OSD_OP_TYPE_PG 0x0500 | ||
178 | |||
179 | enum { | ||
180 | /** data **/ | ||
181 | /* read */ | ||
182 | CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, | ||
183 | CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, | ||
184 | |||
185 | /* fancy read */ | ||
186 | CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, | ||
187 | |||
188 | /* write */ | ||
189 | CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, | ||
190 | CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, | ||
191 | CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, | ||
192 | CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, | ||
193 | CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, | ||
194 | |||
195 | /* fancy write */ | ||
196 | CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, | ||
197 | CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, | ||
198 | CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, | ||
199 | CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, | ||
200 | |||
201 | CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, | ||
202 | CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, | ||
203 | CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, | ||
204 | |||
205 | CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, | ||
206 | CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, | ||
207 | |||
208 | /** attrs **/ | ||
209 | /* read */ | ||
210 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, | ||
211 | CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, | ||
212 | CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, | ||
213 | |||
214 | /* write */ | ||
215 | CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, | ||
216 | CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, | ||
217 | CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, | ||
218 | CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, | ||
219 | |||
220 | /** subop **/ | ||
221 | CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, | ||
222 | CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, | ||
223 | CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, | ||
224 | CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, | ||
225 | CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, | ||
226 | |||
227 | /** lock **/ | ||
228 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, | ||
229 | CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, | ||
230 | CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, | ||
231 | CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, | ||
232 | CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, | ||
233 | CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, | ||
234 | |||
235 | /** exec **/ | ||
236 | CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, | ||
237 | |||
238 | /** pg **/ | ||
239 | CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, | ||
240 | }; | ||
241 | |||
242 | static inline int ceph_osd_op_type_lock(int op) | ||
243 | { | ||
244 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; | ||
245 | } | ||
246 | static inline int ceph_osd_op_type_data(int op) | ||
247 | { | ||
248 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; | ||
249 | } | ||
250 | static inline int ceph_osd_op_type_attr(int op) | ||
251 | { | ||
252 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; | ||
253 | } | ||
254 | static inline int ceph_osd_op_type_exec(int op) | ||
255 | { | ||
256 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; | ||
257 | } | ||
258 | static inline int ceph_osd_op_type_pg(int op) | ||
259 | { | ||
260 | return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; | ||
261 | } | ||
262 | |||
263 | static inline int ceph_osd_op_mode_subop(int op) | ||
264 | { | ||
265 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; | ||
266 | } | ||
267 | static inline int ceph_osd_op_mode_read(int op) | ||
268 | { | ||
269 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; | ||
270 | } | ||
271 | static inline int ceph_osd_op_mode_modify(int op) | ||
272 | { | ||
273 | return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * note that the following tmap stuff is also defined in the ceph librados.h | ||
278 | * any modification here needs to be updated there | ||
279 | */ | ||
280 | #define CEPH_OSD_TMAP_HDR 'h' | ||
281 | #define CEPH_OSD_TMAP_SET 's' | ||
282 | #define CEPH_OSD_TMAP_RM 'r' | ||
283 | |||
284 | extern const char *ceph_osd_op_name(int op); | ||
285 | |||
286 | |||
287 | /* | ||
288 | * osd op flags | ||
289 | * | ||
290 | * An op may be READ, WRITE, or READ|WRITE. | ||
291 | */ | ||
292 | enum { | ||
293 | CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ | ||
294 | CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ | ||
295 | CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ | ||
296 | CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ | ||
297 | CEPH_OSD_FLAG_READ = 16, /* op may read */ | ||
298 | CEPH_OSD_FLAG_WRITE = 32, /* op may write */ | ||
299 | CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ | ||
300 | CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ | ||
301 | CEPH_OSD_FLAG_BALANCE_READS = 256, | ||
302 | CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ | ||
303 | CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ | ||
304 | CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ | ||
305 | CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ | ||
306 | }; | ||
307 | |||
308 | enum { | ||
309 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ | ||
310 | }; | ||
311 | |||
312 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ | ||
313 | #define EBLACKLISTED ESHUTDOWN /* blacklisted */ | ||
314 | |||
315 | /* xattr comparison */ | ||
316 | enum { | ||
317 | CEPH_OSD_CMPXATTR_OP_NOP = 0, | ||
318 | CEPH_OSD_CMPXATTR_OP_EQ = 1, | ||
319 | CEPH_OSD_CMPXATTR_OP_NE = 2, | ||
320 | CEPH_OSD_CMPXATTR_OP_GT = 3, | ||
321 | CEPH_OSD_CMPXATTR_OP_GTE = 4, | ||
322 | CEPH_OSD_CMPXATTR_OP_LT = 5, | ||
323 | CEPH_OSD_CMPXATTR_OP_LTE = 6 | ||
324 | }; | ||
325 | |||
326 | enum { | ||
327 | CEPH_OSD_CMPXATTR_MODE_STRING = 1, | ||
328 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 | ||
329 | }; | ||
330 | |||
331 | /* | ||
332 | * an individual object operation. each may be accompanied by some data | ||
333 | * payload | ||
334 | */ | ||
335 | struct ceph_osd_op { | ||
336 | __le16 op; /* CEPH_OSD_OP_* */ | ||
337 | __le32 flags; /* CEPH_OSD_FLAG_* */ | ||
338 | union { | ||
339 | struct { | ||
340 | __le64 offset, length; | ||
341 | __le64 truncate_size; | ||
342 | __le32 truncate_seq; | ||
343 | } __attribute__ ((packed)) extent; | ||
344 | struct { | ||
345 | __le32 name_len; | ||
346 | __le32 value_len; | ||
347 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | ||
348 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | ||
349 | } __attribute__ ((packed)) xattr; | ||
350 | struct { | ||
351 | __u8 class_len; | ||
352 | __u8 method_len; | ||
353 | __u8 argc; | ||
354 | __le32 indata_len; | ||
355 | } __attribute__ ((packed)) cls; | ||
356 | struct { | ||
357 | __le64 cookie, count; | ||
358 | } __attribute__ ((packed)) pgls; | ||
359 | struct { | ||
360 | __le64 snapid; | ||
361 | } __attribute__ ((packed)) snap; | ||
362 | }; | ||
363 | __le32 payload_len; | ||
364 | } __attribute__ ((packed)); | ||
365 | |||
366 | /* | ||
367 | * osd request message header. each request may include multiple | ||
368 | * ceph_osd_op object operations. | ||
369 | */ | ||
370 | struct ceph_osd_request_head { | ||
371 | __le32 client_inc; /* client incarnation */ | ||
372 | struct ceph_object_layout layout; /* pgid */ | ||
373 | __le32 osdmap_epoch; /* client's osdmap epoch */ | ||
374 | |||
375 | __le32 flags; | ||
376 | |||
377 | struct ceph_timespec mtime; /* for mutations only */ | ||
378 | struct ceph_eversion reassert_version; /* if we are replaying op */ | ||
379 | |||
380 | __le32 object_len; /* length of object name */ | ||
381 | |||
382 | __le64 snapid; /* snapid to read */ | ||
383 | __le64 snap_seq; /* writer's snap context */ | ||
384 | __le32 num_snaps; | ||
385 | |||
386 | __le16 num_ops; | ||
387 | struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ | ||
388 | } __attribute__ ((packed)); | ||
389 | |||
390 | struct ceph_osd_reply_head { | ||
391 | __le32 client_inc; /* client incarnation */ | ||
392 | __le32 flags; | ||
393 | struct ceph_object_layout layout; | ||
394 | __le32 osdmap_epoch; | ||
395 | struct ceph_eversion reassert_version; /* for replaying uncommitted */ | ||
396 | |||
397 | __le32 result; /* result code */ | ||
398 | |||
399 | __le32 object_len; /* length of object name */ | ||
400 | __le32 num_ops; | ||
401 | struct ceph_osd_op ops[0]; /* ops[], object */ | ||
402 | } __attribute__ ((packed)); | ||
403 | |||
404 | |||
405 | #endif | ||
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 190b6c4a6f2b..39c243acd062 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
@@ -1,10 +1,12 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/sort.h> | 3 | #include <linux/sort.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | 5 | ||
6 | #include "super.h" | 6 | #include "super.h" |
7 | #include "decode.h" | 7 | #include "mds_client.h" |
8 | |||
9 | #include <linux/ceph/decode.h> | ||
8 | 10 | ||
9 | /* | 11 | /* |
10 | * Snapshots in ceph are driven in large part by cooperation from the | 12 | * Snapshots in ceph are driven in large part by cooperation from the |
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, | |||
526 | struct ceph_cap_snap *capsnap) | 528 | struct ceph_cap_snap *capsnap) |
527 | { | 529 | { |
528 | struct inode *inode = &ci->vfs_inode; | 530 | struct inode *inode = &ci->vfs_inode; |
529 | struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; | 531 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
530 | 532 | ||
531 | BUG_ON(capsnap->writing); | 533 | BUG_ON(capsnap->writing); |
532 | capsnap->size = inode->i_size; | 534 | capsnap->size = inode->i_size; |
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
747 | struct ceph_mds_session *session, | 749 | struct ceph_mds_session *session, |
748 | struct ceph_msg *msg) | 750 | struct ceph_msg *msg) |
749 | { | 751 | { |
750 | struct super_block *sb = mdsc->client->sb; | 752 | struct super_block *sb = mdsc->fsc->sb; |
751 | int mds = session->s_mds; | 753 | int mds = session->s_mds; |
752 | u64 split; | 754 | u64 split; |
753 | int op; | 755 | int op; |
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c index c6179d3a26a2..cd5097d7c804 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/strings.c | |||
@@ -1,71 +1,9 @@ | |||
1 | /* | 1 | /* |
2 | * Ceph string constants | 2 | * Ceph fs string constants |
3 | */ | 3 | */ |
4 | #include "types.h" | 4 | #include <linux/module.h> |
5 | #include <linux/ceph/types.h> | ||
5 | 6 | ||
6 | const char *ceph_entity_type_name(int type) | ||
7 | { | ||
8 | switch (type) { | ||
9 | case CEPH_ENTITY_TYPE_MDS: return "mds"; | ||
10 | case CEPH_ENTITY_TYPE_OSD: return "osd"; | ||
11 | case CEPH_ENTITY_TYPE_MON: return "mon"; | ||
12 | case CEPH_ENTITY_TYPE_CLIENT: return "client"; | ||
13 | case CEPH_ENTITY_TYPE_AUTH: return "auth"; | ||
14 | default: return "unknown"; | ||
15 | } | ||
16 | } | ||
17 | |||
18 | const char *ceph_osd_op_name(int op) | ||
19 | { | ||
20 | switch (op) { | ||
21 | case CEPH_OSD_OP_READ: return "read"; | ||
22 | case CEPH_OSD_OP_STAT: return "stat"; | ||
23 | |||
24 | case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; | ||
25 | |||
26 | case CEPH_OSD_OP_WRITE: return "write"; | ||
27 | case CEPH_OSD_OP_DELETE: return "delete"; | ||
28 | case CEPH_OSD_OP_TRUNCATE: return "truncate"; | ||
29 | case CEPH_OSD_OP_ZERO: return "zero"; | ||
30 | case CEPH_OSD_OP_WRITEFULL: return "writefull"; | ||
31 | case CEPH_OSD_OP_ROLLBACK: return "rollback"; | ||
32 | |||
33 | case CEPH_OSD_OP_APPEND: return "append"; | ||
34 | case CEPH_OSD_OP_STARTSYNC: return "startsync"; | ||
35 | case CEPH_OSD_OP_SETTRUNC: return "settrunc"; | ||
36 | case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; | ||
37 | |||
38 | case CEPH_OSD_OP_TMAPUP: return "tmapup"; | ||
39 | case CEPH_OSD_OP_TMAPGET: return "tmapget"; | ||
40 | case CEPH_OSD_OP_TMAPPUT: return "tmapput"; | ||
41 | |||
42 | case CEPH_OSD_OP_GETXATTR: return "getxattr"; | ||
43 | case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; | ||
44 | case CEPH_OSD_OP_SETXATTR: return "setxattr"; | ||
45 | case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; | ||
46 | case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; | ||
47 | case CEPH_OSD_OP_RMXATTR: return "rmxattr"; | ||
48 | case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; | ||
49 | |||
50 | case CEPH_OSD_OP_PULL: return "pull"; | ||
51 | case CEPH_OSD_OP_PUSH: return "push"; | ||
52 | case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; | ||
53 | case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; | ||
54 | case CEPH_OSD_OP_SCRUB: return "scrub"; | ||
55 | |||
56 | case CEPH_OSD_OP_WRLOCK: return "wrlock"; | ||
57 | case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; | ||
58 | case CEPH_OSD_OP_RDLOCK: return "rdlock"; | ||
59 | case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; | ||
60 | case CEPH_OSD_OP_UPLOCK: return "uplock"; | ||
61 | case CEPH_OSD_OP_DNLOCK: return "dnlock"; | ||
62 | |||
63 | case CEPH_OSD_OP_CALL: return "call"; | ||
64 | |||
65 | case CEPH_OSD_OP_PGLS: return "pgls"; | ||
66 | } | ||
67 | return "???"; | ||
68 | } | ||
69 | 7 | ||
70 | const char *ceph_mds_state_name(int s) | 8 | const char *ceph_mds_state_name(int s) |
71 | { | 9 | { |
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o) | |||
177 | } | 115 | } |
178 | return "???"; | 116 | return "???"; |
179 | } | 117 | } |
180 | |||
181 | const char *ceph_pool_op_name(int op) | ||
182 | { | ||
183 | switch (op) { | ||
184 | case POOL_OP_CREATE: return "create"; | ||
185 | case POOL_OP_DELETE: return "delete"; | ||
186 | case POOL_OP_AUID_CHANGE: return "auid change"; | ||
187 | case POOL_OP_CREATE_SNAP: return "create snap"; | ||
188 | case POOL_OP_DELETE_SNAP: return "delete snap"; | ||
189 | case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; | ||
190 | case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; | ||
191 | } | ||
192 | return "???"; | ||
193 | } | ||
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9922628532b2..d6e0e0421891 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | #include "ceph_debug.h" | 2 | #include <linux/ceph/ceph_debug.h> |
3 | 3 | ||
4 | #include <linux/backing-dev.h> | 4 | #include <linux/backing-dev.h> |
5 | #include <linux/ctype.h> | 5 | #include <linux/ctype.h> |
@@ -15,10 +15,13 @@ | |||
15 | #include <linux/statfs.h> | 15 | #include <linux/statfs.h> |
16 | #include <linux/string.h> | 16 | #include <linux/string.h> |
17 | 17 | ||
18 | #include "decode.h" | ||
19 | #include "super.h" | 18 | #include "super.h" |
20 | #include "mon_client.h" | 19 | #include "mds_client.h" |
21 | #include "auth.h" | 20 | |
21 | #include <linux/ceph/decode.h> | ||
22 | #include <linux/ceph/mon_client.h> | ||
23 | #include <linux/ceph/auth.h> | ||
24 | #include <linux/ceph/debugfs.h> | ||
22 | 25 | ||
23 | /* | 26 | /* |
24 | * Ceph superblock operations | 27 | * Ceph superblock operations |
@@ -26,36 +29,22 @@ | |||
26 | * Handle the basics of mounting, unmounting. | 29 | * Handle the basics of mounting, unmounting. |
27 | */ | 30 | */ |
28 | 31 | ||
29 | |||
30 | /* | ||
31 | * find filename portion of a path (/foo/bar/baz -> baz) | ||
32 | */ | ||
33 | const char *ceph_file_part(const char *s, int len) | ||
34 | { | ||
35 | const char *e = s + len; | ||
36 | |||
37 | while (e != s && *(e-1) != '/') | ||
38 | e--; | ||
39 | return e; | ||
40 | } | ||
41 | |||
42 | |||
43 | /* | 32 | /* |
44 | * super ops | 33 | * super ops |
45 | */ | 34 | */ |
46 | static void ceph_put_super(struct super_block *s) | 35 | static void ceph_put_super(struct super_block *s) |
47 | { | 36 | { |
48 | struct ceph_client *client = ceph_sb_to_client(s); | 37 | struct ceph_fs_client *fsc = ceph_sb_to_client(s); |
49 | 38 | ||
50 | dout("put_super\n"); | 39 | dout("put_super\n"); |
51 | ceph_mdsc_close_sessions(&client->mdsc); | 40 | ceph_mdsc_close_sessions(fsc->mdsc); |
52 | 41 | ||
53 | /* | 42 | /* |
54 | * ensure we release the bdi before put_anon_super releases | 43 | * ensure we release the bdi before put_anon_super releases |
55 | * the device name. | 44 | * the device name. |
56 | */ | 45 | */ |
57 | if (s->s_bdi == &client->backing_dev_info) { | 46 | if (s->s_bdi == &fsc->backing_dev_info) { |
58 | bdi_unregister(&client->backing_dev_info); | 47 | bdi_unregister(&fsc->backing_dev_info); |
59 | s->s_bdi = NULL; | 48 | s->s_bdi = NULL; |
60 | } | 49 | } |
61 | 50 | ||
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s) | |||
64 | 53 | ||
65 | static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) | 54 | static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) |
66 | { | 55 | { |
67 | struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); | 56 | struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); |
68 | struct ceph_monmap *monmap = client->monc.monmap; | 57 | struct ceph_monmap *monmap = fsc->client->monc.monmap; |
69 | struct ceph_statfs st; | 58 | struct ceph_statfs st; |
70 | u64 fsid; | 59 | u64 fsid; |
71 | int err; | 60 | int err; |
72 | 61 | ||
73 | dout("statfs\n"); | 62 | dout("statfs\n"); |
74 | err = ceph_monc_do_statfs(&client->monc, &st); | 63 | err = ceph_monc_do_statfs(&fsc->client->monc, &st); |
75 | if (err < 0) | 64 | if (err < 0) |
76 | return err; | 65 | return err; |
77 | 66 | ||
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
104 | 93 | ||
105 | static int ceph_sync_fs(struct super_block *sb, int wait) | 94 | static int ceph_sync_fs(struct super_block *sb, int wait) |
106 | { | 95 | { |
107 | struct ceph_client *client = ceph_sb_to_client(sb); | 96 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
108 | 97 | ||
109 | if (!wait) { | 98 | if (!wait) { |
110 | dout("sync_fs (non-blocking)\n"); | 99 | dout("sync_fs (non-blocking)\n"); |
111 | ceph_flush_dirty_caps(&client->mdsc); | 100 | ceph_flush_dirty_caps(fsc->mdsc); |
112 | dout("sync_fs (non-blocking) done\n"); | 101 | dout("sync_fs (non-blocking) done\n"); |
113 | return 0; | 102 | return 0; |
114 | } | 103 | } |
115 | 104 | ||
116 | dout("sync_fs (blocking)\n"); | 105 | dout("sync_fs (blocking)\n"); |
117 | ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); | 106 | ceph_osdc_sync(&fsc->client->osdc); |
118 | ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); | 107 | ceph_mdsc_sync(fsc->mdsc); |
119 | dout("sync_fs (blocking) done\n"); | 108 | dout("sync_fs (blocking) done\n"); |
120 | return 0; | 109 | return 0; |
121 | } | 110 | } |
122 | 111 | ||
123 | static int default_congestion_kb(void) | ||
124 | { | ||
125 | int congestion_kb; | ||
126 | |||
127 | /* | ||
128 | * Copied from NFS | ||
129 | * | ||
130 | * congestion size, scale with available memory. | ||
131 | * | ||
132 | * 64MB: 8192k | ||
133 | * 128MB: 11585k | ||
134 | * 256MB: 16384k | ||
135 | * 512MB: 23170k | ||
136 | * 1GB: 32768k | ||
137 | * 2GB: 46340k | ||
138 | * 4GB: 65536k | ||
139 | * 8GB: 92681k | ||
140 | * 16GB: 131072k | ||
141 | * | ||
142 | * This allows larger machines to have larger/more transfers. | ||
143 | * Limit the default to 256M | ||
144 | */ | ||
145 | congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); | ||
146 | if (congestion_kb > 256*1024) | ||
147 | congestion_kb = 256*1024; | ||
148 | |||
149 | return congestion_kb; | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * ceph_show_options - Show mount options in /proc/mounts | ||
154 | * @m: seq_file to write to | ||
155 | * @mnt: mount descriptor | ||
156 | */ | ||
157 | static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) | ||
158 | { | ||
159 | struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); | ||
160 | struct ceph_mount_args *args = client->mount_args; | ||
161 | |||
162 | if (args->flags & CEPH_OPT_FSID) | ||
163 | seq_printf(m, ",fsid=%pU", &args->fsid); | ||
164 | if (args->flags & CEPH_OPT_NOSHARE) | ||
165 | seq_puts(m, ",noshare"); | ||
166 | if (args->flags & CEPH_OPT_DIRSTAT) | ||
167 | seq_puts(m, ",dirstat"); | ||
168 | if ((args->flags & CEPH_OPT_RBYTES) == 0) | ||
169 | seq_puts(m, ",norbytes"); | ||
170 | if (args->flags & CEPH_OPT_NOCRC) | ||
171 | seq_puts(m, ",nocrc"); | ||
172 | if (args->flags & CEPH_OPT_NOASYNCREADDIR) | ||
173 | seq_puts(m, ",noasyncreaddir"); | ||
174 | |||
175 | if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) | ||
176 | seq_printf(m, ",mount_timeout=%d", args->mount_timeout); | ||
177 | if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) | ||
178 | seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); | ||
179 | if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) | ||
180 | seq_printf(m, ",osdtimeout=%d", args->osd_timeout); | ||
181 | if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) | ||
182 | seq_printf(m, ",osdkeepalivetimeout=%d", | ||
183 | args->osd_keepalive_timeout); | ||
184 | if (args->wsize) | ||
185 | seq_printf(m, ",wsize=%d", args->wsize); | ||
186 | if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) | ||
187 | seq_printf(m, ",rsize=%d", args->rsize); | ||
188 | if (args->congestion_kb != default_congestion_kb()) | ||
189 | seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); | ||
190 | if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) | ||
191 | seq_printf(m, ",caps_wanted_delay_min=%d", | ||
192 | args->caps_wanted_delay_min); | ||
193 | if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) | ||
194 | seq_printf(m, ",caps_wanted_delay_max=%d", | ||
195 | args->caps_wanted_delay_max); | ||
196 | if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) | ||
197 | seq_printf(m, ",cap_release_safety=%d", | ||
198 | args->cap_release_safety); | ||
199 | if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) | ||
200 | seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); | ||
201 | if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) | ||
202 | seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); | ||
203 | if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) | ||
204 | seq_printf(m, ",snapdirname=%s", args->snapdir_name); | ||
205 | if (args->name) | ||
206 | seq_printf(m, ",name=%s", args->name); | ||
207 | if (args->secret) | ||
208 | seq_puts(m, ",secret=<hidden>"); | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * caches | ||
214 | */ | ||
215 | struct kmem_cache *ceph_inode_cachep; | ||
216 | struct kmem_cache *ceph_cap_cachep; | ||
217 | struct kmem_cache *ceph_dentry_cachep; | ||
218 | struct kmem_cache *ceph_file_cachep; | ||
219 | |||
220 | static void ceph_inode_init_once(void *foo) | ||
221 | { | ||
222 | struct ceph_inode_info *ci = foo; | ||
223 | inode_init_once(&ci->vfs_inode); | ||
224 | } | ||
225 | |||
226 | static int __init init_caches(void) | ||
227 | { | ||
228 | ceph_inode_cachep = kmem_cache_create("ceph_inode_info", | ||
229 | sizeof(struct ceph_inode_info), | ||
230 | __alignof__(struct ceph_inode_info), | ||
231 | (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), | ||
232 | ceph_inode_init_once); | ||
233 | if (ceph_inode_cachep == NULL) | ||
234 | return -ENOMEM; | ||
235 | |||
236 | ceph_cap_cachep = KMEM_CACHE(ceph_cap, | ||
237 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
238 | if (ceph_cap_cachep == NULL) | ||
239 | goto bad_cap; | ||
240 | |||
241 | ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, | ||
242 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
243 | if (ceph_dentry_cachep == NULL) | ||
244 | goto bad_dentry; | ||
245 | |||
246 | ceph_file_cachep = KMEM_CACHE(ceph_file_info, | ||
247 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
248 | if (ceph_file_cachep == NULL) | ||
249 | goto bad_file; | ||
250 | |||
251 | return 0; | ||
252 | |||
253 | bad_file: | ||
254 | kmem_cache_destroy(ceph_dentry_cachep); | ||
255 | bad_dentry: | ||
256 | kmem_cache_destroy(ceph_cap_cachep); | ||
257 | bad_cap: | ||
258 | kmem_cache_destroy(ceph_inode_cachep); | ||
259 | return -ENOMEM; | ||
260 | } | ||
261 | |||
262 | static void destroy_caches(void) | ||
263 | { | ||
264 | kmem_cache_destroy(ceph_inode_cachep); | ||
265 | kmem_cache_destroy(ceph_cap_cachep); | ||
266 | kmem_cache_destroy(ceph_dentry_cachep); | ||
267 | kmem_cache_destroy(ceph_file_cachep); | ||
268 | } | ||
269 | |||
270 | |||
271 | /* | ||
272 | * ceph_umount_begin - initiate forced umount. Tear down down the | ||
273 | * mount, skipping steps that may hang while waiting for server(s). | ||
274 | */ | ||
275 | static void ceph_umount_begin(struct super_block *sb) | ||
276 | { | ||
277 | struct ceph_client *client = ceph_sb_to_client(sb); | ||
278 | |||
279 | dout("ceph_umount_begin - starting forced umount\n"); | ||
280 | if (!client) | ||
281 | return; | ||
282 | client->mount_state = CEPH_MOUNT_SHUTDOWN; | ||
283 | return; | ||
284 | } | ||
285 | |||
286 | static const struct super_operations ceph_super_ops = { | ||
287 | .alloc_inode = ceph_alloc_inode, | ||
288 | .destroy_inode = ceph_destroy_inode, | ||
289 | .write_inode = ceph_write_inode, | ||
290 | .sync_fs = ceph_sync_fs, | ||
291 | .put_super = ceph_put_super, | ||
292 | .show_options = ceph_show_options, | ||
293 | .statfs = ceph_statfs, | ||
294 | .umount_begin = ceph_umount_begin, | ||
295 | }; | ||
296 | |||
297 | |||
298 | const char *ceph_msg_type_name(int type) | ||
299 | { | ||
300 | switch (type) { | ||
301 | case CEPH_MSG_SHUTDOWN: return "shutdown"; | ||
302 | case CEPH_MSG_PING: return "ping"; | ||
303 | case CEPH_MSG_AUTH: return "auth"; | ||
304 | case CEPH_MSG_AUTH_REPLY: return "auth_reply"; | ||
305 | case CEPH_MSG_MON_MAP: return "mon_map"; | ||
306 | case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; | ||
307 | case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; | ||
308 | case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; | ||
309 | case CEPH_MSG_STATFS: return "statfs"; | ||
310 | case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; | ||
311 | case CEPH_MSG_MDS_MAP: return "mds_map"; | ||
312 | case CEPH_MSG_CLIENT_SESSION: return "client_session"; | ||
313 | case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; | ||
314 | case CEPH_MSG_CLIENT_REQUEST: return "client_request"; | ||
315 | case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; | ||
316 | case CEPH_MSG_CLIENT_REPLY: return "client_reply"; | ||
317 | case CEPH_MSG_CLIENT_CAPS: return "client_caps"; | ||
318 | case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; | ||
319 | case CEPH_MSG_CLIENT_SNAP: return "client_snap"; | ||
320 | case CEPH_MSG_CLIENT_LEASE: return "client_lease"; | ||
321 | case CEPH_MSG_OSD_MAP: return "osd_map"; | ||
322 | case CEPH_MSG_OSD_OP: return "osd_op"; | ||
323 | case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; | ||
324 | default: return "unknown"; | ||
325 | } | ||
326 | } | ||
327 | |||
328 | |||
329 | /* | 112 | /* |
330 | * mount options | 113 | * mount options |
331 | */ | 114 | */ |
332 | enum { | 115 | enum { |
333 | Opt_wsize, | 116 | Opt_wsize, |
334 | Opt_rsize, | 117 | Opt_rsize, |
335 | Opt_osdtimeout, | ||
336 | Opt_osdkeepalivetimeout, | ||
337 | Opt_mount_timeout, | ||
338 | Opt_osd_idle_ttl, | ||
339 | Opt_caps_wanted_delay_min, | 118 | Opt_caps_wanted_delay_min, |
340 | Opt_caps_wanted_delay_max, | 119 | Opt_caps_wanted_delay_max, |
341 | Opt_cap_release_safety, | 120 | Opt_cap_release_safety, |
@@ -344,29 +123,19 @@ enum { | |||
344 | Opt_congestion_kb, | 123 | Opt_congestion_kb, |
345 | Opt_last_int, | 124 | Opt_last_int, |
346 | /* int args above */ | 125 | /* int args above */ |
347 | Opt_fsid, | ||
348 | Opt_snapdirname, | 126 | Opt_snapdirname, |
349 | Opt_name, | ||
350 | Opt_secret, | ||
351 | Opt_last_string, | 127 | Opt_last_string, |
352 | /* string args above */ | 128 | /* string args above */ |
353 | Opt_ip, | ||
354 | Opt_noshare, | ||
355 | Opt_dirstat, | 129 | Opt_dirstat, |
356 | Opt_nodirstat, | 130 | Opt_nodirstat, |
357 | Opt_rbytes, | 131 | Opt_rbytes, |
358 | Opt_norbytes, | 132 | Opt_norbytes, |
359 | Opt_nocrc, | ||
360 | Opt_noasyncreaddir, | 133 | Opt_noasyncreaddir, |
361 | }; | 134 | }; |
362 | 135 | ||
363 | static match_table_t arg_tokens = { | 136 | static match_table_t fsopt_tokens = { |
364 | {Opt_wsize, "wsize=%d"}, | 137 | {Opt_wsize, "wsize=%d"}, |
365 | {Opt_rsize, "rsize=%d"}, | 138 | {Opt_rsize, "rsize=%d"}, |
366 | {Opt_osdtimeout, "osdtimeout=%d"}, | ||
367 | {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, | ||
368 | {Opt_mount_timeout, "mount_timeout=%d"}, | ||
369 | {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, | ||
370 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, | 139 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, |
371 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, | 140 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, |
372 | {Opt_cap_release_safety, "cap_release_safety=%d"}, | 141 | {Opt_cap_release_safety, "cap_release_safety=%d"}, |
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = { | |||
374 | {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, | 143 | {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, |
375 | {Opt_congestion_kb, "write_congestion_kb=%d"}, | 144 | {Opt_congestion_kb, "write_congestion_kb=%d"}, |
376 | /* int args above */ | 145 | /* int args above */ |
377 | {Opt_fsid, "fsid=%s"}, | ||
378 | {Opt_snapdirname, "snapdirname=%s"}, | 146 | {Opt_snapdirname, "snapdirname=%s"}, |
379 | {Opt_name, "name=%s"}, | ||
380 | {Opt_secret, "secret=%s"}, | ||
381 | /* string args above */ | 147 | /* string args above */ |
382 | {Opt_ip, "ip=%s"}, | ||
383 | {Opt_noshare, "noshare"}, | ||
384 | {Opt_dirstat, "dirstat"}, | 148 | {Opt_dirstat, "dirstat"}, |
385 | {Opt_nodirstat, "nodirstat"}, | 149 | {Opt_nodirstat, "nodirstat"}, |
386 | {Opt_rbytes, "rbytes"}, | 150 | {Opt_rbytes, "rbytes"}, |
387 | {Opt_norbytes, "norbytes"}, | 151 | {Opt_norbytes, "norbytes"}, |
388 | {Opt_nocrc, "nocrc"}, | ||
389 | {Opt_noasyncreaddir, "noasyncreaddir"}, | 152 | {Opt_noasyncreaddir, "noasyncreaddir"}, |
390 | {-1, NULL} | 153 | {-1, NULL} |
391 | }; | 154 | }; |
392 | 155 | ||
393 | static int parse_fsid(const char *str, struct ceph_fsid *fsid) | 156 | static int parse_fsopt_token(char *c, void *private) |
394 | { | 157 | { |
395 | int i = 0; | 158 | struct ceph_mount_options *fsopt = private; |
396 | char tmp[3]; | 159 | substring_t argstr[MAX_OPT_ARGS]; |
397 | int err = -EINVAL; | 160 | int token, intval, ret; |
398 | int d; | 161 | |
399 | 162 | token = match_token((char *)c, fsopt_tokens, argstr); | |
400 | dout("parse_fsid '%s'\n", str); | 163 | if (token < 0) |
401 | tmp[2] = 0; | 164 | return -EINVAL; |
402 | while (*str && i < 16) { | 165 | |
403 | if (ispunct(*str)) { | 166 | if (token < Opt_last_int) { |
404 | str++; | 167 | ret = match_int(&argstr[0], &intval); |
405 | continue; | 168 | if (ret < 0) { |
169 | pr_err("bad mount option arg (not int) " | ||
170 | "at '%s'\n", c); | ||
171 | return ret; | ||
406 | } | 172 | } |
407 | if (!isxdigit(str[0]) || !isxdigit(str[1])) | 173 | dout("got int token %d val %d\n", token, intval); |
408 | break; | 174 | } else if (token > Opt_last_int && token < Opt_last_string) { |
409 | tmp[0] = str[0]; | 175 | dout("got string token %d val %s\n", token, |
410 | tmp[1] = str[1]; | 176 | argstr[0].from); |
411 | if (sscanf(tmp, "%x", &d) < 1) | 177 | } else { |
412 | break; | 178 | dout("got token %d\n", token); |
413 | fsid->fsid[i] = d & 0xff; | ||
414 | i++; | ||
415 | str += 2; | ||
416 | } | 179 | } |
417 | 180 | ||
418 | if (i == 16) | 181 | switch (token) { |
419 | err = 0; | 182 | case Opt_snapdirname: |
420 | dout("parse_fsid ret %d got fsid %pU", err, fsid); | 183 | kfree(fsopt->snapdir_name); |
421 | return err; | 184 | fsopt->snapdir_name = kstrndup(argstr[0].from, |
185 | argstr[0].to-argstr[0].from, | ||
186 | GFP_KERNEL); | ||
187 | if (!fsopt->snapdir_name) | ||
188 | return -ENOMEM; | ||
189 | break; | ||
190 | |||
191 | /* misc */ | ||
192 | case Opt_wsize: | ||
193 | fsopt->wsize = intval; | ||
194 | break; | ||
195 | case Opt_rsize: | ||
196 | fsopt->rsize = intval; | ||
197 | break; | ||
198 | case Opt_caps_wanted_delay_min: | ||
199 | fsopt->caps_wanted_delay_min = intval; | ||
200 | break; | ||
201 | case Opt_caps_wanted_delay_max: | ||
202 | fsopt->caps_wanted_delay_max = intval; | ||
203 | break; | ||
204 | case Opt_readdir_max_entries: | ||
205 | fsopt->max_readdir = intval; | ||
206 | break; | ||
207 | case Opt_readdir_max_bytes: | ||
208 | fsopt->max_readdir_bytes = intval; | ||
209 | break; | ||
210 | case Opt_congestion_kb: | ||
211 | fsopt->congestion_kb = intval; | ||
212 | break; | ||
213 | case Opt_dirstat: | ||
214 | fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; | ||
215 | break; | ||
216 | case Opt_nodirstat: | ||
217 | fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; | ||
218 | break; | ||
219 | case Opt_rbytes: | ||
220 | fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; | ||
221 | break; | ||
222 | case Opt_norbytes: | ||
223 | fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; | ||
224 | break; | ||
225 | case Opt_noasyncreaddir: | ||
226 | fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; | ||
227 | break; | ||
228 | default: | ||
229 | BUG_ON(token); | ||
230 | } | ||
231 | return 0; | ||
422 | } | 232 | } |
423 | 233 | ||
424 | static struct ceph_mount_args *parse_mount_args(int flags, char *options, | 234 | static void destroy_mount_options(struct ceph_mount_options *args) |
425 | const char *dev_name, | ||
426 | const char **path) | ||
427 | { | 235 | { |
428 | struct ceph_mount_args *args; | 236 | dout("destroy_mount_options %p\n", args); |
429 | const char *c; | 237 | kfree(args->snapdir_name); |
430 | int err = -ENOMEM; | 238 | kfree(args); |
431 | substring_t argstr[MAX_OPT_ARGS]; | 239 | } |
432 | 240 | ||
433 | args = kzalloc(sizeof(*args), GFP_KERNEL); | 241 | static int strcmp_null(const char *s1, const char *s2) |
434 | if (!args) | 242 | { |
435 | return ERR_PTR(-ENOMEM); | 243 | if (!s1 && !s2) |
436 | args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), | 244 | return 0; |
437 | GFP_KERNEL); | 245 | if (s1 && !s2) |
438 | if (!args->mon_addr) | 246 | return -1; |
439 | goto out; | 247 | if (!s1 && s2) |
248 | return 1; | ||
249 | return strcmp(s1, s2); | ||
250 | } | ||
440 | 251 | ||
441 | dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); | 252 | static int compare_mount_options(struct ceph_mount_options *new_fsopt, |
442 | 253 | struct ceph_options *new_opt, | |
443 | /* start with defaults */ | 254 | struct ceph_fs_client *fsc) |
444 | args->sb_flags = flags; | 255 | { |
445 | args->flags = CEPH_OPT_DEFAULT; | 256 | struct ceph_mount_options *fsopt1 = new_fsopt; |
446 | args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; | 257 | struct ceph_mount_options *fsopt2 = fsc->mount_options; |
447 | args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; | 258 | int ofs = offsetof(struct ceph_mount_options, snapdir_name); |
448 | args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ | 259 | int ret; |
449 | args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ | ||
450 | args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; | ||
451 | args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; | ||
452 | args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; | ||
453 | args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | ||
454 | args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; | ||
455 | args->max_readdir = CEPH_MAX_READDIR_DEFAULT; | ||
456 | args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; | ||
457 | args->congestion_kb = default_congestion_kb(); | ||
458 | |||
459 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ | ||
460 | err = -EINVAL; | ||
461 | if (!dev_name) | ||
462 | goto out; | ||
463 | *path = strstr(dev_name, ":/"); | ||
464 | if (*path == NULL) { | ||
465 | pr_err("device name is missing path (no :/ in %s)\n", | ||
466 | dev_name); | ||
467 | goto out; | ||
468 | } | ||
469 | 260 | ||
470 | /* get mon ip(s) */ | 261 | ret = memcmp(fsopt1, fsopt2, ofs); |
471 | err = ceph_parse_ips(dev_name, *path, args->mon_addr, | 262 | if (ret) |
472 | CEPH_MAX_MON, &args->num_mon); | 263 | return ret; |
473 | if (err < 0) | 264 | |
474 | goto out; | 265 | ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); |
266 | if (ret) | ||
267 | return ret; | ||
268 | |||
269 | return ceph_compare_options(new_opt, fsc->client); | ||
270 | } | ||
271 | |||
272 | static int parse_mount_options(struct ceph_mount_options **pfsopt, | ||
273 | struct ceph_options **popt, | ||
274 | int flags, char *options, | ||
275 | const char *dev_name, | ||
276 | const char **path) | ||
277 | { | ||
278 | struct ceph_mount_options *fsopt; | ||
279 | const char *dev_name_end; | ||
280 | int err = -ENOMEM; | ||
281 | |||
282 | fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); | ||
283 | if (!fsopt) | ||
284 | return -ENOMEM; | ||
285 | |||
286 | dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); | ||
287 | |||
288 | fsopt->sb_flags = flags; | ||
289 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; | ||
290 | |||
291 | fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; | ||
292 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | ||
293 | fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; | ||
294 | fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; | ||
295 | fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; | ||
296 | fsopt->congestion_kb = default_congestion_kb(); | ||
297 | |||
298 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ | ||
299 | err = -EINVAL; | ||
300 | if (!dev_name) | ||
301 | goto out; | ||
302 | *path = strstr(dev_name, ":/"); | ||
303 | if (*path == NULL) { | ||
304 | pr_err("device name is missing path (no :/ in %s)\n", | ||
305 | dev_name); | ||
306 | goto out; | ||
307 | } | ||
308 | dev_name_end = *path; | ||
309 | dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); | ||
475 | 310 | ||
476 | /* path on server */ | 311 | /* path on server */ |
477 | *path += 2; | 312 | *path += 2; |
478 | dout("server path '%s'\n", *path); | 313 | dout("server path '%s'\n", *path); |
479 | 314 | ||
480 | /* parse mount options */ | 315 | err = ceph_parse_options(popt, options, dev_name, dev_name_end, |
481 | while ((c = strsep(&options, ",")) != NULL) { | 316 | parse_fsopt_token, (void *)fsopt); |
482 | int token, intval, ret; | 317 | if (err) |
483 | if (!*c) | 318 | goto out; |
484 | continue; | 319 | |
485 | err = -EINVAL; | 320 | /* success */ |
486 | token = match_token((char *)c, arg_tokens, argstr); | 321 | *pfsopt = fsopt; |
487 | if (token < 0) { | 322 | return 0; |
488 | pr_err("bad mount option at '%s'\n", c); | ||
489 | goto out; | ||
490 | } | ||
491 | if (token < Opt_last_int) { | ||
492 | ret = match_int(&argstr[0], &intval); | ||
493 | if (ret < 0) { | ||
494 | pr_err("bad mount option arg (not int) " | ||
495 | "at '%s'\n", c); | ||
496 | continue; | ||
497 | } | ||
498 | dout("got int token %d val %d\n", token, intval); | ||
499 | } else if (token > Opt_last_int && token < Opt_last_string) { | ||
500 | dout("got string token %d val %s\n", token, | ||
501 | argstr[0].from); | ||
502 | } else { | ||
503 | dout("got token %d\n", token); | ||
504 | } | ||
505 | switch (token) { | ||
506 | case Opt_ip: | ||
507 | err = ceph_parse_ips(argstr[0].from, | ||
508 | argstr[0].to, | ||
509 | &args->my_addr, | ||
510 | 1, NULL); | ||
511 | if (err < 0) | ||
512 | goto out; | ||
513 | args->flags |= CEPH_OPT_MYIP; | ||
514 | break; | ||
515 | |||
516 | case Opt_fsid: | ||
517 | err = parse_fsid(argstr[0].from, &args->fsid); | ||
518 | if (err == 0) | ||
519 | args->flags |= CEPH_OPT_FSID; | ||
520 | break; | ||
521 | case Opt_snapdirname: | ||
522 | kfree(args->snapdir_name); | ||
523 | args->snapdir_name = kstrndup(argstr[0].from, | ||
524 | argstr[0].to-argstr[0].from, | ||
525 | GFP_KERNEL); | ||
526 | break; | ||
527 | case Opt_name: | ||
528 | args->name = kstrndup(argstr[0].from, | ||
529 | argstr[0].to-argstr[0].from, | ||
530 | GFP_KERNEL); | ||
531 | break; | ||
532 | case Opt_secret: | ||
533 | args->secret = kstrndup(argstr[0].from, | ||
534 | argstr[0].to-argstr[0].from, | ||
535 | GFP_KERNEL); | ||
536 | break; | ||
537 | |||
538 | /* misc */ | ||
539 | case Opt_wsize: | ||
540 | args->wsize = intval; | ||
541 | break; | ||
542 | case Opt_rsize: | ||
543 | args->rsize = intval; | ||
544 | break; | ||
545 | case Opt_osdtimeout: | ||
546 | args->osd_timeout = intval; | ||
547 | break; | ||
548 | case Opt_osdkeepalivetimeout: | ||
549 | args->osd_keepalive_timeout = intval; | ||
550 | break; | ||
551 | case Opt_osd_idle_ttl: | ||
552 | args->osd_idle_ttl = intval; | ||
553 | break; | ||
554 | case Opt_mount_timeout: | ||
555 | args->mount_timeout = intval; | ||
556 | break; | ||
557 | case Opt_caps_wanted_delay_min: | ||
558 | args->caps_wanted_delay_min = intval; | ||
559 | break; | ||
560 | case Opt_caps_wanted_delay_max: | ||
561 | args->caps_wanted_delay_max = intval; | ||
562 | break; | ||
563 | case Opt_readdir_max_entries: | ||
564 | args->max_readdir = intval; | ||
565 | break; | ||
566 | case Opt_readdir_max_bytes: | ||
567 | args->max_readdir_bytes = intval; | ||
568 | break; | ||
569 | case Opt_congestion_kb: | ||
570 | args->congestion_kb = intval; | ||
571 | break; | ||
572 | |||
573 | case Opt_noshare: | ||
574 | args->flags |= CEPH_OPT_NOSHARE; | ||
575 | break; | ||
576 | |||
577 | case Opt_dirstat: | ||
578 | args->flags |= CEPH_OPT_DIRSTAT; | ||
579 | break; | ||
580 | case Opt_nodirstat: | ||
581 | args->flags &= ~CEPH_OPT_DIRSTAT; | ||
582 | break; | ||
583 | case Opt_rbytes: | ||
584 | args->flags |= CEPH_OPT_RBYTES; | ||
585 | break; | ||
586 | case Opt_norbytes: | ||
587 | args->flags &= ~CEPH_OPT_RBYTES; | ||
588 | break; | ||
589 | case Opt_nocrc: | ||
590 | args->flags |= CEPH_OPT_NOCRC; | ||
591 | break; | ||
592 | case Opt_noasyncreaddir: | ||
593 | args->flags |= CEPH_OPT_NOASYNCREADDIR; | ||
594 | break; | ||
595 | |||
596 | default: | ||
597 | BUG_ON(token); | ||
598 | } | ||
599 | } | ||
600 | return args; | ||
601 | 323 | ||
602 | out: | 324 | out: |
603 | kfree(args->mon_addr); | 325 | destroy_mount_options(fsopt); |
604 | kfree(args); | 326 | return err; |
605 | return ERR_PTR(err); | ||
606 | } | 327 | } |
607 | 328 | ||
608 | static void destroy_mount_args(struct ceph_mount_args *args) | 329 | /** |
330 | * ceph_show_options - Show mount options in /proc/mounts | ||
331 | * @m: seq_file to write to | ||
332 | * @mnt: mount descriptor | ||
333 | */ | ||
334 | static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) | ||
609 | { | 335 | { |
610 | dout("destroy_mount_args %p\n", args); | 336 | struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); |
611 | kfree(args->snapdir_name); | 337 | struct ceph_mount_options *fsopt = fsc->mount_options; |
612 | args->snapdir_name = NULL; | 338 | struct ceph_options *opt = fsc->client->options; |
613 | kfree(args->name); | 339 | |
614 | args->name = NULL; | 340 | if (opt->flags & CEPH_OPT_FSID) |
615 | kfree(args->secret); | 341 | seq_printf(m, ",fsid=%pU", &opt->fsid); |
616 | args->secret = NULL; | 342 | if (opt->flags & CEPH_OPT_NOSHARE) |
617 | kfree(args); | 343 | seq_puts(m, ",noshare"); |
344 | if (opt->flags & CEPH_OPT_NOCRC) | ||
345 | seq_puts(m, ",nocrc"); | ||
346 | |||
347 | if (opt->name) | ||
348 | seq_printf(m, ",name=%s", opt->name); | ||
349 | if (opt->secret) | ||
350 | seq_puts(m, ",secret=<hidden>"); | ||
351 | |||
352 | if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) | ||
353 | seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); | ||
354 | if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) | ||
355 | seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); | ||
356 | if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) | ||
357 | seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); | ||
358 | if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) | ||
359 | seq_printf(m, ",osdkeepalivetimeout=%d", | ||
360 | opt->osd_keepalive_timeout); | ||
361 | |||
362 | if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) | ||
363 | seq_puts(m, ",dirstat"); | ||
364 | if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) | ||
365 | seq_puts(m, ",norbytes"); | ||
366 | if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) | ||
367 | seq_puts(m, ",noasyncreaddir"); | ||
368 | |||
369 | if (fsopt->wsize) | ||
370 | seq_printf(m, ",wsize=%d", fsopt->wsize); | ||
371 | if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) | ||
372 | seq_printf(m, ",rsize=%d", fsopt->rsize); | ||
373 | if (fsopt->congestion_kb != default_congestion_kb()) | ||
374 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); | ||
375 | if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) | ||
376 | seq_printf(m, ",caps_wanted_delay_min=%d", | ||
377 | fsopt->caps_wanted_delay_min); | ||
378 | if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) | ||
379 | seq_printf(m, ",caps_wanted_delay_max=%d", | ||
380 | fsopt->caps_wanted_delay_max); | ||
381 | if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) | ||
382 | seq_printf(m, ",cap_release_safety=%d", | ||
383 | fsopt->cap_release_safety); | ||
384 | if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) | ||
385 | seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); | ||
386 | if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) | ||
387 | seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); | ||
388 | if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) | ||
389 | seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); | ||
390 | return 0; | ||
618 | } | 391 | } |
619 | 392 | ||
620 | /* | 393 | /* |
621 | * create a fresh client instance | 394 | * handle any mon messages the standard library doesn't understand. |
395 | * return error if we don't either. | ||
622 | */ | 396 | */ |
623 | static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) | 397 | static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) |
624 | { | 398 | { |
625 | struct ceph_client *client; | 399 | struct ceph_fs_client *fsc = client->private; |
400 | int type = le16_to_cpu(msg->hdr.type); | ||
401 | |||
402 | switch (type) { | ||
403 | case CEPH_MSG_MDS_MAP: | ||
404 | ceph_mdsc_handle_map(fsc->mdsc, msg); | ||
405 | return 0; | ||
406 | |||
407 | default: | ||
408 | return -1; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * create a new fs client | ||
414 | */ | ||
415 | struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | ||
416 | struct ceph_options *opt) | ||
417 | { | ||
418 | struct ceph_fs_client *fsc; | ||
626 | int err = -ENOMEM; | 419 | int err = -ENOMEM; |
627 | 420 | ||
628 | client = kzalloc(sizeof(*client), GFP_KERNEL); | 421 | fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); |
629 | if (client == NULL) | 422 | if (!fsc) |
630 | return ERR_PTR(-ENOMEM); | 423 | return ERR_PTR(-ENOMEM); |
631 | 424 | ||
632 | mutex_init(&client->mount_mutex); | 425 | fsc->client = ceph_create_client(opt, fsc); |
633 | 426 | if (IS_ERR(fsc->client)) { | |
634 | init_waitqueue_head(&client->auth_wq); | 427 | err = PTR_ERR(fsc->client); |
428 | goto fail; | ||
429 | } | ||
430 | fsc->client->extra_mon_dispatch = extra_mon_dispatch; | ||
431 | fsc->client->supported_features |= CEPH_FEATURE_FLOCK; | ||
432 | fsc->client->monc.want_mdsmap = 1; | ||
635 | 433 | ||
636 | client->sb = NULL; | 434 | fsc->mount_options = fsopt; |
637 | client->mount_state = CEPH_MOUNT_MOUNTING; | ||
638 | client->mount_args = args; | ||
639 | 435 | ||
640 | client->msgr = NULL; | 436 | fsc->sb = NULL; |
437 | fsc->mount_state = CEPH_MOUNT_MOUNTING; | ||
641 | 438 | ||
642 | client->auth_err = 0; | 439 | atomic_long_set(&fsc->writeback_count, 0); |
643 | atomic_long_set(&client->writeback_count, 0); | ||
644 | 440 | ||
645 | err = bdi_init(&client->backing_dev_info); | 441 | err = bdi_init(&fsc->backing_dev_info); |
646 | if (err < 0) | 442 | if (err < 0) |
647 | goto fail; | 443 | goto fail_client; |
648 | 444 | ||
649 | err = -ENOMEM; | 445 | err = -ENOMEM; |
650 | client->wb_wq = create_workqueue("ceph-writeback"); | 446 | fsc->wb_wq = create_workqueue("ceph-writeback"); |
651 | if (client->wb_wq == NULL) | 447 | if (fsc->wb_wq == NULL) |
652 | goto fail_bdi; | 448 | goto fail_bdi; |
653 | client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); | 449 | fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); |
654 | if (client->pg_inv_wq == NULL) | 450 | if (fsc->pg_inv_wq == NULL) |
655 | goto fail_wb_wq; | 451 | goto fail_wb_wq; |
656 | client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); | 452 | fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); |
657 | if (client->trunc_wq == NULL) | 453 | if (fsc->trunc_wq == NULL) |
658 | goto fail_pg_inv_wq; | 454 | goto fail_pg_inv_wq; |
659 | 455 | ||
660 | /* set up mempools */ | 456 | /* set up mempools */ |
661 | err = -ENOMEM; | 457 | err = -ENOMEM; |
662 | client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, | 458 | fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, |
663 | client->mount_args->wsize >> PAGE_CACHE_SHIFT); | 459 | fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); |
664 | if (!client->wb_pagevec_pool) | 460 | if (!fsc->wb_pagevec_pool) |
665 | goto fail_trunc_wq; | 461 | goto fail_trunc_wq; |
666 | 462 | ||
667 | /* caps */ | 463 | /* caps */ |
668 | client->min_caps = args->max_readdir; | 464 | fsc->min_caps = fsopt->max_readdir; |
465 | |||
466 | return fsc; | ||
669 | 467 | ||
670 | /* subsystems */ | ||
671 | err = ceph_monc_init(&client->monc, client); | ||
672 | if (err < 0) | ||
673 | goto fail_mempool; | ||
674 | err = ceph_osdc_init(&client->osdc, client); | ||
675 | if (err < 0) | ||
676 | goto fail_monc; | ||
677 | err = ceph_mdsc_init(&client->mdsc, client); | ||
678 | if (err < 0) | ||
679 | goto fail_osdc; | ||
680 | return client; | ||
681 | |||
682 | fail_osdc: | ||
683 | ceph_osdc_stop(&client->osdc); | ||
684 | fail_monc: | ||
685 | ceph_monc_stop(&client->monc); | ||
686 | fail_mempool: | ||
687 | mempool_destroy(client->wb_pagevec_pool); | ||
688 | fail_trunc_wq: | 468 | fail_trunc_wq: |
689 | destroy_workqueue(client->trunc_wq); | 469 | destroy_workqueue(fsc->trunc_wq); |
690 | fail_pg_inv_wq: | 470 | fail_pg_inv_wq: |
691 | destroy_workqueue(client->pg_inv_wq); | 471 | destroy_workqueue(fsc->pg_inv_wq); |
692 | fail_wb_wq: | 472 | fail_wb_wq: |
693 | destroy_workqueue(client->wb_wq); | 473 | destroy_workqueue(fsc->wb_wq); |
694 | fail_bdi: | 474 | fail_bdi: |
695 | bdi_destroy(&client->backing_dev_info); | 475 | bdi_destroy(&fsc->backing_dev_info); |
476 | fail_client: | ||
477 | ceph_destroy_client(fsc->client); | ||
696 | fail: | 478 | fail: |
697 | kfree(client); | 479 | kfree(fsc); |
698 | return ERR_PTR(err); | 480 | return ERR_PTR(err); |
699 | } | 481 | } |
700 | 482 | ||
701 | static void ceph_destroy_client(struct ceph_client *client) | 483 | void destroy_fs_client(struct ceph_fs_client *fsc) |
702 | { | 484 | { |
703 | dout("destroy_client %p\n", client); | 485 | dout("destroy_fs_client %p\n", fsc); |
704 | 486 | ||
705 | /* unmount */ | 487 | destroy_workqueue(fsc->wb_wq); |
706 | ceph_mdsc_stop(&client->mdsc); | 488 | destroy_workqueue(fsc->pg_inv_wq); |
707 | ceph_osdc_stop(&client->osdc); | 489 | destroy_workqueue(fsc->trunc_wq); |
708 | 490 | ||
709 | /* | 491 | bdi_destroy(&fsc->backing_dev_info); |
710 | * make sure mds and osd connections close out before destroying | ||
711 | * the auth module, which is needed to free those connections' | ||
712 | * ceph_authorizers. | ||
713 | */ | ||
714 | ceph_msgr_flush(); | ||
715 | |||
716 | ceph_monc_stop(&client->monc); | ||
717 | 492 | ||
718 | ceph_debugfs_client_cleanup(client); | 493 | mempool_destroy(fsc->wb_pagevec_pool); |
719 | destroy_workqueue(client->wb_wq); | ||
720 | destroy_workqueue(client->pg_inv_wq); | ||
721 | destroy_workqueue(client->trunc_wq); | ||
722 | 494 | ||
723 | bdi_destroy(&client->backing_dev_info); | 495 | destroy_mount_options(fsc->mount_options); |
724 | 496 | ||
725 | if (client->msgr) | 497 | ceph_fs_debugfs_cleanup(fsc); |
726 | ceph_messenger_destroy(client->msgr); | ||
727 | mempool_destroy(client->wb_pagevec_pool); | ||
728 | 498 | ||
729 | destroy_mount_args(client->mount_args); | 499 | ceph_destroy_client(fsc->client); |
730 | 500 | ||
731 | kfree(client); | 501 | kfree(fsc); |
732 | dout("destroy_client %p done\n", client); | 502 | dout("destroy_fs_client %p done\n", fsc); |
733 | } | 503 | } |
734 | 504 | ||
735 | /* | 505 | /* |
736 | * Initially learn our fsid, or verify an fsid matches. | 506 | * caches |
737 | */ | 507 | */ |
738 | int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) | 508 | struct kmem_cache *ceph_inode_cachep; |
509 | struct kmem_cache *ceph_cap_cachep; | ||
510 | struct kmem_cache *ceph_dentry_cachep; | ||
511 | struct kmem_cache *ceph_file_cachep; | ||
512 | |||
513 | static void ceph_inode_init_once(void *foo) | ||
739 | { | 514 | { |
740 | if (client->have_fsid) { | 515 | struct ceph_inode_info *ci = foo; |
741 | if (ceph_fsid_compare(&client->fsid, fsid)) { | 516 | inode_init_once(&ci->vfs_inode); |
742 | pr_err("bad fsid, had %pU got %pU", | 517 | } |
743 | &client->fsid, fsid); | 518 | |
744 | return -1; | 519 | static int __init init_caches(void) |
745 | } | 520 | { |
746 | } else { | 521 | ceph_inode_cachep = kmem_cache_create("ceph_inode_info", |
747 | pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, | 522 | sizeof(struct ceph_inode_info), |
748 | fsid); | 523 | __alignof__(struct ceph_inode_info), |
749 | memcpy(&client->fsid, fsid, sizeof(*fsid)); | 524 | (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), |
750 | ceph_debugfs_client_init(client); | 525 | ceph_inode_init_once); |
751 | client->have_fsid = true; | 526 | if (ceph_inode_cachep == NULL) |
752 | } | 527 | return -ENOMEM; |
528 | |||
529 | ceph_cap_cachep = KMEM_CACHE(ceph_cap, | ||
530 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
531 | if (ceph_cap_cachep == NULL) | ||
532 | goto bad_cap; | ||
533 | |||
534 | ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, | ||
535 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
536 | if (ceph_dentry_cachep == NULL) | ||
537 | goto bad_dentry; | ||
538 | |||
539 | ceph_file_cachep = KMEM_CACHE(ceph_file_info, | ||
540 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
541 | if (ceph_file_cachep == NULL) | ||
542 | goto bad_file; | ||
543 | |||
753 | return 0; | 544 | return 0; |
545 | |||
546 | bad_file: | ||
547 | kmem_cache_destroy(ceph_dentry_cachep); | ||
548 | bad_dentry: | ||
549 | kmem_cache_destroy(ceph_cap_cachep); | ||
550 | bad_cap: | ||
551 | kmem_cache_destroy(ceph_inode_cachep); | ||
552 | return -ENOMEM; | ||
754 | } | 553 | } |
755 | 554 | ||
555 | static void destroy_caches(void) | ||
556 | { | ||
557 | kmem_cache_destroy(ceph_inode_cachep); | ||
558 | kmem_cache_destroy(ceph_cap_cachep); | ||
559 | kmem_cache_destroy(ceph_dentry_cachep); | ||
560 | kmem_cache_destroy(ceph_file_cachep); | ||
561 | } | ||
562 | |||
563 | |||
756 | /* | 564 | /* |
757 | * true if we have the mon map (and have thus joined the cluster) | 565 | * ceph_umount_begin - initiate forced umount. Tear down down the |
566 | * mount, skipping steps that may hang while waiting for server(s). | ||
758 | */ | 567 | */ |
759 | static int have_mon_and_osd_map(struct ceph_client *client) | 568 | static void ceph_umount_begin(struct super_block *sb) |
760 | { | 569 | { |
761 | return client->monc.monmap && client->monc.monmap->epoch && | 570 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
762 | client->osdc.osdmap && client->osdc.osdmap->epoch; | 571 | |
572 | dout("ceph_umount_begin - starting forced umount\n"); | ||
573 | if (!fsc) | ||
574 | return; | ||
575 | fsc->mount_state = CEPH_MOUNT_SHUTDOWN; | ||
576 | return; | ||
763 | } | 577 | } |
764 | 578 | ||
579 | static const struct super_operations ceph_super_ops = { | ||
580 | .alloc_inode = ceph_alloc_inode, | ||
581 | .destroy_inode = ceph_destroy_inode, | ||
582 | .write_inode = ceph_write_inode, | ||
583 | .sync_fs = ceph_sync_fs, | ||
584 | .put_super = ceph_put_super, | ||
585 | .show_options = ceph_show_options, | ||
586 | .statfs = ceph_statfs, | ||
587 | .umount_begin = ceph_umount_begin, | ||
588 | }; | ||
589 | |||
765 | /* | 590 | /* |
766 | * Bootstrap mount by opening the root directory. Note the mount | 591 | * Bootstrap mount by opening the root directory. Note the mount |
767 | * @started time from caller, and time out if this takes too long. | 592 | * @started time from caller, and time out if this takes too long. |
768 | */ | 593 | */ |
769 | static struct dentry *open_root_dentry(struct ceph_client *client, | 594 | static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, |
770 | const char *path, | 595 | const char *path, |
771 | unsigned long started) | 596 | unsigned long started) |
772 | { | 597 | { |
773 | struct ceph_mds_client *mdsc = &client->mdsc; | 598 | struct ceph_mds_client *mdsc = fsc->mdsc; |
774 | struct ceph_mds_request *req = NULL; | 599 | struct ceph_mds_request *req = NULL; |
775 | int err; | 600 | int err; |
776 | struct dentry *root; | 601 | struct dentry *root; |
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client, | |||
784 | req->r_ino1.ino = CEPH_INO_ROOT; | 609 | req->r_ino1.ino = CEPH_INO_ROOT; |
785 | req->r_ino1.snap = CEPH_NOSNAP; | 610 | req->r_ino1.snap = CEPH_NOSNAP; |
786 | req->r_started = started; | 611 | req->r_started = started; |
787 | req->r_timeout = client->mount_args->mount_timeout * HZ; | 612 | req->r_timeout = fsc->client->options->mount_timeout * HZ; |
788 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); | 613 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); |
789 | req->r_num_caps = 2; | 614 | req->r_num_caps = 2; |
790 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 615 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
791 | if (err == 0) { | 616 | if (err == 0) { |
792 | dout("open_root_inode success\n"); | 617 | dout("open_root_inode success\n"); |
793 | if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && | 618 | if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && |
794 | client->sb->s_root == NULL) | 619 | fsc->sb->s_root == NULL) |
795 | root = d_alloc_root(req->r_target_inode); | 620 | root = d_alloc_root(req->r_target_inode); |
796 | else | 621 | else |
797 | root = d_obtain_alias(req->r_target_inode); | 622 | root = d_obtain_alias(req->r_target_inode); |
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client, | |||
804 | return root; | 629 | return root; |
805 | } | 630 | } |
806 | 631 | ||
632 | |||
633 | |||
634 | |||
807 | /* | 635 | /* |
808 | * mount: join the ceph cluster, and open root directory. | 636 | * mount: join the ceph cluster, and open root directory. |
809 | */ | 637 | */ |
810 | static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, | 638 | static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, |
811 | const char *path) | 639 | const char *path) |
812 | { | 640 | { |
813 | struct ceph_entity_addr *myaddr = NULL; | ||
814 | int err; | 641 | int err; |
815 | unsigned long timeout = client->mount_args->mount_timeout * HZ; | ||
816 | unsigned long started = jiffies; /* note the start time */ | 642 | unsigned long started = jiffies; /* note the start time */ |
817 | struct dentry *root; | 643 | struct dentry *root; |
644 | int first = 0; /* first vfsmount for this super_block */ | ||
818 | 645 | ||
819 | dout("mount start\n"); | 646 | dout("mount start\n"); |
820 | mutex_lock(&client->mount_mutex); | 647 | mutex_lock(&fsc->client->mount_mutex); |
821 | |||
822 | /* initialize the messenger */ | ||
823 | if (client->msgr == NULL) { | ||
824 | if (ceph_test_opt(client, MYIP)) | ||
825 | myaddr = &client->mount_args->my_addr; | ||
826 | client->msgr = ceph_messenger_create(myaddr); | ||
827 | if (IS_ERR(client->msgr)) { | ||
828 | err = PTR_ERR(client->msgr); | ||
829 | client->msgr = NULL; | ||
830 | goto out; | ||
831 | } | ||
832 | client->msgr->nocrc = ceph_test_opt(client, NOCRC); | ||
833 | } | ||
834 | 648 | ||
835 | /* open session, and wait for mon, mds, and osd maps */ | 649 | err = __ceph_open_session(fsc->client, started); |
836 | err = ceph_monc_open_session(&client->monc); | ||
837 | if (err < 0) | 650 | if (err < 0) |
838 | goto out; | 651 | goto out; |
839 | 652 | ||
840 | while (!have_mon_and_osd_map(client)) { | ||
841 | err = -EIO; | ||
842 | if (timeout && time_after_eq(jiffies, started + timeout)) | ||
843 | goto out; | ||
844 | |||
845 | /* wait */ | ||
846 | dout("mount waiting for mon_map\n"); | ||
847 | err = wait_event_interruptible_timeout(client->auth_wq, | ||
848 | have_mon_and_osd_map(client) || (client->auth_err < 0), | ||
849 | timeout); | ||
850 | if (err == -EINTR || err == -ERESTARTSYS) | ||
851 | goto out; | ||
852 | if (client->auth_err < 0) { | ||
853 | err = client->auth_err; | ||
854 | goto out; | ||
855 | } | ||
856 | } | ||
857 | |||
858 | dout("mount opening root\n"); | 653 | dout("mount opening root\n"); |
859 | root = open_root_dentry(client, "", started); | 654 | root = open_root_dentry(fsc, "", started); |
860 | if (IS_ERR(root)) { | 655 | if (IS_ERR(root)) { |
861 | err = PTR_ERR(root); | 656 | err = PTR_ERR(root); |
862 | goto out; | 657 | goto out; |
863 | } | 658 | } |
864 | if (client->sb->s_root) | 659 | if (fsc->sb->s_root) { |
865 | dput(root); | 660 | dput(root); |
866 | else | 661 | } else { |
867 | client->sb->s_root = root; | 662 | fsc->sb->s_root = root; |
663 | first = 1; | ||
664 | |||
665 | err = ceph_fs_debugfs_init(fsc); | ||
666 | if (err < 0) | ||
667 | goto fail; | ||
668 | } | ||
868 | 669 | ||
869 | if (path[0] == 0) { | 670 | if (path[0] == 0) { |
870 | dget(root); | 671 | dget(root); |
871 | } else { | 672 | } else { |
872 | dout("mount opening base mountpoint\n"); | 673 | dout("mount opening base mountpoint\n"); |
873 | root = open_root_dentry(client, path, started); | 674 | root = open_root_dentry(fsc, path, started); |
874 | if (IS_ERR(root)) { | 675 | if (IS_ERR(root)) { |
875 | err = PTR_ERR(root); | 676 | err = PTR_ERR(root); |
876 | dput(client->sb->s_root); | 677 | goto fail; |
877 | client->sb->s_root = NULL; | ||
878 | goto out; | ||
879 | } | 678 | } |
880 | } | 679 | } |
881 | 680 | ||
882 | mnt->mnt_root = root; | 681 | mnt->mnt_root = root; |
883 | mnt->mnt_sb = client->sb; | 682 | mnt->mnt_sb = fsc->sb; |
884 | 683 | ||
885 | client->mount_state = CEPH_MOUNT_MOUNTED; | 684 | fsc->mount_state = CEPH_MOUNT_MOUNTED; |
886 | dout("mount success\n"); | 685 | dout("mount success\n"); |
887 | err = 0; | 686 | err = 0; |
888 | 687 | ||
889 | out: | 688 | out: |
890 | mutex_unlock(&client->mount_mutex); | 689 | mutex_unlock(&fsc->client->mount_mutex); |
891 | return err; | 690 | return err; |
691 | |||
692 | fail: | ||
693 | if (first) { | ||
694 | dput(fsc->sb->s_root); | ||
695 | fsc->sb->s_root = NULL; | ||
696 | } | ||
697 | goto out; | ||
892 | } | 698 | } |
893 | 699 | ||
894 | static int ceph_set_super(struct super_block *s, void *data) | 700 | static int ceph_set_super(struct super_block *s, void *data) |
895 | { | 701 | { |
896 | struct ceph_client *client = data; | 702 | struct ceph_fs_client *fsc = data; |
897 | int ret; | 703 | int ret; |
898 | 704 | ||
899 | dout("set_super %p data %p\n", s, data); | 705 | dout("set_super %p data %p\n", s, data); |
900 | 706 | ||
901 | s->s_flags = client->mount_args->sb_flags; | 707 | s->s_flags = fsc->mount_options->sb_flags; |
902 | s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ | 708 | s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ |
903 | 709 | ||
904 | s->s_fs_info = client; | 710 | s->s_fs_info = fsc; |
905 | client->sb = s; | 711 | fsc->sb = s; |
906 | 712 | ||
907 | s->s_op = &ceph_super_ops; | 713 | s->s_op = &ceph_super_ops; |
908 | s->s_export_op = &ceph_export_ops; | 714 | s->s_export_op = &ceph_export_ops; |
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data) | |||
917 | 723 | ||
918 | fail: | 724 | fail: |
919 | s->s_fs_info = NULL; | 725 | s->s_fs_info = NULL; |
920 | client->sb = NULL; | 726 | fsc->sb = NULL; |
921 | return ret; | 727 | return ret; |
922 | } | 728 | } |
923 | 729 | ||
@@ -926,30 +732,23 @@ fail: | |||
926 | */ | 732 | */ |
927 | static int ceph_compare_super(struct super_block *sb, void *data) | 733 | static int ceph_compare_super(struct super_block *sb, void *data) |
928 | { | 734 | { |
929 | struct ceph_client *new = data; | 735 | struct ceph_fs_client *new = data; |
930 | struct ceph_mount_args *args = new->mount_args; | 736 | struct ceph_mount_options *fsopt = new->mount_options; |
931 | struct ceph_client *other = ceph_sb_to_client(sb); | 737 | struct ceph_options *opt = new->client->options; |
932 | int i; | 738 | struct ceph_fs_client *other = ceph_sb_to_client(sb); |
933 | 739 | ||
934 | dout("ceph_compare_super %p\n", sb); | 740 | dout("ceph_compare_super %p\n", sb); |
935 | if (args->flags & CEPH_OPT_FSID) { | 741 | |
936 | if (ceph_fsid_compare(&args->fsid, &other->fsid)) { | 742 | if (compare_mount_options(fsopt, opt, other)) { |
937 | dout("fsid doesn't match\n"); | 743 | dout("monitor(s)/mount options don't match\n"); |
938 | return 0; | 744 | return 0; |
939 | } | ||
940 | } else { | ||
941 | /* do we share (a) monitor? */ | ||
942 | for (i = 0; i < new->monc.monmap->num_mon; i++) | ||
943 | if (ceph_monmap_contains(other->monc.monmap, | ||
944 | &new->monc.monmap->mon_inst[i].addr)) | ||
945 | break; | ||
946 | if (i == new->monc.monmap->num_mon) { | ||
947 | dout("mon ip not part of monmap\n"); | ||
948 | return 0; | ||
949 | } | ||
950 | dout("mon ip matches existing sb %p\n", sb); | ||
951 | } | 745 | } |
952 | if (args->sb_flags != other->mount_args->sb_flags) { | 746 | if ((opt->flags & CEPH_OPT_FSID) && |
747 | ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { | ||
748 | dout("fsid doesn't match\n"); | ||
749 | return 0; | ||
750 | } | ||
751 | if (fsopt->sb_flags != other->mount_options->sb_flags) { | ||
953 | dout("flags differ\n"); | 752 | dout("flags differ\n"); |
954 | return 0; | 753 | return 0; |
955 | } | 754 | } |
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data) | |||
961 | */ | 760 | */ |
962 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 761 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
963 | 762 | ||
964 | static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) | 763 | static int ceph_register_bdi(struct super_block *sb, |
764 | struct ceph_fs_client *fsc) | ||
965 | { | 765 | { |
966 | int err; | 766 | int err; |
967 | 767 | ||
968 | /* set ra_pages based on rsize mount option? */ | 768 | /* set ra_pages based on rsize mount option? */ |
969 | if (client->mount_args->rsize >= PAGE_CACHE_SIZE) | 769 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) |
970 | client->backing_dev_info.ra_pages = | 770 | fsc->backing_dev_info.ra_pages = |
971 | (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) | 771 | (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) |
972 | >> PAGE_SHIFT; | 772 | >> PAGE_SHIFT; |
973 | err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", | 773 | err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", |
974 | atomic_long_inc_return(&bdi_seq)); | 774 | atomic_long_inc_return(&bdi_seq)); |
975 | if (!err) | 775 | if (!err) |
976 | sb->s_bdi = &client->backing_dev_info; | 776 | sb->s_bdi = &fsc->backing_dev_info; |
977 | return err; | 777 | return err; |
978 | } | 778 | } |
979 | 779 | ||
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type, | |||
982 | struct vfsmount *mnt) | 782 | struct vfsmount *mnt) |
983 | { | 783 | { |
984 | struct super_block *sb; | 784 | struct super_block *sb; |
985 | struct ceph_client *client; | 785 | struct ceph_fs_client *fsc; |
986 | int err; | 786 | int err; |
987 | int (*compare_super)(struct super_block *, void *) = ceph_compare_super; | 787 | int (*compare_super)(struct super_block *, void *) = ceph_compare_super; |
988 | const char *path = NULL; | 788 | const char *path = NULL; |
989 | struct ceph_mount_args *args; | 789 | struct ceph_mount_options *fsopt = NULL; |
790 | struct ceph_options *opt = NULL; | ||
990 | 791 | ||
991 | dout("ceph_get_sb\n"); | 792 | dout("ceph_get_sb\n"); |
992 | args = parse_mount_args(flags, data, dev_name, &path); | 793 | err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); |
993 | if (IS_ERR(args)) { | 794 | if (err < 0) |
994 | err = PTR_ERR(args); | ||
995 | goto out_final; | 795 | goto out_final; |
996 | } | ||
997 | 796 | ||
998 | /* create client (which we may/may not use) */ | 797 | /* create client (which we may/may not use) */ |
999 | client = ceph_create_client(args); | 798 | fsc = create_fs_client(fsopt, opt); |
1000 | if (IS_ERR(client)) { | 799 | if (IS_ERR(fsc)) { |
1001 | err = PTR_ERR(client); | 800 | err = PTR_ERR(fsc); |
801 | kfree(fsopt); | ||
802 | kfree(opt); | ||
1002 | goto out_final; | 803 | goto out_final; |
1003 | } | 804 | } |
1004 | 805 | ||
1005 | if (client->mount_args->flags & CEPH_OPT_NOSHARE) | 806 | err = ceph_mdsc_init(fsc); |
807 | if (err < 0) | ||
808 | goto out; | ||
809 | |||
810 | if (ceph_test_opt(fsc->client, NOSHARE)) | ||
1006 | compare_super = NULL; | 811 | compare_super = NULL; |
1007 | sb = sget(fs_type, compare_super, ceph_set_super, client); | 812 | sb = sget(fs_type, compare_super, ceph_set_super, fsc); |
1008 | if (IS_ERR(sb)) { | 813 | if (IS_ERR(sb)) { |
1009 | err = PTR_ERR(sb); | 814 | err = PTR_ERR(sb); |
1010 | goto out; | 815 | goto out; |
1011 | } | 816 | } |
1012 | 817 | ||
1013 | if (ceph_sb_to_client(sb) != client) { | 818 | if (ceph_sb_to_client(sb) != fsc) { |
1014 | ceph_destroy_client(client); | 819 | ceph_mdsc_destroy(fsc); |
1015 | client = ceph_sb_to_client(sb); | 820 | destroy_fs_client(fsc); |
1016 | dout("get_sb got existing client %p\n", client); | 821 | fsc = ceph_sb_to_client(sb); |
822 | dout("get_sb got existing client %p\n", fsc); | ||
1017 | } else { | 823 | } else { |
1018 | dout("get_sb using new client %p\n", client); | 824 | dout("get_sb using new client %p\n", fsc); |
1019 | err = ceph_register_bdi(sb, client); | 825 | err = ceph_register_bdi(sb, fsc); |
1020 | if (err < 0) | 826 | if (err < 0) |
1021 | goto out_splat; | 827 | goto out_splat; |
1022 | } | 828 | } |
1023 | 829 | ||
1024 | err = ceph_mount(client, mnt, path); | 830 | err = ceph_mount(fsc, mnt, path); |
1025 | if (err < 0) | 831 | if (err < 0) |
1026 | goto out_splat; | 832 | goto out_splat; |
1027 | dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, | 833 | dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, |
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type, | |||
1029 | return 0; | 835 | return 0; |
1030 | 836 | ||
1031 | out_splat: | 837 | out_splat: |
1032 | ceph_mdsc_close_sessions(&client->mdsc); | 838 | ceph_mdsc_close_sessions(fsc->mdsc); |
1033 | deactivate_locked_super(sb); | 839 | deactivate_locked_super(sb); |
1034 | goto out_final; | 840 | goto out_final; |
1035 | 841 | ||
1036 | out: | 842 | out: |
1037 | ceph_destroy_client(client); | 843 | ceph_mdsc_destroy(fsc); |
844 | destroy_fs_client(fsc); | ||
1038 | out_final: | 845 | out_final: |
1039 | dout("ceph_get_sb fail %d\n", err); | 846 | dout("ceph_get_sb fail %d\n", err); |
1040 | return err; | 847 | return err; |
@@ -1042,11 +849,12 @@ out_final: | |||
1042 | 849 | ||
1043 | static void ceph_kill_sb(struct super_block *s) | 850 | static void ceph_kill_sb(struct super_block *s) |
1044 | { | 851 | { |
1045 | struct ceph_client *client = ceph_sb_to_client(s); | 852 | struct ceph_fs_client *fsc = ceph_sb_to_client(s); |
1046 | dout("kill_sb %p\n", s); | 853 | dout("kill_sb %p\n", s); |
1047 | ceph_mdsc_pre_umount(&client->mdsc); | 854 | ceph_mdsc_pre_umount(fsc->mdsc); |
1048 | kill_anon_super(s); /* will call put_super after sb is r/o */ | 855 | kill_anon_super(s); /* will call put_super after sb is r/o */ |
1049 | ceph_destroy_client(client); | 856 | ceph_mdsc_destroy(fsc); |
857 | destroy_fs_client(fsc); | ||
1050 | } | 858 | } |
1051 | 859 | ||
1052 | static struct file_system_type ceph_fs_type = { | 860 | static struct file_system_type ceph_fs_type = { |
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = { | |||
1062 | 870 | ||
1063 | static int __init init_ceph(void) | 871 | static int __init init_ceph(void) |
1064 | { | 872 | { |
1065 | int ret = 0; | 873 | int ret = init_caches(); |
1066 | |||
1067 | ret = ceph_debugfs_init(); | ||
1068 | if (ret < 0) | ||
1069 | goto out; | ||
1070 | |||
1071 | ret = ceph_msgr_init(); | ||
1072 | if (ret < 0) | ||
1073 | goto out_debugfs; | ||
1074 | |||
1075 | ret = init_caches(); | ||
1076 | if (ret) | 874 | if (ret) |
1077 | goto out_msgr; | 875 | goto out; |
1078 | 876 | ||
1079 | ret = register_filesystem(&ceph_fs_type); | 877 | ret = register_filesystem(&ceph_fs_type); |
1080 | if (ret) | 878 | if (ret) |
1081 | goto out_icache; | 879 | goto out_icache; |
1082 | 880 | ||
1083 | pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", | 881 | pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); |
1084 | CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, | 882 | |
1085 | CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, | ||
1086 | CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); | ||
1087 | return 0; | 883 | return 0; |
1088 | 884 | ||
1089 | out_icache: | 885 | out_icache: |
1090 | destroy_caches(); | 886 | destroy_caches(); |
1091 | out_msgr: | ||
1092 | ceph_msgr_exit(); | ||
1093 | out_debugfs: | ||
1094 | ceph_debugfs_cleanup(); | ||
1095 | out: | 887 | out: |
1096 | return ret; | 888 | return ret; |
1097 | } | 889 | } |
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void) | |||
1101 | dout("exit_ceph\n"); | 893 | dout("exit_ceph\n"); |
1102 | unregister_filesystem(&ceph_fs_type); | 894 | unregister_filesystem(&ceph_fs_type); |
1103 | destroy_caches(); | 895 | destroy_caches(); |
1104 | ceph_msgr_exit(); | ||
1105 | ceph_debugfs_cleanup(); | ||
1106 | } | 896 | } |
1107 | 897 | ||
1108 | module_init(init_ceph); | 898 | module_init(init_ceph); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b87638e84c4b..1886294e12f7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifndef _FS_CEPH_SUPER_H | 1 | #ifndef _FS_CEPH_SUPER_H |
2 | #define _FS_CEPH_SUPER_H | 2 | #define _FS_CEPH_SUPER_H |
3 | 3 | ||
4 | #include "ceph_debug.h" | 4 | #include <linux/ceph/ceph_debug.h> |
5 | 5 | ||
6 | #include <asm/unaligned.h> | 6 | #include <asm/unaligned.h> |
7 | #include <linux/backing-dev.h> | 7 | #include <linux/backing-dev.h> |
@@ -14,13 +14,7 @@ | |||
14 | #include <linux/writeback.h> | 14 | #include <linux/writeback.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | 16 | ||
17 | #include "types.h" | 17 | #include <linux/ceph/libceph.h> |
18 | #include "messenger.h" | ||
19 | #include "msgpool.h" | ||
20 | #include "mon_client.h" | ||
21 | #include "mds_client.h" | ||
22 | #include "osd_client.h" | ||
23 | #include "ceph_fs.h" | ||
24 | 18 | ||
25 | /* f_type in struct statfs */ | 19 | /* f_type in struct statfs */ |
26 | #define CEPH_SUPER_MAGIC 0x00c36400 | 20 | #define CEPH_SUPER_MAGIC 0x00c36400 |
@@ -30,42 +24,25 @@ | |||
30 | #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ | 24 | #define CEPH_BLOCK_SHIFT 20 /* 1 MB */ |
31 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) | 25 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) |
32 | 26 | ||
33 | /* | 27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ |
34 | * Supported features | 28 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ |
35 | */ | 29 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ |
36 | #define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK | ||
37 | #define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR | ||
38 | 30 | ||
39 | /* | 31 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) |
40 | * mount options | ||
41 | */ | ||
42 | #define CEPH_OPT_FSID (1<<0) | ||
43 | #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ | ||
44 | #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ | ||
45 | #define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ | ||
46 | #define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ | ||
47 | #define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ | ||
48 | #define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ | ||
49 | 32 | ||
50 | #define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) | 33 | #define ceph_set_mount_opt(fsc, opt) \ |
34 | (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; | ||
35 | #define ceph_test_mount_opt(fsc, opt) \ | ||
36 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) | ||
51 | 37 | ||
52 | #define ceph_set_opt(client, opt) \ | 38 | #define CEPH_MAX_READDIR_DEFAULT 1024 |
53 | (client)->mount_args->flags |= CEPH_OPT_##opt; | 39 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) |
54 | #define ceph_test_opt(client, opt) \ | 40 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" |
55 | (!!((client)->mount_args->flags & CEPH_OPT_##opt)) | ||
56 | 41 | ||
57 | 42 | struct ceph_mount_options { | |
58 | struct ceph_mount_args { | ||
59 | int sb_flags; | ||
60 | int flags; | 43 | int flags; |
61 | struct ceph_fsid fsid; | 44 | int sb_flags; |
62 | struct ceph_entity_addr my_addr; | 45 | |
63 | int num_mon; | ||
64 | struct ceph_entity_addr *mon_addr; | ||
65 | int mount_timeout; | ||
66 | int osd_idle_ttl; | ||
67 | int osd_timeout; | ||
68 | int osd_keepalive_timeout; | ||
69 | int wsize; | 46 | int wsize; |
70 | int rsize; /* max readahead */ | 47 | int rsize; /* max readahead */ |
71 | int congestion_kb; /* max writeback in flight */ | 48 | int congestion_kb; /* max writeback in flight */ |
@@ -73,82 +50,25 @@ struct ceph_mount_args { | |||
73 | int cap_release_safety; | 50 | int cap_release_safety; |
74 | int max_readdir; /* max readdir result (entires) */ | 51 | int max_readdir; /* max readdir result (entires) */ |
75 | int max_readdir_bytes; /* max readdir result (bytes) */ | 52 | int max_readdir_bytes; /* max readdir result (bytes) */ |
76 | char *snapdir_name; /* default ".snap" */ | ||
77 | char *name; | ||
78 | char *secret; | ||
79 | }; | ||
80 | 53 | ||
81 | /* | 54 | /* |
82 | * defaults | 55 | * everything above this point can be memcmp'd; everything below |
83 | */ | 56 | * is handled in compare_mount_options() |
84 | #define CEPH_MOUNT_TIMEOUT_DEFAULT 60 | 57 | */ |
85 | #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ | ||
86 | #define CEPH_OSD_KEEPALIVE_DEFAULT 5 | ||
87 | #define CEPH_OSD_IDLE_TTL_DEFAULT 60 | ||
88 | #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ | ||
89 | #define CEPH_MAX_READDIR_DEFAULT 1024 | ||
90 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) | ||
91 | |||
92 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) | ||
93 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) | ||
94 | |||
95 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" | ||
96 | #define CEPH_AUTH_NAME_DEFAULT "guest" | ||
97 | /* | ||
98 | * Delay telling the MDS we no longer want caps, in case we reopen | ||
99 | * the file. Delay a minimum amount of time, even if we send a cap | ||
100 | * message for some other reason. Otherwise, take the oppotunity to | ||
101 | * update the mds to avoid sending another message later. | ||
102 | */ | ||
103 | #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ | ||
104 | #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ | ||
105 | |||
106 | #define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) | ||
107 | |||
108 | /* mount state */ | ||
109 | enum { | ||
110 | CEPH_MOUNT_MOUNTING, | ||
111 | CEPH_MOUNT_MOUNTED, | ||
112 | CEPH_MOUNT_UNMOUNTING, | ||
113 | CEPH_MOUNT_UNMOUNTED, | ||
114 | CEPH_MOUNT_SHUTDOWN, | ||
115 | }; | ||
116 | |||
117 | /* | ||
118 | * subtract jiffies | ||
119 | */ | ||
120 | static inline unsigned long time_sub(unsigned long a, unsigned long b) | ||
121 | { | ||
122 | BUG_ON(time_after(b, a)); | ||
123 | return (long)a - (long)b; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * per-filesystem client state | ||
128 | * | ||
129 | * possibly shared by multiple mount points, if they are | ||
130 | * mounting the same ceph filesystem/cluster. | ||
131 | */ | ||
132 | struct ceph_client { | ||
133 | struct ceph_fsid fsid; | ||
134 | bool have_fsid; | ||
135 | 58 | ||
136 | struct mutex mount_mutex; /* serialize mount attempts */ | 59 | char *snapdir_name; /* default ".snap" */ |
137 | struct ceph_mount_args *mount_args; | 60 | }; |
138 | 61 | ||
62 | struct ceph_fs_client { | ||
139 | struct super_block *sb; | 63 | struct super_block *sb; |
140 | 64 | ||
141 | unsigned long mount_state; | 65 | struct ceph_mount_options *mount_options; |
142 | wait_queue_head_t auth_wq; | 66 | struct ceph_client *client; |
143 | |||
144 | int auth_err; | ||
145 | 67 | ||
68 | unsigned long mount_state; | ||
146 | int min_caps; /* min caps i added */ | 69 | int min_caps; /* min caps i added */ |
147 | 70 | ||
148 | struct ceph_messenger *msgr; /* messenger instance */ | 71 | struct ceph_mds_client *mdsc; |
149 | struct ceph_mon_client monc; | ||
150 | struct ceph_mds_client mdsc; | ||
151 | struct ceph_osd_client osdc; | ||
152 | 72 | ||
153 | /* writeback */ | 73 | /* writeback */ |
154 | mempool_t *wb_pagevec_pool; | 74 | mempool_t *wb_pagevec_pool; |
@@ -160,14 +80,14 @@ struct ceph_client { | |||
160 | struct backing_dev_info backing_dev_info; | 80 | struct backing_dev_info backing_dev_info; |
161 | 81 | ||
162 | #ifdef CONFIG_DEBUG_FS | 82 | #ifdef CONFIG_DEBUG_FS |
163 | struct dentry *debugfs_monmap; | 83 | struct dentry *debugfs_dentry_lru, *debugfs_caps; |
164 | struct dentry *debugfs_mdsmap, *debugfs_osdmap; | ||
165 | struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; | ||
166 | struct dentry *debugfs_congestion_kb; | 84 | struct dentry *debugfs_congestion_kb; |
167 | struct dentry *debugfs_bdi; | 85 | struct dentry *debugfs_bdi; |
86 | struct dentry *debugfs_mdsc, *debugfs_mdsmap; | ||
168 | #endif | 87 | #endif |
169 | }; | 88 | }; |
170 | 89 | ||
90 | |||
171 | /* | 91 | /* |
172 | * File i/o capability. This tracks shared state with the metadata | 92 | * File i/o capability. This tracks shared state with the metadata |
173 | * server that allows us to cache or writeback attributes or to read | 93 | * server that allows us to cache or writeback attributes or to read |
@@ -275,6 +195,20 @@ struct ceph_inode_xattr { | |||
275 | int should_free_val; | 195 | int should_free_val; |
276 | }; | 196 | }; |
277 | 197 | ||
198 | /* | ||
199 | * Ceph dentry state | ||
200 | */ | ||
201 | struct ceph_dentry_info { | ||
202 | struct ceph_mds_session *lease_session; | ||
203 | u32 lease_gen, lease_shared_gen; | ||
204 | u32 lease_seq; | ||
205 | unsigned long lease_renew_after, lease_renew_from; | ||
206 | struct list_head lru; | ||
207 | struct dentry *dentry; | ||
208 | u64 time; | ||
209 | u64 offset; | ||
210 | }; | ||
211 | |||
278 | struct ceph_inode_xattrs_info { | 212 | struct ceph_inode_xattrs_info { |
279 | /* | 213 | /* |
280 | * (still encoded) xattr blob. we avoid the overhead of parsing | 214 | * (still encoded) xattr blob. we avoid the overhead of parsing |
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info { | |||
296 | /* | 230 | /* |
297 | * Ceph inode. | 231 | * Ceph inode. |
298 | */ | 232 | */ |
299 | #define CEPH_I_COMPLETE 1 /* we have complete directory cached */ | ||
300 | #define CEPH_I_NODELAY 4 /* do not delay cap release */ | ||
301 | #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ | ||
302 | #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ | ||
303 | |||
304 | struct ceph_inode_info { | 233 | struct ceph_inode_info { |
305 | struct ceph_vino i_vino; /* ceph ino + snap */ | 234 | struct ceph_vino i_vino; /* ceph ino + snap */ |
306 | 235 | ||
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) | |||
391 | return container_of(inode, struct ceph_inode_info, vfs_inode); | 320 | return container_of(inode, struct ceph_inode_info, vfs_inode); |
392 | } | 321 | } |
393 | 322 | ||
323 | static inline struct ceph_vino ceph_vino(struct inode *inode) | ||
324 | { | ||
325 | return ceph_inode(inode)->i_vino; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * ino_t is <64 bits on many architectures, blech. | ||
330 | * | ||
331 | * don't include snap in ino hash, at least for now. | ||
332 | */ | ||
333 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) | ||
334 | { | ||
335 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ | ||
336 | #if BITS_PER_LONG == 32 | ||
337 | ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; | ||
338 | if (!ino) | ||
339 | ino = 1; | ||
340 | #endif | ||
341 | return ino; | ||
342 | } | ||
343 | |||
344 | /* for printf-style formatting */ | ||
345 | #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap | ||
346 | |||
347 | static inline u64 ceph_ino(struct inode *inode) | ||
348 | { | ||
349 | return ceph_inode(inode)->i_vino.ino; | ||
350 | } | ||
351 | static inline u64 ceph_snap(struct inode *inode) | ||
352 | { | ||
353 | return ceph_inode(inode)->i_vino.snap; | ||
354 | } | ||
355 | |||
356 | static inline int ceph_ino_compare(struct inode *inode, void *data) | ||
357 | { | ||
358 | struct ceph_vino *pvino = (struct ceph_vino *)data; | ||
359 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
360 | return ci->i_vino.ino == pvino->ino && | ||
361 | ci->i_vino.snap == pvino->snap; | ||
362 | } | ||
363 | |||
364 | static inline struct inode *ceph_find_inode(struct super_block *sb, | ||
365 | struct ceph_vino vino) | ||
366 | { | ||
367 | ino_t t = ceph_vino_to_ino(vino); | ||
368 | return ilookup5(sb, t, ceph_ino_compare, &vino); | ||
369 | } | ||
370 | |||
371 | |||
372 | /* | ||
373 | * Ceph inode. | ||
374 | */ | ||
375 | #define CEPH_I_COMPLETE 1 /* we have complete directory cached */ | ||
376 | #define CEPH_I_NODELAY 4 /* do not delay cap release */ | ||
377 | #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ | ||
378 | #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ | ||
379 | |||
394 | static inline void ceph_i_clear(struct inode *inode, unsigned mask) | 380 | static inline void ceph_i_clear(struct inode *inode, unsigned mask) |
395 | { | 381 | { |
396 | struct ceph_inode_info *ci = ceph_inode(inode); | 382 | struct ceph_inode_info *ci = ceph_inode(inode); |
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask) | |||
414 | struct ceph_inode_info *ci = ceph_inode(inode); | 400 | struct ceph_inode_info *ci = ceph_inode(inode); |
415 | bool r; | 401 | bool r; |
416 | 402 | ||
417 | smp_mb(); | 403 | spin_lock(&inode->i_lock); |
418 | r = (ci->i_ceph_flags & mask) == mask; | 404 | r = (ci->i_ceph_flags & mask) == mask; |
405 | spin_unlock(&inode->i_lock); | ||
419 | return r; | 406 | return r; |
420 | } | 407 | } |
421 | 408 | ||
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, | |||
432 | struct ceph_inode_frag *pfrag, | 419 | struct ceph_inode_frag *pfrag, |
433 | int *found); | 420 | int *found); |
434 | 421 | ||
435 | /* | ||
436 | * Ceph dentry state | ||
437 | */ | ||
438 | struct ceph_dentry_info { | ||
439 | struct ceph_mds_session *lease_session; | ||
440 | u32 lease_gen, lease_shared_gen; | ||
441 | u32 lease_seq; | ||
442 | unsigned long lease_renew_after, lease_renew_from; | ||
443 | struct list_head lru; | ||
444 | struct dentry *dentry; | ||
445 | u64 time; | ||
446 | u64 offset; | ||
447 | }; | ||
448 | |||
449 | static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) | 422 | static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) |
450 | { | 423 | { |
451 | return (struct ceph_dentry_info *)dentry->d_fsdata; | 424 | return (struct ceph_dentry_info *)dentry->d_fsdata; |
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) | |||
456 | return ((loff_t)frag << 32) | (loff_t)off; | 429 | return ((loff_t)frag << 32) | (loff_t)off; |
457 | } | 430 | } |
458 | 431 | ||
459 | /* | ||
460 | * ino_t is <64 bits on many architectures, blech. | ||
461 | * | ||
462 | * don't include snap in ino hash, at least for now. | ||
463 | */ | ||
464 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) | ||
465 | { | ||
466 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ | ||
467 | #if BITS_PER_LONG == 32 | ||
468 | ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; | ||
469 | if (!ino) | ||
470 | ino = 1; | ||
471 | #endif | ||
472 | return ino; | ||
473 | } | ||
474 | |||
475 | static inline int ceph_set_ino_cb(struct inode *inode, void *data) | 432 | static inline int ceph_set_ino_cb(struct inode *inode, void *data) |
476 | { | 433 | { |
477 | ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; | 434 | ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; |
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data) | |||
479 | return 0; | 436 | return 0; |
480 | } | 437 | } |
481 | 438 | ||
482 | static inline struct ceph_vino ceph_vino(struct inode *inode) | ||
483 | { | ||
484 | return ceph_inode(inode)->i_vino; | ||
485 | } | ||
486 | |||
487 | /* for printf-style formatting */ | ||
488 | #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap | ||
489 | |||
490 | static inline u64 ceph_ino(struct inode *inode) | ||
491 | { | ||
492 | return ceph_inode(inode)->i_vino.ino; | ||
493 | } | ||
494 | static inline u64 ceph_snap(struct inode *inode) | ||
495 | { | ||
496 | return ceph_inode(inode)->i_vino.snap; | ||
497 | } | ||
498 | |||
499 | static inline int ceph_ino_compare(struct inode *inode, void *data) | ||
500 | { | ||
501 | struct ceph_vino *pvino = (struct ceph_vino *)data; | ||
502 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
503 | return ci->i_vino.ino == pvino->ino && | ||
504 | ci->i_vino.snap == pvino->snap; | ||
505 | } | ||
506 | |||
507 | static inline struct inode *ceph_find_inode(struct super_block *sb, | ||
508 | struct ceph_vino vino) | ||
509 | { | ||
510 | ino_t t = ceph_vino_to_ino(vino); | ||
511 | return ilookup5(sb, t, ceph_ino_compare, &vino); | ||
512 | } | ||
513 | |||
514 | |||
515 | /* | 439 | /* |
516 | * caps helpers | 440 | * caps helpers |
517 | */ | 441 | */ |
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, | |||
576 | struct ceph_cap_reservation *ctx, int need); | 500 | struct ceph_cap_reservation *ctx, int need); |
577 | extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, | 501 | extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, |
578 | struct ceph_cap_reservation *ctx); | 502 | struct ceph_cap_reservation *ctx); |
579 | extern void ceph_reservation_status(struct ceph_client *client, | 503 | extern void ceph_reservation_status(struct ceph_fs_client *client, |
580 | int *total, int *avail, int *used, | 504 | int *total, int *avail, int *used, |
581 | int *reserved, int *min); | 505 | int *reserved, int *min); |
582 | 506 | ||
583 | static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) | 507 | static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) |
584 | { | 508 | { |
585 | return (struct ceph_client *)inode->i_sb->s_fs_info; | 509 | return (struct ceph_fs_client *)inode->i_sb->s_fs_info; |
586 | } | 510 | } |
587 | 511 | ||
588 | static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) | 512 | static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) |
589 | { | 513 | { |
590 | return (struct ceph_client *)sb->s_fs_info; | 514 | return (struct ceph_fs_client *)sb->s_fs_info; |
591 | } | 515 | } |
592 | 516 | ||
593 | 517 | ||
@@ -617,51 +541,6 @@ struct ceph_file_info { | |||
617 | 541 | ||
618 | 542 | ||
619 | /* | 543 | /* |
620 | * snapshots | ||
621 | */ | ||
622 | |||
623 | /* | ||
624 | * A "snap context" is the set of existing snapshots when we | ||
625 | * write data. It is used by the OSD to guide its COW behavior. | ||
626 | * | ||
627 | * The ceph_snap_context is refcounted, and attached to each dirty | ||
628 | * page, indicating which context the dirty data belonged when it was | ||
629 | * dirtied. | ||
630 | */ | ||
631 | struct ceph_snap_context { | ||
632 | atomic_t nref; | ||
633 | u64 seq; | ||
634 | int num_snaps; | ||
635 | u64 snaps[]; | ||
636 | }; | ||
637 | |||
638 | static inline struct ceph_snap_context * | ||
639 | ceph_get_snap_context(struct ceph_snap_context *sc) | ||
640 | { | ||
641 | /* | ||
642 | printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), | ||
643 | atomic_read(&sc->nref)+1); | ||
644 | */ | ||
645 | if (sc) | ||
646 | atomic_inc(&sc->nref); | ||
647 | return sc; | ||
648 | } | ||
649 | |||
650 | static inline void ceph_put_snap_context(struct ceph_snap_context *sc) | ||
651 | { | ||
652 | if (!sc) | ||
653 | return; | ||
654 | /* | ||
655 | printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), | ||
656 | atomic_read(&sc->nref)-1); | ||
657 | */ | ||
658 | if (atomic_dec_and_test(&sc->nref)) { | ||
659 | /*printk(" deleting snap_context %p\n", sc);*/ | ||
660 | kfree(sc); | ||
661 | } | ||
662 | } | ||
663 | |||
664 | /* | ||
665 | * A "snap realm" describes a subset of the file hierarchy sharing | 544 | * A "snap realm" describes a subset of the file hierarchy sharing |
666 | * the same set of snapshots that apply to it. The realms themselves | 545 | * the same set of snapshots that apply to it. The realms themselves |
667 | * are organized into a hierarchy, such that children inherit (some of) | 546 | * are organized into a hierarchy, such that children inherit (some of) |
@@ -699,16 +578,33 @@ struct ceph_snap_realm { | |||
699 | spinlock_t inodes_with_caps_lock; | 578 | spinlock_t inodes_with_caps_lock; |
700 | }; | 579 | }; |
701 | 580 | ||
702 | 581 | static inline int default_congestion_kb(void) | |
703 | |||
704 | /* | ||
705 | * calculate the number of pages a given length and offset map onto, | ||
706 | * if we align the data. | ||
707 | */ | ||
708 | static inline int calc_pages_for(u64 off, u64 len) | ||
709 | { | 582 | { |
710 | return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - | 583 | int congestion_kb; |
711 | (off >> PAGE_CACHE_SHIFT); | 584 | |
585 | /* | ||
586 | * Copied from NFS | ||
587 | * | ||
588 | * congestion size, scale with available memory. | ||
589 | * | ||
590 | * 64MB: 8192k | ||
591 | * 128MB: 11585k | ||
592 | * 256MB: 16384k | ||
593 | * 512MB: 23170k | ||
594 | * 1GB: 32768k | ||
595 | * 2GB: 46340k | ||
596 | * 4GB: 65536k | ||
597 | * 8GB: 92681k | ||
598 | * 16GB: 131072k | ||
599 | * | ||
600 | * This allows larger machines to have larger/more transfers. | ||
601 | * Limit the default to 256M | ||
602 | */ | ||
603 | congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); | ||
604 | if (congestion_kb > 256*1024) | ||
605 | congestion_kb = 256*1024; | ||
606 | |||
607 | return congestion_kb; | ||
712 | } | 608 | } |
713 | 609 | ||
714 | 610 | ||
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) | |||
741 | ci_item)->writing; | 637 | ci_item)->writing; |
742 | } | 638 | } |
743 | 639 | ||
744 | |||
745 | /* super.c */ | ||
746 | extern struct kmem_cache *ceph_inode_cachep; | ||
747 | extern struct kmem_cache *ceph_cap_cachep; | ||
748 | extern struct kmem_cache *ceph_dentry_cachep; | ||
749 | extern struct kmem_cache *ceph_file_cachep; | ||
750 | |||
751 | extern const char *ceph_msg_type_name(int type); | ||
752 | extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); | ||
753 | |||
754 | /* inode.c */ | 640 | /* inode.c */ |
755 | extern const struct inode_operations ceph_file_iops; | 641 | extern const struct inode_operations ceph_file_iops; |
756 | 642 | ||
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); | |||
857 | /* file.c */ | 743 | /* file.c */ |
858 | extern const struct file_operations ceph_file_fops; | 744 | extern const struct file_operations ceph_file_fops; |
859 | extern const struct address_space_operations ceph_aops; | 745 | extern const struct address_space_operations ceph_aops; |
746 | extern int ceph_copy_to_page_vector(struct page **pages, | ||
747 | const char *data, | ||
748 | loff_t off, size_t len); | ||
749 | extern int ceph_copy_from_page_vector(struct page **pages, | ||
750 | char *data, | ||
751 | loff_t off, size_t len); | ||
752 | extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); | ||
860 | extern int ceph_open(struct inode *inode, struct file *file); | 753 | extern int ceph_open(struct inode *inode, struct file *file); |
861 | extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, | 754 | extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, |
862 | struct nameidata *nd, int mode, | 755 | struct nameidata *nd, int mode, |
863 | int locked_dir); | 756 | int locked_dir); |
864 | extern int ceph_release(struct inode *inode, struct file *filp); | 757 | extern int ceph_release(struct inode *inode, struct file *filp); |
865 | extern void ceph_release_page_vector(struct page **pages, int num_pages); | ||
866 | 758 | ||
867 | /* dir.c */ | 759 | /* dir.c */ |
868 | extern const struct file_operations ceph_dir_fops; | 760 | extern const struct file_operations ceph_dir_fops; |
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); | |||
892 | /* export.c */ | 784 | /* export.c */ |
893 | extern const struct export_operations ceph_export_ops; | 785 | extern const struct export_operations ceph_export_ops; |
894 | 786 | ||
895 | /* debugfs.c */ | ||
896 | extern int ceph_debugfs_init(void); | ||
897 | extern void ceph_debugfs_cleanup(void); | ||
898 | extern int ceph_debugfs_client_init(struct ceph_client *client); | ||
899 | extern void ceph_debugfs_client_cleanup(struct ceph_client *client); | ||
900 | |||
901 | /* locks.c */ | 787 | /* locks.c */ |
902 | extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); | 788 | extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); |
903 | extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); | 789 | extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); |
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) | |||
914 | return NULL; | 800 | return NULL; |
915 | } | 801 | } |
916 | 802 | ||
803 | /* debugfs.c */ | ||
804 | extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); | ||
805 | extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); | ||
806 | |||
917 | #endif /* _FS_CEPH_SUPER_H */ | 807 | #endif /* _FS_CEPH_SUPER_H */ |
diff --git a/fs/ceph/types.h b/fs/ceph/types.h deleted file mode 100644 index 28b35a005ec2..000000000000 --- a/fs/ceph/types.h +++ /dev/null | |||
@@ -1,29 +0,0 @@ | |||
1 | #ifndef _FS_CEPH_TYPES_H | ||
2 | #define _FS_CEPH_TYPES_H | ||
3 | |||
4 | /* needed before including ceph_fs.h */ | ||
5 | #include <linux/in.h> | ||
6 | #include <linux/types.h> | ||
7 | #include <linux/fcntl.h> | ||
8 | #include <linux/string.h> | ||
9 | |||
10 | #include "ceph_fs.h" | ||
11 | #include "ceph_frag.h" | ||
12 | #include "ceph_hash.h" | ||
13 | |||
14 | /* | ||
15 | * Identify inodes by both their ino AND snapshot id (a u64). | ||
16 | */ | ||
17 | struct ceph_vino { | ||
18 | u64 ino; | ||
19 | u64 snap; | ||
20 | }; | ||
21 | |||
22 | |||
23 | /* context for the caps reservation mechanism */ | ||
24 | struct ceph_cap_reservation { | ||
25 | int count; | ||
26 | }; | ||
27 | |||
28 | |||
29 | #endif | ||
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9578af610b73..6e12a6ba5f79 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -1,6 +1,9 @@ | |||
1 | #include "ceph_debug.h" | 1 | #include <linux/ceph/ceph_debug.h> |
2 | |||
2 | #include "super.h" | 3 | #include "super.h" |
3 | #include "decode.h" | 4 | #include "mds_client.h" |
5 | |||
6 | #include <linux/ceph/decode.h> | ||
4 | 7 | ||
5 | #include <linux/xattr.h> | 8 | #include <linux/xattr.h> |
6 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
@@ -620,12 +623,12 @@ out: | |||
620 | static int ceph_sync_setxattr(struct dentry *dentry, const char *name, | 623 | static int ceph_sync_setxattr(struct dentry *dentry, const char *name, |
621 | const char *value, size_t size, int flags) | 624 | const char *value, size_t size, int flags) |
622 | { | 625 | { |
623 | struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); | 626 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
624 | struct inode *inode = dentry->d_inode; | 627 | struct inode *inode = dentry->d_inode; |
625 | struct ceph_inode_info *ci = ceph_inode(inode); | 628 | struct ceph_inode_info *ci = ceph_inode(inode); |
626 | struct inode *parent_inode = dentry->d_parent->d_inode; | 629 | struct inode *parent_inode = dentry->d_parent->d_inode; |
627 | struct ceph_mds_request *req; | 630 | struct ceph_mds_request *req; |
628 | struct ceph_mds_client *mdsc = &client->mdsc; | 631 | struct ceph_mds_client *mdsc = fsc->mdsc; |
629 | int err; | 632 | int err; |
630 | int i, nr_pages; | 633 | int i, nr_pages; |
631 | struct page **pages = NULL; | 634 | struct page **pages = NULL; |
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, | |||
713 | 716 | ||
714 | /* preallocate memory for xattr name, value, index node */ | 717 | /* preallocate memory for xattr name, value, index node */ |
715 | err = -ENOMEM; | 718 | err = -ENOMEM; |
716 | newname = kmalloc(name_len + 1, GFP_NOFS); | 719 | newname = kmemdup(name, name_len + 1, GFP_NOFS); |
717 | if (!newname) | 720 | if (!newname) |
718 | goto out; | 721 | goto out; |
719 | memcpy(newname, name, name_len + 1); | ||
720 | 722 | ||
721 | if (val_len) { | 723 | if (val_len) { |
722 | newval = kmalloc(val_len + 1, GFP_NOFS); | 724 | newval = kmalloc(val_len + 1, GFP_NOFS); |
@@ -777,8 +779,8 @@ out: | |||
777 | 779 | ||
778 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) | 780 | static int ceph_send_removexattr(struct dentry *dentry, const char *name) |
779 | { | 781 | { |
780 | struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); | 782 | struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); |
781 | struct ceph_mds_client *mdsc = &client->mdsc; | 783 | struct ceph_mds_client *mdsc = fsc->mdsc; |
782 | struct inode *inode = dentry->d_inode; | 784 | struct inode *inode = dentry->d_inode; |
783 | struct inode *parent_inode = dentry->d_parent->d_inode; | 785 | struct inode *parent_inode = dentry->d_parent->d_inode; |
784 | struct ceph_mds_request *req; | 786 | struct ceph_mds_request *req; |
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd37..7e83b356cc9e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c | |||
@@ -232,7 +232,7 @@ static int | |||
232 | small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, | 232 | small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, |
233 | void **request_buf) | 233 | void **request_buf) |
234 | { | 234 | { |
235 | int rc = 0; | 235 | int rc; |
236 | 236 | ||
237 | rc = cifs_reconnect_tcon(tcon, smb_command); | 237 | rc = cifs_reconnect_tcon(tcon, smb_command); |
238 | if (rc) | 238 | if (rc) |
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, | |||
250 | if (tcon != NULL) | 250 | if (tcon != NULL) |
251 | cifs_stats_inc(&tcon->num_smbs_sent); | 251 | cifs_stats_inc(&tcon->num_smbs_sent); |
252 | 252 | ||
253 | return rc; | 253 | return 0; |
254 | } | 254 | } |
255 | 255 | ||
256 | int | 256 | int |
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct, | |||
281 | 281 | ||
282 | /* If the return code is zero, this function must fill in request_buf pointer */ | 282 | /* If the return code is zero, this function must fill in request_buf pointer */ |
283 | static int | 283 | static int |
284 | smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, | 284 | __smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, |
285 | void **request_buf /* returned */ , | 285 | void **request_buf, void **response_buf) |
286 | void **response_buf /* returned */ ) | ||
287 | { | 286 | { |
288 | int rc = 0; | ||
289 | |||
290 | rc = cifs_reconnect_tcon(tcon, smb_command); | ||
291 | if (rc) | ||
292 | return rc; | ||
293 | |||
294 | *request_buf = cifs_buf_get(); | 287 | *request_buf = cifs_buf_get(); |
295 | if (*request_buf == NULL) { | 288 | if (*request_buf == NULL) { |
296 | /* BB should we add a retry in here if not a writepage? */ | 289 | /* BB should we add a retry in here if not a writepage? */ |
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, | |||
309 | if (tcon != NULL) | 302 | if (tcon != NULL) |
310 | cifs_stats_inc(&tcon->num_smbs_sent); | 303 | cifs_stats_inc(&tcon->num_smbs_sent); |
311 | 304 | ||
312 | return rc; | 305 | return 0; |
306 | } | ||
307 | |||
308 | /* If the return code is zero, this function must fill in request_buf pointer */ | ||
309 | static int | ||
310 | smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, | ||
311 | void **request_buf, void **response_buf) | ||
312 | { | ||
313 | int rc; | ||
314 | |||
315 | rc = cifs_reconnect_tcon(tcon, smb_command); | ||
316 | if (rc) | ||
317 | return rc; | ||
318 | |||
319 | return __smb_init(smb_command, wct, tcon, request_buf, response_buf); | ||
320 | } | ||
321 | |||
322 | static int | ||
323 | smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon, | ||
324 | void **request_buf, void **response_buf) | ||
325 | { | ||
326 | if (tcon->ses->need_reconnect || tcon->need_reconnect) | ||
327 | return -EHOSTDOWN; | ||
328 | |||
329 | return __smb_init(smb_command, wct, tcon, request_buf, response_buf); | ||
313 | } | 330 | } |
314 | 331 | ||
315 | static int validate_t2(struct smb_t2_rsp *pSMB) | 332 | static int validate_t2(struct smb_t2_rsp *pSMB) |
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon) | |||
4534 | 4551 | ||
4535 | cFYI(1, "In QFSUnixInfo"); | 4552 | cFYI(1, "In QFSUnixInfo"); |
4536 | QFSUnixRetry: | 4553 | QFSUnixRetry: |
4537 | rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, | 4554 | rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, |
4538 | (void **) &pSMBr); | 4555 | (void **) &pSMB, (void **) &pSMBr); |
4539 | if (rc) | 4556 | if (rc) |
4540 | return rc; | 4557 | return rc; |
4541 | 4558 | ||
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap) | |||
4604 | cFYI(1, "In SETFSUnixInfo"); | 4621 | cFYI(1, "In SETFSUnixInfo"); |
4605 | SETFSUnixRetry: | 4622 | SETFSUnixRetry: |
4606 | /* BB switch to small buf init to save memory */ | 4623 | /* BB switch to small buf init to save memory */ |
4607 | rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, | 4624 | rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, |
4608 | (void **) &pSMBr); | 4625 | (void **) &pSMB, (void **) &pSMBr); |
4609 | if (rc) | 4626 | if (rc) |
4610 | return rc; | 4627 | return rc; |
4611 | 4628 | ||
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 93f77d438d3c..53cce8cc2224 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c | |||
@@ -801,6 +801,8 @@ retry_iget5_locked: | |||
801 | inode->i_flags |= S_NOATIME | S_NOCMTIME; | 801 | inode->i_flags |= S_NOATIME | S_NOCMTIME; |
802 | if (inode->i_state & I_NEW) { | 802 | if (inode->i_state & I_NEW) { |
803 | inode->i_ino = hash; | 803 | inode->i_ino = hash; |
804 | if (S_ISREG(inode->i_mode)) | ||
805 | inode->i_data.backing_dev_info = sb->s_bdi; | ||
804 | #ifdef CONFIG_CIFS_FSCACHE | 806 | #ifdef CONFIG_CIFS_FSCACHE |
805 | /* initialize per-inode cache cookie pointer */ | 807 | /* initialize per-inode cache cookie pointer */ |
806 | CIFS_I(inode)->fscache = NULL; | 808 | CIFS_I(inode)->fscache = NULL; |
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 03e59aa318eb..d0ad09d57789 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c | |||
@@ -599,69 +599,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, | |||
599 | #define HIDPGETCONNLIST _IOR('H', 210, int) | 599 | #define HIDPGETCONNLIST _IOR('H', 210, int) |
600 | #define HIDPGETCONNINFO _IOR('H', 211, int) | 600 | #define HIDPGETCONNINFO _IOR('H', 211, int) |
601 | 601 | ||
602 | #ifdef CONFIG_BLOCK | ||
603 | struct raw32_config_request | ||
604 | { | ||
605 | compat_int_t raw_minor; | ||
606 | __u64 block_major; | ||
607 | __u64 block_minor; | ||
608 | } __attribute__((packed)); | ||
609 | |||
610 | static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req) | ||
611 | { | ||
612 | int ret; | ||
613 | |||
614 | if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request))) | ||
615 | return -EFAULT; | ||
616 | |||
617 | ret = __get_user(req->raw_minor, &user_req->raw_minor); | ||
618 | ret |= __get_user(req->block_major, &user_req->block_major); | ||
619 | ret |= __get_user(req->block_minor, &user_req->block_minor); | ||
620 | |||
621 | return ret ? -EFAULT : 0; | ||
622 | } | ||
623 | |||
624 | static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req) | ||
625 | { | ||
626 | int ret; | ||
627 | |||
628 | if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request))) | ||
629 | return -EFAULT; | ||
630 | |||
631 | ret = __put_user(req->raw_minor, &user_req->raw_minor); | ||
632 | ret |= __put_user(req->block_major, &user_req->block_major); | ||
633 | ret |= __put_user(req->block_minor, &user_req->block_minor); | ||
634 | |||
635 | return ret ? -EFAULT : 0; | ||
636 | } | ||
637 | |||
638 | static int raw_ioctl(unsigned fd, unsigned cmd, | ||
639 | struct raw32_config_request __user *user_req) | ||
640 | { | ||
641 | int ret; | ||
642 | |||
643 | switch (cmd) { | ||
644 | case RAW_SETBIND: | ||
645 | default: { /* RAW_GETBIND */ | ||
646 | struct raw_config_request req; | ||
647 | mm_segment_t oldfs = get_fs(); | ||
648 | |||
649 | if ((ret = get_raw32_request(&req, user_req))) | ||
650 | return ret; | ||
651 | |||
652 | set_fs(KERNEL_DS); | ||
653 | ret = sys_ioctl(fd,cmd,(unsigned long)&req); | ||
654 | set_fs(oldfs); | ||
655 | |||
656 | if ((!ret) && (cmd == RAW_GETBIND)) { | ||
657 | ret = set_raw32_request(&req, user_req); | ||
658 | } | ||
659 | break; | ||
660 | } | ||
661 | } | ||
662 | return ret; | ||
663 | } | ||
664 | #endif /* CONFIG_BLOCK */ | ||
665 | 602 | ||
666 | struct serial_struct32 { | 603 | struct serial_struct32 { |
667 | compat_int_t type; | 604 | compat_int_t type; |
@@ -1262,9 +1199,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) | |||
1262 | COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) | 1199 | COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) |
1263 | COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) | 1200 | COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) |
1264 | COMPATIBLE_IOCTL(OSS_GETVERSION) | 1201 | COMPATIBLE_IOCTL(OSS_GETVERSION) |
1265 | /* Raw devices */ | ||
1266 | COMPATIBLE_IOCTL(RAW_SETBIND) | ||
1267 | COMPATIBLE_IOCTL(RAW_GETBIND) | ||
1268 | /* SMB ioctls which do not need any translations */ | 1202 | /* SMB ioctls which do not need any translations */ |
1269 | COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) | 1203 | COMPATIBLE_IOCTL(SMB_IOC_NEWCONN) |
1270 | /* Watchdog */ | 1204 | /* Watchdog */ |
@@ -1523,10 +1457,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, | |||
1523 | case MTIOCGET32: | 1457 | case MTIOCGET32: |
1524 | case MTIOCPOS32: | 1458 | case MTIOCPOS32: |
1525 | return mt_ioctl_trans(fd, cmd, argp); | 1459 | return mt_ioctl_trans(fd, cmd, argp); |
1526 | /* Raw devices */ | ||
1527 | case RAW_SETBIND: | ||
1528 | case RAW_GETBIND: | ||
1529 | return raw_ioctl(fd, cmd, argp); | ||
1530 | #endif | 1460 | #endif |
1531 | /* One SMB ioctl needs translations. */ | 1461 | /* One SMB ioctl needs translations. */ |
1532 | #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) | 1462 | #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) |
@@ -2014,3 +2014,43 @@ fail_creds: | |||
2014 | fail: | 2014 | fail: |
2015 | return; | 2015 | return; |
2016 | } | 2016 | } |
2017 | |||
2018 | /* | ||
2019 | * Core dumping helper functions. These are the only things you should | ||
2020 | * do on a core-file: use only these functions to write out all the | ||
2021 | * necessary info. | ||
2022 | */ | ||
2023 | int dump_write(struct file *file, const void *addr, int nr) | ||
2024 | { | ||
2025 | return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; | ||
2026 | } | ||
2027 | EXPORT_SYMBOL(dump_write); | ||
2028 | |||
2029 | int dump_seek(struct file *file, loff_t off) | ||
2030 | { | ||
2031 | int ret = 1; | ||
2032 | |||
2033 | if (file->f_op->llseek && file->f_op->llseek != no_llseek) { | ||
2034 | if (file->f_op->llseek(file, off, SEEK_CUR) < 0) | ||
2035 | return 0; | ||
2036 | } else { | ||
2037 | char *buf = (char *)get_zeroed_page(GFP_KERNEL); | ||
2038 | |||
2039 | if (!buf) | ||
2040 | return 0; | ||
2041 | while (off > 0) { | ||
2042 | unsigned long n = off; | ||
2043 | |||
2044 | if (n > PAGE_SIZE) | ||
2045 | n = PAGE_SIZE; | ||
2046 | if (!dump_write(file, buf, n)) { | ||
2047 | ret = 0; | ||
2048 | break; | ||
2049 | } | ||
2050 | off -= n; | ||
2051 | } | ||
2052 | free_page((unsigned long)buf); | ||
2053 | } | ||
2054 | return ret; | ||
2055 | } | ||
2056 | EXPORT_SYMBOL(dump_seek); | ||
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index eb7368ebd8cd..3eadd97324b1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -54,6 +54,9 @@ struct page_collect { | |||
54 | unsigned nr_pages; | 54 | unsigned nr_pages; |
55 | unsigned long length; | 55 | unsigned long length; |
56 | loff_t pg_first; /* keep 64bit also in 32-arches */ | 56 | loff_t pg_first; /* keep 64bit also in 32-arches */ |
57 | bool read_4_write; /* This means two things: that the read is sync | ||
58 | * And the pages should not be unlocked. | ||
59 | */ | ||
57 | }; | 60 | }; |
58 | 61 | ||
59 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | 62 | static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, |
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, | |||
71 | pcol->nr_pages = 0; | 74 | pcol->nr_pages = 0; |
72 | pcol->length = 0; | 75 | pcol->length = 0; |
73 | pcol->pg_first = -1; | 76 | pcol->pg_first = -1; |
77 | pcol->read_4_write = false; | ||
74 | } | 78 | } |
75 | 79 | ||
76 | static void _pcol_reset(struct page_collect *pcol) | 80 | static void _pcol_reset(struct page_collect *pcol) |
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page) | |||
347 | if (PageError(page)) | 351 | if (PageError(page)) |
348 | ClearPageError(page); | 352 | ClearPageError(page); |
349 | 353 | ||
350 | unlock_page(page); | 354 | if (!pcol->read_4_write) |
355 | unlock_page(page); | ||
351 | EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," | 356 | EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," |
352 | " splitting\n", inode->i_ino, page->index); | 357 | " splitting\n", inode->i_ino, page->index); |
353 | 358 | ||
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync) | |||
428 | /* readpage_strip might call read_exec(,is_sync==false) at several | 433 | /* readpage_strip might call read_exec(,is_sync==false) at several |
429 | * places but not if we have a single page. | 434 | * places but not if we have a single page. |
430 | */ | 435 | */ |
436 | pcol.read_4_write = is_sync; | ||
431 | ret = readpage_strip(&pcol, page); | 437 | ret = readpage_strip(&pcol, page); |
432 | if (ret) { | 438 | if (ret) { |
433 | EXOFS_ERR("_readpage => %d\n", ret); | 439 | EXOFS_ERR("_readpage => %d\n", ret); |
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2e20bd771337..377768009106 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c | |||
@@ -1842,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) | |||
1842 | goto failed_mount; | 1842 | goto failed_mount; |
1843 | } | 1843 | } |
1844 | 1844 | ||
1845 | if (le32_to_cpu(es->s_blocks_count) > | 1845 | if (generic_check_addressable(sb->s_blocksize_bits, |
1846 | (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { | 1846 | le32_to_cpu(es->s_blocks_count))) { |
1847 | ext3_msg(sb, KERN_ERR, | 1847 | ext3_msg(sb, KERN_ERR, |
1848 | "error: filesystem is too large to mount safely"); | 1848 | "error: filesystem is too large to mount safely"); |
1849 | if (sizeof(sector_t) < 8) | 1849 | if (sizeof(sector_t) < 8) |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 24e7699f915d..8ecc1e590303 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -2826,15 +2826,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
2826 | * Test whether we have more sectors than will fit in sector_t, | 2826 | * Test whether we have more sectors than will fit in sector_t, |
2827 | * and whether the max offset is addressable by the page cache. | 2827 | * and whether the max offset is addressable by the page cache. |
2828 | */ | 2828 | */ |
2829 | if ((ext4_blocks_count(es) > | 2829 | ret = generic_check_addressable(sb->s_blocksize_bits, |
2830 | (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || | 2830 | ext4_blocks_count(es)); |
2831 | (ext4_blocks_count(es) > | 2831 | if (ret) { |
2832 | (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { | ||
2833 | ext4_msg(sb, KERN_ERR, "filesystem" | 2832 | ext4_msg(sb, KERN_ERR, "filesystem" |
2834 | " too large to mount safely on this system"); | 2833 | " too large to mount safely on this system"); |
2835 | if (sizeof(sector_t) < 8) | 2834 | if (sizeof(sector_t) < 8) |
2836 | ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); | 2835 | ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); |
2837 | ret = -EFBIG; | ||
2838 | goto failed_mount; | 2836 | goto failed_mount; |
2839 | } | 2837 | } |
2840 | 2838 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5581122bd2c0..ab38fef1c9a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) | |||
72 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) | 72 | static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) |
73 | { | 73 | { |
74 | struct super_block *sb = inode->i_sb; | 74 | struct super_block *sb = inode->i_sb; |
75 | struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; | ||
76 | 75 | ||
77 | /* | 76 | if (strcmp(sb->s_type->name, "bdev") == 0) |
78 | * For inodes on standard filesystems, we use superblock's bdi. For | 77 | return inode->i_mapping->backing_dev_info; |
79 | * inodes on virtual filesystems, we want to use inode mapping's bdi | 78 | |
80 | * because they can possibly point to something useful (think about | 79 | return sb->s_bdi; |
81 | * block_dev filesystem). | ||
82 | */ | ||
83 | if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { | ||
84 | /* Some device inodes could play dirty tricks. Catch them... */ | ||
85 | WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi), | ||
86 | "Dirtiable inode bdi %s != sb bdi %s\n", | ||
87 | bdi->name, sb->s_bdi->name); | ||
88 | return sb->s_bdi; | ||
89 | } | ||
90 | return bdi; | ||
91 | } | 80 | } |
92 | 81 | ||
93 | static void bdi_queue_work(struct backing_dev_info *bdi, | 82 | static void bdi_queue_work(struct backing_dev_info *bdi, |
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d367af1514ef..cde755cca564 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, | |||
1354 | loff_t file_size; | 1354 | loff_t file_size; |
1355 | unsigned int num; | 1355 | unsigned int num; |
1356 | unsigned int offset; | 1356 | unsigned int offset; |
1357 | size_t total_len; | 1357 | size_t total_len = 0; |
1358 | 1358 | ||
1359 | req = fuse_get_req(fc); | 1359 | req = fuse_get_req(fc); |
1360 | if (IS_ERR(req)) | 1360 | if (IS_ERR(req)) |
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index cc9665522148..c465ae066c62 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig | |||
@@ -1,6 +1,6 @@ | |||
1 | config GFS2_FS | 1 | config GFS2_FS |
2 | tristate "GFS2 file system support" | 2 | tristate "GFS2 file system support" |
3 | depends on EXPERIMENTAL && (64BIT || LBDAF) | 3 | depends on (64BIT || LBDAF) |
4 | select DLM if GFS2_FS_LOCKING_DLM | 4 | select DLM if GFS2_FS_LOCKING_DLM |
5 | select CONFIGFS_FS if GFS2_FS_LOCKING_DLM | 5 | select CONFIGFS_FS if GFS2_FS_LOCKING_DLM |
6 | select SYSFS if GFS2_FS_LOCKING_DLM | 6 | select SYSFS if GFS2_FS_LOCKING_DLM |
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 194fe16d8418..6b24afb96aae 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
@@ -36,8 +36,8 @@ | |||
36 | #include "glops.h" | 36 | #include "glops.h" |
37 | 37 | ||
38 | 38 | ||
39 | static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, | 39 | void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, |
40 | unsigned int from, unsigned int to) | 40 | unsigned int from, unsigned int to) |
41 | { | 41 | { |
42 | struct buffer_head *head = page_buffers(page); | 42 | struct buffer_head *head = page_buffers(page); |
43 | unsigned int bsize = head->b_size; | 43 | unsigned int bsize = head->b_size; |
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, | |||
615 | unsigned int data_blocks = 0, ind_blocks = 0, rblocks; | 615 | unsigned int data_blocks = 0, ind_blocks = 0, rblocks; |
616 | int alloc_required; | 616 | int alloc_required; |
617 | int error = 0; | 617 | int error = 0; |
618 | struct gfs2_alloc *al; | 618 | struct gfs2_alloc *al = NULL; |
619 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 619 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
620 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 620 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
621 | unsigned to = from + len; | 621 | unsigned to = from + len; |
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, | |||
663 | rblocks += RES_STATFS + RES_QUOTA; | 663 | rblocks += RES_STATFS + RES_QUOTA; |
664 | if (&ip->i_inode == sdp->sd_rindex) | 664 | if (&ip->i_inode == sdp->sd_rindex) |
665 | rblocks += 2 * RES_STATFS; | 665 | rblocks += 2 * RES_STATFS; |
666 | if (alloc_required) | ||
667 | rblocks += gfs2_rg_blocks(al); | ||
666 | 668 | ||
667 | error = gfs2_trans_begin(sdp, rblocks, | 669 | error = gfs2_trans_begin(sdp, rblocks, |
668 | PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); | 670 | PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); |
@@ -696,13 +698,11 @@ out: | |||
696 | 698 | ||
697 | page_cache_release(page); | 699 | page_cache_release(page); |
698 | 700 | ||
699 | /* | 701 | gfs2_trans_end(sdp); |
700 | * XXX(truncate): the call below should probably be replaced with | ||
701 | * a call to the gfs2-specific truncate blocks helper to actually | ||
702 | * release disk blocks.. | ||
703 | */ | ||
704 | if (pos + len > ip->i_inode.i_size) | 702 | if (pos + len > ip->i_inode.i_size) |
705 | truncate_setsize(&ip->i_inode, ip->i_inode.i_size); | 703 | gfs2_trim_blocks(&ip->i_inode); |
704 | goto out_trans_fail; | ||
705 | |||
706 | out_endtrans: | 706 | out_endtrans: |
707 | gfs2_trans_end(sdp); | 707 | gfs2_trans_end(sdp); |
708 | out_trans_fail: | 708 | out_trans_fail: |
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, | |||
802 | page_cache_release(page); | 802 | page_cache_release(page); |
803 | 803 | ||
804 | if (copied) { | 804 | if (copied) { |
805 | if (inode->i_size < to) { | 805 | if (inode->i_size < to) |
806 | i_size_write(inode, to); | 806 | i_size_write(inode, to); |
807 | ip->i_disksize = inode->i_size; | ||
808 | } | ||
809 | gfs2_dinode_out(ip, di); | 807 | gfs2_dinode_out(ip, di); |
810 | mark_inode_dirty(inode); | 808 | mark_inode_dirty(inode); |
811 | } | 809 | } |
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, | |||
876 | 874 | ||
877 | ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); | 875 | ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); |
878 | if (ret > 0) { | 876 | if (ret > 0) { |
879 | if (inode->i_size > ip->i_disksize) | ||
880 | ip->i_disksize = inode->i_size; | ||
881 | gfs2_dinode_out(ip, dibh->b_data); | 877 | gfs2_dinode_out(ip, dibh->b_data); |
882 | mark_inode_dirty(inode); | 878 | mark_inode_dirty(inode); |
883 | } | 879 | } |
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6f482809d1a3..5476c066d4ee 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c | |||
@@ -50,7 +50,7 @@ struct strip_mine { | |||
50 | * @ip: the inode | 50 | * @ip: the inode |
51 | * @dibh: the dinode buffer | 51 | * @dibh: the dinode buffer |
52 | * @block: the block number that was allocated | 52 | * @block: the block number that was allocated |
53 | * @private: any locked page held by the caller process | 53 | * @page: The (optional) page. This is looked up if @page is NULL |
54 | * | 54 | * |
55 | * Returns: errno | 55 | * Returns: errno |
56 | */ | 56 | */ |
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, | |||
109 | /** | 109 | /** |
110 | * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big | 110 | * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big |
111 | * @ip: The GFS2 inode to unstuff | 111 | * @ip: The GFS2 inode to unstuff |
112 | * @unstuffer: the routine that handles unstuffing a non-zero length file | 112 | * @page: The (optional) page. This is looked up if the @page is NULL |
113 | * @private: private data for the unstuffer | ||
114 | * | 113 | * |
115 | * This routine unstuffs a dinode and returns it to a "normal" state such | 114 | * This routine unstuffs a dinode and returns it to a "normal" state such |
116 | * that the height can be grown in the traditional way. | 115 | * that the height can be grown in the traditional way. |
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) | |||
132 | if (error) | 131 | if (error) |
133 | goto out; | 132 | goto out; |
134 | 133 | ||
135 | if (ip->i_disksize) { | 134 | if (i_size_read(&ip->i_inode)) { |
136 | /* Get a free block, fill it with the stuffed data, | 135 | /* Get a free block, fill it with the stuffed data, |
137 | and write it out to disk */ | 136 | and write it out to disk */ |
138 | 137 | ||
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) | |||
161 | di = (struct gfs2_dinode *)dibh->b_data; | 160 | di = (struct gfs2_dinode *)dibh->b_data; |
162 | gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); | 161 | gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); |
163 | 162 | ||
164 | if (ip->i_disksize) { | 163 | if (i_size_read(&ip->i_inode)) { |
165 | *(__be64 *)(di + 1) = cpu_to_be64(block); | 164 | *(__be64 *)(di + 1) = cpu_to_be64(block); |
166 | gfs2_add_inode_blocks(&ip->i_inode, 1); | 165 | gfs2_add_inode_blocks(&ip->i_inode, 1); |
167 | di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); | 166 | di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); |
@@ -885,83 +884,14 @@ out: | |||
885 | } | 884 | } |
886 | 885 | ||
887 | /** | 886 | /** |
888 | * do_grow - Make a file look bigger than it is | ||
889 | * @ip: the inode | ||
890 | * @size: the size to set the file to | ||
891 | * | ||
892 | * Called with an exclusive lock on @ip. | ||
893 | * | ||
894 | * Returns: errno | ||
895 | */ | ||
896 | |||
897 | static int do_grow(struct gfs2_inode *ip, u64 size) | ||
898 | { | ||
899 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | ||
900 | struct gfs2_alloc *al; | ||
901 | struct buffer_head *dibh; | ||
902 | int error; | ||
903 | |||
904 | al = gfs2_alloc_get(ip); | ||
905 | if (!al) | ||
906 | return -ENOMEM; | ||
907 | |||
908 | error = gfs2_quota_lock_check(ip); | ||
909 | if (error) | ||
910 | goto out; | ||
911 | |||
912 | al->al_requested = sdp->sd_max_height + RES_DATA; | ||
913 | |||
914 | error = gfs2_inplace_reserve(ip); | ||
915 | if (error) | ||
916 | goto out_gunlock_q; | ||
917 | |||
918 | error = gfs2_trans_begin(sdp, | ||
919 | sdp->sd_max_height + al->al_rgd->rd_length + | ||
920 | RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); | ||
921 | if (error) | ||
922 | goto out_ipres; | ||
923 | |||
924 | error = gfs2_meta_inode_buffer(ip, &dibh); | ||
925 | if (error) | ||
926 | goto out_end_trans; | ||
927 | |||
928 | if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { | ||
929 | if (gfs2_is_stuffed(ip)) { | ||
930 | error = gfs2_unstuff_dinode(ip, NULL); | ||
931 | if (error) | ||
932 | goto out_brelse; | ||
933 | } | ||
934 | } | ||
935 | |||
936 | ip->i_disksize = size; | ||
937 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | ||
938 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | ||
939 | gfs2_dinode_out(ip, dibh->b_data); | ||
940 | |||
941 | out_brelse: | ||
942 | brelse(dibh); | ||
943 | out_end_trans: | ||
944 | gfs2_trans_end(sdp); | ||
945 | out_ipres: | ||
946 | gfs2_inplace_release(ip); | ||
947 | out_gunlock_q: | ||
948 | gfs2_quota_unlock(ip); | ||
949 | out: | ||
950 | gfs2_alloc_put(ip); | ||
951 | return error; | ||
952 | } | ||
953 | |||
954 | |||
955 | /** | ||
956 | * gfs2_block_truncate_page - Deal with zeroing out data for truncate | 887 | * gfs2_block_truncate_page - Deal with zeroing out data for truncate |
957 | * | 888 | * |
958 | * This is partly borrowed from ext3. | 889 | * This is partly borrowed from ext3. |
959 | */ | 890 | */ |
960 | static int gfs2_block_truncate_page(struct address_space *mapping) | 891 | static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) |
961 | { | 892 | { |
962 | struct inode *inode = mapping->host; | 893 | struct inode *inode = mapping->host; |
963 | struct gfs2_inode *ip = GFS2_I(inode); | 894 | struct gfs2_inode *ip = GFS2_I(inode); |
964 | loff_t from = inode->i_size; | ||
965 | unsigned long index = from >> PAGE_CACHE_SHIFT; | 895 | unsigned long index = from >> PAGE_CACHE_SHIFT; |
966 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 896 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
967 | unsigned blocksize, iblock, length, pos; | 897 | unsigned blocksize, iblock, length, pos; |
@@ -1023,9 +953,11 @@ unlock: | |||
1023 | return err; | 953 | return err; |
1024 | } | 954 | } |
1025 | 955 | ||
1026 | static int trunc_start(struct gfs2_inode *ip, u64 size) | 956 | static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) |
1027 | { | 957 | { |
1028 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 958 | struct gfs2_inode *ip = GFS2_I(inode); |
959 | struct gfs2_sbd *sdp = GFS2_SB(inode); | ||
960 | struct address_space *mapping = inode->i_mapping; | ||
1029 | struct buffer_head *dibh; | 961 | struct buffer_head *dibh; |
1030 | int journaled = gfs2_is_jdata(ip); | 962 | int journaled = gfs2_is_jdata(ip); |
1031 | int error; | 963 | int error; |
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) | |||
1039 | if (error) | 971 | if (error) |
1040 | goto out; | 972 | goto out; |
1041 | 973 | ||
974 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | ||
975 | |||
1042 | if (gfs2_is_stuffed(ip)) { | 976 | if (gfs2_is_stuffed(ip)) { |
1043 | u64 dsize = size + sizeof(struct gfs2_dinode); | 977 | gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); |
1044 | ip->i_disksize = size; | ||
1045 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | ||
1046 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | ||
1047 | gfs2_dinode_out(ip, dibh->b_data); | ||
1048 | if (dsize > dibh->b_size) | ||
1049 | dsize = dibh->b_size; | ||
1050 | gfs2_buffer_clear_tail(dibh, dsize); | ||
1051 | error = 1; | ||
1052 | } else { | 978 | } else { |
1053 | if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) | 979 | if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) { |
1054 | error = gfs2_block_truncate_page(ip->i_inode.i_mapping); | 980 | error = gfs2_block_truncate_page(mapping, newsize); |
1055 | 981 | if (error) | |
1056 | if (!error) { | 982 | goto out_brelse; |
1057 | ip->i_disksize = size; | ||
1058 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | ||
1059 | ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; | ||
1060 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | ||
1061 | gfs2_dinode_out(ip, dibh->b_data); | ||
1062 | } | 983 | } |
984 | ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; | ||
1063 | } | 985 | } |
1064 | 986 | ||
1065 | brelse(dibh); | 987 | i_size_write(inode, newsize); |
988 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | ||
989 | gfs2_dinode_out(ip, dibh->b_data); | ||
1066 | 990 | ||
991 | truncate_pagecache(inode, oldsize, newsize); | ||
992 | out_brelse: | ||
993 | brelse(dibh); | ||
1067 | out: | 994 | out: |
1068 | gfs2_trans_end(sdp); | 995 | gfs2_trans_end(sdp); |
1069 | return error; | 996 | return error; |
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip) | |||
1123 | if (error) | 1050 | if (error) |
1124 | goto out; | 1051 | goto out; |
1125 | 1052 | ||
1126 | if (!ip->i_disksize) { | 1053 | if (!i_size_read(&ip->i_inode)) { |
1127 | ip->i_height = 0; | 1054 | ip->i_height = 0; |
1128 | ip->i_goal = ip->i_no_addr; | 1055 | ip->i_goal = ip->i_no_addr; |
1129 | gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); | 1056 | gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); |
@@ -1143,92 +1070,154 @@ out: | |||
1143 | 1070 | ||
1144 | /** | 1071 | /** |
1145 | * do_shrink - make a file smaller | 1072 | * do_shrink - make a file smaller |
1146 | * @ip: the inode | 1073 | * @inode: the inode |
1147 | * @size: the size to make the file | 1074 | * @oldsize: the current inode size |
1148 | * @truncator: function to truncate the last partial block | 1075 | * @newsize: the size to make the file |
1149 | * | 1076 | * |
1150 | * Called with an exclusive lock on @ip. | 1077 | * Called with an exclusive lock on @inode. The @size must |
1078 | * be equal to or smaller than the current inode size. | ||
1151 | * | 1079 | * |
1152 | * Returns: errno | 1080 | * Returns: errno |
1153 | */ | 1081 | */ |
1154 | 1082 | ||
1155 | static int do_shrink(struct gfs2_inode *ip, u64 size) | 1083 | static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize) |
1156 | { | 1084 | { |
1085 | struct gfs2_inode *ip = GFS2_I(inode); | ||
1157 | int error; | 1086 | int error; |
1158 | 1087 | ||
1159 | error = trunc_start(ip, size); | 1088 | error = trunc_start(inode, oldsize, newsize); |
1160 | if (error < 0) | 1089 | if (error < 0) |
1161 | return error; | 1090 | return error; |
1162 | if (error > 0) | 1091 | if (gfs2_is_stuffed(ip)) |
1163 | return 0; | 1092 | return 0; |
1164 | 1093 | ||
1165 | error = trunc_dealloc(ip, size); | 1094 | error = trunc_dealloc(ip, newsize); |
1166 | if (!error) | 1095 | if (error == 0) |
1167 | error = trunc_end(ip); | 1096 | error = trunc_end(ip); |
1168 | 1097 | ||
1169 | return error; | 1098 | return error; |
1170 | } | 1099 | } |
1171 | 1100 | ||
1172 | static int do_touch(struct gfs2_inode *ip, u64 size) | 1101 | void gfs2_trim_blocks(struct inode *inode) |
1173 | { | 1102 | { |
1174 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 1103 | u64 size = inode->i_size; |
1104 | int ret; | ||
1105 | |||
1106 | ret = do_shrink(inode, size, size); | ||
1107 | WARN_ON(ret != 0); | ||
1108 | } | ||
1109 | |||
1110 | /** | ||
1111 | * do_grow - Touch and update inode size | ||
1112 | * @inode: The inode | ||
1113 | * @size: The new size | ||
1114 | * | ||
1115 | * This function updates the timestamps on the inode and | ||
1116 | * may also increase the size of the inode. This function | ||
1117 | * must not be called with @size any smaller than the current | ||
1118 | * inode size. | ||
1119 | * | ||
1120 | * Although it is not strictly required to unstuff files here, | ||
1121 | * earlier versions of GFS2 have a bug in the stuffed file reading | ||
1122 | * code which will result in a buffer overrun if the size is larger | ||
1123 | * than the max stuffed file size. In order to prevent this from | ||
1124 | * occuring, such files are unstuffed, but in other cases we can | ||
1125 | * just update the inode size directly. | ||
1126 | * | ||
1127 | * Returns: 0 on success, or -ve on error | ||
1128 | */ | ||
1129 | |||
1130 | static int do_grow(struct inode *inode, u64 size) | ||
1131 | { | ||
1132 | struct gfs2_inode *ip = GFS2_I(inode); | ||
1133 | struct gfs2_sbd *sdp = GFS2_SB(inode); | ||
1175 | struct buffer_head *dibh; | 1134 | struct buffer_head *dibh; |
1135 | struct gfs2_alloc *al = NULL; | ||
1176 | int error; | 1136 | int error; |
1177 | 1137 | ||
1178 | error = gfs2_trans_begin(sdp, RES_DINODE, 0); | 1138 | if (gfs2_is_stuffed(ip) && |
1139 | (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { | ||
1140 | al = gfs2_alloc_get(ip); | ||
1141 | if (al == NULL) | ||
1142 | return -ENOMEM; | ||
1143 | |||
1144 | error = gfs2_quota_lock_check(ip); | ||
1145 | if (error) | ||
1146 | goto do_grow_alloc_put; | ||
1147 | |||
1148 | al->al_requested = 1; | ||
1149 | error = gfs2_inplace_reserve(ip); | ||
1150 | if (error) | ||
1151 | goto do_grow_qunlock; | ||
1152 | } | ||
1153 | |||
1154 | error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); | ||
1179 | if (error) | 1155 | if (error) |
1180 | return error; | 1156 | goto do_grow_release; |
1181 | 1157 | ||
1182 | down_write(&ip->i_rw_mutex); | 1158 | if (al) { |
1159 | error = gfs2_unstuff_dinode(ip, NULL); | ||
1160 | if (error) | ||
1161 | goto do_end_trans; | ||
1162 | } | ||
1183 | 1163 | ||
1184 | error = gfs2_meta_inode_buffer(ip, &dibh); | 1164 | error = gfs2_meta_inode_buffer(ip, &dibh); |
1185 | if (error) | 1165 | if (error) |
1186 | goto do_touch_out; | 1166 | goto do_end_trans; |
1187 | 1167 | ||
1168 | i_size_write(inode, size); | ||
1188 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | 1169 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; |
1189 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | 1170 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); |
1190 | gfs2_dinode_out(ip, dibh->b_data); | 1171 | gfs2_dinode_out(ip, dibh->b_data); |
1191 | brelse(dibh); | 1172 | brelse(dibh); |
1192 | 1173 | ||
1193 | do_touch_out: | 1174 | do_end_trans: |
1194 | up_write(&ip->i_rw_mutex); | ||
1195 | gfs2_trans_end(sdp); | 1175 | gfs2_trans_end(sdp); |
1176 | do_grow_release: | ||
1177 | if (al) { | ||
1178 | gfs2_inplace_release(ip); | ||
1179 | do_grow_qunlock: | ||
1180 | gfs2_quota_unlock(ip); | ||
1181 | do_grow_alloc_put: | ||
1182 | gfs2_alloc_put(ip); | ||
1183 | } | ||
1196 | return error; | 1184 | return error; |
1197 | } | 1185 | } |
1198 | 1186 | ||
1199 | /** | 1187 | /** |
1200 | * gfs2_truncatei - make a file a given size | 1188 | * gfs2_setattr_size - make a file a given size |
1201 | * @ip: the inode | 1189 | * @inode: the inode |
1202 | * @size: the size to make the file | 1190 | * @newsize: the size to make the file |
1203 | * @truncator: function to truncate the last partial block | ||
1204 | * | 1191 | * |
1205 | * The file size can grow, shrink, or stay the same size. | 1192 | * The file size can grow, shrink, or stay the same size. This |
1193 | * is called holding i_mutex and an exclusive glock on the inode | ||
1194 | * in question. | ||
1206 | * | 1195 | * |
1207 | * Returns: errno | 1196 | * Returns: errno |
1208 | */ | 1197 | */ |
1209 | 1198 | ||
1210 | int gfs2_truncatei(struct gfs2_inode *ip, u64 size) | 1199 | int gfs2_setattr_size(struct inode *inode, u64 newsize) |
1211 | { | 1200 | { |
1212 | int error; | 1201 | int ret; |
1202 | u64 oldsize; | ||
1213 | 1203 | ||
1214 | if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) | 1204 | BUG_ON(!S_ISREG(inode->i_mode)); |
1215 | return -EINVAL; | ||
1216 | 1205 | ||
1217 | if (size > ip->i_disksize) | 1206 | ret = inode_newsize_ok(inode, newsize); |
1218 | error = do_grow(ip, size); | 1207 | if (ret) |
1219 | else if (size < ip->i_disksize) | 1208 | return ret; |
1220 | error = do_shrink(ip, size); | ||
1221 | else | ||
1222 | /* update time stamps */ | ||
1223 | error = do_touch(ip, size); | ||
1224 | 1209 | ||
1225 | return error; | 1210 | oldsize = inode->i_size; |
1211 | if (newsize >= oldsize) | ||
1212 | return do_grow(inode, newsize); | ||
1213 | |||
1214 | return do_shrink(inode, oldsize, newsize); | ||
1226 | } | 1215 | } |
1227 | 1216 | ||
1228 | int gfs2_truncatei_resume(struct gfs2_inode *ip) | 1217 | int gfs2_truncatei_resume(struct gfs2_inode *ip) |
1229 | { | 1218 | { |
1230 | int error; | 1219 | int error; |
1231 | error = trunc_dealloc(ip, ip->i_disksize); | 1220 | error = trunc_dealloc(ip, i_size_read(&ip->i_inode)); |
1232 | if (!error) | 1221 | if (!error) |
1233 | error = trunc_end(ip); | 1222 | error = trunc_end(ip); |
1234 | return error; | 1223 | return error; |
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, | |||
1269 | 1258 | ||
1270 | shift = sdp->sd_sb.sb_bsize_shift; | 1259 | shift = sdp->sd_sb.sb_bsize_shift; |
1271 | BUG_ON(gfs2_is_dir(ip)); | 1260 | BUG_ON(gfs2_is_dir(ip)); |
1272 | end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; | 1261 | end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; |
1273 | lblock = offset >> shift; | 1262 | lblock = offset >> shift; |
1274 | lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; | 1263 | lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; |
1275 | if (lblock_stop > end_of_file) | 1264 | if (lblock_stop > end_of_file) |
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index a20a5213135a..42fea03e2bd9 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h | |||
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, | |||
44 | } | 44 | } |
45 | } | 45 | } |
46 | 46 | ||
47 | int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); | 47 | extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); |
48 | int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); | 48 | extern int gfs2_block_map(struct inode *inode, sector_t lblock, |
49 | int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); | 49 | struct buffer_head *bh, int create); |
50 | 50 | extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, | |
51 | int gfs2_truncatei(struct gfs2_inode *ip, u64 size); | 51 | u64 *dblock, unsigned *extlen); |
52 | int gfs2_truncatei_resume(struct gfs2_inode *ip); | 52 | extern int gfs2_setattr_size(struct inode *inode, u64 size); |
53 | int gfs2_file_dealloc(struct gfs2_inode *ip); | 53 | extern void gfs2_trim_blocks(struct inode *inode); |
54 | int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, | 54 | extern int gfs2_truncatei_resume(struct gfs2_inode *ip); |
55 | unsigned int len); | 55 | extern int gfs2_file_dealloc(struct gfs2_inode *ip); |
56 | extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, | ||
57 | unsigned int len); | ||
56 | 58 | ||
57 | #endif /* __BMAP_DOT_H__ */ | 59 | #endif /* __BMAP_DOT_H__ */ |
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index bb7907bde3d8..6798755b3858 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c | |||
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) | |||
49 | ip = GFS2_I(inode); | 49 | ip = GFS2_I(inode); |
50 | } | 50 | } |
51 | 51 | ||
52 | if (sdp->sd_args.ar_localcaching) | 52 | if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) |
53 | goto valid; | 53 | goto valid; |
54 | 54 | ||
55 | had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); | 55 | had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); |
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index b9dd88a78dd4..5c356d09c321 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c | |||
@@ -79,6 +79,9 @@ | |||
79 | #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) | 79 | #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) |
80 | #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) | 80 | #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) |
81 | 81 | ||
82 | struct qstr gfs2_qdot __read_mostly; | ||
83 | struct qstr gfs2_qdotdot __read_mostly; | ||
84 | |||
82 | typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, | 85 | typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, |
83 | u64 leaf_no, void *data); | 86 | u64 leaf_no, void *data); |
84 | typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, | 87 | typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, |
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, | |||
127 | 130 | ||
128 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | 131 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); |
129 | memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); | 132 | memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); |
130 | if (ip->i_disksize < offset + size) | 133 | if (ip->i_inode.i_size < offset + size) |
131 | ip->i_disksize = offset + size; | 134 | i_size_write(&ip->i_inode, offset + size); |
132 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | 135 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; |
133 | gfs2_dinode_out(ip, dibh->b_data); | 136 | gfs2_dinode_out(ip, dibh->b_data); |
134 | 137 | ||
@@ -225,8 +228,8 @@ out: | |||
225 | if (error) | 228 | if (error) |
226 | return error; | 229 | return error; |
227 | 230 | ||
228 | if (ip->i_disksize < offset + copied) | 231 | if (ip->i_inode.i_size < offset + copied) |
229 | ip->i_disksize = offset + copied; | 232 | i_size_write(&ip->i_inode, offset + copied); |
230 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; | 233 | ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; |
231 | 234 | ||
232 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | 235 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); |
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, | |||
275 | unsigned int o; | 278 | unsigned int o; |
276 | int copied = 0; | 279 | int copied = 0; |
277 | int error = 0; | 280 | int error = 0; |
281 | u64 disksize = i_size_read(&ip->i_inode); | ||
278 | 282 | ||
279 | if (offset >= ip->i_disksize) | 283 | if (offset >= disksize) |
280 | return 0; | 284 | return 0; |
281 | 285 | ||
282 | if (offset + size > ip->i_disksize) | 286 | if (offset + size > disksize) |
283 | size = ip->i_disksize - offset; | 287 | size = disksize - offset; |
284 | 288 | ||
285 | if (!size) | 289 | if (!size) |
286 | return 0; | 290 | return 0; |
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, | |||
727 | unsigned hsize = 1 << ip->i_depth; | 731 | unsigned hsize = 1 << ip->i_depth; |
728 | unsigned index; | 732 | unsigned index; |
729 | u64 ln; | 733 | u64 ln; |
730 | if (hsize * sizeof(u64) != ip->i_disksize) { | 734 | if (hsize * sizeof(u64) != i_size_read(inode)) { |
731 | gfs2_consist_inode(ip); | 735 | gfs2_consist_inode(ip); |
732 | return ERR_PTR(-EIO); | 736 | return ERR_PTR(-EIO); |
733 | } | 737 | } |
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode) | |||
879 | for (x = sdp->sd_hash_ptrs; x--; lp++) | 883 | for (x = sdp->sd_hash_ptrs; x--; lp++) |
880 | *lp = cpu_to_be64(bn); | 884 | *lp = cpu_to_be64(bn); |
881 | 885 | ||
882 | dip->i_disksize = sdp->sd_sb.sb_bsize / 2; | 886 | i_size_write(inode, sdp->sd_sb.sb_bsize / 2); |
883 | gfs2_add_inode_blocks(&dip->i_inode, 1); | 887 | gfs2_add_inode_blocks(&dip->i_inode, 1); |
884 | dip->i_diskflags |= GFS2_DIF_EXHASH; | 888 | dip->i_diskflags |= GFS2_DIF_EXHASH; |
885 | 889 | ||
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip) | |||
1057 | u64 *buf; | 1061 | u64 *buf; |
1058 | u64 *from, *to; | 1062 | u64 *from, *to; |
1059 | u64 block; | 1063 | u64 block; |
1064 | u64 disksize = i_size_read(&dip->i_inode); | ||
1060 | int x; | 1065 | int x; |
1061 | int error = 0; | 1066 | int error = 0; |
1062 | 1067 | ||
1063 | hsize = 1 << dip->i_depth; | 1068 | hsize = 1 << dip->i_depth; |
1064 | if (hsize * sizeof(u64) != dip->i_disksize) { | 1069 | if (hsize * sizeof(u64) != disksize) { |
1065 | gfs2_consist_inode(dip); | 1070 | gfs2_consist_inode(dip); |
1066 | return -EIO; | 1071 | return -EIO; |
1067 | } | 1072 | } |
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) | |||
1072 | if (!buf) | 1077 | if (!buf) |
1073 | return -ENOMEM; | 1078 | return -ENOMEM; |
1074 | 1079 | ||
1075 | for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { | 1080 | for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) { |
1076 | error = gfs2_dir_read_data(dip, (char *)buf, | 1081 | error = gfs2_dir_read_data(dip, (char *)buf, |
1077 | block * sdp->sd_hash_bsize, | 1082 | block * sdp->sd_hash_bsize, |
1078 | sdp->sd_hash_bsize, 1); | 1083 | sdp->sd_hash_bsize, 1); |
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, | |||
1370 | unsigned depth = 0; | 1375 | unsigned depth = 0; |
1371 | 1376 | ||
1372 | hsize = 1 << dip->i_depth; | 1377 | hsize = 1 << dip->i_depth; |
1373 | if (hsize * sizeof(u64) != dip->i_disksize) { | 1378 | if (hsize * sizeof(u64) != i_size_read(inode)) { |
1374 | gfs2_consist_inode(dip); | 1379 | gfs2_consist_inode(dip); |
1375 | return -EIO; | 1380 | return -EIO; |
1376 | } | 1381 | } |
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) | |||
1784 | int error = 0; | 1789 | int error = 0; |
1785 | 1790 | ||
1786 | hsize = 1 << dip->i_depth; | 1791 | hsize = 1 << dip->i_depth; |
1787 | if (hsize * sizeof(u64) != dip->i_disksize) { | 1792 | if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { |
1788 | gfs2_consist_inode(dip); | 1793 | gfs2_consist_inode(dip); |
1789 | return -EIO; | 1794 | return -EIO; |
1790 | } | 1795 | } |
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f919440c3be..a98f644bd3df 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h | |||
@@ -17,23 +17,24 @@ struct inode; | |||
17 | struct gfs2_inode; | 17 | struct gfs2_inode; |
18 | struct gfs2_inum; | 18 | struct gfs2_inum; |
19 | 19 | ||
20 | struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); | 20 | extern struct inode *gfs2_dir_search(struct inode *dir, |
21 | int gfs2_dir_check(struct inode *dir, const struct qstr *filename, | 21 | const struct qstr *filename); |
22 | const struct gfs2_inode *ip); | 22 | extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, |
23 | int gfs2_dir_add(struct inode *inode, const struct qstr *filename, | 23 | const struct gfs2_inode *ip); |
24 | const struct gfs2_inode *ip, unsigned int type); | 24 | extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, |
25 | int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); | 25 | const struct gfs2_inode *ip, unsigned int type); |
26 | int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, | 26 | extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); |
27 | filldir_t filldir); | 27 | extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, |
28 | int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, | 28 | filldir_t filldir); |
29 | const struct gfs2_inode *nip, unsigned int new_type); | 29 | extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, |
30 | const struct gfs2_inode *nip, unsigned int new_type); | ||
30 | 31 | ||
31 | int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); | 32 | extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); |
32 | 33 | ||
33 | int gfs2_diradd_alloc_required(struct inode *dir, | 34 | extern int gfs2_diradd_alloc_required(struct inode *dir, |
34 | const struct qstr *filename); | 35 | const struct qstr *filename); |
35 | int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, | 36 | extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, |
36 | struct buffer_head **bhp); | 37 | struct buffer_head **bhp); |
37 | 38 | ||
38 | static inline u32 gfs2_disk_hash(const char *data, int len) | 39 | static inline u32 gfs2_disk_hash(const char *data, int len) |
39 | { | 40 | { |
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct | |||
61 | memcpy(dent + 1, name->name, name->len); | 62 | memcpy(dent + 1, name->name, name->len); |
62 | } | 63 | } |
63 | 64 | ||
65 | extern struct qstr gfs2_qdot; | ||
66 | extern struct qstr gfs2_qdotdot; | ||
67 | |||
64 | #endif /* __DIR_DOT_H__ */ | 68 | #endif /* __DIR_DOT_H__ */ |
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index dfe237a3f8ad..06d582732d34 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c | |||
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name, | |||
126 | 126 | ||
127 | static struct dentry *gfs2_get_parent(struct dentry *child) | 127 | static struct dentry *gfs2_get_parent(struct dentry *child) |
128 | { | 128 | { |
129 | struct qstr dotdot; | ||
130 | struct dentry *dentry; | 129 | struct dentry *dentry; |
131 | 130 | ||
132 | /* | 131 | dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); |
133 | * XXX(hch): it would be a good idea to keep this around as a | ||
134 | * static variable. | ||
135 | */ | ||
136 | gfs2_str2qstr(&dotdot, ".."); | ||
137 | |||
138 | dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); | ||
139 | if (!IS_ERR(dentry)) | 132 | if (!IS_ERR(dentry)) |
140 | dentry->d_op = &gfs2_dops; | 133 | dentry->d_op = &gfs2_dops; |
141 | return dentry; | 134 | return dentry; |
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 8fcfefb96077..a51079bd4af1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c | |||
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
382 | rblocks = RES_DINODE + ind_blocks; | 382 | rblocks = RES_DINODE + ind_blocks; |
383 | if (gfs2_is_jdata(ip)) | 383 | if (gfs2_is_jdata(ip)) |
384 | rblocks += data_blocks ? data_blocks : 1; | 384 | rblocks += data_blocks ? data_blocks : 1; |
385 | if (ind_blocks || data_blocks) | 385 | if (ind_blocks || data_blocks) { |
386 | rblocks += RES_STATFS + RES_QUOTA; | 386 | rblocks += RES_STATFS + RES_QUOTA; |
387 | rblocks += gfs2_rg_blocks(al); | ||
388 | } | ||
387 | ret = gfs2_trans_begin(sdp, rblocks, 0); | 389 | ret = gfs2_trans_begin(sdp, rblocks, 0); |
388 | if (ret) | 390 | if (ret) |
389 | goto out_trans_fail; | 391 | goto out_trans_fail; |
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file) | |||
491 | goto fail; | 493 | goto fail; |
492 | 494 | ||
493 | if (!(file->f_flags & O_LARGEFILE) && | 495 | if (!(file->f_flags & O_LARGEFILE) && |
494 | ip->i_disksize > MAX_NON_LFS) { | 496 | i_size_read(inode) > MAX_NON_LFS) { |
495 | error = -EOVERFLOW; | 497 | error = -EOVERFLOW; |
496 | goto fail_gunlock; | 498 | goto fail_gunlock; |
497 | } | 499 | } |
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9adf8f924e08..87778857f099 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c | |||
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) | |||
441 | else | 441 | else |
442 | gfs2_glock_put_nolock(gl); | 442 | gfs2_glock_put_nolock(gl); |
443 | } | 443 | } |
444 | if (held1 && held2 && list_empty(&gl->gl_holders)) | ||
445 | clear_bit(GLF_QUEUED, &gl->gl_flags); | ||
444 | 446 | ||
445 | gl->gl_state = new_state; | 447 | gl->gl_state = new_state; |
446 | gl->gl_tchange = jiffies; | 448 | gl->gl_tchange = jiffies; |
@@ -1012,6 +1014,7 @@ fail: | |||
1012 | if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) | 1014 | if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) |
1013 | insert_pt = &gh2->gh_list; | 1015 | insert_pt = &gh2->gh_list; |
1014 | } | 1016 | } |
1017 | set_bit(GLF_QUEUED, &gl->gl_flags); | ||
1015 | if (likely(insert_pt == NULL)) { | 1018 | if (likely(insert_pt == NULL)) { |
1016 | list_add_tail(&gh->gh_list, &gl->gl_holders); | 1019 | list_add_tail(&gh->gh_list, &gl->gl_holders); |
1017 | if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) | 1020 | if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) |
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) | |||
1310 | 1313 | ||
1311 | gfs2_glock_hold(gl); | 1314 | gfs2_glock_hold(gl); |
1312 | holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; | 1315 | holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; |
1313 | if (time_before(now, holdtime)) | 1316 | if (test_bit(GLF_QUEUED, &gl->gl_flags)) { |
1314 | delay = holdtime - now; | 1317 | if (time_before(now, holdtime)) |
1315 | if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) | 1318 | delay = holdtime - now; |
1316 | delay = gl->gl_ops->go_min_hold_time; | 1319 | if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) |
1320 | delay = gl->gl_ops->go_min_hold_time; | ||
1321 | } | ||
1317 | 1322 | ||
1318 | spin_lock(&gl->gl_spin); | 1323 | spin_lock(&gl->gl_spin); |
1319 | handle_callback(gl, state, delay); | 1324 | handle_callback(gl, state, delay); |
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl) | |||
1512 | spin_unlock(&lru_lock); | 1517 | spin_unlock(&lru_lock); |
1513 | 1518 | ||
1514 | spin_lock(&gl->gl_spin); | 1519 | spin_lock(&gl->gl_spin); |
1515 | if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) | 1520 | if (gl->gl_state != LM_ST_UNLOCKED) |
1516 | handle_callback(gl, LM_ST_UNLOCKED, 0); | 1521 | handle_callback(gl, LM_ST_UNLOCKED, 0); |
1517 | spin_unlock(&gl->gl_spin); | 1522 | spin_unlock(&gl->gl_spin); |
1518 | gfs2_glock_hold(gl); | 1523 | gfs2_glock_hold(gl); |
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) | |||
1660 | *p++ = 'I'; | 1665 | *p++ = 'I'; |
1661 | if (test_bit(GLF_FROZEN, gflags)) | 1666 | if (test_bit(GLF_FROZEN, gflags)) |
1662 | *p++ = 'F'; | 1667 | *p++ = 'F'; |
1668 | if (test_bit(GLF_QUEUED, gflags)) | ||
1669 | *p++ = 'q'; | ||
1663 | *p = 0; | 1670 | *p = 0; |
1664 | return buf; | 1671 | return buf; |
1665 | } | 1672 | } |
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void) | |||
1776 | } | 1783 | } |
1777 | #endif | 1784 | #endif |
1778 | 1785 | ||
1779 | glock_workqueue = create_workqueue("glock_workqueue"); | 1786 | glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | |
1787 | WQ_HIGHPRI | WQ_FREEZEABLE, 0); | ||
1780 | if (IS_ERR(glock_workqueue)) | 1788 | if (IS_ERR(glock_workqueue)) |
1781 | return PTR_ERR(glock_workqueue); | 1789 | return PTR_ERR(glock_workqueue); |
1782 | gfs2_delete_workqueue = create_workqueue("delete_workqueue"); | 1790 | gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | |
1791 | WQ_FREEZEABLE, 0); | ||
1783 | if (IS_ERR(gfs2_delete_workqueue)) { | 1792 | if (IS_ERR(gfs2_delete_workqueue)) { |
1784 | destroy_workqueue(glock_workqueue); | 1793 | destroy_workqueue(glock_workqueue); |
1785 | return PTR_ERR(gfs2_delete_workqueue); | 1794 | return PTR_ERR(gfs2_delete_workqueue); |
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 2bda1911b156..db1c26d6d220 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h | |||
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); | |||
215 | void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); | 215 | void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); |
216 | 216 | ||
217 | /** | 217 | /** |
218 | * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock | 218 | * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock |
219 | * @gl: the glock | 219 | * @gl: the glock |
220 | * @state: the state we're requesting | 220 | * @state: the state we're requesting |
221 | * @flags: the modifier flags | 221 | * @flags: the modifier flags |
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 49f97d3bb690..0d149dcc04e5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c | |||
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) | |||
262 | const struct gfs2_inode *ip = gl->gl_object; | 262 | const struct gfs2_inode *ip = gl->gl_object; |
263 | if (ip == NULL) | 263 | if (ip == NULL) |
264 | return 0; | 264 | return 0; |
265 | gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", | 265 | gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", |
266 | (unsigned long long)ip->i_no_formal_ino, | 266 | (unsigned long long)ip->i_no_formal_ino, |
267 | (unsigned long long)ip->i_no_addr, | 267 | (unsigned long long)ip->i_no_addr, |
268 | IF2DT(ip->i_inode.i_mode), ip->i_flags, | 268 | IF2DT(ip->i_inode.i_mode), ip->i_flags, |
269 | (unsigned int)ip->i_diskflags, | 269 | (unsigned int)ip->i_diskflags, |
270 | (unsigned long long)ip->i_inode.i_size, | 270 | (unsigned long long)i_size_read(&ip->i_inode)); |
271 | (unsigned long long)ip->i_disksize); | ||
272 | return 0; | 271 | return 0; |
273 | } | 272 | } |
274 | 273 | ||
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = { | |||
453 | [LM_TYPE_META] = &gfs2_meta_glops, | 452 | [LM_TYPE_META] = &gfs2_meta_glops, |
454 | [LM_TYPE_INODE] = &gfs2_inode_glops, | 453 | [LM_TYPE_INODE] = &gfs2_inode_glops, |
455 | [LM_TYPE_RGRP] = &gfs2_rgrp_glops, | 454 | [LM_TYPE_RGRP] = &gfs2_rgrp_glops, |
456 | [LM_TYPE_NONDISK] = &gfs2_trans_glops, | ||
457 | [LM_TYPE_IOPEN] = &gfs2_iopen_glops, | 455 | [LM_TYPE_IOPEN] = &gfs2_iopen_glops, |
458 | [LM_TYPE_FLOCK] = &gfs2_flock_glops, | 456 | [LM_TYPE_FLOCK] = &gfs2_flock_glops, |
459 | [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, | 457 | [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, |
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index fdbf4b366fa5..764fbb49efc8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h | |||
@@ -196,6 +196,7 @@ enum { | |||
196 | GLF_REPLY_PENDING = 9, | 196 | GLF_REPLY_PENDING = 9, |
197 | GLF_INITIAL = 10, | 197 | GLF_INITIAL = 10, |
198 | GLF_FROZEN = 11, | 198 | GLF_FROZEN = 11, |
199 | GLF_QUEUED = 12, | ||
199 | }; | 200 | }; |
200 | 201 | ||
201 | struct gfs2_glock { | 202 | struct gfs2_glock { |
@@ -267,7 +268,6 @@ struct gfs2_inode { | |||
267 | u64 i_no_formal_ino; | 268 | u64 i_no_formal_ino; |
268 | u64 i_generation; | 269 | u64 i_generation; |
269 | u64 i_eattr; | 270 | u64 i_eattr; |
270 | loff_t i_disksize; | ||
271 | unsigned long i_flags; /* GIF_... */ | 271 | unsigned long i_flags; /* GIF_... */ |
272 | struct gfs2_glock *i_gl; /* Move into i_gh? */ | 272 | struct gfs2_glock *i_gl; /* Move into i_gh? */ |
273 | struct gfs2_holder i_iopen_gh; | 273 | struct gfs2_holder i_iopen_gh; |
@@ -416,11 +416,8 @@ struct gfs2_args { | |||
416 | char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ | 416 | char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ |
417 | char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ | 417 | char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ |
418 | unsigned int ar_spectator:1; /* Don't get a journal */ | 418 | unsigned int ar_spectator:1; /* Don't get a journal */ |
419 | unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */ | ||
420 | unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ | 419 | unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ |
421 | unsigned int ar_localcaching:1; /* Local caching */ | ||
422 | unsigned int ar_debug:1; /* Oops on errors */ | 420 | unsigned int ar_debug:1; /* Oops on errors */ |
423 | unsigned int ar_upgrade:1; /* Upgrade ondisk format */ | ||
424 | unsigned int ar_posix_acl:1; /* Enable posix acls */ | 421 | unsigned int ar_posix_acl:1; /* Enable posix acls */ |
425 | unsigned int ar_quota:2; /* off/account/on */ | 422 | unsigned int ar_quota:2; /* off/account/on */ |
426 | unsigned int ar_suiddir:1; /* suiddir support */ | 423 | unsigned int ar_suiddir:1; /* suiddir support */ |
@@ -497,7 +494,7 @@ struct gfs2_sb_host { | |||
497 | */ | 494 | */ |
498 | 495 | ||
499 | struct lm_lockstruct { | 496 | struct lm_lockstruct { |
500 | unsigned int ls_jid; | 497 | int ls_jid; |
501 | unsigned int ls_first; | 498 | unsigned int ls_first; |
502 | unsigned int ls_first_done; | 499 | unsigned int ls_first_done; |
503 | unsigned int ls_nodir; | 500 | unsigned int ls_nodir; |
@@ -572,6 +569,7 @@ struct gfs2_sbd { | |||
572 | struct list_head sd_rindex_mru_list; | 569 | struct list_head sd_rindex_mru_list; |
573 | struct gfs2_rgrpd *sd_rindex_forward; | 570 | struct gfs2_rgrpd *sd_rindex_forward; |
574 | unsigned int sd_rgrps; | 571 | unsigned int sd_rgrps; |
572 | unsigned int sd_max_rg_data; | ||
575 | 573 | ||
576 | /* Journal index stuff */ | 574 | /* Journal index stuff */ |
577 | 575 | ||
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 08140f185a37..06370f8bd8cf 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c | |||
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) | |||
359 | * to do that. | 359 | * to do that. |
360 | */ | 360 | */ |
361 | ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); | 361 | ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); |
362 | ip->i_disksize = be64_to_cpu(str->di_size); | 362 | i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); |
363 | i_size_write(&ip->i_inode, ip->i_disksize); | ||
364 | gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); | 363 | gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); |
365 | atime.tv_sec = be64_to_cpu(str->di_atime); | 364 | atime.tv_sec = be64_to_cpu(str->di_atime); |
366 | atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); | 365 | atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); |
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) | |||
1055 | str->di_uid = cpu_to_be32(ip->i_inode.i_uid); | 1054 | str->di_uid = cpu_to_be32(ip->i_inode.i_uid); |
1056 | str->di_gid = cpu_to_be32(ip->i_inode.i_gid); | 1055 | str->di_gid = cpu_to_be32(ip->i_inode.i_gid); |
1057 | str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); | 1056 | str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); |
1058 | str->di_size = cpu_to_be64(ip->i_disksize); | 1057 | str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); |
1059 | str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); | 1058 | str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); |
1060 | str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); | 1059 | str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); |
1061 | str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); | 1060 | str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); |
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip) | |||
1085 | (unsigned long long)ip->i_no_formal_ino); | 1084 | (unsigned long long)ip->i_no_formal_ino); |
1086 | printk(KERN_INFO " no_addr = %llu\n", | 1085 | printk(KERN_INFO " no_addr = %llu\n", |
1087 | (unsigned long long)ip->i_no_addr); | 1086 | (unsigned long long)ip->i_no_addr); |
1088 | printk(KERN_INFO " i_disksize = %llu\n", | 1087 | printk(KERN_INFO " i_size = %llu\n", |
1089 | (unsigned long long)ip->i_disksize); | 1088 | (unsigned long long)i_size_read(&ip->i_inode)); |
1090 | printk(KERN_INFO " blocks = %llu\n", | 1089 | printk(KERN_INFO " blocks = %llu\n", |
1091 | (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); | 1090 | (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); |
1092 | printk(KERN_INFO " i_goal = %llu\n", | 1091 | printk(KERN_INFO " i_goal = %llu\n", |
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 300ada3f21de..6720d7d5fbc6 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h | |||
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); | |||
19 | extern int gfs2_internal_read(struct gfs2_inode *ip, | 19 | extern int gfs2_internal_read(struct gfs2_inode *ip, |
20 | struct file_ra_state *ra_state, | 20 | struct file_ra_state *ra_state, |
21 | char *buf, loff_t *pos, unsigned size); | 21 | char *buf, loff_t *pos, unsigned size); |
22 | extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, | ||
23 | unsigned int from, unsigned int to); | ||
22 | extern void gfs2_set_aops(struct inode *inode); | 24 | extern void gfs2_set_aops(struct inode *inode); |
23 | 25 | ||
24 | static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) | 26 | static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) |
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, | |||
80 | dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); | 82 | dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); |
81 | } | 83 | } |
82 | 84 | ||
85 | static inline int gfs2_check_internal_file_size(struct inode *inode, | ||
86 | u64 minsize, u64 maxsize) | ||
87 | { | ||
88 | u64 size = i_size_read(inode); | ||
89 | if (size < minsize || size > maxsize) | ||
90 | goto err; | ||
91 | if (size & ((1 << inode->i_blkbits) - 1)) | ||
92 | goto err; | ||
93 | return 0; | ||
94 | err: | ||
95 | gfs2_consist_inode(GFS2_I(inode)); | ||
96 | return -EIO; | ||
97 | } | ||
83 | 98 | ||
84 | extern void gfs2_set_iop(struct inode *inode); | 99 | extern void gfs2_set_iop(struct inode *inode); |
85 | extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, | 100 | extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, |
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0e0470ed34c2..1c09425b45fd 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c | |||
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg) | |||
42 | ret |= LM_OUT_CANCELED; | 42 | ret |= LM_OUT_CANCELED; |
43 | goto out; | 43 | goto out; |
44 | case -EAGAIN: /* Try lock fails */ | 44 | case -EAGAIN: /* Try lock fails */ |
45 | case -EDEADLK: /* Deadlock detected */ | ||
45 | goto out; | 46 | goto out; |
46 | case -EINVAL: /* Invalid */ | 47 | case -ETIMEDOUT: /* Canceled due to timeout */ |
47 | case -ENOMEM: /* Out of memory */ | ||
48 | ret |= LM_OUT_ERROR; | 48 | ret |= LM_OUT_ERROR; |
49 | goto out; | 49 | goto out; |
50 | case 0: /* Success */ | 50 | case 0: /* Success */ |
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index b1e9630eb46a..d7eb1e209aa8 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include "glock.h" | 24 | #include "glock.h" |
25 | #include "quota.h" | 25 | #include "quota.h" |
26 | #include "recovery.h" | 26 | #include "recovery.h" |
27 | #include "dir.h" | ||
27 | 28 | ||
28 | static struct shrinker qd_shrinker = { | 29 | static struct shrinker qd_shrinker = { |
29 | .shrink = gfs2_shrink_qd_memory, | 30 | .shrink = gfs2_shrink_qd_memory, |
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void) | |||
78 | { | 79 | { |
79 | int error; | 80 | int error; |
80 | 81 | ||
82 | gfs2_str2qstr(&gfs2_qdot, "."); | ||
83 | gfs2_str2qstr(&gfs2_qdotdot, ".."); | ||
84 | |||
81 | error = gfs2_sys_init(); | 85 | error = gfs2_sys_init(); |
82 | if (error) | 86 | if (error) |
83 | return error; | 87 | return error; |
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void) | |||
140 | 144 | ||
141 | error = -ENOMEM; | 145 | error = -ENOMEM; |
142 | gfs_recovery_wq = alloc_workqueue("gfs_recovery", | 146 | gfs_recovery_wq = alloc_workqueue("gfs_recovery", |
143 | WQ_NON_REENTRANT | WQ_RESCUER, 0); | 147 | WQ_RESCUER | WQ_FREEZEABLE, 0); |
144 | if (!gfs_recovery_wq) | 148 | if (!gfs_recovery_wq) |
145 | goto fail_wq; | 149 | goto fail_wq; |
146 | 150 | ||
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4d4b1e8ac64c..aeafc233dc89 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c | |||
@@ -38,14 +38,6 @@ | |||
38 | #define DO 0 | 38 | #define DO 0 |
39 | #define UNDO 1 | 39 | #define UNDO 1 |
40 | 40 | ||
41 | static const u32 gfs2_old_fs_formats[] = { | ||
42 | 0 | ||
43 | }; | ||
44 | |||
45 | static const u32 gfs2_old_multihost_formats[] = { | ||
46 | 0 | ||
47 | }; | ||
48 | |||
49 | /** | 41 | /** |
50 | * gfs2_tune_init - Fill a gfs2_tune structure with default values | 42 | * gfs2_tune_init - Fill a gfs2_tune structure with default values |
51 | * @gt: tune | 43 | * @gt: tune |
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) | |||
135 | 127 | ||
136 | static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) | 128 | static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) |
137 | { | 129 | { |
138 | unsigned int x; | ||
139 | |||
140 | if (sb->sb_magic != GFS2_MAGIC || | 130 | if (sb->sb_magic != GFS2_MAGIC || |
141 | sb->sb_type != GFS2_METATYPE_SB) { | 131 | sb->sb_type != GFS2_METATYPE_SB) { |
142 | if (!silent) | 132 | if (!silent) |
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile | |||
150 | sb->sb_multihost_format == GFS2_FORMAT_MULTI) | 140 | sb->sb_multihost_format == GFS2_FORMAT_MULTI) |
151 | return 0; | 141 | return 0; |
152 | 142 | ||
153 | if (sb->sb_fs_format != GFS2_FORMAT_FS) { | 143 | fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); |
154 | for (x = 0; gfs2_old_fs_formats[x]; x++) | ||
155 | if (gfs2_old_fs_formats[x] == sb->sb_fs_format) | ||
156 | break; | ||
157 | 144 | ||
158 | if (!gfs2_old_fs_formats[x]) { | 145 | return -EINVAL; |
159 | printk(KERN_WARNING | ||
160 | "GFS2: code version (%u, %u) is incompatible " | ||
161 | "with ondisk format (%u, %u)\n", | ||
162 | GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, | ||
163 | sb->sb_fs_format, sb->sb_multihost_format); | ||
164 | printk(KERN_WARNING | ||
165 | "GFS2: I don't know how to upgrade this FS\n"); | ||
166 | return -EINVAL; | ||
167 | } | ||
168 | } | ||
169 | |||
170 | if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { | ||
171 | for (x = 0; gfs2_old_multihost_formats[x]; x++) | ||
172 | if (gfs2_old_multihost_formats[x] == | ||
173 | sb->sb_multihost_format) | ||
174 | break; | ||
175 | |||
176 | if (!gfs2_old_multihost_formats[x]) { | ||
177 | printk(KERN_WARNING | ||
178 | "GFS2: code version (%u, %u) is incompatible " | ||
179 | "with ondisk format (%u, %u)\n", | ||
180 | GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, | ||
181 | sb->sb_fs_format, sb->sb_multihost_format); | ||
182 | printk(KERN_WARNING | ||
183 | "GFS2: I don't know how to upgrade this FS\n"); | ||
184 | return -EINVAL; | ||
185 | } | ||
186 | } | ||
187 | |||
188 | if (!sdp->sd_args.ar_upgrade) { | ||
189 | printk(KERN_WARNING | ||
190 | "GFS2: code version (%u, %u) is incompatible " | ||
191 | "with ondisk format (%u, %u)\n", | ||
192 | GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, | ||
193 | sb->sb_fs_format, sb->sb_multihost_format); | ||
194 | printk(KERN_INFO | ||
195 | "GFS2: Use the \"upgrade\" mount option to upgrade " | ||
196 | "the FS\n"); | ||
197 | printk(KERN_INFO "GFS2: See the manual for more details\n"); | ||
198 | return -EINVAL; | ||
199 | } | ||
200 | |||
201 | return 0; | ||
202 | } | 146 | } |
203 | 147 | ||
204 | static void end_bio_io_page(struct bio *bio, int error) | 148 | static void end_bio_io_page(struct bio *bio, int error) |
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp) | |||
586 | 530 | ||
587 | prev_db = 0; | 531 | prev_db = 0; |
588 | 532 | ||
589 | for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { | 533 | for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { |
590 | bh.b_state = 0; | 534 | bh.b_state = 0; |
591 | bh.b_blocknr = 0; | 535 | bh.b_blocknr = 0; |
592 | bh.b_size = 1 << ip->i_inode.i_blkbits; | 536 | bh.b_size = 1 << ip->i_inode.i_blkbits; |
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) | |||
1022 | if (!strcmp("lock_nolock", proto)) { | 966 | if (!strcmp("lock_nolock", proto)) { |
1023 | lm = &nolock_ops; | 967 | lm = &nolock_ops; |
1024 | sdp->sd_args.ar_localflocks = 1; | 968 | sdp->sd_args.ar_localflocks = 1; |
1025 | sdp->sd_args.ar_localcaching = 1; | ||
1026 | #ifdef CONFIG_GFS2_FS_LOCKING_DLM | 969 | #ifdef CONFIG_GFS2_FS_LOCKING_DLM |
1027 | } else if (!strcmp("lock_dlm", proto)) { | 970 | } else if (!strcmp("lock_dlm", proto)) { |
1028 | lm = &gfs2_dlm_ops; | 971 | lm = &gfs2_dlm_ops; |
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word) | |||
1113 | 1056 | ||
1114 | static int wait_on_journal(struct gfs2_sbd *sdp) | 1057 | static int wait_on_journal(struct gfs2_sbd *sdp) |
1115 | { | 1058 | { |
1116 | if (sdp->sd_args.ar_spectator) | ||
1117 | return 0; | ||
1118 | if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) | 1059 | if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) |
1119 | return 0; | 1060 | return 0; |
1120 | 1061 | ||
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent | |||
1217 | if (error) | 1158 | if (error) |
1218 | goto fail_sb; | 1159 | goto fail_sb; |
1219 | 1160 | ||
1161 | /* | ||
1162 | * If user space has failed to join the cluster or some similar | ||
1163 | * failure has occurred, then the journal id will contain a | ||
1164 | * negative (error) number. This will then be returned to the | ||
1165 | * caller (of the mount syscall). We do this even for spectator | ||
1166 | * mounts (which just write a jid of 0 to indicate "ok" even though | ||
1167 | * the jid is unused in the spectator case) | ||
1168 | */ | ||
1169 | if (sdp->sd_lockstruct.ls_jid < 0) { | ||
1170 | error = sdp->sd_lockstruct.ls_jid; | ||
1171 | sdp->sd_lockstruct.ls_jid = 0; | ||
1172 | goto fail_sb; | ||
1173 | } | ||
1174 | |||
1220 | error = init_inodes(sdp, DO); | 1175 | error = init_inodes(sdp, DO); |
1221 | if (error) | 1176 | if (error) |
1222 | goto fail_sb; | 1177 | goto fail_sb; |
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 1009be2c9737..0534510200d5 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c | |||
@@ -18,6 +18,8 @@ | |||
18 | #include <linux/gfs2_ondisk.h> | 18 | #include <linux/gfs2_ondisk.h> |
19 | #include <linux/crc32.h> | 19 | #include <linux/crc32.h> |
20 | #include <linux/fiemap.h> | 20 | #include <linux/fiemap.h> |
21 | #include <linux/swap.h> | ||
22 | #include <linux/falloc.h> | ||
21 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
22 | 24 | ||
23 | #include "gfs2.h" | 25 | #include "gfs2.h" |
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, | |||
217 | goto out_gunlock_q; | 219 | goto out_gunlock_q; |
218 | 220 | ||
219 | error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + | 221 | error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + |
220 | al->al_rgd->rd_length + | 222 | gfs2_rg_blocks(al) + |
221 | 2 * RES_DINODE + RES_STATFS + | 223 | 2 * RES_DINODE + RES_STATFS + |
222 | RES_QUOTA, 0); | 224 | RES_QUOTA, 0); |
223 | if (error) | 225 | if (error) |
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, | |||
406 | 408 | ||
407 | ip = ghs[1].gh_gl->gl_object; | 409 | ip = ghs[1].gh_gl->gl_object; |
408 | 410 | ||
409 | ip->i_disksize = size; | ||
410 | i_size_write(inode, size); | 411 | i_size_write(inode, size); |
411 | 412 | ||
412 | error = gfs2_meta_inode_buffer(ip, &dibh); | 413 | error = gfs2_meta_inode_buffer(ip, &dibh); |
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
461 | ip = ghs[1].gh_gl->gl_object; | 462 | ip = ghs[1].gh_gl->gl_object; |
462 | 463 | ||
463 | ip->i_inode.i_nlink = 2; | 464 | ip->i_inode.i_nlink = 2; |
464 | ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); | 465 | i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); |
465 | ip->i_diskflags |= GFS2_DIF_JDATA; | 466 | ip->i_diskflags |= GFS2_DIF_JDATA; |
466 | ip->i_entries = 2; | 467 | ip->i_entries = 2; |
467 | 468 | ||
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
470 | if (!gfs2_assert_withdraw(sdp, !error)) { | 471 | if (!gfs2_assert_withdraw(sdp, !error)) { |
471 | struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; | 472 | struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; |
472 | struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); | 473 | struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); |
473 | struct qstr str; | ||
474 | 474 | ||
475 | gfs2_str2qstr(&str, "."); | ||
476 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | 475 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); |
477 | gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); | 476 | gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); |
478 | dent->de_inum = di->di_num; /* already GFS2 endian */ | 477 | dent->de_inum = di->di_num; /* already GFS2 endian */ |
479 | dent->de_type = cpu_to_be16(DT_DIR); | 478 | dent->de_type = cpu_to_be16(DT_DIR); |
480 | di->di_entries = cpu_to_be32(1); | 479 | di->di_entries = cpu_to_be32(1); |
481 | 480 | ||
482 | gfs2_str2qstr(&str, ".."); | ||
483 | dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); | 481 | dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); |
484 | gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); | 482 | gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); |
485 | 483 | ||
486 | gfs2_inum_out(dip, dent); | 484 | gfs2_inum_out(dip, dent); |
487 | dent->de_type = cpu_to_be16(DT_DIR); | 485 | dent->de_type = cpu_to_be16(DT_DIR); |
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
522 | static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, | 520 | static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, |
523 | struct gfs2_inode *ip) | 521 | struct gfs2_inode *ip) |
524 | { | 522 | { |
525 | struct qstr dotname; | ||
526 | int error; | 523 | int error; |
527 | 524 | ||
528 | if (ip->i_entries != 2) { | 525 | if (ip->i_entries != 2) { |
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, | |||
539 | if (error) | 536 | if (error) |
540 | return error; | 537 | return error; |
541 | 538 | ||
542 | gfs2_str2qstr(&dotname, "."); | 539 | error = gfs2_dir_del(ip, &gfs2_qdot); |
543 | error = gfs2_dir_del(ip, &dotname); | ||
544 | if (error) | 540 | if (error) |
545 | return error; | 541 | return error; |
546 | 542 | ||
547 | gfs2_str2qstr(&dotname, ".."); | 543 | error = gfs2_dir_del(ip, &gfs2_qdotdot); |
548 | error = gfs2_dir_del(ip, &dotname); | ||
549 | if (error) | 544 | if (error) |
550 | return error; | 545 | return error; |
551 | 546 | ||
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) | |||
694 | struct inode *dir = &to->i_inode; | 689 | struct inode *dir = &to->i_inode; |
695 | struct super_block *sb = dir->i_sb; | 690 | struct super_block *sb = dir->i_sb; |
696 | struct inode *tmp; | 691 | struct inode *tmp; |
697 | struct qstr dotdot; | ||
698 | int error = 0; | 692 | int error = 0; |
699 | 693 | ||
700 | gfs2_str2qstr(&dotdot, ".."); | ||
701 | |||
702 | igrab(dir); | 694 | igrab(dir); |
703 | 695 | ||
704 | for (;;) { | 696 | for (;;) { |
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) | |||
711 | break; | 703 | break; |
712 | } | 704 | } |
713 | 705 | ||
714 | tmp = gfs2_lookupi(dir, &dotdot, 1); | 706 | tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); |
715 | if (IS_ERR(tmp)) { | 707 | if (IS_ERR(tmp)) { |
716 | error = PTR_ERR(tmp); | 708 | error = PTR_ERR(tmp); |
717 | break; | 709 | break; |
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, | |||
744 | struct gfs2_inode *ip = GFS2_I(odentry->d_inode); | 736 | struct gfs2_inode *ip = GFS2_I(odentry->d_inode); |
745 | struct gfs2_inode *nip = NULL; | 737 | struct gfs2_inode *nip = NULL; |
746 | struct gfs2_sbd *sdp = GFS2_SB(odir); | 738 | struct gfs2_sbd *sdp = GFS2_SB(odir); |
747 | struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; | 739 | struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; |
748 | struct gfs2_rgrpd *nrgd; | 740 | struct gfs2_rgrpd *nrgd; |
749 | unsigned int num_gh; | 741 | unsigned int num_gh; |
750 | int dir_rename = 0; | 742 | int dir_rename = 0; |
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, | |||
758 | return 0; | 750 | return 0; |
759 | } | 751 | } |
760 | 752 | ||
753 | error = gfs2_rindex_hold(sdp, &ri_gh); | ||
754 | if (error) | ||
755 | return error; | ||
761 | 756 | ||
762 | if (odip != ndip) { | 757 | if (odip != ndip) { |
763 | error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, | 758 | error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, |
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, | |||
887 | 882 | ||
888 | al->al_requested = sdp->sd_max_dirres; | 883 | al->al_requested = sdp->sd_max_dirres; |
889 | 884 | ||
890 | error = gfs2_inplace_reserve(ndip); | 885 | error = gfs2_inplace_reserve_ri(ndip); |
891 | if (error) | 886 | if (error) |
892 | goto out_gunlock_q; | 887 | goto out_gunlock_q; |
893 | 888 | ||
894 | error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + | 889 | error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + |
895 | al->al_rgd->rd_length + | 890 | gfs2_rg_blocks(al) + |
896 | 4 * RES_DINODE + 4 * RES_LEAF + | 891 | 4 * RES_DINODE + 4 * RES_LEAF + |
897 | RES_STATFS + RES_QUOTA + 4, 0); | 892 | RES_STATFS + RES_QUOTA + 4, 0); |
898 | if (error) | 893 | if (error) |
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, | |||
920 | } | 915 | } |
921 | 916 | ||
922 | if (dir_rename) { | 917 | if (dir_rename) { |
923 | struct qstr name; | ||
924 | gfs2_str2qstr(&name, ".."); | ||
925 | |||
926 | error = gfs2_change_nlink(ndip, +1); | 918 | error = gfs2_change_nlink(ndip, +1); |
927 | if (error) | 919 | if (error) |
928 | goto out_end_trans; | 920 | goto out_end_trans; |
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, | |||
930 | if (error) | 922 | if (error) |
931 | goto out_end_trans; | 923 | goto out_end_trans; |
932 | 924 | ||
933 | error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); | 925 | error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); |
934 | if (error) | 926 | if (error) |
935 | goto out_end_trans; | 927 | goto out_end_trans; |
936 | } else { | 928 | } else { |
@@ -972,6 +964,7 @@ out_gunlock_r: | |||
972 | if (r_gh.gh_gl) | 964 | if (r_gh.gh_gl) |
973 | gfs2_glock_dq_uninit(&r_gh); | 965 | gfs2_glock_dq_uninit(&r_gh); |
974 | out: | 966 | out: |
967 | gfs2_glock_dq_uninit(&ri_gh); | ||
975 | return error; | 968 | return error; |
976 | } | 969 | } |
977 | 970 | ||
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) | |||
990 | struct gfs2_inode *ip = GFS2_I(dentry->d_inode); | 983 | struct gfs2_inode *ip = GFS2_I(dentry->d_inode); |
991 | struct gfs2_holder i_gh; | 984 | struct gfs2_holder i_gh; |
992 | struct buffer_head *dibh; | 985 | struct buffer_head *dibh; |
993 | unsigned int x; | 986 | unsigned int x, size; |
994 | char *buf; | 987 | char *buf; |
995 | int error; | 988 | int error; |
996 | 989 | ||
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) | |||
1002 | return NULL; | 995 | return NULL; |
1003 | } | 996 | } |
1004 | 997 | ||
1005 | if (!ip->i_disksize) { | 998 | size = (unsigned int)i_size_read(&ip->i_inode); |
999 | if (size == 0) { | ||
1006 | gfs2_consist_inode(ip); | 1000 | gfs2_consist_inode(ip); |
1007 | buf = ERR_PTR(-EIO); | 1001 | buf = ERR_PTR(-EIO); |
1008 | goto out; | 1002 | goto out; |
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) | |||
1014 | goto out; | 1008 | goto out; |
1015 | } | 1009 | } |
1016 | 1010 | ||
1017 | x = ip->i_disksize + 1; | 1011 | x = size + 1; |
1018 | buf = kmalloc(x, GFP_NOFS); | 1012 | buf = kmalloc(x, GFP_NOFS); |
1019 | if (!buf) | 1013 | if (!buf) |
1020 | buf = ERR_PTR(-ENOMEM); | 1014 | buf = ERR_PTR(-ENOMEM); |
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask) | |||
1071 | return error; | 1065 | return error; |
1072 | } | 1066 | } |
1073 | 1067 | ||
1074 | /* | ||
1075 | * XXX(truncate): the truncate_setsize calls should be moved to the end. | ||
1076 | */ | ||
1077 | static int setattr_size(struct inode *inode, struct iattr *attr) | ||
1078 | { | ||
1079 | struct gfs2_inode *ip = GFS2_I(inode); | ||
1080 | struct gfs2_sbd *sdp = GFS2_SB(inode); | ||
1081 | int error; | ||
1082 | |||
1083 | if (attr->ia_size != ip->i_disksize) { | ||
1084 | error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); | ||
1085 | if (error) | ||
1086 | return error; | ||
1087 | truncate_setsize(inode, attr->ia_size); | ||
1088 | gfs2_trans_end(sdp); | ||
1089 | } | ||
1090 | |||
1091 | error = gfs2_truncatei(ip, attr->ia_size); | ||
1092 | if (error && (inode->i_size != ip->i_disksize)) | ||
1093 | i_size_write(inode, ip->i_disksize); | ||
1094 | |||
1095 | return error; | ||
1096 | } | ||
1097 | |||
1098 | static int setattr_chown(struct inode *inode, struct iattr *attr) | 1068 | static int setattr_chown(struct inode *inode, struct iattr *attr) |
1099 | { | 1069 | { |
1100 | struct gfs2_inode *ip = GFS2_I(inode); | 1070 | struct gfs2_inode *ip = GFS2_I(inode); |
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) | |||
1195 | goto out; | 1165 | goto out; |
1196 | 1166 | ||
1197 | if (attr->ia_valid & ATTR_SIZE) | 1167 | if (attr->ia_valid & ATTR_SIZE) |
1198 | error = setattr_size(inode, attr); | 1168 | error = gfs2_setattr_size(inode, attr->ia_size); |
1199 | else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) | 1169 | else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) |
1200 | error = setattr_chown(inode, attr); | 1170 | error = setattr_chown(inode, attr); |
1201 | else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) | 1171 | else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) |
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) | |||
1301 | return ret; | 1271 | return ret; |
1302 | } | 1272 | } |
1303 | 1273 | ||
1274 | static void empty_write_end(struct page *page, unsigned from, | ||
1275 | unsigned to) | ||
1276 | { | ||
1277 | struct gfs2_inode *ip = GFS2_I(page->mapping->host); | ||
1278 | |||
1279 | page_zero_new_buffers(page, from, to); | ||
1280 | flush_dcache_page(page); | ||
1281 | mark_page_accessed(page); | ||
1282 | |||
1283 | if (!gfs2_is_writeback(ip)) | ||
1284 | gfs2_page_add_databufs(ip, page, from, to); | ||
1285 | |||
1286 | block_commit_write(page, from, to); | ||
1287 | } | ||
1288 | |||
1289 | |||
1290 | static int write_empty_blocks(struct page *page, unsigned from, unsigned to) | ||
1291 | { | ||
1292 | unsigned start, end, next; | ||
1293 | struct buffer_head *bh, *head; | ||
1294 | int error; | ||
1295 | |||
1296 | if (!page_has_buffers(page)) { | ||
1297 | error = block_prepare_write(page, from, to, gfs2_block_map); | ||
1298 | if (unlikely(error)) | ||
1299 | return error; | ||
1300 | |||
1301 | empty_write_end(page, from, to); | ||
1302 | return 0; | ||
1303 | } | ||
1304 | |||
1305 | bh = head = page_buffers(page); | ||
1306 | next = end = 0; | ||
1307 | while (next < from) { | ||
1308 | next += bh->b_size; | ||
1309 | bh = bh->b_this_page; | ||
1310 | } | ||
1311 | start = next; | ||
1312 | do { | ||
1313 | next += bh->b_size; | ||
1314 | if (buffer_mapped(bh)) { | ||
1315 | if (end) { | ||
1316 | error = block_prepare_write(page, start, end, | ||
1317 | gfs2_block_map); | ||
1318 | if (unlikely(error)) | ||
1319 | return error; | ||
1320 | empty_write_end(page, start, end); | ||
1321 | end = 0; | ||
1322 | } | ||
1323 | start = next; | ||
1324 | } | ||
1325 | else | ||
1326 | end = next; | ||
1327 | bh = bh->b_this_page; | ||
1328 | } while (next < to); | ||
1329 | |||
1330 | if (end) { | ||
1331 | error = block_prepare_write(page, start, end, gfs2_block_map); | ||
1332 | if (unlikely(error)) | ||
1333 | return error; | ||
1334 | empty_write_end(page, start, end); | ||
1335 | } | ||
1336 | |||
1337 | return 0; | ||
1338 | } | ||
1339 | |||
1340 | static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, | ||
1341 | int mode) | ||
1342 | { | ||
1343 | struct gfs2_inode *ip = GFS2_I(inode); | ||
1344 | struct buffer_head *dibh; | ||
1345 | int error; | ||
1346 | u64 start = offset >> PAGE_CACHE_SHIFT; | ||
1347 | unsigned int start_offset = offset & ~PAGE_CACHE_MASK; | ||
1348 | u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; | ||
1349 | pgoff_t curr; | ||
1350 | struct page *page; | ||
1351 | unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; | ||
1352 | unsigned int from, to; | ||
1353 | |||
1354 | if (!end_offset) | ||
1355 | end_offset = PAGE_CACHE_SIZE; | ||
1356 | |||
1357 | error = gfs2_meta_inode_buffer(ip, &dibh); | ||
1358 | if (unlikely(error)) | ||
1359 | goto out; | ||
1360 | |||
1361 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | ||
1362 | |||
1363 | if (gfs2_is_stuffed(ip)) { | ||
1364 | error = gfs2_unstuff_dinode(ip, NULL); | ||
1365 | if (unlikely(error)) | ||
1366 | goto out; | ||
1367 | } | ||
1368 | |||
1369 | curr = start; | ||
1370 | offset = start << PAGE_CACHE_SHIFT; | ||
1371 | from = start_offset; | ||
1372 | to = PAGE_CACHE_SIZE; | ||
1373 | while (curr <= end) { | ||
1374 | page = grab_cache_page_write_begin(inode->i_mapping, curr, | ||
1375 | AOP_FLAG_NOFS); | ||
1376 | if (unlikely(!page)) { | ||
1377 | error = -ENOMEM; | ||
1378 | goto out; | ||
1379 | } | ||
1380 | |||
1381 | if (curr == end) | ||
1382 | to = end_offset; | ||
1383 | error = write_empty_blocks(page, from, to); | ||
1384 | if (!error && offset + to > inode->i_size && | ||
1385 | !(mode & FALLOC_FL_KEEP_SIZE)) { | ||
1386 | i_size_write(inode, offset + to); | ||
1387 | } | ||
1388 | unlock_page(page); | ||
1389 | page_cache_release(page); | ||
1390 | if (error) | ||
1391 | goto out; | ||
1392 | curr++; | ||
1393 | offset += PAGE_CACHE_SIZE; | ||
1394 | from = 0; | ||
1395 | } | ||
1396 | |||
1397 | gfs2_dinode_out(ip, dibh->b_data); | ||
1398 | mark_inode_dirty(inode); | ||
1399 | |||
1400 | brelse(dibh); | ||
1401 | |||
1402 | out: | ||
1403 | return error; | ||
1404 | } | ||
1405 | |||
1406 | static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, | ||
1407 | unsigned int *data_blocks, unsigned int *ind_blocks) | ||
1408 | { | ||
1409 | const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | ||
1410 | unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; | ||
1411 | unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); | ||
1412 | |||
1413 | for (tmp = max_data; tmp > sdp->sd_diptrs;) { | ||
1414 | tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); | ||
1415 | max_data -= tmp; | ||
1416 | } | ||
1417 | /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, | ||
1418 | so it might end up with fewer data blocks */ | ||
1419 | if (max_data <= *data_blocks) | ||
1420 | return; | ||
1421 | *data_blocks = max_data; | ||
1422 | *ind_blocks = max_blocks - max_data; | ||
1423 | *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; | ||
1424 | if (*len > max) { | ||
1425 | *len = max; | ||
1426 | gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); | ||
1427 | } | ||
1428 | } | ||
1429 | |||
1430 | static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, | ||
1431 | loff_t len) | ||
1432 | { | ||
1433 | struct gfs2_sbd *sdp = GFS2_SB(inode); | ||
1434 | struct gfs2_inode *ip = GFS2_I(inode); | ||
1435 | unsigned int data_blocks = 0, ind_blocks = 0, rblocks; | ||
1436 | loff_t bytes, max_bytes; | ||
1437 | struct gfs2_alloc *al; | ||
1438 | int error; | ||
1439 | loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; | ||
1440 | next = (next + 1) << sdp->sd_sb.sb_bsize_shift; | ||
1441 | |||
1442 | offset = (offset >> sdp->sd_sb.sb_bsize_shift) << | ||
1443 | sdp->sd_sb.sb_bsize_shift; | ||
1444 | |||
1445 | len = next - offset; | ||
1446 | bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; | ||
1447 | if (!bytes) | ||
1448 | bytes = UINT_MAX; | ||
1449 | |||
1450 | gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); | ||
1451 | error = gfs2_glock_nq(&ip->i_gh); | ||
1452 | if (unlikely(error)) | ||
1453 | goto out_uninit; | ||
1454 | |||
1455 | if (!gfs2_write_alloc_required(ip, offset, len)) | ||
1456 | goto out_unlock; | ||
1457 | |||
1458 | while (len > 0) { | ||
1459 | if (len < bytes) | ||
1460 | bytes = len; | ||
1461 | al = gfs2_alloc_get(ip); | ||
1462 | if (!al) { | ||
1463 | error = -ENOMEM; | ||
1464 | goto out_unlock; | ||
1465 | } | ||
1466 | |||
1467 | error = gfs2_quota_lock_check(ip); | ||
1468 | if (error) | ||
1469 | goto out_alloc_put; | ||
1470 | |||
1471 | retry: | ||
1472 | gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); | ||
1473 | |||
1474 | al->al_requested = data_blocks + ind_blocks; | ||
1475 | error = gfs2_inplace_reserve(ip); | ||
1476 | if (error) { | ||
1477 | if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { | ||
1478 | bytes >>= 1; | ||
1479 | goto retry; | ||
1480 | } | ||
1481 | goto out_qunlock; | ||
1482 | } | ||
1483 | max_bytes = bytes; | ||
1484 | calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); | ||
1485 | al->al_requested = data_blocks + ind_blocks; | ||
1486 | |||
1487 | rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + | ||
1488 | RES_RG_HDR + gfs2_rg_blocks(al); | ||
1489 | if (gfs2_is_jdata(ip)) | ||
1490 | rblocks += data_blocks ? data_blocks : 1; | ||
1491 | |||
1492 | error = gfs2_trans_begin(sdp, rblocks, | ||
1493 | PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); | ||
1494 | if (error) | ||
1495 | goto out_trans_fail; | ||
1496 | |||
1497 | error = fallocate_chunk(inode, offset, max_bytes, mode); | ||
1498 | gfs2_trans_end(sdp); | ||
1499 | |||
1500 | if (error) | ||
1501 | goto out_trans_fail; | ||
1502 | |||
1503 | len -= max_bytes; | ||
1504 | offset += max_bytes; | ||
1505 | gfs2_inplace_release(ip); | ||
1506 | gfs2_quota_unlock(ip); | ||
1507 | gfs2_alloc_put(ip); | ||
1508 | } | ||
1509 | goto out_unlock; | ||
1510 | |||
1511 | out_trans_fail: | ||
1512 | gfs2_inplace_release(ip); | ||
1513 | out_qunlock: | ||
1514 | gfs2_quota_unlock(ip); | ||
1515 | out_alloc_put: | ||
1516 | gfs2_alloc_put(ip); | ||
1517 | out_unlock: | ||
1518 | gfs2_glock_dq(&ip->i_gh); | ||
1519 | out_uninit: | ||
1520 | gfs2_holder_uninit(&ip->i_gh); | ||
1521 | return error; | ||
1522 | } | ||
1523 | |||
1524 | |||
1304 | static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 1525 | static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
1305 | u64 start, u64 len) | 1526 | u64 start, u64 len) |
1306 | { | 1527 | { |
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = { | |||
1351 | .getxattr = gfs2_getxattr, | 1572 | .getxattr = gfs2_getxattr, |
1352 | .listxattr = gfs2_listxattr, | 1573 | .listxattr = gfs2_listxattr, |
1353 | .removexattr = gfs2_removexattr, | 1574 | .removexattr = gfs2_removexattr, |
1575 | .fallocate = gfs2_fallocate, | ||
1354 | .fiemap = gfs2_fiemap, | 1576 | .fiemap = gfs2_fiemap, |
1355 | }; | 1577 | }; |
1356 | 1578 | ||
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 1bc6b5695e6d..58a9b9998b42 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c | |||
@@ -735,10 +735,8 @@ get_a_page: | |||
735 | goto out; | 735 | goto out; |
736 | 736 | ||
737 | size = loc + sizeof(struct gfs2_quota); | 737 | size = loc + sizeof(struct gfs2_quota); |
738 | if (size > inode->i_size) { | 738 | if (size > inode->i_size) |
739 | ip->i_disksize = size; | ||
740 | i_size_write(inode, size); | 739 | i_size_write(inode, size); |
741 | } | ||
742 | inode->i_mtime = inode->i_atime = CURRENT_TIME; | 740 | inode->i_mtime = inode->i_atime = CURRENT_TIME; |
743 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); | 741 | gfs2_trans_add_bh(ip->i_gl, dibh, 1); |
744 | gfs2_dinode_out(ip, dibh->b_data); | 742 | gfs2_dinode_out(ip, dibh->b_data); |
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) | |||
817 | goto out_alloc; | 815 | goto out_alloc; |
818 | 816 | ||
819 | if (nalloc) | 817 | if (nalloc) |
820 | blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; | 818 | blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS; |
821 | 819 | ||
822 | error = gfs2_trans_begin(sdp, blocks, 0); | 820 | error = gfs2_trans_begin(sdp, blocks, 0); |
823 | if (error) | 821 | if (error) |
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * | |||
1190 | int gfs2_quota_init(struct gfs2_sbd *sdp) | 1188 | int gfs2_quota_init(struct gfs2_sbd *sdp) |
1191 | { | 1189 | { |
1192 | struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); | 1190 | struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); |
1193 | unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; | 1191 | u64 size = i_size_read(sdp->sd_qc_inode); |
1192 | unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; | ||
1194 | unsigned int x, slot = 0; | 1193 | unsigned int x, slot = 0; |
1195 | unsigned int found = 0; | 1194 | unsigned int found = 0; |
1196 | u64 dblock; | 1195 | u64 dblock; |
1197 | u32 extlen = 0; | 1196 | u32 extlen = 0; |
1198 | int error; | 1197 | int error; |
1199 | 1198 | ||
1200 | if (!ip->i_disksize || ip->i_disksize > (64 << 20) || | 1199 | if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) |
1201 | ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) { | ||
1202 | gfs2_consist_inode(ip); | ||
1203 | return -EIO; | 1200 | return -EIO; |
1204 | } | 1201 | |
1205 | sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; | 1202 | sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; |
1206 | sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); | 1203 | sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); |
1207 | 1204 | ||
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, | |||
1589 | error = gfs2_inplace_reserve(ip); | 1586 | error = gfs2_inplace_reserve(ip); |
1590 | if (error) | 1587 | if (error) |
1591 | goto out_alloc; | 1588 | goto out_alloc; |
1589 | blocks += gfs2_rg_blocks(al); | ||
1592 | } | 1590 | } |
1593 | 1591 | ||
1594 | error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); | 1592 | error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); |
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f7f89a94a5a4..f2a02edcac8f 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c | |||
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work) | |||
455 | int ro = 0; | 455 | int ro = 0; |
456 | unsigned int pass; | 456 | unsigned int pass; |
457 | int error; | 457 | int error; |
458 | int jlocked = 0; | ||
458 | 459 | ||
459 | if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { | 460 | if (sdp->sd_args.ar_spectator || |
461 | (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { | ||
460 | fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", | 462 | fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", |
461 | jd->jd_jid); | 463 | jd->jd_jid); |
462 | 464 | jlocked = 1; | |
463 | /* Acquire the journal lock so we can do recovery */ | 465 | /* Acquire the journal lock so we can do recovery */ |
464 | 466 | ||
465 | error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, | 467 | error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, |
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work) | |||
554 | jd->jd_jid, t); | 556 | jd->jd_jid, t); |
555 | } | 557 | } |
556 | 558 | ||
557 | if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) | ||
558 | gfs2_glock_dq_uninit(&ji_gh); | ||
559 | |||
560 | gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); | 559 | gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); |
561 | 560 | ||
562 | if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) | 561 | if (jlocked) { |
562 | gfs2_glock_dq_uninit(&ji_gh); | ||
563 | gfs2_glock_dq_uninit(&j_gh); | 563 | gfs2_glock_dq_uninit(&j_gh); |
564 | } | ||
564 | 565 | ||
565 | fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); | 566 | fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); |
566 | goto done; | 567 | goto done; |
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work) | |||
568 | fail_gunlock_tr: | 569 | fail_gunlock_tr: |
569 | gfs2_glock_dq_uninit(&t_gh); | 570 | gfs2_glock_dq_uninit(&t_gh); |
570 | fail_gunlock_ji: | 571 | fail_gunlock_ji: |
571 | if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { | 572 | if (jlocked) { |
572 | gfs2_glock_dq_uninit(&ji_gh); | 573 | gfs2_glock_dq_uninit(&ji_gh); |
573 | fail_gunlock_j: | 574 | fail_gunlock_j: |
574 | gfs2_glock_dq_uninit(&j_gh); | 575 | gfs2_glock_dq_uninit(&j_gh); |
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 171a744f8e45..fb67f593f408 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c | |||
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) | |||
500 | for (rgrps = 0;; rgrps++) { | 500 | for (rgrps = 0;; rgrps++) { |
501 | loff_t pos = rgrps * sizeof(struct gfs2_rindex); | 501 | loff_t pos = rgrps * sizeof(struct gfs2_rindex); |
502 | 502 | ||
503 | if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) | 503 | if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) |
504 | break; | 504 | break; |
505 | error = gfs2_internal_read(ip, &ra_state, buf, &pos, | 505 | error = gfs2_internal_read(ip, &ra_state, buf, &pos, |
506 | sizeof(struct gfs2_rindex)); | 506 | sizeof(struct gfs2_rindex)); |
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip) | |||
588 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 588 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); |
589 | struct inode *inode = &ip->i_inode; | 589 | struct inode *inode = &ip->i_inode; |
590 | struct file_ra_state ra_state; | 590 | struct file_ra_state ra_state; |
591 | u64 rgrp_count = ip->i_disksize; | 591 | u64 rgrp_count = i_size_read(inode); |
592 | struct gfs2_rgrpd *rgd; | ||
593 | unsigned int max_data = 0; | ||
592 | int error; | 594 | int error; |
593 | 595 | ||
594 | do_div(rgrp_count, sizeof(struct gfs2_rindex)); | 596 | do_div(rgrp_count, sizeof(struct gfs2_rindex)); |
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip) | |||
603 | } | 605 | } |
604 | } | 606 | } |
605 | 607 | ||
608 | list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) | ||
609 | if (rgd->rd_data > max_data) | ||
610 | max_data = rgd->rd_data; | ||
611 | sdp->sd_max_rg_data = max_data; | ||
606 | sdp->sd_rindex_uptodate = 1; | 612 | sdp->sd_rindex_uptodate = 1; |
607 | return 0; | 613 | return 0; |
608 | } | 614 | } |
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) | |||
622 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 628 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); |
623 | struct inode *inode = &ip->i_inode; | 629 | struct inode *inode = &ip->i_inode; |
624 | struct file_ra_state ra_state; | 630 | struct file_ra_state ra_state; |
631 | struct gfs2_rgrpd *rgd; | ||
632 | unsigned int max_data = 0; | ||
625 | int error; | 633 | int error; |
626 | 634 | ||
627 | file_ra_state_init(&ra_state, inode->i_mapping); | 635 | file_ra_state_init(&ra_state, inode->i_mapping); |
628 | for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { | 636 | for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { |
629 | /* Ignore partials */ | 637 | /* Ignore partials */ |
630 | if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > | 638 | if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > |
631 | ip->i_disksize) | 639 | i_size_read(inode)) |
632 | break; | 640 | break; |
633 | error = read_rindex_entry(ip, &ra_state); | 641 | error = read_rindex_entry(ip, &ra_state); |
634 | if (error) { | 642 | if (error) { |
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) | |||
636 | return error; | 644 | return error; |
637 | } | 645 | } |
638 | } | 646 | } |
647 | list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) | ||
648 | if (rgd->rd_data > max_data) | ||
649 | max_data = rgd->rd_data; | ||
650 | sdp->sd_max_rg_data = max_data; | ||
639 | 651 | ||
640 | sdp->sd_rindex_uptodate = 1; | 652 | sdp->sd_rindex_uptodate = 1; |
641 | return 0; | 653 | return 0; |
@@ -1188,7 +1200,8 @@ out: | |||
1188 | * Returns: errno | 1200 | * Returns: errno |
1189 | */ | 1201 | */ |
1190 | 1202 | ||
1191 | int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) | 1203 | int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, |
1204 | char *file, unsigned int line) | ||
1192 | { | 1205 | { |
1193 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 1206 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); |
1194 | struct gfs2_alloc *al = ip->i_alloc; | 1207 | struct gfs2_alloc *al = ip->i_alloc; |
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) | |||
1199 | return -EINVAL; | 1212 | return -EINVAL; |
1200 | 1213 | ||
1201 | try_again: | 1214 | try_again: |
1202 | /* We need to hold the rindex unless the inode we're using is | 1215 | if (hold_rindex) { |
1203 | the rindex itself, in which case it's already held. */ | 1216 | /* We need to hold the rindex unless the inode we're using is |
1204 | if (ip != GFS2_I(sdp->sd_rindex)) | 1217 | the rindex itself, in which case it's already held. */ |
1205 | error = gfs2_rindex_hold(sdp, &al->al_ri_gh); | 1218 | if (ip != GFS2_I(sdp->sd_rindex)) |
1206 | else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ | 1219 | error = gfs2_rindex_hold(sdp, &al->al_ri_gh); |
1207 | error = gfs2_ri_update_special(ip); | 1220 | else if (!sdp->sd_rgrps) /* We may not have the rindex read |
1221 | in, so: */ | ||
1222 | error = gfs2_ri_update_special(ip); | ||
1223 | } | ||
1208 | 1224 | ||
1209 | if (error) | 1225 | if (error) |
1210 | return error; | 1226 | return error; |
@@ -1215,7 +1231,7 @@ try_again: | |||
1215 | try to free it, and try the allocation again. */ | 1231 | try to free it, and try the allocation again. */ |
1216 | error = get_local_rgrp(ip, &unlinked, &last_unlinked); | 1232 | error = get_local_rgrp(ip, &unlinked, &last_unlinked); |
1217 | if (error) { | 1233 | if (error) { |
1218 | if (ip != GFS2_I(sdp->sd_rindex)) | 1234 | if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) |
1219 | gfs2_glock_dq_uninit(&al->al_ri_gh); | 1235 | gfs2_glock_dq_uninit(&al->al_ri_gh); |
1220 | if (error != -EAGAIN) | 1236 | if (error != -EAGAIN) |
1221 | return error; | 1237 | return error; |
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) | |||
1257 | al->al_rgd = NULL; | 1273 | al->al_rgd = NULL; |
1258 | if (al->al_rgd_gh.gh_gl) | 1274 | if (al->al_rgd_gh.gh_gl) |
1259 | gfs2_glock_dq_uninit(&al->al_rgd_gh); | 1275 | gfs2_glock_dq_uninit(&al->al_rgd_gh); |
1260 | if (ip != GFS2_I(sdp->sd_rindex)) | 1276 | if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl) |
1261 | gfs2_glock_dq_uninit(&al->al_ri_gh); | 1277 | gfs2_glock_dq_uninit(&al->al_ri_gh); |
1262 | } | 1278 | } |
1263 | 1279 | ||
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) | |||
1496 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); | 1512 | struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); |
1497 | struct buffer_head *dibh; | 1513 | struct buffer_head *dibh; |
1498 | struct gfs2_alloc *al = ip->i_alloc; | 1514 | struct gfs2_alloc *al = ip->i_alloc; |
1499 | struct gfs2_rgrpd *rgd = al->al_rgd; | 1515 | struct gfs2_rgrpd *rgd; |
1500 | u32 goal, blk; | 1516 | u32 goal, blk; |
1501 | u64 block; | 1517 | u64 block; |
1502 | int error; | 1518 | int error; |
1503 | 1519 | ||
1520 | /* Only happens if there is a bug in gfs2, return something distinctive | ||
1521 | * to ensure that it is noticed. | ||
1522 | */ | ||
1523 | if (al == NULL) | ||
1524 | return -ECANCELED; | ||
1525 | |||
1526 | rgd = al->al_rgd; | ||
1527 | |||
1504 | if (rgrp_contains_block(rgd, ip->i_goal)) | 1528 | if (rgrp_contains_block(rgd, ip->i_goal)) |
1505 | goal = ip->i_goal - rgd->rd_data0; | 1529 | goal = ip->i_goal - rgd->rd_data0; |
1506 | else | 1530 | else |
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index f07119d89557..0e35c0466f9a 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h | |||
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) | |||
39 | ip->i_alloc = NULL; | 39 | ip->i_alloc = NULL; |
40 | } | 40 | } |
41 | 41 | ||
42 | extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, | 42 | extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, |
43 | unsigned int line); | 43 | char *file, unsigned int line); |
44 | #define gfs2_inplace_reserve(ip) \ | 44 | #define gfs2_inplace_reserve(ip) \ |
45 | gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) | 45 | gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__) |
46 | #define gfs2_inplace_reserve_ri(ip) \ | ||
47 | gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__) | ||
46 | 48 | ||
47 | extern void gfs2_inplace_release(struct gfs2_inode *ip); | 49 | extern void gfs2_inplace_release(struct gfs2_inode *ip); |
48 | 50 | ||
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 77cb9f830ee4..047d1176096c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c | |||
@@ -85,6 +85,7 @@ static const match_table_t tokens = { | |||
85 | {Opt_locktable, "locktable=%s"}, | 85 | {Opt_locktable, "locktable=%s"}, |
86 | {Opt_hostdata, "hostdata=%s"}, | 86 | {Opt_hostdata, "hostdata=%s"}, |
87 | {Opt_spectator, "spectator"}, | 87 | {Opt_spectator, "spectator"}, |
88 | {Opt_spectator, "norecovery"}, | ||
88 | {Opt_ignore_local_fs, "ignore_local_fs"}, | 89 | {Opt_ignore_local_fs, "ignore_local_fs"}, |
89 | {Opt_localflocks, "localflocks"}, | 90 | {Opt_localflocks, "localflocks"}, |
90 | {Opt_localcaching, "localcaching"}, | 91 | {Opt_localcaching, "localcaching"}, |
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) | |||
159 | args->ar_spectator = 1; | 160 | args->ar_spectator = 1; |
160 | break; | 161 | break; |
161 | case Opt_ignore_local_fs: | 162 | case Opt_ignore_local_fs: |
162 | args->ar_ignore_local_fs = 1; | 163 | /* Retained for backwards compat only */ |
163 | break; | 164 | break; |
164 | case Opt_localflocks: | 165 | case Opt_localflocks: |
165 | args->ar_localflocks = 1; | 166 | args->ar_localflocks = 1; |
166 | break; | 167 | break; |
167 | case Opt_localcaching: | 168 | case Opt_localcaching: |
168 | args->ar_localcaching = 1; | 169 | /* Retained for backwards compat only */ |
169 | break; | 170 | break; |
170 | case Opt_debug: | 171 | case Opt_debug: |
171 | if (args->ar_errors == GFS2_ERRORS_PANIC) { | 172 | if (args->ar_errors == GFS2_ERRORS_PANIC) { |
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) | |||
179 | args->ar_debug = 0; | 180 | args->ar_debug = 0; |
180 | break; | 181 | break; |
181 | case Opt_upgrade: | 182 | case Opt_upgrade: |
182 | args->ar_upgrade = 1; | 183 | /* Retained for backwards compat only */ |
183 | break; | 184 | break; |
184 | case Opt_acl: | 185 | case Opt_acl: |
185 | args->ar_posix_acl = 1; | 186 | args->ar_posix_acl = 1; |
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) | |||
342 | { | 343 | { |
343 | struct gfs2_inode *ip = GFS2_I(jd->jd_inode); | 344 | struct gfs2_inode *ip = GFS2_I(jd->jd_inode); |
344 | struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); | 345 | struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); |
346 | u64 size = i_size_read(jd->jd_inode); | ||
345 | 347 | ||
346 | if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || | 348 | if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) |
347 | (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { | ||
348 | gfs2_consist_inode(ip); | ||
349 | return -EIO; | 349 | return -EIO; |
350 | } | ||
351 | jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; | ||
352 | 350 | ||
353 | if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { | 351 | jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; |
352 | |||
353 | if (gfs2_write_alloc_required(ip, 0, size)) { | ||
354 | gfs2_consist_inode(ip); | 354 | gfs2_consist_inode(ip); |
355 | return -EIO; | 355 | return -EIO; |
356 | } | 356 | } |
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) | |||
1129 | 1129 | ||
1130 | /* Some flags must not be changed */ | 1130 | /* Some flags must not be changed */ |
1131 | if (args_neq(&args, &sdp->sd_args, spectator) || | 1131 | if (args_neq(&args, &sdp->sd_args, spectator) || |
1132 | args_neq(&args, &sdp->sd_args, ignore_local_fs) || | ||
1133 | args_neq(&args, &sdp->sd_args, localflocks) || | 1132 | args_neq(&args, &sdp->sd_args, localflocks) || |
1134 | args_neq(&args, &sdp->sd_args, localcaching) || | ||
1135 | args_neq(&args, &sdp->sd_args, meta)) | 1133 | args_neq(&args, &sdp->sd_args, meta)) |
1136 | return -EINVAL; | 1134 | return -EINVAL; |
1137 | 1135 | ||
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
1234 | seq_printf(s, ",hostdata=%s", args->ar_hostdata); | 1232 | seq_printf(s, ",hostdata=%s", args->ar_hostdata); |
1235 | if (args->ar_spectator) | 1233 | if (args->ar_spectator) |
1236 | seq_printf(s, ",spectator"); | 1234 | seq_printf(s, ",spectator"); |
1237 | if (args->ar_ignore_local_fs) | ||
1238 | seq_printf(s, ",ignore_local_fs"); | ||
1239 | if (args->ar_localflocks) | 1235 | if (args->ar_localflocks) |
1240 | seq_printf(s, ",localflocks"); | 1236 | seq_printf(s, ",localflocks"); |
1241 | if (args->ar_localcaching) | ||
1242 | seq_printf(s, ",localcaching"); | ||
1243 | if (args->ar_debug) | 1237 | if (args->ar_debug) |
1244 | seq_printf(s, ",debug"); | 1238 | seq_printf(s, ",debug"); |
1245 | if (args->ar_upgrade) | ||
1246 | seq_printf(s, ",upgrade"); | ||
1247 | if (args->ar_posix_acl) | 1239 | if (args->ar_posix_acl) |
1248 | seq_printf(s, ",acl"); | 1240 | seq_printf(s, ",acl"); |
1249 | if (args->ar_quota != GFS2_QUOTA_DEFAULT) { | 1241 | if (args->ar_quota != GFS2_QUOTA_DEFAULT) { |
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ccacffd2faaa..748ccb557c18 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c | |||
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len | |||
230 | 230 | ||
231 | if (gltype > LM_TYPE_JOURNAL) | 231 | if (gltype > LM_TYPE_JOURNAL) |
232 | return -EINVAL; | 232 | return -EINVAL; |
233 | glops = gfs2_glops_list[gltype]; | 233 | if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) |
234 | glops = &gfs2_trans_glops; | ||
235 | else | ||
236 | glops = gfs2_glops_list[gltype]; | ||
234 | if (glops == NULL) | 237 | if (glops == NULL) |
235 | return -EINVAL; | 238 | return -EINVAL; |
236 | if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) | 239 | if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) |
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) | |||
399 | 402 | ||
400 | static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) | 403 | static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) |
401 | { | 404 | { |
402 | return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); | 405 | return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); |
403 | } | 406 | } |
404 | 407 | ||
405 | static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) | 408 | static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) |
406 | { | 409 | { |
407 | unsigned jid; | 410 | int jid; |
408 | int rv; | 411 | int rv; |
409 | 412 | ||
410 | rv = sscanf(buf, "%u", &jid); | 413 | rv = sscanf(buf, "%d", &jid); |
411 | if (rv != 1) | 414 | if (rv != 1) |
412 | return -EINVAL; | 415 | return -EINVAL; |
413 | 416 | ||
414 | spin_lock(&sdp->sd_jindex_spin); | 417 | spin_lock(&sdp->sd_jindex_spin); |
415 | rv = -EINVAL; | 418 | rv = -EINVAL; |
416 | if (sdp->sd_args.ar_spectator) | ||
417 | goto out; | ||
418 | if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) | 419 | if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) |
419 | goto out; | 420 | goto out; |
420 | rv = -EBUSY; | 421 | rv = -EBUSY; |
421 | if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) | 422 | if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) |
422 | goto out; | 423 | goto out; |
424 | rv = 0; | ||
425 | if (sdp->sd_args.ar_spectator && jid > 0) | ||
426 | rv = jid = -EINVAL; | ||
423 | sdp->sd_lockstruct.ls_jid = jid; | 427 | sdp->sd_lockstruct.ls_jid = jid; |
428 | clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); | ||
424 | smp_mb__after_clear_bit(); | 429 | smp_mb__after_clear_bit(); |
425 | wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); | 430 | wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); |
426 | rv = 0; | ||
427 | out: | 431 | out: |
428 | spin_unlock(&sdp->sd_jindex_spin); | 432 | spin_unlock(&sdp->sd_jindex_spin); |
429 | return rv ? rv : len; | 433 | return rv ? rv : len; |
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, | |||
617 | add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); | 621 | add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); |
618 | add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); | 622 | add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); |
619 | if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) | 623 | if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) |
620 | add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); | 624 | add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); |
621 | if (gfs2_uuid_valid(uuid)) | 625 | if (gfs2_uuid_valid(uuid)) |
622 | add_uevent_var(env, "UUID=%pUB", uuid); | 626 | add_uevent_var(env, "UUID=%pUB", uuid); |
623 | return 0; | 627 | return 0; |
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 148d55c14171..cedb0bb96d96 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h | |||
@@ -39,7 +39,8 @@ | |||
39 | {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ | 39 | {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ |
40 | {(1UL << GLF_REPLY_PENDING), "r" }, \ | 40 | {(1UL << GLF_REPLY_PENDING), "r" }, \ |
41 | {(1UL << GLF_INITIAL), "I" }, \ | 41 | {(1UL << GLF_INITIAL), "I" }, \ |
42 | {(1UL << GLF_FROZEN), "F" }) | 42 | {(1UL << GLF_FROZEN), "F" }, \ |
43 | {(1UL << GLF_QUEUED), "q" }) | ||
43 | 44 | ||
44 | #ifndef NUMPTY | 45 | #ifndef NUMPTY |
45 | #define NUMPTY | 46 | #define NUMPTY |
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index edf9d4bd908e..fb56b783e028 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h | |||
@@ -20,11 +20,20 @@ struct gfs2_glock; | |||
20 | #define RES_JDATA 1 | 20 | #define RES_JDATA 1 |
21 | #define RES_DATA 1 | 21 | #define RES_DATA 1 |
22 | #define RES_LEAF 1 | 22 | #define RES_LEAF 1 |
23 | #define RES_RG_HDR 1 | ||
23 | #define RES_RG_BIT 2 | 24 | #define RES_RG_BIT 2 |
24 | #define RES_EATTR 1 | 25 | #define RES_EATTR 1 |
25 | #define RES_STATFS 1 | 26 | #define RES_STATFS 1 |
26 | #define RES_QUOTA 2 | 27 | #define RES_QUOTA 2 |
27 | 28 | ||
29 | /* reserve either the number of blocks to be allocated plus the rg header | ||
30 | * block, or all of the blocks in the rg, whichever is smaller */ | ||
31 | static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al) | ||
32 | { | ||
33 | return (al->al_requested < al->al_rgd->rd_length)? | ||
34 | al->al_requested + 1 : al->al_rgd->rd_length; | ||
35 | } | ||
36 | |||
28 | int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, | 37 | int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, |
29 | unsigned int revokes); | 38 | unsigned int revokes); |
30 | 39 | ||
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 776af6eb4bcb..30b58f07c8a6 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c | |||
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, | |||
734 | goto out_gunlock_q; | 734 | goto out_gunlock_q; |
735 | 735 | ||
736 | error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), | 736 | error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), |
737 | blks + al->al_rgd->rd_length + | 737 | blks + gfs2_rg_blocks(al) + |
738 | RES_DINODE + RES_STATFS + RES_QUOTA, 0); | 738 | RES_DINODE + RES_STATFS + RES_QUOTA, 0); |
739 | if (error) | 739 | if (error) |
740 | goto out_ipres; | 740 | goto out_ipres; |
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 4129cdb3f0d8..571abe97b42a 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c | |||
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) | |||
23 | fd->search_key = ptr; | 23 | fd->search_key = ptr; |
24 | fd->key = ptr + tree->max_key_len + 2; | 24 | fd->key = ptr + tree->max_key_len + 2; |
25 | dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); | 25 | dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); |
26 | down(&tree->tree_lock); | 26 | mutex_lock(&tree->tree_lock); |
27 | return 0; | 27 | return 0; |
28 | } | 28 | } |
29 | 29 | ||
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) | |||
32 | hfs_bnode_put(fd->bnode); | 32 | hfs_bnode_put(fd->bnode); |
33 | kfree(fd->search_key); | 33 | kfree(fd->search_key); |
34 | dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); | 34 | dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); |
35 | up(&fd->tree->tree_lock); | 35 | mutex_unlock(&fd->tree->tree_lock); |
36 | fd->tree = NULL; | 36 | fd->tree = NULL; |
37 | } | 37 | } |
38 | 38 | ||
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 38a0a9917d7f..3ebc437736fe 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c | |||
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke | |||
27 | if (!tree) | 27 | if (!tree) |
28 | return NULL; | 28 | return NULL; |
29 | 29 | ||
30 | init_MUTEX(&tree->tree_lock); | 30 | mutex_init(&tree->tree_lock); |
31 | spin_lock_init(&tree->hash_lock); | 31 | spin_lock_init(&tree->hash_lock); |
32 | /* Set the correct compare function */ | 32 | /* Set the correct compare function */ |
33 | tree->sb = sb; | 33 | tree->sb = sb; |
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index cc51905ac21d..2a1d712f85dc 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h | |||
@@ -33,7 +33,7 @@ struct hfs_btree { | |||
33 | unsigned int depth; | 33 | unsigned int depth; |
34 | 34 | ||
35 | //unsigned int map1_size, map_size; | 35 | //unsigned int map1_size, map_size; |
36 | struct semaphore tree_lock; | 36 | struct mutex tree_lock; |
37 | 37 | ||
38 | unsigned int pages_per_bnode; | 38 | unsigned int pages_per_bnode; |
39 | spinlock_t hash_lock; | 39 | spinlock_t hash_lock; |
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 5007a41f1be9..d182438c7ae4 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c | |||
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) | |||
23 | fd->search_key = ptr; | 23 | fd->search_key = ptr; |
24 | fd->key = ptr + tree->max_key_len + 2; | 24 | fd->key = ptr + tree->max_key_len + 2; |
25 | dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); | 25 | dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); |
26 | down(&tree->tree_lock); | 26 | mutex_lock(&tree->tree_lock); |
27 | return 0; | 27 | return 0; |
28 | } | 28 | } |
29 | 29 | ||
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd) | |||
32 | hfs_bnode_put(fd->bnode); | 32 | hfs_bnode_put(fd->bnode); |
33 | kfree(fd->search_key); | 33 | kfree(fd->search_key); |
34 | dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); | 34 | dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); |
35 | up(&fd->tree->tree_lock); | 35 | mutex_unlock(&fd->tree->tree_lock); |
36 | fd->tree = NULL; | 36 | fd->tree = NULL; |
37 | } | 37 | } |
38 | 38 | ||
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) | |||
52 | rec = (e + b) / 2; | 52 | rec = (e + b) / 2; |
53 | len = hfs_brec_lenoff(bnode, rec, &off); | 53 | len = hfs_brec_lenoff(bnode, rec, &off); |
54 | keylen = hfs_brec_keylen(bnode, rec); | 54 | keylen = hfs_brec_keylen(bnode, rec); |
55 | if (keylen == 0) { | ||
56 | res = -EINVAL; | ||
57 | goto fail; | ||
58 | } | ||
55 | hfs_bnode_read(bnode, fd->key, off, keylen); | 59 | hfs_bnode_read(bnode, fd->key, off, keylen); |
56 | cmpval = bnode->tree->keycmp(fd->key, fd->search_key); | 60 | cmpval = bnode->tree->keycmp(fd->key, fd->search_key); |
57 | if (!cmpval) { | 61 | if (!cmpval) { |
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd) | |||
67 | if (rec != e && e >= 0) { | 71 | if (rec != e && e >= 0) { |
68 | len = hfs_brec_lenoff(bnode, e, &off); | 72 | len = hfs_brec_lenoff(bnode, e, &off); |
69 | keylen = hfs_brec_keylen(bnode, e); | 73 | keylen = hfs_brec_keylen(bnode, e); |
74 | if (keylen == 0) { | ||
75 | res = -EINVAL; | ||
76 | goto fail; | ||
77 | } | ||
70 | hfs_bnode_read(bnode, fd->key, off, keylen); | 78 | hfs_bnode_read(bnode, fd->key, off, keylen); |
71 | } | 79 | } |
72 | done: | 80 | done: |
@@ -75,6 +83,7 @@ done: | |||
75 | fd->keylength = keylen; | 83 | fd->keylength = keylen; |
76 | fd->entryoffset = off + keylen; | 84 | fd->entryoffset = off + keylen; |
77 | fd->entrylength = len - keylen; | 85 | fd->entrylength = len - keylen; |
86 | fail: | ||
78 | return res; | 87 | return res; |
79 | } | 88 | } |
80 | 89 | ||
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt) | |||
198 | 207 | ||
199 | len = hfs_brec_lenoff(bnode, fd->record, &off); | 208 | len = hfs_brec_lenoff(bnode, fd->record, &off); |
200 | keylen = hfs_brec_keylen(bnode, fd->record); | 209 | keylen = hfs_brec_keylen(bnode, fd->record); |
210 | if (keylen == 0) { | ||
211 | res = -EINVAL; | ||
212 | goto out; | ||
213 | } | ||
201 | fd->keyoffset = off; | 214 | fd->keyoffset = off; |
202 | fd->keylength = keylen; | 215 | fd->keylength = keylen; |
203 | fd->entryoffset = off + keylen; | 216 | fd->entryoffset = off + keylen; |
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index ea30afc2a03c..ad57f5991eb1 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c | |||
@@ -17,6 +17,7 @@ | |||
17 | 17 | ||
18 | int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) | 18 | int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) |
19 | { | 19 | { |
20 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
20 | struct page *page; | 21 | struct page *page; |
21 | struct address_space *mapping; | 22 | struct address_space *mapping; |
22 | __be32 *pptr, *curr, *end; | 23 | __be32 *pptr, *curr, *end; |
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma | |||
29 | return size; | 30 | return size; |
30 | 31 | ||
31 | dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); | 32 | dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); |
32 | mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); | 33 | mutex_lock(&sbi->alloc_mutex); |
33 | mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; | 34 | mapping = sbi->alloc_file->i_mapping; |
34 | page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); | 35 | page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); |
35 | if (IS_ERR(page)) { | 36 | if (IS_ERR(page)) { |
36 | start = size; | 37 | start = size; |
@@ -150,16 +151,17 @@ done: | |||
150 | set_page_dirty(page); | 151 | set_page_dirty(page); |
151 | kunmap(page); | 152 | kunmap(page); |
152 | *max = offset + (curr - pptr) * 32 + i - start; | 153 | *max = offset + (curr - pptr) * 32 + i - start; |
153 | HFSPLUS_SB(sb).free_blocks -= *max; | 154 | sbi->free_blocks -= *max; |
154 | sb->s_dirt = 1; | 155 | sb->s_dirt = 1; |
155 | dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); | 156 | dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); |
156 | out: | 157 | out: |
157 | mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); | 158 | mutex_unlock(&sbi->alloc_mutex); |
158 | return start; | 159 | return start; |
159 | } | 160 | } |
160 | 161 | ||
161 | int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) | 162 | int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) |
162 | { | 163 | { |
164 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
163 | struct page *page; | 165 | struct page *page; |
164 | struct address_space *mapping; | 166 | struct address_space *mapping; |
165 | __be32 *pptr, *curr, *end; | 167 | __be32 *pptr, *curr, *end; |
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) | |||
172 | 174 | ||
173 | dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); | 175 | dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); |
174 | /* are all of the bits in range? */ | 176 | /* are all of the bits in range? */ |
175 | if ((offset + count) > HFSPLUS_SB(sb).total_blocks) | 177 | if ((offset + count) > sbi->total_blocks) |
176 | return -2; | 178 | return -2; |
177 | 179 | ||
178 | mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); | 180 | mutex_lock(&sbi->alloc_mutex); |
179 | mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; | 181 | mapping = sbi->alloc_file->i_mapping; |
180 | pnr = offset / PAGE_CACHE_BITS; | 182 | pnr = offset / PAGE_CACHE_BITS; |
181 | page = read_mapping_page(mapping, pnr, NULL); | 183 | page = read_mapping_page(mapping, pnr, NULL); |
182 | pptr = kmap(page); | 184 | pptr = kmap(page); |
@@ -224,9 +226,9 @@ done: | |||
224 | out: | 226 | out: |
225 | set_page_dirty(page); | 227 | set_page_dirty(page); |
226 | kunmap(page); | 228 | kunmap(page); |
227 | HFSPLUS_SB(sb).free_blocks += len; | 229 | sbi->free_blocks += len; |
228 | sb->s_dirt = 1; | 230 | sb->s_dirt = 1; |
229 | mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); | 231 | mutex_unlock(&sbi->alloc_mutex); |
230 | 232 | ||
231 | return 0; | 233 | return 0; |
232 | } | 234 | } |
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index c88e5d72a402..2f39d05443e1 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c | |||
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) | |||
42 | recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); | 42 | recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); |
43 | if (!recoff) | 43 | if (!recoff) |
44 | return 0; | 44 | return 0; |
45 | if (node->tree->attributes & HFS_TREE_BIGKEYS) | 45 | |
46 | retval = hfs_bnode_read_u16(node, recoff) + 2; | 46 | retval = hfs_bnode_read_u16(node, recoff) + 2; |
47 | else | 47 | if (retval > node->tree->max_key_len + 2) { |
48 | retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; | 48 | printk(KERN_ERR "hfs: keylen %d too large\n", |
49 | retval); | ||
50 | retval = 0; | ||
51 | } | ||
49 | } | 52 | } |
50 | return retval; | 53 | return retval; |
51 | } | 54 | } |
@@ -216,7 +219,7 @@ skip: | |||
216 | static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) | 219 | static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) |
217 | { | 220 | { |
218 | struct hfs_btree *tree; | 221 | struct hfs_btree *tree; |
219 | struct hfs_bnode *node, *new_node; | 222 | struct hfs_bnode *node, *new_node, *next_node; |
220 | struct hfs_bnode_desc node_desc; | 223 | struct hfs_bnode_desc node_desc; |
221 | int num_recs, new_rec_off, new_off, old_rec_off; | 224 | int num_recs, new_rec_off, new_off, old_rec_off; |
222 | int data_start, data_end, size; | 225 | int data_start, data_end, size; |
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) | |||
235 | new_node->type = node->type; | 238 | new_node->type = node->type; |
236 | new_node->height = node->height; | 239 | new_node->height = node->height; |
237 | 240 | ||
241 | if (node->next) | ||
242 | next_node = hfs_bnode_find(tree, node->next); | ||
243 | else | ||
244 | next_node = NULL; | ||
245 | |||
246 | if (IS_ERR(next_node)) { | ||
247 | hfs_bnode_put(node); | ||
248 | hfs_bnode_put(new_node); | ||
249 | return next_node; | ||
250 | } | ||
251 | |||
238 | size = tree->node_size / 2 - node->num_recs * 2 - 14; | 252 | size = tree->node_size / 2 - node->num_recs * 2 - 14; |
239 | old_rec_off = tree->node_size - 4; | 253 | old_rec_off = tree->node_size - 4; |
240 | num_recs = 1; | 254 | num_recs = 1; |
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) | |||
248 | /* panic? */ | 262 | /* panic? */ |
249 | hfs_bnode_put(node); | 263 | hfs_bnode_put(node); |
250 | hfs_bnode_put(new_node); | 264 | hfs_bnode_put(new_node); |
265 | if (next_node) | ||
266 | hfs_bnode_put(next_node); | ||
251 | return ERR_PTR(-ENOSPC); | 267 | return ERR_PTR(-ENOSPC); |
252 | } | 268 | } |
253 | 269 | ||
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) | |||
302 | hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); | 318 | hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); |
303 | 319 | ||
304 | /* update next bnode header */ | 320 | /* update next bnode header */ |
305 | if (new_node->next) { | 321 | if (next_node) { |
306 | struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next); | ||
307 | next_node->prev = new_node->this; | 322 | next_node->prev = new_node->this; |
308 | hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); | 323 | hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); |
309 | node_desc.prev = cpu_to_be32(next_node->prev); | 324 | node_desc.prev = cpu_to_be32(next_node->prev); |
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index e49fcee1e293..22e4d4e32999 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c | |||
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) | |||
30 | if (!tree) | 30 | if (!tree) |
31 | return NULL; | 31 | return NULL; |
32 | 32 | ||
33 | init_MUTEX(&tree->tree_lock); | 33 | mutex_init(&tree->tree_lock); |
34 | spin_lock_init(&tree->hash_lock); | 34 | spin_lock_init(&tree->hash_lock); |
35 | tree->sb = sb; | 35 | tree->sb = sb; |
36 | tree->cnid = id; | 36 | tree->cnid = id; |
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) | |||
39 | goto free_tree; | 39 | goto free_tree; |
40 | tree->inode = inode; | 40 | tree->inode = inode; |
41 | 41 | ||
42 | if (!HFSPLUS_I(tree->inode)->first_blocks) { | ||
43 | printk(KERN_ERR | ||
44 | "hfs: invalid btree extent records (0 size).\n"); | ||
45 | goto free_inode; | ||
46 | } | ||
47 | |||
42 | mapping = tree->inode->i_mapping; | 48 | mapping = tree->inode->i_mapping; |
43 | page = read_mapping_page(mapping, 0, NULL); | 49 | page = read_mapping_page(mapping, 0, NULL); |
44 | if (IS_ERR(page)) | 50 | if (IS_ERR(page)) |
45 | goto free_tree; | 51 | goto free_inode; |
46 | 52 | ||
47 | /* Load the header */ | 53 | /* Load the header */ |
48 | head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); | 54 | head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); |
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) | |||
57 | tree->max_key_len = be16_to_cpu(head->max_key_len); | 63 | tree->max_key_len = be16_to_cpu(head->max_key_len); |
58 | tree->depth = be16_to_cpu(head->depth); | 64 | tree->depth = be16_to_cpu(head->depth); |
59 | 65 | ||
60 | /* Set the correct compare function */ | 66 | /* Verify the tree and set the correct compare function */ |
61 | if (id == HFSPLUS_EXT_CNID) { | 67 | switch (id) { |
68 | case HFSPLUS_EXT_CNID: | ||
69 | if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { | ||
70 | printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", | ||
71 | tree->max_key_len); | ||
72 | goto fail_page; | ||
73 | } | ||
74 | if (tree->attributes & HFS_TREE_VARIDXKEYS) { | ||
75 | printk(KERN_ERR "hfs: invalid extent btree flag\n"); | ||
76 | goto fail_page; | ||
77 | } | ||
78 | |||
62 | tree->keycmp = hfsplus_ext_cmp_key; | 79 | tree->keycmp = hfsplus_ext_cmp_key; |
63 | } else if (id == HFSPLUS_CAT_CNID) { | 80 | break; |
64 | if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && | 81 | case HFSPLUS_CAT_CNID: |
82 | if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { | ||
83 | printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", | ||
84 | tree->max_key_len); | ||
85 | goto fail_page; | ||
86 | } | ||
87 | if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { | ||
88 | printk(KERN_ERR "hfs: invalid catalog btree flag\n"); | ||
89 | goto fail_page; | ||
90 | } | ||
91 | |||
92 | if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && | ||
65 | (head->key_type == HFSPLUS_KEY_BINARY)) | 93 | (head->key_type == HFSPLUS_KEY_BINARY)) |
66 | tree->keycmp = hfsplus_cat_bin_cmp_key; | 94 | tree->keycmp = hfsplus_cat_bin_cmp_key; |
67 | else { | 95 | else { |
68 | tree->keycmp = hfsplus_cat_case_cmp_key; | 96 | tree->keycmp = hfsplus_cat_case_cmp_key; |
69 | HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; | 97 | set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); |
70 | } | 98 | } |
71 | } else { | 99 | break; |
100 | default: | ||
72 | printk(KERN_ERR "hfs: unknown B*Tree requested\n"); | 101 | printk(KERN_ERR "hfs: unknown B*Tree requested\n"); |
73 | goto fail_page; | 102 | goto fail_page; |
74 | } | 103 | } |
75 | 104 | ||
105 | if (!(tree->attributes & HFS_TREE_BIGKEYS)) { | ||
106 | printk(KERN_ERR "hfs: invalid btree flag\n"); | ||
107 | goto fail_page; | ||
108 | } | ||
109 | |||
76 | size = tree->node_size; | 110 | size = tree->node_size; |
77 | if (!is_power_of_2(size)) | 111 | if (!is_power_of_2(size)) |
78 | goto fail_page; | 112 | goto fail_page; |
79 | if (!tree->node_count) | 113 | if (!tree->node_count) |
80 | goto fail_page; | 114 | goto fail_page; |
115 | |||
81 | tree->node_size_shift = ffs(size) - 1; | 116 | tree->node_size_shift = ffs(size) - 1; |
82 | 117 | ||
83 | tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 118 | tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) | |||
87 | return tree; | 122 | return tree; |
88 | 123 | ||
89 | fail_page: | 124 | fail_page: |
90 | tree->inode->i_mapping->a_ops = &hfsplus_aops; | ||
91 | page_cache_release(page); | 125 | page_cache_release(page); |
92 | free_tree: | 126 | free_inode: |
127 | tree->inode->i_mapping->a_ops = &hfsplus_aops; | ||
93 | iput(tree->inode); | 128 | iput(tree->inode); |
129 | free_tree: | ||
94 | kfree(tree); | 130 | kfree(tree); |
95 | return NULL; | 131 | return NULL; |
96 | } | 132 | } |
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) | |||
192 | 228 | ||
193 | while (!tree->free_nodes) { | 229 | while (!tree->free_nodes) { |
194 | struct inode *inode = tree->inode; | 230 | struct inode *inode = tree->inode; |
231 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
195 | u32 count; | 232 | u32 count; |
196 | int res; | 233 | int res; |
197 | 234 | ||
198 | res = hfsplus_file_extend(inode); | 235 | res = hfsplus_file_extend(inode); |
199 | if (res) | 236 | if (res) |
200 | return ERR_PTR(res); | 237 | return ERR_PTR(res); |
201 | HFSPLUS_I(inode).phys_size = inode->i_size = | 238 | hip->phys_size = inode->i_size = |
202 | (loff_t)HFSPLUS_I(inode).alloc_blocks << | 239 | (loff_t)hip->alloc_blocks << |
203 | HFSPLUS_SB(tree->sb).alloc_blksz_shift; | 240 | HFSPLUS_SB(tree->sb)->alloc_blksz_shift; |
204 | HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << | 241 | hip->fs_blocks = |
205 | HFSPLUS_SB(tree->sb).fs_shift; | 242 | hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; |
206 | inode_set_bytes(inode, inode->i_size); | 243 | inode_set_bytes(inode, inode->i_size); |
207 | count = inode->i_size >> tree->node_size_shift; | 244 | count = inode->i_size >> tree->node_size_shift; |
208 | tree->free_nodes = count - tree->node_count; | 245 | tree->free_nodes = count - tree->node_count; |
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index f6874acb2cf2..8af45fc5b051 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c | |||
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, | |||
67 | key->key_len = cpu_to_be16(6 + ustrlen); | 67 | key->key_len = cpu_to_be16(6 + ustrlen); |
68 | } | 68 | } |
69 | 69 | ||
70 | static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) | 70 | void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms) |
71 | { | 71 | { |
72 | if (inode->i_flags & S_IMMUTABLE) | 72 | if (inode->i_flags & S_IMMUTABLE) |
73 | perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; | 73 | perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; |
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) | |||
77 | perms->rootflags |= HFSPLUS_FLG_APPEND; | 77 | perms->rootflags |= HFSPLUS_FLG_APPEND; |
78 | else | 78 | else |
79 | perms->rootflags &= ~HFSPLUS_FLG_APPEND; | 79 | perms->rootflags &= ~HFSPLUS_FLG_APPEND; |
80 | HFSPLUS_I(inode).rootflags = perms->rootflags; | 80 | |
81 | HFSPLUS_I(inode).userflags = perms->userflags; | 81 | perms->userflags = HFSPLUS_I(inode)->userflags; |
82 | perms->mode = cpu_to_be16(inode->i_mode); | 82 | perms->mode = cpu_to_be16(inode->i_mode); |
83 | perms->owner = cpu_to_be32(inode->i_uid); | 83 | perms->owner = cpu_to_be32(inode->i_uid); |
84 | perms->group = cpu_to_be32(inode->i_gid); | 84 | perms->group = cpu_to_be32(inode->i_gid); |
85 | |||
86 | if (S_ISREG(inode->i_mode)) | ||
87 | perms->dev = cpu_to_be32(inode->i_nlink); | ||
88 | else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) | ||
89 | perms->dev = cpu_to_be32(inode->i_rdev); | ||
90 | else | ||
91 | perms->dev = 0; | ||
85 | } | 92 | } |
86 | 93 | ||
87 | static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) | 94 | static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) |
88 | { | 95 | { |
96 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); | ||
97 | |||
89 | if (S_ISDIR(inode->i_mode)) { | 98 | if (S_ISDIR(inode->i_mode)) { |
90 | struct hfsplus_cat_folder *folder; | 99 | struct hfsplus_cat_folder *folder; |
91 | 100 | ||
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i | |||
93 | memset(folder, 0, sizeof(*folder)); | 102 | memset(folder, 0, sizeof(*folder)); |
94 | folder->type = cpu_to_be16(HFSPLUS_FOLDER); | 103 | folder->type = cpu_to_be16(HFSPLUS_FOLDER); |
95 | folder->id = cpu_to_be32(inode->i_ino); | 104 | folder->id = cpu_to_be32(inode->i_ino); |
96 | HFSPLUS_I(inode).create_date = | 105 | HFSPLUS_I(inode)->create_date = |
97 | folder->create_date = | 106 | folder->create_date = |
98 | folder->content_mod_date = | 107 | folder->content_mod_date = |
99 | folder->attribute_mod_date = | 108 | folder->attribute_mod_date = |
100 | folder->access_date = hfsp_now2mt(); | 109 | folder->access_date = hfsp_now2mt(); |
101 | hfsplus_set_perms(inode, &folder->permissions); | 110 | hfsplus_cat_set_perms(inode, &folder->permissions); |
102 | if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) | 111 | if (inode == sbi->hidden_dir) |
103 | /* invisible and namelocked */ | 112 | /* invisible and namelocked */ |
104 | folder->user_info.frFlags = cpu_to_be16(0x5000); | 113 | folder->user_info.frFlags = cpu_to_be16(0x5000); |
105 | return sizeof(*folder); | 114 | return sizeof(*folder); |
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i | |||
111 | file->type = cpu_to_be16(HFSPLUS_FILE); | 120 | file->type = cpu_to_be16(HFSPLUS_FILE); |
112 | file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); | 121 | file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); |
113 | file->id = cpu_to_be32(cnid); | 122 | file->id = cpu_to_be32(cnid); |
114 | HFSPLUS_I(inode).create_date = | 123 | HFSPLUS_I(inode)->create_date = |
115 | file->create_date = | 124 | file->create_date = |
116 | file->content_mod_date = | 125 | file->content_mod_date = |
117 | file->attribute_mod_date = | 126 | file->attribute_mod_date = |
118 | file->access_date = hfsp_now2mt(); | 127 | file->access_date = hfsp_now2mt(); |
119 | if (cnid == inode->i_ino) { | 128 | if (cnid == inode->i_ino) { |
120 | hfsplus_set_perms(inode, &file->permissions); | 129 | hfsplus_cat_set_perms(inode, &file->permissions); |
121 | if (S_ISLNK(inode->i_mode)) { | 130 | if (S_ISLNK(inode->i_mode)) { |
122 | file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); | 131 | file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); |
123 | file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); | 132 | file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); |
124 | } else { | 133 | } else { |
125 | file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); | 134 | file->user_info.fdType = cpu_to_be32(sbi->type); |
126 | file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); | 135 | file->user_info.fdCreator = cpu_to_be32(sbi->creator); |
127 | } | 136 | } |
128 | if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) | 137 | if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) |
129 | file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); | 138 | file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); |
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i | |||
131 | file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); | 140 | file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); |
132 | file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); | 141 | file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); |
133 | file->user_info.fdFlags = cpu_to_be16(0x100); | 142 | file->user_info.fdFlags = cpu_to_be16(0x100); |
134 | file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; | 143 | file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date; |
135 | file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); | 144 | file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid); |
136 | } | 145 | } |
137 | return sizeof(*file); | 146 | return sizeof(*file); |
138 | } | 147 | } |
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, | |||
180 | 189 | ||
181 | int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) | 190 | int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) |
182 | { | 191 | { |
192 | struct super_block *sb = dir->i_sb; | ||
183 | struct hfs_find_data fd; | 193 | struct hfs_find_data fd; |
184 | struct super_block *sb; | ||
185 | hfsplus_cat_entry entry; | 194 | hfsplus_cat_entry entry; |
186 | int entry_size; | 195 | int entry_size; |
187 | int err; | 196 | int err; |
188 | 197 | ||
189 | dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); | 198 | dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); |
190 | sb = dir->i_sb; | 199 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); |
191 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | ||
192 | 200 | ||
193 | hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); | 201 | hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); |
194 | entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? | 202 | entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? |
@@ -234,7 +242,7 @@ err2: | |||
234 | 242 | ||
235 | int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) | 243 | int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) |
236 | { | 244 | { |
237 | struct super_block *sb; | 245 | struct super_block *sb = dir->i_sb; |
238 | struct hfs_find_data fd; | 246 | struct hfs_find_data fd; |
239 | struct hfsplus_fork_raw fork; | 247 | struct hfsplus_fork_raw fork; |
240 | struct list_head *pos; | 248 | struct list_head *pos; |
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) | |||
242 | u16 type; | 250 | u16 type; |
243 | 251 | ||
244 | dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); | 252 | dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); |
245 | sb = dir->i_sb; | 253 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); |
246 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | ||
247 | 254 | ||
248 | if (!str) { | 255 | if (!str) { |
249 | int len; | 256 | int len; |
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) | |||
279 | hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); | 286 | hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); |
280 | } | 287 | } |
281 | 288 | ||
282 | list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { | 289 | list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) { |
283 | struct hfsplus_readdir_data *rd = | 290 | struct hfsplus_readdir_data *rd = |
284 | list_entry(pos, struct hfsplus_readdir_data, list); | 291 | list_entry(pos, struct hfsplus_readdir_data, list); |
285 | if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) | 292 | if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) |
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid, | |||
312 | struct inode *src_dir, struct qstr *src_name, | 319 | struct inode *src_dir, struct qstr *src_name, |
313 | struct inode *dst_dir, struct qstr *dst_name) | 320 | struct inode *dst_dir, struct qstr *dst_name) |
314 | { | 321 | { |
315 | struct super_block *sb; | 322 | struct super_block *sb = src_dir->i_sb; |
316 | struct hfs_find_data src_fd, dst_fd; | 323 | struct hfs_find_data src_fd, dst_fd; |
317 | hfsplus_cat_entry entry; | 324 | hfsplus_cat_entry entry; |
318 | int entry_size, type; | 325 | int entry_size, type; |
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid, | |||
320 | 327 | ||
321 | dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, | 328 | dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, |
322 | dst_dir->i_ino, dst_name->name); | 329 | dst_dir->i_ino, dst_name->name); |
323 | sb = src_dir->i_sb; | 330 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); |
324 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd); | ||
325 | dst_fd = src_fd; | 331 | dst_fd = src_fd; |
326 | 332 | ||
327 | /* find the old dir entry and read the data */ | 333 | /* find the old dir entry and read the data */ |
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 764fd1bdca88..d236d85ec9d7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c | |||
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, | |||
39 | 39 | ||
40 | dentry->d_op = &hfsplus_dentry_operations; | 40 | dentry->d_op = &hfsplus_dentry_operations; |
41 | dentry->d_fsdata = NULL; | 41 | dentry->d_fsdata = NULL; |
42 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | 42 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); |
43 | hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); | 43 | hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); |
44 | again: | 44 | again: |
45 | err = hfs_brec_read(&fd, &entry, sizeof(entry)); | 45 | err = hfs_brec_read(&fd, &entry, sizeof(entry)); |
@@ -68,9 +68,9 @@ again: | |||
68 | cnid = be32_to_cpu(entry.file.id); | 68 | cnid = be32_to_cpu(entry.file.id); |
69 | if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && | 69 | if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && |
70 | entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && | 70 | entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && |
71 | (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || | 71 | (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date || |
72 | entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && | 72 | entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) && |
73 | HFSPLUS_SB(sb).hidden_dir) { | 73 | HFSPLUS_SB(sb)->hidden_dir) { |
74 | struct qstr str; | 74 | struct qstr str; |
75 | char name[32]; | 75 | char name[32]; |
76 | 76 | ||
@@ -86,7 +86,8 @@ again: | |||
86 | linkid = be32_to_cpu(entry.file.permissions.dev); | 86 | linkid = be32_to_cpu(entry.file.permissions.dev); |
87 | str.len = sprintf(name, "iNode%d", linkid); | 87 | str.len = sprintf(name, "iNode%d", linkid); |
88 | str.name = name; | 88 | str.name = name; |
89 | hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); | 89 | hfsplus_cat_build_key(sb, fd.search_key, |
90 | HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); | ||
90 | goto again; | 91 | goto again; |
91 | } | 92 | } |
92 | } else if (!dentry->d_fsdata) | 93 | } else if (!dentry->d_fsdata) |
@@ -101,7 +102,7 @@ again: | |||
101 | if (IS_ERR(inode)) | 102 | if (IS_ERR(inode)) |
102 | return ERR_CAST(inode); | 103 | return ERR_CAST(inode); |
103 | if (S_ISREG(inode->i_mode)) | 104 | if (S_ISREG(inode->i_mode)) |
104 | HFSPLUS_I(inode).dev = linkid; | 105 | HFSPLUS_I(inode)->linkid = linkid; |
105 | out: | 106 | out: |
106 | d_add(dentry, inode); | 107 | d_add(dentry, inode); |
107 | return NULL; | 108 | return NULL; |
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
124 | if (filp->f_pos >= inode->i_size) | 125 | if (filp->f_pos >= inode->i_size) |
125 | return 0; | 126 | return 0; |
126 | 127 | ||
127 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | 128 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); |
128 | hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); | 129 | hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); |
129 | err = hfs_brec_find(&fd); | 130 | err = hfs_brec_find(&fd); |
130 | if (err) | 131 | if (err) |
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
180 | err = -EIO; | 181 | err = -EIO; |
181 | goto out; | 182 | goto out; |
182 | } | 183 | } |
183 | if (HFSPLUS_SB(sb).hidden_dir && | 184 | if (HFSPLUS_SB(sb)->hidden_dir && |
184 | HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) | 185 | HFSPLUS_SB(sb)->hidden_dir->i_ino == |
186 | be32_to_cpu(entry.folder.id)) | ||
185 | goto next; | 187 | goto next; |
186 | if (filldir(dirent, strbuf, len, filp->f_pos, | 188 | if (filldir(dirent, strbuf, len, filp->f_pos, |
187 | be32_to_cpu(entry.folder.id), DT_DIR)) | 189 | be32_to_cpu(entry.folder.id), DT_DIR)) |
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
217 | } | 219 | } |
218 | filp->private_data = rd; | 220 | filp->private_data = rd; |
219 | rd->file = filp; | 221 | rd->file = filp; |
220 | list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); | 222 | list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); |
221 | } | 223 | } |
222 | memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); | 224 | memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); |
223 | out: | 225 | out: |
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) | |||
229 | { | 231 | { |
230 | struct hfsplus_readdir_data *rd = file->private_data; | 232 | struct hfsplus_readdir_data *rd = file->private_data; |
231 | if (rd) { | 233 | if (rd) { |
234 | mutex_lock(&inode->i_mutex); | ||
232 | list_del(&rd->list); | 235 | list_del(&rd->list); |
236 | mutex_unlock(&inode->i_mutex); | ||
233 | kfree(rd); | 237 | kfree(rd); |
234 | } | 238 | } |
235 | return 0; | 239 | return 0; |
236 | } | 240 | } |
237 | 241 | ||
238 | static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, | ||
239 | struct nameidata *nd) | ||
240 | { | ||
241 | struct inode *inode; | ||
242 | int res; | ||
243 | |||
244 | inode = hfsplus_new_inode(dir->i_sb, mode); | ||
245 | if (!inode) | ||
246 | return -ENOSPC; | ||
247 | |||
248 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); | ||
249 | if (res) { | ||
250 | inode->i_nlink = 0; | ||
251 | hfsplus_delete_inode(inode); | ||
252 | iput(inode); | ||
253 | return res; | ||
254 | } | ||
255 | hfsplus_instantiate(dentry, inode, inode->i_ino); | ||
256 | mark_inode_dirty(inode); | ||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, | 242 | static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, |
261 | struct dentry *dst_dentry) | 243 | struct dentry *dst_dentry) |
262 | { | 244 | { |
263 | struct super_block *sb = dst_dir->i_sb; | 245 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); |
264 | struct inode *inode = src_dentry->d_inode; | 246 | struct inode *inode = src_dentry->d_inode; |
265 | struct inode *src_dir = src_dentry->d_parent->d_inode; | 247 | struct inode *src_dir = src_dentry->d_parent->d_inode; |
266 | struct qstr str; | 248 | struct qstr str; |
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, | |||
270 | 252 | ||
271 | if (HFSPLUS_IS_RSRC(inode)) | 253 | if (HFSPLUS_IS_RSRC(inode)) |
272 | return -EPERM; | 254 | return -EPERM; |
255 | if (!S_ISREG(inode->i_mode)) | ||
256 | return -EPERM; | ||
273 | 257 | ||
258 | mutex_lock(&sbi->vh_mutex); | ||
274 | if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { | 259 | if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { |
275 | for (;;) { | 260 | for (;;) { |
276 | get_random_bytes(&id, sizeof(cnid)); | 261 | get_random_bytes(&id, sizeof(cnid)); |
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, | |||
279 | str.len = sprintf(name, "iNode%d", id); | 264 | str.len = sprintf(name, "iNode%d", id); |
280 | res = hfsplus_rename_cat(inode->i_ino, | 265 | res = hfsplus_rename_cat(inode->i_ino, |
281 | src_dir, &src_dentry->d_name, | 266 | src_dir, &src_dentry->d_name, |
282 | HFSPLUS_SB(sb).hidden_dir, &str); | 267 | sbi->hidden_dir, &str); |
283 | if (!res) | 268 | if (!res) |
284 | break; | 269 | break; |
285 | if (res != -EEXIST) | 270 | if (res != -EEXIST) |
286 | return res; | 271 | goto out; |
287 | } | 272 | } |
288 | HFSPLUS_I(inode).dev = id; | 273 | HFSPLUS_I(inode)->linkid = id; |
289 | cnid = HFSPLUS_SB(sb).next_cnid++; | 274 | cnid = sbi->next_cnid++; |
290 | src_dentry->d_fsdata = (void *)(unsigned long)cnid; | 275 | src_dentry->d_fsdata = (void *)(unsigned long)cnid; |
291 | res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); | 276 | res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); |
292 | if (res) | 277 | if (res) |
293 | /* panic? */ | 278 | /* panic? */ |
294 | return res; | 279 | goto out; |
295 | HFSPLUS_SB(sb).file_count++; | 280 | sbi->file_count++; |
296 | } | 281 | } |
297 | cnid = HFSPLUS_SB(sb).next_cnid++; | 282 | cnid = sbi->next_cnid++; |
298 | res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); | 283 | res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); |
299 | if (res) | 284 | if (res) |
300 | return res; | 285 | goto out; |
301 | 286 | ||
302 | inc_nlink(inode); | 287 | inc_nlink(inode); |
303 | hfsplus_instantiate(dst_dentry, inode, cnid); | 288 | hfsplus_instantiate(dst_dentry, inode, cnid); |
304 | atomic_inc(&inode->i_count); | 289 | atomic_inc(&inode->i_count); |
305 | inode->i_ctime = CURRENT_TIME_SEC; | 290 | inode->i_ctime = CURRENT_TIME_SEC; |
306 | mark_inode_dirty(inode); | 291 | mark_inode_dirty(inode); |
307 | HFSPLUS_SB(sb).file_count++; | 292 | sbi->file_count++; |
308 | sb->s_dirt = 1; | 293 | dst_dir->i_sb->s_dirt = 1; |
309 | 294 | out: | |
310 | return 0; | 295 | mutex_unlock(&sbi->vh_mutex); |
296 | return res; | ||
311 | } | 297 | } |
312 | 298 | ||
313 | static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) | 299 | static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) |
314 | { | 300 | { |
315 | struct super_block *sb = dir->i_sb; | 301 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); |
316 | struct inode *inode = dentry->d_inode; | 302 | struct inode *inode = dentry->d_inode; |
317 | struct qstr str; | 303 | struct qstr str; |
318 | char name[32]; | 304 | char name[32]; |
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) | |||
322 | if (HFSPLUS_IS_RSRC(inode)) | 308 | if (HFSPLUS_IS_RSRC(inode)) |
323 | return -EPERM; | 309 | return -EPERM; |
324 | 310 | ||
311 | mutex_lock(&sbi->vh_mutex); | ||
325 | cnid = (u32)(unsigned long)dentry->d_fsdata; | 312 | cnid = (u32)(unsigned long)dentry->d_fsdata; |
326 | if (inode->i_ino == cnid && | 313 | if (inode->i_ino == cnid && |
327 | atomic_read(&HFSPLUS_I(inode).opencnt)) { | 314 | atomic_read(&HFSPLUS_I(inode)->opencnt)) { |
328 | str.name = name; | 315 | str.name = name; |
329 | str.len = sprintf(name, "temp%lu", inode->i_ino); | 316 | str.len = sprintf(name, "temp%lu", inode->i_ino); |
330 | res = hfsplus_rename_cat(inode->i_ino, | 317 | res = hfsplus_rename_cat(inode->i_ino, |
331 | dir, &dentry->d_name, | 318 | dir, &dentry->d_name, |
332 | HFSPLUS_SB(sb).hidden_dir, &str); | 319 | sbi->hidden_dir, &str); |
333 | if (!res) | 320 | if (!res) |
334 | inode->i_flags |= S_DEAD; | 321 | inode->i_flags |= S_DEAD; |
335 | return res; | 322 | goto out; |
336 | } | 323 | } |
337 | res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); | 324 | res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); |
338 | if (res) | 325 | if (res) |
339 | return res; | 326 | goto out; |
340 | 327 | ||
341 | if (inode->i_nlink > 0) | 328 | if (inode->i_nlink > 0) |
342 | drop_nlink(inode); | 329 | drop_nlink(inode); |
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) | |||
344 | clear_nlink(inode); | 331 | clear_nlink(inode); |
345 | if (!inode->i_nlink) { | 332 | if (!inode->i_nlink) { |
346 | if (inode->i_ino != cnid) { | 333 | if (inode->i_ino != cnid) { |
347 | HFSPLUS_SB(sb).file_count--; | 334 | sbi->file_count--; |
348 | if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { | 335 | if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) { |
349 | res = hfsplus_delete_cat(inode->i_ino, | 336 | res = hfsplus_delete_cat(inode->i_ino, |
350 | HFSPLUS_SB(sb).hidden_dir, | 337 | sbi->hidden_dir, |
351 | NULL); | 338 | NULL); |
352 | if (!res) | 339 | if (!res) |
353 | hfsplus_delete_inode(inode); | 340 | hfsplus_delete_inode(inode); |
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) | |||
356 | } else | 343 | } else |
357 | hfsplus_delete_inode(inode); | 344 | hfsplus_delete_inode(inode); |
358 | } else | 345 | } else |
359 | HFSPLUS_SB(sb).file_count--; | 346 | sbi->file_count--; |
360 | inode->i_ctime = CURRENT_TIME_SEC; | 347 | inode->i_ctime = CURRENT_TIME_SEC; |
361 | mark_inode_dirty(inode); | 348 | mark_inode_dirty(inode); |
362 | 349 | out: | |
350 | mutex_unlock(&sbi->vh_mutex); | ||
363 | return res; | 351 | return res; |
364 | } | 352 | } |
365 | 353 | ||
366 | static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
367 | { | ||
368 | struct inode *inode; | ||
369 | int res; | ||
370 | |||
371 | inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode); | ||
372 | if (!inode) | ||
373 | return -ENOSPC; | ||
374 | |||
375 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); | ||
376 | if (res) { | ||
377 | inode->i_nlink = 0; | ||
378 | hfsplus_delete_inode(inode); | ||
379 | iput(inode); | ||
380 | return res; | ||
381 | } | ||
382 | hfsplus_instantiate(dentry, inode, inode->i_ino); | ||
383 | mark_inode_dirty(inode); | ||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) | 354 | static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) |
388 | { | 355 | { |
389 | struct inode *inode; | 356 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); |
357 | struct inode *inode = dentry->d_inode; | ||
390 | int res; | 358 | int res; |
391 | 359 | ||
392 | inode = dentry->d_inode; | ||
393 | if (inode->i_size != 2) | 360 | if (inode->i_size != 2) |
394 | return -ENOTEMPTY; | 361 | return -ENOTEMPTY; |
362 | |||
363 | mutex_lock(&sbi->vh_mutex); | ||
395 | res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); | 364 | res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); |
396 | if (res) | 365 | if (res) |
397 | return res; | 366 | goto out; |
398 | clear_nlink(inode); | 367 | clear_nlink(inode); |
399 | inode->i_ctime = CURRENT_TIME_SEC; | 368 | inode->i_ctime = CURRENT_TIME_SEC; |
400 | hfsplus_delete_inode(inode); | 369 | hfsplus_delete_inode(inode); |
401 | mark_inode_dirty(inode); | 370 | mark_inode_dirty(inode); |
402 | return 0; | 371 | out: |
372 | mutex_unlock(&sbi->vh_mutex); | ||
373 | return res; | ||
403 | } | 374 | } |
404 | 375 | ||
405 | static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, | 376 | static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, |
406 | const char *symname) | 377 | const char *symname) |
407 | { | 378 | { |
408 | struct super_block *sb; | 379 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); |
409 | struct inode *inode; | 380 | struct inode *inode; |
410 | int res; | 381 | int res = -ENOSPC; |
411 | 382 | ||
412 | sb = dir->i_sb; | 383 | mutex_lock(&sbi->vh_mutex); |
413 | inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); | 384 | inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); |
414 | if (!inode) | 385 | if (!inode) |
415 | return -ENOSPC; | 386 | goto out; |
416 | 387 | ||
417 | res = page_symlink(inode, symname, strlen(symname) + 1); | 388 | res = page_symlink(inode, symname, strlen(symname) + 1); |
418 | if (res) { | 389 | if (res) |
419 | inode->i_nlink = 0; | 390 | goto out_err; |
420 | hfsplus_delete_inode(inode); | ||
421 | iput(inode); | ||
422 | return res; | ||
423 | } | ||
424 | 391 | ||
425 | mark_inode_dirty(inode); | ||
426 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); | 392 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); |
393 | if (res) | ||
394 | goto out_err; | ||
427 | 395 | ||
428 | if (!res) { | 396 | hfsplus_instantiate(dentry, inode, inode->i_ino); |
429 | hfsplus_instantiate(dentry, inode, inode->i_ino); | 397 | mark_inode_dirty(inode); |
430 | mark_inode_dirty(inode); | 398 | goto out; |
431 | } | ||
432 | 399 | ||
400 | out_err: | ||
401 | inode->i_nlink = 0; | ||
402 | hfsplus_delete_inode(inode); | ||
403 | iput(inode); | ||
404 | out: | ||
405 | mutex_unlock(&sbi->vh_mutex); | ||
433 | return res; | 406 | return res; |
434 | } | 407 | } |
435 | 408 | ||
436 | static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, | 409 | static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, |
437 | int mode, dev_t rdev) | 410 | int mode, dev_t rdev) |
438 | { | 411 | { |
439 | struct super_block *sb; | 412 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); |
440 | struct inode *inode; | 413 | struct inode *inode; |
441 | int res; | 414 | int res = -ENOSPC; |
442 | 415 | ||
443 | sb = dir->i_sb; | 416 | mutex_lock(&sbi->vh_mutex); |
444 | inode = hfsplus_new_inode(sb, mode); | 417 | inode = hfsplus_new_inode(dir->i_sb, mode); |
445 | if (!inode) | 418 | if (!inode) |
446 | return -ENOSPC; | 419 | goto out; |
420 | |||
421 | if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) | ||
422 | init_special_inode(inode, mode, rdev); | ||
447 | 423 | ||
448 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); | 424 | res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); |
449 | if (res) { | 425 | if (res) { |
450 | inode->i_nlink = 0; | 426 | inode->i_nlink = 0; |
451 | hfsplus_delete_inode(inode); | 427 | hfsplus_delete_inode(inode); |
452 | iput(inode); | 428 | iput(inode); |
453 | return res; | 429 | goto out; |
454 | } | 430 | } |
455 | init_special_inode(inode, mode, rdev); | 431 | |
456 | hfsplus_instantiate(dentry, inode, inode->i_ino); | 432 | hfsplus_instantiate(dentry, inode, inode->i_ino); |
457 | mark_inode_dirty(inode); | 433 | mark_inode_dirty(inode); |
434 | out: | ||
435 | mutex_unlock(&sbi->vh_mutex); | ||
436 | return res; | ||
437 | } | ||
458 | 438 | ||
459 | return 0; | 439 | static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode, |
440 | struct nameidata *nd) | ||
441 | { | ||
442 | return hfsplus_mknod(dir, dentry, mode, 0); | ||
443 | } | ||
444 | |||
445 | static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
446 | { | ||
447 | return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); | ||
460 | } | 448 | } |
461 | 449 | ||
462 | static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, | 450 | static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, |
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
466 | 454 | ||
467 | /* Unlink destination if it already exists */ | 455 | /* Unlink destination if it already exists */ |
468 | if (new_dentry->d_inode) { | 456 | if (new_dentry->d_inode) { |
469 | res = hfsplus_unlink(new_dir, new_dentry); | 457 | if (S_ISDIR(new_dentry->d_inode->i_mode)) |
458 | res = hfsplus_rmdir(new_dir, new_dentry); | ||
459 | else | ||
460 | res = hfsplus_unlink(new_dir, new_dentry); | ||
470 | if (res) | 461 | if (res) |
471 | return res; | 462 | return res; |
472 | } | 463 | } |
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 0022eec63cda..0c9cb1820a52 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c | |||
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) | |||
85 | 85 | ||
86 | static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) | 86 | static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) |
87 | { | 87 | { |
88 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
88 | int res; | 89 | int res; |
89 | 90 | ||
90 | hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, | 91 | WARN_ON(!mutex_is_locked(&hip->extents_lock)); |
91 | HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); | 92 | |
93 | hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start, | ||
94 | HFSPLUS_IS_RSRC(inode) ? | ||
95 | HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); | ||
96 | |||
92 | res = hfs_brec_find(fd); | 97 | res = hfs_brec_find(fd); |
93 | if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { | 98 | if (hip->flags & HFSPLUS_FLG_EXT_NEW) { |
94 | if (res != -ENOENT) | 99 | if (res != -ENOENT) |
95 | return; | 100 | return; |
96 | hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); | 101 | hfs_brec_insert(fd, hip->cached_extents, |
97 | HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); | 102 | sizeof(hfsplus_extent_rec)); |
103 | hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); | ||
98 | } else { | 104 | } else { |
99 | if (res) | 105 | if (res) |
100 | return; | 106 | return; |
101 | hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); | 107 | hfs_bnode_write(fd->bnode, hip->cached_extents, |
102 | HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; | 108 | fd->entryoffset, fd->entrylength); |
109 | hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY; | ||
103 | } | 110 | } |
104 | } | 111 | } |
105 | 112 | ||
106 | void hfsplus_ext_write_extent(struct inode *inode) | 113 | static void hfsplus_ext_write_extent_locked(struct inode *inode) |
107 | { | 114 | { |
108 | if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { | 115 | if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) { |
109 | struct hfs_find_data fd; | 116 | struct hfs_find_data fd; |
110 | 117 | ||
111 | hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); | 118 | hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); |
112 | __hfsplus_ext_write_extent(inode, &fd); | 119 | __hfsplus_ext_write_extent(inode, &fd); |
113 | hfs_find_exit(&fd); | 120 | hfs_find_exit(&fd); |
114 | } | 121 | } |
115 | } | 122 | } |
116 | 123 | ||
124 | void hfsplus_ext_write_extent(struct inode *inode) | ||
125 | { | ||
126 | mutex_lock(&HFSPLUS_I(inode)->extents_lock); | ||
127 | hfsplus_ext_write_extent_locked(inode); | ||
128 | mutex_unlock(&HFSPLUS_I(inode)->extents_lock); | ||
129 | } | ||
130 | |||
117 | static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, | 131 | static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, |
118 | struct hfsplus_extent *extent, | 132 | struct hfsplus_extent *extent, |
119 | u32 cnid, u32 block, u8 type) | 133 | u32 cnid, u32 block, u8 type) |
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, | |||
136 | 150 | ||
137 | static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) | 151 | static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) |
138 | { | 152 | { |
153 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
139 | int res; | 154 | int res; |
140 | 155 | ||
141 | if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) | 156 | WARN_ON(!mutex_is_locked(&hip->extents_lock)); |
157 | |||
158 | if (hip->flags & HFSPLUS_FLG_EXT_DIRTY) | ||
142 | __hfsplus_ext_write_extent(inode, fd); | 159 | __hfsplus_ext_write_extent(inode, fd); |
143 | 160 | ||
144 | res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, | 161 | res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, |
145 | block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); | 162 | block, HFSPLUS_IS_RSRC(inode) ? |
163 | HFSPLUS_TYPE_RSRC : | ||
164 | HFSPLUS_TYPE_DATA); | ||
146 | if (!res) { | 165 | if (!res) { |
147 | HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); | 166 | hip->cached_start = be32_to_cpu(fd->key->ext.start_block); |
148 | HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); | 167 | hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents); |
149 | } else { | 168 | } else { |
150 | HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; | 169 | hip->cached_start = hip->cached_blocks = 0; |
151 | HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); | 170 | hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); |
152 | } | 171 | } |
153 | return res; | 172 | return res; |
154 | } | 173 | } |
155 | 174 | ||
156 | static int hfsplus_ext_read_extent(struct inode *inode, u32 block) | 175 | static int hfsplus_ext_read_extent(struct inode *inode, u32 block) |
157 | { | 176 | { |
177 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
158 | struct hfs_find_data fd; | 178 | struct hfs_find_data fd; |
159 | int res; | 179 | int res; |
160 | 180 | ||
161 | if (block >= HFSPLUS_I(inode).cached_start && | 181 | if (block >= hip->cached_start && |
162 | block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) | 182 | block < hip->cached_start + hip->cached_blocks) |
163 | return 0; | 183 | return 0; |
164 | 184 | ||
165 | hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); | 185 | hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); |
166 | res = __hfsplus_ext_cache_extent(&fd, inode, block); | 186 | res = __hfsplus_ext_cache_extent(&fd, inode, block); |
167 | hfs_find_exit(&fd); | 187 | hfs_find_exit(&fd); |
168 | return res; | 188 | return res; |
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block) | |||
172 | int hfsplus_get_block(struct inode *inode, sector_t iblock, | 192 | int hfsplus_get_block(struct inode *inode, sector_t iblock, |
173 | struct buffer_head *bh_result, int create) | 193 | struct buffer_head *bh_result, int create) |
174 | { | 194 | { |
175 | struct super_block *sb; | 195 | struct super_block *sb = inode->i_sb; |
196 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
197 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
176 | int res = -EIO; | 198 | int res = -EIO; |
177 | u32 ablock, dblock, mask; | 199 | u32 ablock, dblock, mask; |
178 | int shift; | 200 | int shift; |
179 | 201 | ||
180 | sb = inode->i_sb; | ||
181 | |||
182 | /* Convert inode block to disk allocation block */ | 202 | /* Convert inode block to disk allocation block */ |
183 | shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; | 203 | shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; |
184 | ablock = iblock >> HFSPLUS_SB(sb).fs_shift; | 204 | ablock = iblock >> sbi->fs_shift; |
185 | 205 | ||
186 | if (iblock >= HFSPLUS_I(inode).fs_blocks) { | 206 | if (iblock >= hip->fs_blocks) { |
187 | if (iblock > HFSPLUS_I(inode).fs_blocks || !create) | 207 | if (iblock > hip->fs_blocks || !create) |
188 | return -EIO; | 208 | return -EIO; |
189 | if (ablock >= HFSPLUS_I(inode).alloc_blocks) { | 209 | if (ablock >= hip->alloc_blocks) { |
190 | res = hfsplus_file_extend(inode); | 210 | res = hfsplus_file_extend(inode); |
191 | if (res) | 211 | if (res) |
192 | return res; | 212 | return res; |
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, | |||
194 | } else | 214 | } else |
195 | create = 0; | 215 | create = 0; |
196 | 216 | ||
197 | if (ablock < HFSPLUS_I(inode).first_blocks) { | 217 | if (ablock < hip->first_blocks) { |
198 | dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); | 218 | dblock = hfsplus_ext_find_block(hip->first_extents, ablock); |
199 | goto done; | 219 | goto done; |
200 | } | 220 | } |
201 | 221 | ||
202 | if (inode->i_ino == HFSPLUS_EXT_CNID) | 222 | if (inode->i_ino == HFSPLUS_EXT_CNID) |
203 | return -EIO; | 223 | return -EIO; |
204 | 224 | ||
205 | mutex_lock(&HFSPLUS_I(inode).extents_lock); | 225 | mutex_lock(&hip->extents_lock); |
206 | res = hfsplus_ext_read_extent(inode, ablock); | 226 | res = hfsplus_ext_read_extent(inode, ablock); |
207 | if (!res) { | 227 | if (!res) { |
208 | dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - | 228 | dblock = hfsplus_ext_find_block(hip->cached_extents, |
209 | HFSPLUS_I(inode).cached_start); | 229 | ablock - hip->cached_start); |
210 | } else { | 230 | } else { |
211 | mutex_unlock(&HFSPLUS_I(inode).extents_lock); | 231 | mutex_unlock(&hip->extents_lock); |
212 | return -EIO; | 232 | return -EIO; |
213 | } | 233 | } |
214 | mutex_unlock(&HFSPLUS_I(inode).extents_lock); | 234 | mutex_unlock(&hip->extents_lock); |
215 | 235 | ||
216 | done: | 236 | done: |
217 | dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); | 237 | dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); |
218 | mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; | 238 | mask = (1 << sbi->fs_shift) - 1; |
219 | map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); | 239 | map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask)); |
220 | if (create) { | 240 | if (create) { |
221 | set_buffer_new(bh_result); | 241 | set_buffer_new(bh_result); |
222 | HFSPLUS_I(inode).phys_size += sb->s_blocksize; | 242 | hip->phys_size += sb->s_blocksize; |
223 | HFSPLUS_I(inode).fs_blocks++; | 243 | hip->fs_blocks++; |
224 | inode_add_bytes(inode, sb->s_blocksize); | 244 | inode_add_bytes(inode, sb->s_blocksize); |
225 | mark_inode_dirty(inode); | 245 | mark_inode_dirty(inode); |
226 | } | 246 | } |
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw | |||
327 | if (total_blocks == blocks) | 347 | if (total_blocks == blocks) |
328 | return 0; | 348 | return 0; |
329 | 349 | ||
330 | hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); | 350 | hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); |
331 | do { | 351 | do { |
332 | res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, | 352 | res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, |
333 | total_blocks, type); | 353 | total_blocks, type); |
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw | |||
348 | int hfsplus_file_extend(struct inode *inode) | 368 | int hfsplus_file_extend(struct inode *inode) |
349 | { | 369 | { |
350 | struct super_block *sb = inode->i_sb; | 370 | struct super_block *sb = inode->i_sb; |
371 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
372 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
351 | u32 start, len, goal; | 373 | u32 start, len, goal; |
352 | int res; | 374 | int res; |
353 | 375 | ||
354 | if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { | 376 | if (sbi->alloc_file->i_size * 8 < |
377 | sbi->total_blocks - sbi->free_blocks + 8) { | ||
355 | // extend alloc file | 378 | // extend alloc file |
356 | printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, | 379 | printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", |
357 | HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); | 380 | sbi->alloc_file->i_size * 8, |
381 | sbi->total_blocks, sbi->free_blocks); | ||
358 | return -ENOSPC; | 382 | return -ENOSPC; |
359 | } | 383 | } |
360 | 384 | ||
361 | mutex_lock(&HFSPLUS_I(inode).extents_lock); | 385 | mutex_lock(&hip->extents_lock); |
362 | if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) | 386 | if (hip->alloc_blocks == hip->first_blocks) |
363 | goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); | 387 | goal = hfsplus_ext_lastblock(hip->first_extents); |
364 | else { | 388 | else { |
365 | res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); | 389 | res = hfsplus_ext_read_extent(inode, hip->alloc_blocks); |
366 | if (res) | 390 | if (res) |
367 | goto out; | 391 | goto out; |
368 | goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); | 392 | goal = hfsplus_ext_lastblock(hip->cached_extents); |
369 | } | 393 | } |
370 | 394 | ||
371 | len = HFSPLUS_I(inode).clump_blocks; | 395 | len = hip->clump_blocks; |
372 | start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); | 396 | start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len); |
373 | if (start >= HFSPLUS_SB(sb).total_blocks) { | 397 | if (start >= sbi->total_blocks) { |
374 | start = hfsplus_block_allocate(sb, goal, 0, &len); | 398 | start = hfsplus_block_allocate(sb, goal, 0, &len); |
375 | if (start >= goal) { | 399 | if (start >= goal) { |
376 | res = -ENOSPC; | 400 | res = -ENOSPC; |
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode) | |||
379 | } | 403 | } |
380 | 404 | ||
381 | dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); | 405 | dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); |
382 | if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { | 406 | |
383 | if (!HFSPLUS_I(inode).first_blocks) { | 407 | if (hip->alloc_blocks <= hip->first_blocks) { |
408 | if (!hip->first_blocks) { | ||
384 | dprint(DBG_EXTENT, "first extents\n"); | 409 | dprint(DBG_EXTENT, "first extents\n"); |
385 | /* no extents yet */ | 410 | /* no extents yet */ |
386 | HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); | 411 | hip->first_extents[0].start_block = cpu_to_be32(start); |
387 | HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); | 412 | hip->first_extents[0].block_count = cpu_to_be32(len); |
388 | res = 0; | 413 | res = 0; |
389 | } else { | 414 | } else { |
390 | /* try to append to extents in inode */ | 415 | /* try to append to extents in inode */ |
391 | res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, | 416 | res = hfsplus_add_extent(hip->first_extents, |
392 | HFSPLUS_I(inode).alloc_blocks, | 417 | hip->alloc_blocks, |
393 | start, len); | 418 | start, len); |
394 | if (res == -ENOSPC) | 419 | if (res == -ENOSPC) |
395 | goto insert_extent; | 420 | goto insert_extent; |
396 | } | 421 | } |
397 | if (!res) { | 422 | if (!res) { |
398 | hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); | 423 | hfsplus_dump_extent(hip->first_extents); |
399 | HFSPLUS_I(inode).first_blocks += len; | 424 | hip->first_blocks += len; |
400 | } | 425 | } |
401 | } else { | 426 | } else { |
402 | res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, | 427 | res = hfsplus_add_extent(hip->cached_extents, |
403 | HFSPLUS_I(inode).alloc_blocks - | 428 | hip->alloc_blocks - hip->cached_start, |
404 | HFSPLUS_I(inode).cached_start, | ||
405 | start, len); | 429 | start, len); |
406 | if (!res) { | 430 | if (!res) { |
407 | hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); | 431 | hfsplus_dump_extent(hip->cached_extents); |
408 | HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; | 432 | hip->flags |= HFSPLUS_FLG_EXT_DIRTY; |
409 | HFSPLUS_I(inode).cached_blocks += len; | 433 | hip->cached_blocks += len; |
410 | } else if (res == -ENOSPC) | 434 | } else if (res == -ENOSPC) |
411 | goto insert_extent; | 435 | goto insert_extent; |
412 | } | 436 | } |
413 | out: | 437 | out: |
414 | mutex_unlock(&HFSPLUS_I(inode).extents_lock); | 438 | mutex_unlock(&hip->extents_lock); |
415 | if (!res) { | 439 | if (!res) { |
416 | HFSPLUS_I(inode).alloc_blocks += len; | 440 | hip->alloc_blocks += len; |
417 | mark_inode_dirty(inode); | 441 | mark_inode_dirty(inode); |
418 | } | 442 | } |
419 | return res; | 443 | return res; |
420 | 444 | ||
421 | insert_extent: | 445 | insert_extent: |
422 | dprint(DBG_EXTENT, "insert new extent\n"); | 446 | dprint(DBG_EXTENT, "insert new extent\n"); |
423 | hfsplus_ext_write_extent(inode); | 447 | hfsplus_ext_write_extent_locked(inode); |
424 | 448 | ||
425 | memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); | 449 | memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); |
426 | HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); | 450 | hip->cached_extents[0].start_block = cpu_to_be32(start); |
427 | HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); | 451 | hip->cached_extents[0].block_count = cpu_to_be32(len); |
428 | hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); | 452 | hfsplus_dump_extent(hip->cached_extents); |
429 | HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; | 453 | hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; |
430 | HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; | 454 | hip->cached_start = hip->alloc_blocks; |
431 | HFSPLUS_I(inode).cached_blocks = len; | 455 | hip->cached_blocks = len; |
432 | 456 | ||
433 | res = 0; | 457 | res = 0; |
434 | goto out; | 458 | goto out; |
@@ -437,13 +461,15 @@ insert_extent: | |||
437 | void hfsplus_file_truncate(struct inode *inode) | 461 | void hfsplus_file_truncate(struct inode *inode) |
438 | { | 462 | { |
439 | struct super_block *sb = inode->i_sb; | 463 | struct super_block *sb = inode->i_sb; |
464 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
440 | struct hfs_find_data fd; | 465 | struct hfs_find_data fd; |
441 | u32 alloc_cnt, blk_cnt, start; | 466 | u32 alloc_cnt, blk_cnt, start; |
442 | int res; | 467 | int res; |
443 | 468 | ||
444 | dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, | 469 | dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", |
445 | (long long)HFSPLUS_I(inode).phys_size, inode->i_size); | 470 | inode->i_ino, (long long)hip->phys_size, inode->i_size); |
446 | if (inode->i_size > HFSPLUS_I(inode).phys_size) { | 471 | |
472 | if (inode->i_size > hip->phys_size) { | ||
447 | struct address_space *mapping = inode->i_mapping; | 473 | struct address_space *mapping = inode->i_mapping; |
448 | struct page *page; | 474 | struct page *page; |
449 | void *fsdata; | 475 | void *fsdata; |
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode) | |||
460 | return; | 486 | return; |
461 | mark_inode_dirty(inode); | 487 | mark_inode_dirty(inode); |
462 | return; | 488 | return; |
463 | } else if (inode->i_size == HFSPLUS_I(inode).phys_size) | 489 | } else if (inode->i_size == hip->phys_size) |
464 | return; | 490 | return; |
465 | 491 | ||
466 | blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; | 492 | blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> |
467 | alloc_cnt = HFSPLUS_I(inode).alloc_blocks; | 493 | HFSPLUS_SB(sb)->alloc_blksz_shift; |
494 | alloc_cnt = hip->alloc_blocks; | ||
468 | if (blk_cnt == alloc_cnt) | 495 | if (blk_cnt == alloc_cnt) |
469 | goto out; | 496 | goto out; |
470 | 497 | ||
471 | mutex_lock(&HFSPLUS_I(inode).extents_lock); | 498 | mutex_lock(&hip->extents_lock); |
472 | hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); | 499 | hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); |
473 | while (1) { | 500 | while (1) { |
474 | if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { | 501 | if (alloc_cnt == hip->first_blocks) { |
475 | hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, | 502 | hfsplus_free_extents(sb, hip->first_extents, |
476 | alloc_cnt, alloc_cnt - blk_cnt); | 503 | alloc_cnt, alloc_cnt - blk_cnt); |
477 | hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); | 504 | hfsplus_dump_extent(hip->first_extents); |
478 | HFSPLUS_I(inode).first_blocks = blk_cnt; | 505 | hip->first_blocks = blk_cnt; |
479 | break; | 506 | break; |
480 | } | 507 | } |
481 | res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); | 508 | res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); |
482 | if (res) | 509 | if (res) |
483 | break; | 510 | break; |
484 | start = HFSPLUS_I(inode).cached_start; | 511 | start = hip->cached_start; |
485 | hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, | 512 | hfsplus_free_extents(sb, hip->cached_extents, |
486 | alloc_cnt - start, alloc_cnt - blk_cnt); | 513 | alloc_cnt - start, alloc_cnt - blk_cnt); |
487 | hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); | 514 | hfsplus_dump_extent(hip->cached_extents); |
488 | if (blk_cnt > start) { | 515 | if (blk_cnt > start) { |
489 | HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; | 516 | hip->flags |= HFSPLUS_FLG_EXT_DIRTY; |
490 | break; | 517 | break; |
491 | } | 518 | } |
492 | alloc_cnt = start; | 519 | alloc_cnt = start; |
493 | HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; | 520 | hip->cached_start = hip->cached_blocks = 0; |
494 | HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); | 521 | hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); |
495 | hfs_brec_remove(&fd); | 522 | hfs_brec_remove(&fd); |
496 | } | 523 | } |
497 | hfs_find_exit(&fd); | 524 | hfs_find_exit(&fd); |
498 | mutex_unlock(&HFSPLUS_I(inode).extents_lock); | 525 | mutex_unlock(&hip->extents_lock); |
499 | 526 | ||
500 | HFSPLUS_I(inode).alloc_blocks = blk_cnt; | 527 | hip->alloc_blocks = blk_cnt; |
501 | out: | 528 | out: |
502 | HFSPLUS_I(inode).phys_size = inode->i_size; | 529 | hip->phys_size = inode->i_size; |
503 | HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; | 530 | hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; |
504 | inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); | 531 | inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); |
505 | mark_inode_dirty(inode); | 532 | mark_inode_dirty(inode); |
506 | } | 533 | } |
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index dc856be3c2b0..cb3653efb57a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h | |||
@@ -62,7 +62,7 @@ struct hfs_btree { | |||
62 | unsigned int depth; | 62 | unsigned int depth; |
63 | 63 | ||
64 | //unsigned int map1_size, map_size; | 64 | //unsigned int map1_size, map_size; |
65 | struct semaphore tree_lock; | 65 | struct mutex tree_lock; |
66 | 66 | ||
67 | unsigned int pages_per_bnode; | 67 | unsigned int pages_per_bnode; |
68 | spinlock_t hash_lock; | 68 | spinlock_t hash_lock; |
@@ -121,16 +121,21 @@ struct hfsplus_sb_info { | |||
121 | u32 sect_count; | 121 | u32 sect_count; |
122 | int fs_shift; | 122 | int fs_shift; |
123 | 123 | ||
124 | /* Stuff in host order from Vol Header */ | 124 | /* immutable data from the volume header */ |
125 | u32 alloc_blksz; | 125 | u32 alloc_blksz; |
126 | int alloc_blksz_shift; | 126 | int alloc_blksz_shift; |
127 | u32 total_blocks; | 127 | u32 total_blocks; |
128 | u32 data_clump_blocks, rsrc_clump_blocks; | ||
129 | |||
130 | /* mutable data from the volume header, protected by alloc_mutex */ | ||
128 | u32 free_blocks; | 131 | u32 free_blocks; |
129 | u32 next_alloc; | 132 | struct mutex alloc_mutex; |
133 | |||
134 | /* mutable data from the volume header, protected by vh_mutex */ | ||
130 | u32 next_cnid; | 135 | u32 next_cnid; |
131 | u32 file_count; | 136 | u32 file_count; |
132 | u32 folder_count; | 137 | u32 folder_count; |
133 | u32 data_clump_blocks, rsrc_clump_blocks; | 138 | struct mutex vh_mutex; |
134 | 139 | ||
135 | /* Config options */ | 140 | /* Config options */ |
136 | u32 creator; | 141 | u32 creator; |
@@ -143,40 +148,50 @@ struct hfsplus_sb_info { | |||
143 | int part, session; | 148 | int part, session; |
144 | 149 | ||
145 | unsigned long flags; | 150 | unsigned long flags; |
146 | |||
147 | struct hlist_head rsrc_inodes; | ||
148 | }; | 151 | }; |
149 | 152 | ||
150 | #define HFSPLUS_SB_WRITEBACKUP 0x0001 | 153 | #define HFSPLUS_SB_WRITEBACKUP 0 |
151 | #define HFSPLUS_SB_NODECOMPOSE 0x0002 | 154 | #define HFSPLUS_SB_NODECOMPOSE 1 |
152 | #define HFSPLUS_SB_FORCE 0x0004 | 155 | #define HFSPLUS_SB_FORCE 2 |
153 | #define HFSPLUS_SB_HFSX 0x0008 | 156 | #define HFSPLUS_SB_HFSX 3 |
154 | #define HFSPLUS_SB_CASEFOLD 0x0010 | 157 | #define HFSPLUS_SB_CASEFOLD 4 |
155 | 158 | ||
156 | 159 | ||
157 | struct hfsplus_inode_info { | 160 | struct hfsplus_inode_info { |
158 | struct mutex extents_lock; | ||
159 | u32 clump_blocks, alloc_blocks; | ||
160 | sector_t fs_blocks; | ||
161 | /* Allocation extents from catalog record or volume header */ | ||
162 | hfsplus_extent_rec first_extents; | ||
163 | u32 first_blocks; | ||
164 | hfsplus_extent_rec cached_extents; | ||
165 | u32 cached_start, cached_blocks; | ||
166 | atomic_t opencnt; | 161 | atomic_t opencnt; |
167 | 162 | ||
168 | struct inode *rsrc_inode; | 163 | /* |
164 | * Extent allocation information, protected by extents_lock. | ||
165 | */ | ||
166 | u32 first_blocks; | ||
167 | u32 clump_blocks; | ||
168 | u32 alloc_blocks; | ||
169 | u32 cached_start; | ||
170 | u32 cached_blocks; | ||
171 | hfsplus_extent_rec first_extents; | ||
172 | hfsplus_extent_rec cached_extents; | ||
169 | unsigned long flags; | 173 | unsigned long flags; |
174 | struct mutex extents_lock; | ||
170 | 175 | ||
176 | /* | ||
177 | * Immutable data. | ||
178 | */ | ||
179 | struct inode *rsrc_inode; | ||
171 | __be32 create_date; | 180 | __be32 create_date; |
172 | /* Device number in hfsplus_permissions in catalog */ | ||
173 | u32 dev; | ||
174 | /* BSD system and user file flags */ | ||
175 | u8 rootflags; | ||
176 | u8 userflags; | ||
177 | 181 | ||
182 | /* | ||
183 | * Protected by sbi->vh_mutex. | ||
184 | */ | ||
185 | u32 linkid; | ||
186 | |||
187 | /* | ||
188 | * Protected by i_mutex. | ||
189 | */ | ||
190 | sector_t fs_blocks; | ||
191 | u8 userflags; /* BSD user file flags */ | ||
178 | struct list_head open_dir_list; | 192 | struct list_head open_dir_list; |
179 | loff_t phys_size; | 193 | loff_t phys_size; |
194 | |||
180 | struct inode vfs_inode; | 195 | struct inode vfs_inode; |
181 | }; | 196 | }; |
182 | 197 | ||
@@ -184,8 +199,8 @@ struct hfsplus_inode_info { | |||
184 | #define HFSPLUS_FLG_EXT_DIRTY 0x0002 | 199 | #define HFSPLUS_FLG_EXT_DIRTY 0x0002 |
185 | #define HFSPLUS_FLG_EXT_NEW 0x0004 | 200 | #define HFSPLUS_FLG_EXT_NEW 0x0004 |
186 | 201 | ||
187 | #define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) | 202 | #define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)) |
188 | #define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) | 203 | #define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC) |
189 | 204 | ||
190 | struct hfs_find_data { | 205 | struct hfs_find_data { |
191 | /* filled by caller */ | 206 | /* filled by caller */ |
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); | |||
311 | int hfsplus_delete_cat(u32, struct inode *, struct qstr *); | 326 | int hfsplus_delete_cat(u32, struct inode *, struct qstr *); |
312 | int hfsplus_rename_cat(u32, struct inode *, struct qstr *, | 327 | int hfsplus_rename_cat(u32, struct inode *, struct qstr *, |
313 | struct inode *, struct qstr *); | 328 | struct inode *, struct qstr *); |
329 | void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); | ||
314 | 330 | ||
315 | /* dir.c */ | 331 | /* dir.c */ |
316 | extern const struct inode_operations hfsplus_dir_inode_operations; | 332 | extern const struct inode_operations hfsplus_dir_inode_operations; |
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *); | |||
372 | int hfs_part_find(struct super_block *, sector_t *, sector_t *); | 388 | int hfs_part_find(struct super_block *, sector_t *, sector_t *); |
373 | 389 | ||
374 | /* access macros */ | 390 | /* access macros */ |
375 | /* | ||
376 | static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) | 391 | static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) |
377 | { | 392 | { |
378 | return sb->s_fs_info; | 393 | return sb->s_fs_info; |
379 | } | 394 | } |
395 | |||
380 | static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) | 396 | static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) |
381 | { | 397 | { |
382 | return list_entry(inode, struct hfsplus_inode_info, vfs_inode); | 398 | return list_entry(inode, struct hfsplus_inode_info, vfs_inode); |
383 | } | 399 | } |
384 | */ | ||
385 | #define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info) | ||
386 | #define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode)) | ||
387 | |||
388 | #if 1 | ||
389 | #define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); }) | ||
390 | #define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; }) | ||
391 | #else | ||
392 | #define hfsplus_kmap(p) kmap(p) | ||
393 | #define hfsplus_kunmap(p) kunmap(p) | ||
394 | #endif | ||
395 | 400 | ||
396 | #define sb_bread512(sb, sec, data) ({ \ | 401 | #define sb_bread512(sb, sec, data) ({ \ |
397 | struct buffer_head *__bh; \ | 402 | struct buffer_head *__bh; \ |
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) | |||
419 | #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) | 424 | #define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) |
420 | #define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) | 425 | #define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) |
421 | 426 | ||
422 | #define kdev_t_to_nr(x) (x) | ||
423 | |||
424 | #endif | 427 | #endif |
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index fe99fe8db61a..6892899fd6fb 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h | |||
@@ -200,6 +200,7 @@ struct hfsplus_cat_key { | |||
200 | struct hfsplus_unistr name; | 200 | struct hfsplus_unistr name; |
201 | } __packed; | 201 | } __packed; |
202 | 202 | ||
203 | #define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) | ||
203 | 204 | ||
204 | /* Structs from hfs.h */ | 205 | /* Structs from hfs.h */ |
205 | struct hfsp_point { | 206 | struct hfsp_point { |
@@ -323,7 +324,7 @@ struct hfsplus_ext_key { | |||
323 | __be32 start_block; | 324 | __be32 start_block; |
324 | } __packed; | 325 | } __packed; |
325 | 326 | ||
326 | #define HFSPLUS_EXT_KEYLEN 12 | 327 | #define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) |
327 | 328 | ||
328 | /* HFS+ generic BTree key */ | 329 | /* HFS+ generic BTree key */ |
329 | typedef union { | 330 | typedef union { |
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c5a979d62c65..78449280dae0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c | |||
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping, | |||
36 | *pagep = NULL; | 36 | *pagep = NULL; |
37 | ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | 37 | ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, |
38 | hfsplus_get_block, | 38 | hfsplus_get_block, |
39 | &HFSPLUS_I(mapping->host).phys_size); | 39 | &HFSPLUS_I(mapping->host)->phys_size); |
40 | if (unlikely(ret)) { | 40 | if (unlikely(ret)) { |
41 | loff_t isize = mapping->host->i_size; | 41 | loff_t isize = mapping->host->i_size; |
42 | if (pos + len > isize) | 42 | if (pos + len > isize) |
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) | |||
62 | 62 | ||
63 | switch (inode->i_ino) { | 63 | switch (inode->i_ino) { |
64 | case HFSPLUS_EXT_CNID: | 64 | case HFSPLUS_EXT_CNID: |
65 | tree = HFSPLUS_SB(sb).ext_tree; | 65 | tree = HFSPLUS_SB(sb)->ext_tree; |
66 | break; | 66 | break; |
67 | case HFSPLUS_CAT_CNID: | 67 | case HFSPLUS_CAT_CNID: |
68 | tree = HFSPLUS_SB(sb).cat_tree; | 68 | tree = HFSPLUS_SB(sb)->cat_tree; |
69 | break; | 69 | break; |
70 | case HFSPLUS_ATTR_CNID: | 70 | case HFSPLUS_ATTR_CNID: |
71 | tree = HFSPLUS_SB(sb).attr_tree; | 71 | tree = HFSPLUS_SB(sb)->attr_tree; |
72 | break; | 72 | break; |
73 | default: | 73 | default: |
74 | BUG(); | 74 | BUG(); |
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent | |||
172 | struct hfs_find_data fd; | 172 | struct hfs_find_data fd; |
173 | struct super_block *sb = dir->i_sb; | 173 | struct super_block *sb = dir->i_sb; |
174 | struct inode *inode = NULL; | 174 | struct inode *inode = NULL; |
175 | struct hfsplus_inode_info *hip; | ||
175 | int err; | 176 | int err; |
176 | 177 | ||
177 | if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) | 178 | if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) |
178 | goto out; | 179 | goto out; |
179 | 180 | ||
180 | inode = HFSPLUS_I(dir).rsrc_inode; | 181 | inode = HFSPLUS_I(dir)->rsrc_inode; |
181 | if (inode) | 182 | if (inode) |
182 | goto out; | 183 | goto out; |
183 | 184 | ||
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent | |||
185 | if (!inode) | 186 | if (!inode) |
186 | return ERR_PTR(-ENOMEM); | 187 | return ERR_PTR(-ENOMEM); |
187 | 188 | ||
189 | hip = HFSPLUS_I(inode); | ||
188 | inode->i_ino = dir->i_ino; | 190 | inode->i_ino = dir->i_ino; |
189 | INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); | 191 | INIT_LIST_HEAD(&hip->open_dir_list); |
190 | mutex_init(&HFSPLUS_I(inode).extents_lock); | 192 | mutex_init(&hip->extents_lock); |
191 | HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; | 193 | hip->flags = HFSPLUS_FLG_RSRC; |
192 | 194 | ||
193 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | 195 | hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); |
194 | err = hfsplus_find_cat(sb, dir->i_ino, &fd); | 196 | err = hfsplus_find_cat(sb, dir->i_ino, &fd); |
195 | if (!err) | 197 | if (!err) |
196 | err = hfsplus_cat_read_inode(inode, &fd); | 198 | err = hfsplus_cat_read_inode(inode, &fd); |
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent | |||
199 | iput(inode); | 201 | iput(inode); |
200 | return ERR_PTR(err); | 202 | return ERR_PTR(err); |
201 | } | 203 | } |
202 | HFSPLUS_I(inode).rsrc_inode = dir; | 204 | hip->rsrc_inode = dir; |
203 | HFSPLUS_I(dir).rsrc_inode = inode; | 205 | HFSPLUS_I(dir)->rsrc_inode = inode; |
204 | igrab(dir); | 206 | igrab(dir); |
205 | hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); | 207 | |
208 | /* | ||
209 | * __mark_inode_dirty expects inodes to be hashed. Since we don't | ||
210 | * want resource fork inodes in the regular inode space, we make them | ||
211 | * appear hashed, but do not put on any lists. hlist_del() | ||
212 | * will work fine and require no locking. | ||
213 | */ | ||
214 | inode->i_hash.pprev = &inode->i_hash.next; | ||
215 | |||
206 | mark_inode_dirty(inode); | 216 | mark_inode_dirty(inode); |
207 | out: | 217 | out: |
208 | d_add(dentry, inode); | 218 | d_add(dentry, inode); |
@@ -211,30 +221,27 @@ out: | |||
211 | 221 | ||
212 | static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) | 222 | static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) |
213 | { | 223 | { |
214 | struct super_block *sb = inode->i_sb; | 224 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); |
215 | u16 mode; | 225 | u16 mode; |
216 | 226 | ||
217 | mode = be16_to_cpu(perms->mode); | 227 | mode = be16_to_cpu(perms->mode); |
218 | 228 | ||
219 | inode->i_uid = be32_to_cpu(perms->owner); | 229 | inode->i_uid = be32_to_cpu(perms->owner); |
220 | if (!inode->i_uid && !mode) | 230 | if (!inode->i_uid && !mode) |
221 | inode->i_uid = HFSPLUS_SB(sb).uid; | 231 | inode->i_uid = sbi->uid; |
222 | 232 | ||
223 | inode->i_gid = be32_to_cpu(perms->group); | 233 | inode->i_gid = be32_to_cpu(perms->group); |
224 | if (!inode->i_gid && !mode) | 234 | if (!inode->i_gid && !mode) |
225 | inode->i_gid = HFSPLUS_SB(sb).gid; | 235 | inode->i_gid = sbi->gid; |
226 | 236 | ||
227 | if (dir) { | 237 | if (dir) { |
228 | mode = mode ? (mode & S_IALLUGO) : | 238 | mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); |
229 | (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask)); | ||
230 | mode |= S_IFDIR; | 239 | mode |= S_IFDIR; |
231 | } else if (!mode) | 240 | } else if (!mode) |
232 | mode = S_IFREG | ((S_IRUGO|S_IWUGO) & | 241 | mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); |
233 | ~(HFSPLUS_SB(sb).umask)); | ||
234 | inode->i_mode = mode; | 242 | inode->i_mode = mode; |
235 | 243 | ||
236 | HFSPLUS_I(inode).rootflags = perms->rootflags; | 244 | HFSPLUS_I(inode)->userflags = perms->userflags; |
237 | HFSPLUS_I(inode).userflags = perms->userflags; | ||
238 | if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) | 245 | if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) |
239 | inode->i_flags |= S_IMMUTABLE; | 246 | inode->i_flags |= S_IMMUTABLE; |
240 | else | 247 | else |
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i | |||
245 | inode->i_flags &= ~S_APPEND; | 252 | inode->i_flags &= ~S_APPEND; |
246 | } | 253 | } |
247 | 254 | ||
248 | static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) | ||
249 | { | ||
250 | if (inode->i_flags & S_IMMUTABLE) | ||
251 | perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; | ||
252 | else | ||
253 | perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE; | ||
254 | if (inode->i_flags & S_APPEND) | ||
255 | perms->rootflags |= HFSPLUS_FLG_APPEND; | ||
256 | else | ||
257 | perms->rootflags &= ~HFSPLUS_FLG_APPEND; | ||
258 | perms->userflags = HFSPLUS_I(inode).userflags; | ||
259 | perms->mode = cpu_to_be16(inode->i_mode); | ||
260 | perms->owner = cpu_to_be32(inode->i_uid); | ||
261 | perms->group = cpu_to_be32(inode->i_gid); | ||
262 | perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); | ||
263 | } | ||
264 | |||
265 | static int hfsplus_file_open(struct inode *inode, struct file *file) | 255 | static int hfsplus_file_open(struct inode *inode, struct file *file) |
266 | { | 256 | { |
267 | if (HFSPLUS_IS_RSRC(inode)) | 257 | if (HFSPLUS_IS_RSRC(inode)) |
268 | inode = HFSPLUS_I(inode).rsrc_inode; | 258 | inode = HFSPLUS_I(inode)->rsrc_inode; |
269 | if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) | 259 | if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) |
270 | return -EOVERFLOW; | 260 | return -EOVERFLOW; |
271 | atomic_inc(&HFSPLUS_I(inode).opencnt); | 261 | atomic_inc(&HFSPLUS_I(inode)->opencnt); |
272 | return 0; | 262 | return 0; |
273 | } | 263 | } |
274 | 264 | ||
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) | |||
277 | struct super_block *sb = inode->i_sb; | 267 | struct super_block *sb = inode->i_sb; |
278 | 268 | ||
279 | if (HFSPLUS_IS_RSRC(inode)) | 269 | if (HFSPLUS_IS_RSRC(inode)) |
280 | inode = HFSPLUS_I(inode).rsrc_inode; | 270 | inode = HFSPLUS_I(inode)->rsrc_inode; |
281 | if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { | 271 | if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { |
282 | mutex_lock(&inode->i_mutex); | 272 | mutex_lock(&inode->i_mutex); |
283 | hfsplus_file_truncate(inode); | 273 | hfsplus_file_truncate(inode); |
284 | if (inode->i_flags & S_DEAD) { | 274 | if (inode->i_flags & S_DEAD) { |
285 | hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); | 275 | hfsplus_delete_cat(inode->i_ino, |
276 | HFSPLUS_SB(sb)->hidden_dir, NULL); | ||
286 | hfsplus_delete_inode(inode); | 277 | hfsplus_delete_inode(inode); |
287 | } | 278 | } |
288 | mutex_unlock(&inode->i_mutex); | 279 | mutex_unlock(&inode->i_mutex); |
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = { | |||
361 | 352 | ||
362 | struct inode *hfsplus_new_inode(struct super_block *sb, int mode) | 353 | struct inode *hfsplus_new_inode(struct super_block *sb, int mode) |
363 | { | 354 | { |
355 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
364 | struct inode *inode = new_inode(sb); | 356 | struct inode *inode = new_inode(sb); |
357 | struct hfsplus_inode_info *hip; | ||
358 | |||
365 | if (!inode) | 359 | if (!inode) |
366 | return NULL; | 360 | return NULL; |
367 | 361 | ||
368 | inode->i_ino = HFSPLUS_SB(sb).next_cnid++; | 362 | inode->i_ino = sbi->next_cnid++; |
369 | inode->i_mode = mode; | 363 | inode->i_mode = mode; |
370 | inode->i_uid = current_fsuid(); | 364 | inode->i_uid = current_fsuid(); |
371 | inode->i_gid = current_fsgid(); | 365 | inode->i_gid = current_fsgid(); |
372 | inode->i_nlink = 1; | 366 | inode->i_nlink = 1; |
373 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; | 367 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; |
374 | INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); | 368 | |
375 | mutex_init(&HFSPLUS_I(inode).extents_lock); | 369 | hip = HFSPLUS_I(inode); |
376 | atomic_set(&HFSPLUS_I(inode).opencnt, 0); | 370 | INIT_LIST_HEAD(&hip->open_dir_list); |
377 | HFSPLUS_I(inode).flags = 0; | 371 | mutex_init(&hip->extents_lock); |
378 | memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); | 372 | atomic_set(&hip->opencnt, 0); |
379 | memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); | 373 | hip->flags = 0; |
380 | HFSPLUS_I(inode).alloc_blocks = 0; | 374 | memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); |
381 | HFSPLUS_I(inode).first_blocks = 0; | 375 | memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); |
382 | HFSPLUS_I(inode).cached_start = 0; | 376 | hip->alloc_blocks = 0; |
383 | HFSPLUS_I(inode).cached_blocks = 0; | 377 | hip->first_blocks = 0; |
384 | HFSPLUS_I(inode).phys_size = 0; | 378 | hip->cached_start = 0; |
385 | HFSPLUS_I(inode).fs_blocks = 0; | 379 | hip->cached_blocks = 0; |
386 | HFSPLUS_I(inode).rsrc_inode = NULL; | 380 | hip->phys_size = 0; |
381 | hip->fs_blocks = 0; | ||
382 | hip->rsrc_inode = NULL; | ||
387 | if (S_ISDIR(inode->i_mode)) { | 383 | if (S_ISDIR(inode->i_mode)) { |
388 | inode->i_size = 2; | 384 | inode->i_size = 2; |
389 | HFSPLUS_SB(sb).folder_count++; | 385 | sbi->folder_count++; |
390 | inode->i_op = &hfsplus_dir_inode_operations; | 386 | inode->i_op = &hfsplus_dir_inode_operations; |
391 | inode->i_fop = &hfsplus_dir_operations; | 387 | inode->i_fop = &hfsplus_dir_operations; |
392 | } else if (S_ISREG(inode->i_mode)) { | 388 | } else if (S_ISREG(inode->i_mode)) { |
393 | HFSPLUS_SB(sb).file_count++; | 389 | sbi->file_count++; |
394 | inode->i_op = &hfsplus_file_inode_operations; | 390 | inode->i_op = &hfsplus_file_inode_operations; |
395 | inode->i_fop = &hfsplus_file_operations; | 391 | inode->i_fop = &hfsplus_file_operations; |
396 | inode->i_mapping->a_ops = &hfsplus_aops; | 392 | inode->i_mapping->a_ops = &hfsplus_aops; |
397 | HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; | 393 | hip->clump_blocks = sbi->data_clump_blocks; |
398 | } else if (S_ISLNK(inode->i_mode)) { | 394 | } else if (S_ISLNK(inode->i_mode)) { |
399 | HFSPLUS_SB(sb).file_count++; | 395 | sbi->file_count++; |
400 | inode->i_op = &page_symlink_inode_operations; | 396 | inode->i_op = &page_symlink_inode_operations; |
401 | inode->i_mapping->a_ops = &hfsplus_aops; | 397 | inode->i_mapping->a_ops = &hfsplus_aops; |
402 | HFSPLUS_I(inode).clump_blocks = 1; | 398 | hip->clump_blocks = 1; |
403 | } else | 399 | } else |
404 | HFSPLUS_SB(sb).file_count++; | 400 | sbi->file_count++; |
405 | insert_inode_hash(inode); | 401 | insert_inode_hash(inode); |
406 | mark_inode_dirty(inode); | 402 | mark_inode_dirty(inode); |
407 | sb->s_dirt = 1; | 403 | sb->s_dirt = 1; |
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode) | |||
414 | struct super_block *sb = inode->i_sb; | 410 | struct super_block *sb = inode->i_sb; |
415 | 411 | ||
416 | if (S_ISDIR(inode->i_mode)) { | 412 | if (S_ISDIR(inode->i_mode)) { |
417 | HFSPLUS_SB(sb).folder_count--; | 413 | HFSPLUS_SB(sb)->folder_count--; |
418 | sb->s_dirt = 1; | 414 | sb->s_dirt = 1; |
419 | return; | 415 | return; |
420 | } | 416 | } |
421 | HFSPLUS_SB(sb).file_count--; | 417 | HFSPLUS_SB(sb)->file_count--; |
422 | if (S_ISREG(inode->i_mode)) { | 418 | if (S_ISREG(inode->i_mode)) { |
423 | if (!inode->i_nlink) { | 419 | if (!inode->i_nlink) { |
424 | inode->i_size = 0; | 420 | inode->i_size = 0; |
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode) | |||
434 | void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) | 430 | void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) |
435 | { | 431 | { |
436 | struct super_block *sb = inode->i_sb; | 432 | struct super_block *sb = inode->i_sb; |
433 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
434 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
437 | u32 count; | 435 | u32 count; |
438 | int i; | 436 | int i; |
439 | 437 | ||
440 | memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, | 438 | memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); |
441 | sizeof(hfsplus_extent_rec)); | ||
442 | for (count = 0, i = 0; i < 8; i++) | 439 | for (count = 0, i = 0; i < 8; i++) |
443 | count += be32_to_cpu(fork->extents[i].block_count); | 440 | count += be32_to_cpu(fork->extents[i].block_count); |
444 | HFSPLUS_I(inode).first_blocks = count; | 441 | hip->first_blocks = count; |
445 | memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); | 442 | memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); |
446 | HFSPLUS_I(inode).cached_start = 0; | 443 | hip->cached_start = 0; |
447 | HFSPLUS_I(inode).cached_blocks = 0; | 444 | hip->cached_blocks = 0; |
448 | 445 | ||
449 | HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); | 446 | hip->alloc_blocks = be32_to_cpu(fork->total_blocks); |
450 | inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); | 447 | hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); |
451 | HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; | 448 | hip->fs_blocks = |
452 | inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); | 449 | (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; |
453 | HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; | 450 | inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); |
454 | if (!HFSPLUS_I(inode).clump_blocks) | 451 | hip->clump_blocks = |
455 | HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : | 452 | be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; |
456 | HFSPLUS_SB(sb).data_clump_blocks; | 453 | if (!hip->clump_blocks) { |
454 | hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? | ||
455 | sbi->rsrc_clump_blocks : | ||
456 | sbi->data_clump_blocks; | ||
457 | } | ||
457 | } | 458 | } |
458 | 459 | ||
459 | void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) | 460 | void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) |
460 | { | 461 | { |
461 | memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, | 462 | memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, |
462 | sizeof(hfsplus_extent_rec)); | 463 | sizeof(hfsplus_extent_rec)); |
463 | fork->total_size = cpu_to_be64(inode->i_size); | 464 | fork->total_size = cpu_to_be64(inode->i_size); |
464 | fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); | 465 | fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); |
465 | } | 466 | } |
466 | 467 | ||
467 | int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) | 468 | int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) |
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) | |||
472 | 473 | ||
473 | type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); | 474 | type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); |
474 | 475 | ||
475 | HFSPLUS_I(inode).dev = 0; | 476 | HFSPLUS_I(inode)->linkid = 0; |
476 | if (type == HFSPLUS_FOLDER) { | 477 | if (type == HFSPLUS_FOLDER) { |
477 | struct hfsplus_cat_folder *folder = &entry.folder; | 478 | struct hfsplus_cat_folder *folder = &entry.folder; |
478 | 479 | ||
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) | |||
486 | inode->i_atime = hfsp_mt2ut(folder->access_date); | 487 | inode->i_atime = hfsp_mt2ut(folder->access_date); |
487 | inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); | 488 | inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); |
488 | inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); | 489 | inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); |
489 | HFSPLUS_I(inode).create_date = folder->create_date; | 490 | HFSPLUS_I(inode)->create_date = folder->create_date; |
490 | HFSPLUS_I(inode).fs_blocks = 0; | 491 | HFSPLUS_I(inode)->fs_blocks = 0; |
491 | inode->i_op = &hfsplus_dir_inode_operations; | 492 | inode->i_op = &hfsplus_dir_inode_operations; |
492 | inode->i_fop = &hfsplus_dir_operations; | 493 | inode->i_fop = &hfsplus_dir_operations; |
493 | } else if (type == HFSPLUS_FILE) { | 494 | } else if (type == HFSPLUS_FILE) { |
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) | |||
518 | inode->i_atime = hfsp_mt2ut(file->access_date); | 519 | inode->i_atime = hfsp_mt2ut(file->access_date); |
519 | inode->i_mtime = hfsp_mt2ut(file->content_mod_date); | 520 | inode->i_mtime = hfsp_mt2ut(file->content_mod_date); |
520 | inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); | 521 | inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); |
521 | HFSPLUS_I(inode).create_date = file->create_date; | 522 | HFSPLUS_I(inode)->create_date = file->create_date; |
522 | } else { | 523 | } else { |
523 | printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); | 524 | printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); |
524 | res = -EIO; | 525 | res = -EIO; |
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode) | |||
533 | hfsplus_cat_entry entry; | 534 | hfsplus_cat_entry entry; |
534 | 535 | ||
535 | if (HFSPLUS_IS_RSRC(inode)) | 536 | if (HFSPLUS_IS_RSRC(inode)) |
536 | main_inode = HFSPLUS_I(inode).rsrc_inode; | 537 | main_inode = HFSPLUS_I(inode)->rsrc_inode; |
537 | 538 | ||
538 | if (!main_inode->i_nlink) | 539 | if (!main_inode->i_nlink) |
539 | return 0; | 540 | return 0; |
540 | 541 | ||
541 | if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) | 542 | if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) |
542 | /* panic? */ | 543 | /* panic? */ |
543 | return -EIO; | 544 | return -EIO; |
544 | 545 | ||
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode) | |||
554 | hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, | 555 | hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, |
555 | sizeof(struct hfsplus_cat_folder)); | 556 | sizeof(struct hfsplus_cat_folder)); |
556 | /* simple node checks? */ | 557 | /* simple node checks? */ |
557 | hfsplus_set_perms(inode, &folder->permissions); | 558 | hfsplus_cat_set_perms(inode, &folder->permissions); |
558 | folder->access_date = hfsp_ut2mt(inode->i_atime); | 559 | folder->access_date = hfsp_ut2mt(inode->i_atime); |
559 | folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); | 560 | folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); |
560 | folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); | 561 | folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); |
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode) | |||
576 | hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, | 577 | hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, |
577 | sizeof(struct hfsplus_cat_file)); | 578 | sizeof(struct hfsplus_cat_file)); |
578 | hfsplus_inode_write_fork(inode, &file->data_fork); | 579 | hfsplus_inode_write_fork(inode, &file->data_fork); |
579 | if (S_ISREG(inode->i_mode)) | 580 | hfsplus_cat_set_perms(inode, &file->permissions); |
580 | HFSPLUS_I(inode).dev = inode->i_nlink; | ||
581 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | ||
582 | HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev); | ||
583 | hfsplus_set_perms(inode, &file->permissions); | ||
584 | if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) | 581 | if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) |
585 | file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); | 582 | file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); |
586 | else | 583 | else |
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index ac405f099026..5b4667e08ef7 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c | |||
@@ -17,83 +17,98 @@ | |||
17 | #include <linux/mount.h> | 17 | #include <linux/mount.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/xattr.h> | 19 | #include <linux/xattr.h> |
20 | #include <linux/smp_lock.h> | ||
21 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> |
22 | #include "hfsplus_fs.h" | 21 | #include "hfsplus_fs.h" |
23 | 22 | ||
24 | long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | 23 | static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags) |
25 | { | 24 | { |
26 | struct inode *inode = filp->f_path.dentry->d_inode; | 25 | struct inode *inode = file->f_path.dentry->d_inode; |
26 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
27 | unsigned int flags = 0; | ||
28 | |||
29 | if (inode->i_flags & S_IMMUTABLE) | ||
30 | flags |= FS_IMMUTABLE_FL; | ||
31 | if (inode->i_flags |= S_APPEND) | ||
32 | flags |= FS_APPEND_FL; | ||
33 | if (hip->userflags & HFSPLUS_FLG_NODUMP) | ||
34 | flags |= FS_NODUMP_FL; | ||
35 | |||
36 | return put_user(flags, user_flags); | ||
37 | } | ||
38 | |||
39 | static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) | ||
40 | { | ||
41 | struct inode *inode = file->f_path.dentry->d_inode; | ||
42 | struct hfsplus_inode_info *hip = HFSPLUS_I(inode); | ||
27 | unsigned int flags; | 43 | unsigned int flags; |
44 | int err = 0; | ||
28 | 45 | ||
29 | lock_kernel(); | 46 | err = mnt_want_write(file->f_path.mnt); |
30 | switch (cmd) { | 47 | if (err) |
31 | case HFSPLUS_IOC_EXT2_GETFLAGS: | 48 | goto out; |
32 | flags = 0; | ||
33 | if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE) | ||
34 | flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */ | ||
35 | if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND) | ||
36 | flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */ | ||
37 | if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP) | ||
38 | flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ | ||
39 | return put_user(flags, (int __user *)arg); | ||
40 | case HFSPLUS_IOC_EXT2_SETFLAGS: { | ||
41 | int err = 0; | ||
42 | err = mnt_want_write(filp->f_path.mnt); | ||
43 | if (err) { | ||
44 | unlock_kernel(); | ||
45 | return err; | ||
46 | } | ||
47 | 49 | ||
48 | if (!is_owner_or_cap(inode)) { | 50 | if (!is_owner_or_cap(inode)) { |
49 | err = -EACCES; | 51 | err = -EACCES; |
50 | goto setflags_out; | 52 | goto out_drop_write; |
51 | } | 53 | } |
52 | if (get_user(flags, (int __user *)arg)) { | ||
53 | err = -EFAULT; | ||
54 | goto setflags_out; | ||
55 | } | ||
56 | if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || | ||
57 | HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { | ||
58 | if (!capable(CAP_LINUX_IMMUTABLE)) { | ||
59 | err = -EPERM; | ||
60 | goto setflags_out; | ||
61 | } | ||
62 | } | ||
63 | 54 | ||
64 | /* don't silently ignore unsupported ext2 flags */ | 55 | if (get_user(flags, user_flags)) { |
65 | if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { | 56 | err = -EFAULT; |
66 | err = -EOPNOTSUPP; | 57 | goto out_drop_write; |
67 | goto setflags_out; | 58 | } |
68 | } | 59 | |
69 | if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ | 60 | mutex_lock(&inode->i_mutex); |
70 | inode->i_flags |= S_IMMUTABLE; | 61 | |
71 | HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; | 62 | if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || |
72 | } else { | 63 | inode->i_flags & (S_IMMUTABLE|S_APPEND)) { |
73 | inode->i_flags &= ~S_IMMUTABLE; | 64 | if (!capable(CAP_LINUX_IMMUTABLE)) { |
74 | HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; | 65 | err = -EPERM; |
75 | } | 66 | goto out_unlock_inode; |
76 | if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */ | ||
77 | inode->i_flags |= S_APPEND; | ||
78 | HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND; | ||
79 | } else { | ||
80 | inode->i_flags &= ~S_APPEND; | ||
81 | HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND; | ||
82 | } | 67 | } |
83 | if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */ | ||
84 | HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP; | ||
85 | else | ||
86 | HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP; | ||
87 | |||
88 | inode->i_ctime = CURRENT_TIME_SEC; | ||
89 | mark_inode_dirty(inode); | ||
90 | setflags_out: | ||
91 | mnt_drop_write(filp->f_path.mnt); | ||
92 | unlock_kernel(); | ||
93 | return err; | ||
94 | } | 68 | } |
69 | |||
70 | /* don't silently ignore unsupported ext2 flags */ | ||
71 | if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { | ||
72 | err = -EOPNOTSUPP; | ||
73 | goto out_unlock_inode; | ||
74 | } | ||
75 | |||
76 | if (flags & FS_IMMUTABLE_FL) | ||
77 | inode->i_flags |= S_IMMUTABLE; | ||
78 | else | ||
79 | inode->i_flags &= ~S_IMMUTABLE; | ||
80 | |||
81 | if (flags & FS_APPEND_FL) | ||
82 | inode->i_flags |= S_APPEND; | ||
83 | else | ||
84 | inode->i_flags &= ~S_APPEND; | ||
85 | |||
86 | if (flags & FS_NODUMP_FL) | ||
87 | hip->userflags |= HFSPLUS_FLG_NODUMP; | ||
88 | else | ||
89 | hip->userflags &= ~HFSPLUS_FLG_NODUMP; | ||
90 | |||
91 | inode->i_ctime = CURRENT_TIME_SEC; | ||
92 | mark_inode_dirty(inode); | ||
93 | |||
94 | out_unlock_inode: | ||
95 | mutex_lock(&inode->i_mutex); | ||
96 | out_drop_write: | ||
97 | mnt_drop_write(file->f_path.mnt); | ||
98 | out: | ||
99 | return err; | ||
100 | } | ||
101 | |||
102 | long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
103 | { | ||
104 | void __user *argp = (void __user *)arg; | ||
105 | |||
106 | switch (cmd) { | ||
107 | case HFSPLUS_IOC_EXT2_GETFLAGS: | ||
108 | return hfsplus_ioctl_getflags(file, argp); | ||
109 | case HFSPLUS_IOC_EXT2_SETFLAGS: | ||
110 | return hfsplus_ioctl_setflags(file, argp); | ||
95 | default: | 111 | default: |
96 | unlock_kernel(); | ||
97 | return -ENOTTY; | 112 | return -ENOTTY; |
98 | } | 113 | } |
99 | } | 114 | } |
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, | |||
110 | if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) | 125 | if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) |
111 | return -EOPNOTSUPP; | 126 | return -EOPNOTSUPP; |
112 | 127 | ||
113 | res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); | 128 | res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); |
114 | if (res) | 129 | if (res) |
115 | return res; | 130 | return res; |
116 | res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); | 131 | res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); |
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, | |||
153 | return -EOPNOTSUPP; | 168 | return -EOPNOTSUPP; |
154 | 169 | ||
155 | if (size) { | 170 | if (size) { |
156 | res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); | 171 | res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); |
157 | if (res) | 172 | if (res) |
158 | return res; | 173 | return res; |
159 | res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); | 174 | res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); |
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, | |||
177 | } else | 192 | } else |
178 | res = size ? -ERANGE : 4; | 193 | res = size ? -ERANGE : 4; |
179 | } else | 194 | } else |
180 | res = -ENODATA; | 195 | res = -EOPNOTSUPP; |
181 | out: | 196 | out: |
182 | if (size) | 197 | if (size) |
183 | hfs_find_exit(&fd); | 198 | hfs_find_exit(&fd); |
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 572628b4b07d..f9ab276a4d8d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c | |||
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) | |||
143 | kfree(p); | 143 | kfree(p); |
144 | break; | 144 | break; |
145 | case opt_decompose: | 145 | case opt_decompose: |
146 | sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; | 146 | clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); |
147 | break; | 147 | break; |
148 | case opt_nodecompose: | 148 | case opt_nodecompose: |
149 | sbi->flags |= HFSPLUS_SB_NODECOMPOSE; | 149 | set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); |
150 | break; | 150 | break; |
151 | case opt_force: | 151 | case opt_force: |
152 | sbi->flags |= HFSPLUS_SB_FORCE; | 152 | set_bit(HFSPLUS_SB_FORCE, &sbi->flags); |
153 | break; | 153 | break; |
154 | default: | 154 | default: |
155 | return 0; | 155 | return 0; |
@@ -171,7 +171,7 @@ done: | |||
171 | 171 | ||
172 | int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) | 172 | int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) |
173 | { | 173 | { |
174 | struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); | 174 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb); |
175 | 175 | ||
176 | if (sbi->creator != HFSPLUS_DEF_CR_TYPE) | 176 | if (sbi->creator != HFSPLUS_DEF_CR_TYPE) |
177 | seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); | 177 | seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); |
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) | |||
184 | seq_printf(seq, ",session=%u", sbi->session); | 184 | seq_printf(seq, ",session=%u", sbi->session); |
185 | if (sbi->nls) | 185 | if (sbi->nls) |
186 | seq_printf(seq, ",nls=%s", sbi->nls->charset); | 186 | seq_printf(seq, ",nls=%s", sbi->nls->charset); |
187 | if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) | 187 | if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) |
188 | seq_printf(seq, ",nodecompose"); | 188 | seq_printf(seq, ",nodecompose"); |
189 | return 0; | 189 | return 0; |
190 | } | 190 | } |
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 1528a6fd0299..208b16c645cc 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c | |||
@@ -74,6 +74,7 @@ struct old_pmap { | |||
74 | int hfs_part_find(struct super_block *sb, | 74 | int hfs_part_find(struct super_block *sb, |
75 | sector_t *part_start, sector_t *part_size) | 75 | sector_t *part_start, sector_t *part_size) |
76 | { | 76 | { |
77 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
77 | struct buffer_head *bh; | 78 | struct buffer_head *bh; |
78 | __be16 *data; | 79 | __be16 *data; |
79 | int i, size, res; | 80 | int i, size, res; |
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb, | |||
95 | for (i = 0; i < size; p++, i++) { | 96 | for (i = 0; i < size; p++, i++) { |
96 | if (p->pdStart && p->pdSize && | 97 | if (p->pdStart && p->pdSize && |
97 | p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && | 98 | p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && |
98 | (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { | 99 | (sbi->part < 0 || sbi->part == i)) { |
99 | *part_start += be32_to_cpu(p->pdStart); | 100 | *part_start += be32_to_cpu(p->pdStart); |
100 | *part_size = be32_to_cpu(p->pdSize); | 101 | *part_size = be32_to_cpu(p->pdSize); |
101 | res = 0; | 102 | res = 0; |
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb, | |||
111 | size = be32_to_cpu(pm->pmMapBlkCnt); | 112 | size = be32_to_cpu(pm->pmMapBlkCnt); |
112 | for (i = 0; i < size;) { | 113 | for (i = 0; i < size;) { |
113 | if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && | 114 | if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && |
114 | (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { | 115 | (sbi->part < 0 || sbi->part == i)) { |
115 | *part_start += be32_to_cpu(pm->pmPyPartStart); | 116 | *part_start += be32_to_cpu(pm->pmPyPartStart); |
116 | *part_size = be32_to_cpu(pm->pmPartBlkCnt); | 117 | *part_size = be32_to_cpu(pm->pmPartBlkCnt); |
117 | res = 0; | 118 | res = 0; |
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 3b55c050c742..9a88d7536103 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/smp_lock.h> | ||
16 | #include <linux/vfs.h> | 15 | #include <linux/vfs.h> |
17 | #include <linux/nls.h> | 16 | #include <linux/nls.h> |
18 | 17 | ||
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode); | |||
21 | 20 | ||
22 | #include "hfsplus_fs.h" | 21 | #include "hfsplus_fs.h" |
23 | 22 | ||
24 | struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) | 23 | static int hfsplus_system_read_inode(struct inode *inode) |
25 | { | 24 | { |
26 | struct hfs_find_data fd; | 25 | struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr; |
27 | struct hfsplus_vh *vhdr; | ||
28 | struct inode *inode; | ||
29 | long err = -EIO; | ||
30 | |||
31 | inode = iget_locked(sb, ino); | ||
32 | if (!inode) | ||
33 | return ERR_PTR(-ENOMEM); | ||
34 | if (!(inode->i_state & I_NEW)) | ||
35 | return inode; | ||
36 | 26 | ||
37 | INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); | 27 | switch (inode->i_ino) { |
38 | mutex_init(&HFSPLUS_I(inode).extents_lock); | ||
39 | HFSPLUS_I(inode).flags = 0; | ||
40 | HFSPLUS_I(inode).rsrc_inode = NULL; | ||
41 | atomic_set(&HFSPLUS_I(inode).opencnt, 0); | ||
42 | |||
43 | if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { | ||
44 | read_inode: | ||
45 | hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); | ||
46 | err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); | ||
47 | if (!err) | ||
48 | err = hfsplus_cat_read_inode(inode, &fd); | ||
49 | hfs_find_exit(&fd); | ||
50 | if (err) | ||
51 | goto bad_inode; | ||
52 | goto done; | ||
53 | } | ||
54 | vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; | ||
55 | switch(inode->i_ino) { | ||
56 | case HFSPLUS_ROOT_CNID: | ||
57 | goto read_inode; | ||
58 | case HFSPLUS_EXT_CNID: | 28 | case HFSPLUS_EXT_CNID: |
59 | hfsplus_inode_read_fork(inode, &vhdr->ext_file); | 29 | hfsplus_inode_read_fork(inode, &vhdr->ext_file); |
60 | inode->i_mapping->a_ops = &hfsplus_btree_aops; | 30 | inode->i_mapping->a_ops = &hfsplus_btree_aops; |
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) | |||
75 | inode->i_mapping->a_ops = &hfsplus_btree_aops; | 45 | inode->i_mapping->a_ops = &hfsplus_btree_aops; |
76 | break; | 46 | break; |
77 | default: | 47 | default: |
78 | goto bad_inode; | 48 | return -EIO; |
49 | } | ||
50 | |||
51 | return 0; | ||
52 | } | ||
53 | |||
54 | struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) | ||
55 | { | ||
56 | struct hfs_find_data fd; | ||
57 | struct inode *inode; | ||
58 | int err; | ||
59 | |||
60 | inode = iget_locked(sb, ino); | ||
61 | if (!inode) | ||
62 | return ERR_PTR(-ENOMEM); | ||
63 | if (!(inode->i_state & I_NEW)) | ||
64 | return inode; | ||
65 | |||
66 | INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list); | ||
67 | mutex_init(&HFSPLUS_I(inode)->extents_lock); | ||
68 | HFSPLUS_I(inode)->flags = 0; | ||
69 | HFSPLUS_I(inode)->rsrc_inode = NULL; | ||
70 | atomic_set(&HFSPLUS_I(inode)->opencnt, 0); | ||
71 | |||
72 | if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || | ||
73 | inode->i_ino == HFSPLUS_ROOT_CNID) { | ||
74 | hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); | ||
75 | err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); | ||
76 | if (!err) | ||
77 | err = hfsplus_cat_read_inode(inode, &fd); | ||
78 | hfs_find_exit(&fd); | ||
79 | } else { | ||
80 | err = hfsplus_system_read_inode(inode); | ||
81 | } | ||
82 | |||
83 | if (err) { | ||
84 | iget_failed(inode); | ||
85 | return ERR_PTR(err); | ||
79 | } | 86 | } |
80 | 87 | ||
81 | done: | ||
82 | unlock_new_inode(inode); | 88 | unlock_new_inode(inode); |
83 | return inode; | 89 | return inode; |
84 | |||
85 | bad_inode: | ||
86 | iget_failed(inode); | ||
87 | return ERR_PTR(err); | ||
88 | } | 90 | } |
89 | 91 | ||
90 | static int hfsplus_write_inode(struct inode *inode, | 92 | static int hfsplus_system_write_inode(struct inode *inode) |
91 | struct writeback_control *wbc) | ||
92 | { | 93 | { |
93 | struct hfsplus_vh *vhdr; | 94 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); |
94 | int ret = 0; | 95 | struct hfsplus_vh *vhdr = sbi->s_vhdr; |
96 | struct hfsplus_fork_raw *fork; | ||
97 | struct hfs_btree *tree = NULL; | ||
95 | 98 | ||
96 | dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); | ||
97 | hfsplus_ext_write_extent(inode); | ||
98 | if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) { | ||
99 | return hfsplus_cat_write_inode(inode); | ||
100 | } | ||
101 | vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr; | ||
102 | switch (inode->i_ino) { | 99 | switch (inode->i_ino) { |
103 | case HFSPLUS_ROOT_CNID: | ||
104 | ret = hfsplus_cat_write_inode(inode); | ||
105 | break; | ||
106 | case HFSPLUS_EXT_CNID: | 100 | case HFSPLUS_EXT_CNID: |
107 | if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { | 101 | fork = &vhdr->ext_file; |
108 | HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; | 102 | tree = sbi->ext_tree; |
109 | inode->i_sb->s_dirt = 1; | ||
110 | } | ||
111 | hfsplus_inode_write_fork(inode, &vhdr->ext_file); | ||
112 | hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree); | ||
113 | break; | 103 | break; |
114 | case HFSPLUS_CAT_CNID: | 104 | case HFSPLUS_CAT_CNID: |
115 | if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { | 105 | fork = &vhdr->cat_file; |
116 | HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; | 106 | tree = sbi->cat_tree; |
117 | inode->i_sb->s_dirt = 1; | ||
118 | } | ||
119 | hfsplus_inode_write_fork(inode, &vhdr->cat_file); | ||
120 | hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree); | ||
121 | break; | 107 | break; |
122 | case HFSPLUS_ALLOC_CNID: | 108 | case HFSPLUS_ALLOC_CNID: |
123 | if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { | 109 | fork = &vhdr->alloc_file; |
124 | HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; | ||
125 | inode->i_sb->s_dirt = 1; | ||
126 | } | ||
127 | hfsplus_inode_write_fork(inode, &vhdr->alloc_file); | ||
128 | break; | 110 | break; |
129 | case HFSPLUS_START_CNID: | 111 | case HFSPLUS_START_CNID: |
130 | if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { | 112 | fork = &vhdr->start_file; |
131 | HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; | ||
132 | inode->i_sb->s_dirt = 1; | ||
133 | } | ||
134 | hfsplus_inode_write_fork(inode, &vhdr->start_file); | ||
135 | break; | 113 | break; |
136 | case HFSPLUS_ATTR_CNID: | 114 | case HFSPLUS_ATTR_CNID: |
137 | if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { | 115 | fork = &vhdr->attr_file; |
138 | HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; | 116 | tree = sbi->attr_tree; |
139 | inode->i_sb->s_dirt = 1; | 117 | default: |
140 | } | 118 | return -EIO; |
141 | hfsplus_inode_write_fork(inode, &vhdr->attr_file); | 119 | } |
142 | hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); | 120 | |
143 | break; | 121 | if (fork->total_size != cpu_to_be64(inode->i_size)) { |
122 | set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags); | ||
123 | inode->i_sb->s_dirt = 1; | ||
144 | } | 124 | } |
145 | return ret; | 125 | hfsplus_inode_write_fork(inode, fork); |
126 | if (tree) | ||
127 | hfs_btree_write(tree); | ||
128 | return 0; | ||
129 | } | ||
130 | |||
131 | static int hfsplus_write_inode(struct inode *inode, | ||
132 | struct writeback_control *wbc) | ||
133 | { | ||
134 | dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); | ||
135 | |||
136 | hfsplus_ext_write_extent(inode); | ||
137 | |||
138 | if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID || | ||
139 | inode->i_ino == HFSPLUS_ROOT_CNID) | ||
140 | return hfsplus_cat_write_inode(inode); | ||
141 | else | ||
142 | return hfsplus_system_write_inode(inode); | ||
146 | } | 143 | } |
147 | 144 | ||
148 | static void hfsplus_evict_inode(struct inode *inode) | 145 | static void hfsplus_evict_inode(struct inode *inode) |
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode) | |||
151 | truncate_inode_pages(&inode->i_data, 0); | 148 | truncate_inode_pages(&inode->i_data, 0); |
152 | end_writeback(inode); | 149 | end_writeback(inode); |
153 | if (HFSPLUS_IS_RSRC(inode)) { | 150 | if (HFSPLUS_IS_RSRC(inode)) { |
154 | HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; | 151 | HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; |
155 | iput(HFSPLUS_I(inode).rsrc_inode); | 152 | iput(HFSPLUS_I(inode)->rsrc_inode); |
156 | } | 153 | } |
157 | } | 154 | } |
158 | 155 | ||
159 | int hfsplus_sync_fs(struct super_block *sb, int wait) | 156 | int hfsplus_sync_fs(struct super_block *sb, int wait) |
160 | { | 157 | { |
161 | struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; | 158 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); |
159 | struct hfsplus_vh *vhdr = sbi->s_vhdr; | ||
162 | 160 | ||
163 | dprint(DBG_SUPER, "hfsplus_write_super\n"); | 161 | dprint(DBG_SUPER, "hfsplus_write_super\n"); |
164 | 162 | ||
165 | lock_super(sb); | 163 | mutex_lock(&sbi->vh_mutex); |
164 | mutex_lock(&sbi->alloc_mutex); | ||
166 | sb->s_dirt = 0; | 165 | sb->s_dirt = 0; |
167 | 166 | ||
168 | vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); | 167 | vhdr->free_blocks = cpu_to_be32(sbi->free_blocks); |
169 | vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); | 168 | vhdr->next_cnid = cpu_to_be32(sbi->next_cnid); |
170 | vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); | 169 | vhdr->folder_count = cpu_to_be32(sbi->folder_count); |
171 | vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); | 170 | vhdr->file_count = cpu_to_be32(sbi->file_count); |
172 | vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count); | ||
173 | 171 | ||
174 | mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); | 172 | mark_buffer_dirty(sbi->s_vhbh); |
175 | if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { | 173 | if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { |
176 | if (HFSPLUS_SB(sb).sect_count) { | 174 | if (sbi->sect_count) { |
177 | struct buffer_head *bh; | 175 | struct buffer_head *bh; |
178 | u32 block, offset; | 176 | u32 block, offset; |
179 | 177 | ||
180 | block = HFSPLUS_SB(sb).blockoffset; | 178 | block = sbi->blockoffset; |
181 | block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); | 179 | block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9); |
182 | offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); | 180 | offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1); |
183 | printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, | 181 | printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", |
184 | HFSPLUS_SB(sb).sect_count, block, offset); | 182 | sbi->blockoffset, sbi->sect_count, |
183 | block, offset); | ||
185 | bh = sb_bread(sb, block); | 184 | bh = sb_bread(sb, block); |
186 | if (bh) { | 185 | if (bh) { |
187 | vhdr = (struct hfsplus_vh *)(bh->b_data + offset); | 186 | vhdr = (struct hfsplus_vh *)(bh->b_data + offset); |
188 | if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { | 187 | if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { |
189 | memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); | 188 | memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr)); |
190 | mark_buffer_dirty(bh); | 189 | mark_buffer_dirty(bh); |
191 | brelse(bh); | 190 | brelse(bh); |
192 | } else | 191 | } else |
193 | printk(KERN_WARNING "hfs: backup not found!\n"); | 192 | printk(KERN_WARNING "hfs: backup not found!\n"); |
194 | } | 193 | } |
195 | } | 194 | } |
196 | HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; | ||
197 | } | 195 | } |
198 | unlock_super(sb); | 196 | mutex_unlock(&sbi->alloc_mutex); |
197 | mutex_unlock(&sbi->vh_mutex); | ||
199 | return 0; | 198 | return 0; |
200 | } | 199 | } |
201 | 200 | ||
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb) | |||
209 | 208 | ||
210 | static void hfsplus_put_super(struct super_block *sb) | 209 | static void hfsplus_put_super(struct super_block *sb) |
211 | { | 210 | { |
211 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
212 | |||
212 | dprint(DBG_SUPER, "hfsplus_put_super\n"); | 213 | dprint(DBG_SUPER, "hfsplus_put_super\n"); |
214 | |||
213 | if (!sb->s_fs_info) | 215 | if (!sb->s_fs_info) |
214 | return; | 216 | return; |
215 | 217 | ||
216 | lock_kernel(); | ||
217 | |||
218 | if (sb->s_dirt) | 218 | if (sb->s_dirt) |
219 | hfsplus_write_super(sb); | 219 | hfsplus_write_super(sb); |
220 | if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { | 220 | if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) { |
221 | struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; | 221 | struct hfsplus_vh *vhdr = sbi->s_vhdr; |
222 | 222 | ||
223 | vhdr->modify_date = hfsp_now2mt(); | 223 | vhdr->modify_date = hfsp_now2mt(); |
224 | vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); | 224 | vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); |
225 | vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); | 225 | vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); |
226 | mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); | 226 | mark_buffer_dirty(sbi->s_vhbh); |
227 | sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); | 227 | sync_dirty_buffer(sbi->s_vhbh); |
228 | } | 228 | } |
229 | 229 | ||
230 | hfs_btree_close(HFSPLUS_SB(sb).cat_tree); | 230 | hfs_btree_close(sbi->cat_tree); |
231 | hfs_btree_close(HFSPLUS_SB(sb).ext_tree); | 231 | hfs_btree_close(sbi->ext_tree); |
232 | iput(HFSPLUS_SB(sb).alloc_file); | 232 | iput(sbi->alloc_file); |
233 | iput(HFSPLUS_SB(sb).hidden_dir); | 233 | iput(sbi->hidden_dir); |
234 | brelse(HFSPLUS_SB(sb).s_vhbh); | 234 | brelse(sbi->s_vhbh); |
235 | unload_nls(HFSPLUS_SB(sb).nls); | 235 | unload_nls(sbi->nls); |
236 | kfree(sb->s_fs_info); | 236 | kfree(sb->s_fs_info); |
237 | sb->s_fs_info = NULL; | 237 | sb->s_fs_info = NULL; |
238 | |||
239 | unlock_kernel(); | ||
240 | } | 238 | } |
241 | 239 | ||
242 | static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) | 240 | static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) |
243 | { | 241 | { |
244 | struct super_block *sb = dentry->d_sb; | 242 | struct super_block *sb = dentry->d_sb; |
243 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
245 | u64 id = huge_encode_dev(sb->s_bdev->bd_dev); | 244 | u64 id = huge_encode_dev(sb->s_bdev->bd_dev); |
246 | 245 | ||
247 | buf->f_type = HFSPLUS_SUPER_MAGIC; | 246 | buf->f_type = HFSPLUS_SUPER_MAGIC; |
248 | buf->f_bsize = sb->s_blocksize; | 247 | buf->f_bsize = sb->s_blocksize; |
249 | buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; | 248 | buf->f_blocks = sbi->total_blocks << sbi->fs_shift; |
250 | buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; | 249 | buf->f_bfree = sbi->free_blocks << sbi->fs_shift; |
251 | buf->f_bavail = buf->f_bfree; | 250 | buf->f_bavail = buf->f_bfree; |
252 | buf->f_files = 0xFFFFFFFF; | 251 | buf->f_files = 0xFFFFFFFF; |
253 | buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; | 252 | buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; |
254 | buf->f_fsid.val[0] = (u32)id; | 253 | buf->f_fsid.val[0] = (u32)id; |
255 | buf->f_fsid.val[1] = (u32)(id >> 32); | 254 | buf->f_fsid.val[1] = (u32)(id >> 32); |
256 | buf->f_namelen = HFSPLUS_MAX_STRLEN; | 255 | buf->f_namelen = HFSPLUS_MAX_STRLEN; |
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) | |||
263 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | 262 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) |
264 | return 0; | 263 | return 0; |
265 | if (!(*flags & MS_RDONLY)) { | 264 | if (!(*flags & MS_RDONLY)) { |
266 | struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; | 265 | struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; |
267 | struct hfsplus_sb_info sbi; | 266 | struct hfsplus_sb_info sbi; |
268 | 267 | ||
269 | memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); | 268 | memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); |
270 | sbi.nls = HFSPLUS_SB(sb).nls; | 269 | sbi.nls = HFSPLUS_SB(sb)->nls; |
271 | if (!hfsplus_parse_options(data, &sbi)) | 270 | if (!hfsplus_parse_options(data, &sbi)) |
272 | return -EINVAL; | 271 | return -EINVAL; |
273 | 272 | ||
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) | |||
276 | "running fsck.hfsplus is recommended. leaving read-only.\n"); | 275 | "running fsck.hfsplus is recommended. leaving read-only.\n"); |
277 | sb->s_flags |= MS_RDONLY; | 276 | sb->s_flags |= MS_RDONLY; |
278 | *flags |= MS_RDONLY; | 277 | *flags |= MS_RDONLY; |
279 | } else if (sbi.flags & HFSPLUS_SB_FORCE) { | 278 | } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) { |
280 | /* nothing */ | 279 | /* nothing */ |
281 | } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { | 280 | } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { |
282 | printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); | 281 | printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); |
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
320 | return -ENOMEM; | 319 | return -ENOMEM; |
321 | 320 | ||
322 | sb->s_fs_info = sbi; | 321 | sb->s_fs_info = sbi; |
323 | INIT_HLIST_HEAD(&sbi->rsrc_inodes); | 322 | mutex_init(&sbi->alloc_mutex); |
323 | mutex_init(&sbi->vh_mutex); | ||
324 | hfsplus_fill_defaults(sbi); | 324 | hfsplus_fill_defaults(sbi); |
325 | if (!hfsplus_parse_options(data, sbi)) { | 325 | if (!hfsplus_parse_options(data, sbi)) { |
326 | printk(KERN_ERR "hfs: unable to parse mount options\n"); | 326 | printk(KERN_ERR "hfs: unable to parse mount options\n"); |
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
344 | err = -EINVAL; | 344 | err = -EINVAL; |
345 | goto cleanup; | 345 | goto cleanup; |
346 | } | 346 | } |
347 | vhdr = HFSPLUS_SB(sb).s_vhdr; | 347 | vhdr = sbi->s_vhdr; |
348 | 348 | ||
349 | /* Copy parts of the volume header into the superblock */ | 349 | /* Copy parts of the volume header into the superblock */ |
350 | sb->s_magic = HFSPLUS_VOLHEAD_SIG; | 350 | sb->s_magic = HFSPLUS_VOLHEAD_SIG; |
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
353 | printk(KERN_ERR "hfs: wrong filesystem version\n"); | 353 | printk(KERN_ERR "hfs: wrong filesystem version\n"); |
354 | goto cleanup; | 354 | goto cleanup; |
355 | } | 355 | } |
356 | HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); | 356 | sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); |
357 | HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); | 357 | sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); |
358 | HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); | 358 | sbi->next_cnid = be32_to_cpu(vhdr->next_cnid); |
359 | HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); | 359 | sbi->file_count = be32_to_cpu(vhdr->file_count); |
360 | HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); | 360 | sbi->folder_count = be32_to_cpu(vhdr->folder_count); |
361 | HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); | 361 | sbi->data_clump_blocks = |
362 | HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; | 362 | be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift; |
363 | if (!HFSPLUS_SB(sb).data_clump_blocks) | 363 | if (!sbi->data_clump_blocks) |
364 | HFSPLUS_SB(sb).data_clump_blocks = 1; | 364 | sbi->data_clump_blocks = 1; |
365 | HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; | 365 | sbi->rsrc_clump_blocks = |
366 | if (!HFSPLUS_SB(sb).rsrc_clump_blocks) | 366 | be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift; |
367 | HFSPLUS_SB(sb).rsrc_clump_blocks = 1; | 367 | if (!sbi->rsrc_clump_blocks) |
368 | sbi->rsrc_clump_blocks = 1; | ||
368 | 369 | ||
369 | /* Set up operations so we can load metadata */ | 370 | /* Set up operations so we can load metadata */ |
370 | sb->s_op = &hfsplus_sops; | 371 | sb->s_op = &hfsplus_sops; |
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
374 | printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " | 375 | printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " |
375 | "running fsck.hfsplus is recommended. mounting read-only.\n"); | 376 | "running fsck.hfsplus is recommended. mounting read-only.\n"); |
376 | sb->s_flags |= MS_RDONLY; | 377 | sb->s_flags |= MS_RDONLY; |
377 | } else if (sbi->flags & HFSPLUS_SB_FORCE) { | 378 | } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { |
378 | /* nothing */ | 379 | /* nothing */ |
379 | } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { | 380 | } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { |
380 | printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); | 381 | printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); |
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
384 | "use the force option at your own risk, mounting read-only.\n"); | 385 | "use the force option at your own risk, mounting read-only.\n"); |
385 | sb->s_flags |= MS_RDONLY; | 386 | sb->s_flags |= MS_RDONLY; |
386 | } | 387 | } |
387 | sbi->flags &= ~HFSPLUS_SB_FORCE; | ||
388 | 388 | ||
389 | /* Load metadata objects (B*Trees) */ | 389 | /* Load metadata objects (B*Trees) */ |
390 | HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); | 390 | sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); |
391 | if (!HFSPLUS_SB(sb).ext_tree) { | 391 | if (!sbi->ext_tree) { |
392 | printk(KERN_ERR "hfs: failed to load extents file\n"); | 392 | printk(KERN_ERR "hfs: failed to load extents file\n"); |
393 | goto cleanup; | 393 | goto cleanup; |
394 | } | 394 | } |
395 | HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); | 395 | sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); |
396 | if (!HFSPLUS_SB(sb).cat_tree) { | 396 | if (!sbi->cat_tree) { |
397 | printk(KERN_ERR "hfs: failed to load catalog file\n"); | 397 | printk(KERN_ERR "hfs: failed to load catalog file\n"); |
398 | goto cleanup; | 398 | goto cleanup; |
399 | } | 399 | } |
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
404 | err = PTR_ERR(inode); | 404 | err = PTR_ERR(inode); |
405 | goto cleanup; | 405 | goto cleanup; |
406 | } | 406 | } |
407 | HFSPLUS_SB(sb).alloc_file = inode; | 407 | sbi->alloc_file = inode; |
408 | 408 | ||
409 | /* Load the root directory */ | 409 | /* Load the root directory */ |
410 | root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); | 410 | root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); |
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
423 | 423 | ||
424 | str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; | 424 | str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; |
425 | str.name = HFSP_HIDDENDIR_NAME; | 425 | str.name = HFSP_HIDDENDIR_NAME; |
426 | hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); | 426 | hfs_find_init(sbi->cat_tree, &fd); |
427 | hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); | 427 | hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); |
428 | if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { | 428 | if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { |
429 | hfs_find_exit(&fd); | 429 | hfs_find_exit(&fd); |
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
434 | err = PTR_ERR(inode); | 434 | err = PTR_ERR(inode); |
435 | goto cleanup; | 435 | goto cleanup; |
436 | } | 436 | } |
437 | HFSPLUS_SB(sb).hidden_dir = inode; | 437 | sbi->hidden_dir = inode; |
438 | } else | 438 | } else |
439 | hfs_find_exit(&fd); | 439 | hfs_find_exit(&fd); |
440 | 440 | ||
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) | |||
449 | be32_add_cpu(&vhdr->write_count, 1); | 449 | be32_add_cpu(&vhdr->write_count, 1); |
450 | vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); | 450 | vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); |
451 | vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); | 451 | vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); |
452 | mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); | 452 | mark_buffer_dirty(sbi->s_vhbh); |
453 | sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); | 453 | sync_dirty_buffer(sbi->s_vhbh); |
454 | 454 | ||
455 | if (!HFSPLUS_SB(sb).hidden_dir) { | 455 | if (!sbi->hidden_dir) { |
456 | printk(KERN_DEBUG "hfs: create hidden dir...\n"); | 456 | printk(KERN_DEBUG "hfs: create hidden dir...\n"); |
457 | HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); | 457 | |
458 | hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, | 458 | mutex_lock(&sbi->vh_mutex); |
459 | &str, HFSPLUS_SB(sb).hidden_dir); | 459 | sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); |
460 | mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); | 460 | hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode, |
461 | &str, sbi->hidden_dir); | ||
462 | mutex_unlock(&sbi->vh_mutex); | ||
463 | |||
464 | mark_inode_dirty(sbi->hidden_dir); | ||
461 | } | 465 | } |
462 | out: | 466 | out: |
463 | unload_nls(sbi->nls); | 467 | unload_nls(sbi->nls); |
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) | |||
486 | 490 | ||
487 | static void hfsplus_destroy_inode(struct inode *inode) | 491 | static void hfsplus_destroy_inode(struct inode *inode) |
488 | { | 492 | { |
489 | kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); | 493 | kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); |
490 | } | 494 | } |
491 | 495 | ||
492 | #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) | 496 | #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) |
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 628ccf6fa402..b66d67de882c 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c | |||
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) | |||
121 | int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) | 121 | int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) |
122 | { | 122 | { |
123 | const hfsplus_unichr *ip; | 123 | const hfsplus_unichr *ip; |
124 | struct nls_table *nls = HFSPLUS_SB(sb).nls; | 124 | struct nls_table *nls = HFSPLUS_SB(sb)->nls; |
125 | u8 *op; | 125 | u8 *op; |
126 | u16 cc, c0, c1; | 126 | u16 cc, c0, c1; |
127 | u16 *ce1, *ce2; | 127 | u16 *ce1, *ce2; |
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c | |||
132 | ustrlen = be16_to_cpu(ustr->length); | 132 | ustrlen = be16_to_cpu(ustr->length); |
133 | len = *len_p; | 133 | len = *len_p; |
134 | ce1 = NULL; | 134 | ce1 = NULL; |
135 | compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); | 135 | compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); |
136 | 136 | ||
137 | while (ustrlen > 0) { | 137 | while (ustrlen > 0) { |
138 | c0 = be16_to_cpu(*ip++); | 138 | c0 = be16_to_cpu(*ip++); |
@@ -246,7 +246,7 @@ out: | |||
246 | static inline int asc2unichar(struct super_block *sb, const char *astr, int len, | 246 | static inline int asc2unichar(struct super_block *sb, const char *astr, int len, |
247 | wchar_t *uc) | 247 | wchar_t *uc) |
248 | { | 248 | { |
249 | int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); | 249 | int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); |
250 | if (size <= 0) { | 250 | if (size <= 0) { |
251 | *uc = '?'; | 251 | *uc = '?'; |
252 | size = 1; | 252 | size = 1; |
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, | |||
293 | u16 *dstr, outlen = 0; | 293 | u16 *dstr, outlen = 0; |
294 | wchar_t c; | 294 | wchar_t c; |
295 | 295 | ||
296 | decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); | 296 | decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); |
297 | while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { | 297 | while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { |
298 | size = asc2unichar(sb, astr, len, &c); | 298 | size = asc2unichar(sb, astr, len, &c); |
299 | 299 | ||
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str) | |||
330 | wchar_t c; | 330 | wchar_t c; |
331 | u16 c2; | 331 | u16 c2; |
332 | 332 | ||
333 | casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); | 333 | casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); |
334 | decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); | 334 | decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); |
335 | hash = init_name_hash(); | 335 | hash = init_name_hash(); |
336 | astr = str->name; | 336 | astr = str->name; |
337 | len = str->len; | 337 | len = str->len; |
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr * | |||
373 | u16 c1, c2; | 373 | u16 c1, c2; |
374 | wchar_t c; | 374 | wchar_t c; |
375 | 375 | ||
376 | casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); | 376 | casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); |
377 | decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); | 377 | decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); |
378 | astr1 = s1->name; | 378 | astr1 = s1->name; |
379 | len1 = s1->len; | 379 | len1 = s1->len; |
380 | astr2 = s2->name; | 380 | astr2 = s2->name; |
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index bed78ac8f6d1..8972c20b3216 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c | |||
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb, | |||
65 | *start = 0; | 65 | *start = 0; |
66 | *size = sb->s_bdev->bd_inode->i_size >> 9; | 66 | *size = sb->s_bdev->bd_inode->i_size >> 9; |
67 | 67 | ||
68 | if (HFSPLUS_SB(sb).session >= 0) { | 68 | if (HFSPLUS_SB(sb)->session >= 0) { |
69 | te.cdte_track = HFSPLUS_SB(sb).session; | 69 | te.cdte_track = HFSPLUS_SB(sb)->session; |
70 | te.cdte_format = CDROM_LBA; | 70 | te.cdte_format = CDROM_LBA; |
71 | res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); | 71 | res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); |
72 | if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { | 72 | if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { |
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb, | |||
87 | /* Takes in super block, returns true if good data read */ | 87 | /* Takes in super block, returns true if good data read */ |
88 | int hfsplus_read_wrapper(struct super_block *sb) | 88 | int hfsplus_read_wrapper(struct super_block *sb) |
89 | { | 89 | { |
90 | struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); | ||
90 | struct buffer_head *bh; | 91 | struct buffer_head *bh; |
91 | struct hfsplus_vh *vhdr; | 92 | struct hfsplus_vh *vhdr; |
92 | struct hfsplus_wd wd; | 93 | struct hfsplus_wd wd; |
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb) | |||
122 | if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) | 123 | if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) |
123 | break; | 124 | break; |
124 | if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { | 125 | if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { |
125 | HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; | 126 | set_bit(HFSPLUS_SB_HFSX, &sbi->flags); |
126 | break; | 127 | break; |
127 | } | 128 | } |
128 | brelse(bh); | 129 | brelse(bh); |
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb) | |||
143 | if (blocksize < HFSPLUS_SECTOR_SIZE || | 144 | if (blocksize < HFSPLUS_SECTOR_SIZE || |
144 | ((blocksize - 1) & blocksize)) | 145 | ((blocksize - 1) & blocksize)) |
145 | return -EINVAL; | 146 | return -EINVAL; |
146 | HFSPLUS_SB(sb).alloc_blksz = blocksize; | 147 | sbi->alloc_blksz = blocksize; |
147 | HFSPLUS_SB(sb).alloc_blksz_shift = 0; | 148 | sbi->alloc_blksz_shift = 0; |
148 | while ((blocksize >>= 1) != 0) | 149 | while ((blocksize >>= 1) != 0) |
149 | HFSPLUS_SB(sb).alloc_blksz_shift++; | 150 | sbi->alloc_blksz_shift++; |
150 | blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); | 151 | blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); |
151 | 152 | ||
152 | /* align block size to block offset */ | 153 | /* align block size to block offset */ |
153 | while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) | 154 | while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) |
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb) | |||
158 | return -EINVAL; | 159 | return -EINVAL; |
159 | } | 160 | } |
160 | 161 | ||
161 | HFSPLUS_SB(sb).blockoffset = part_start >> | 162 | sbi->blockoffset = |
162 | (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); | 163 | part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); |
163 | HFSPLUS_SB(sb).sect_count = part_size; | 164 | sbi->sect_count = part_size; |
164 | HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - | 165 | sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; |
165 | sb->s_blocksize_bits; | ||
166 | 166 | ||
167 | bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); | 167 | bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); |
168 | if (!bh) | 168 | if (!bh) |
169 | return -EIO; | 169 | return -EIO; |
170 | 170 | ||
171 | /* should still be the same... */ | 171 | /* should still be the same... */ |
172 | if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? | 172 | if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) { |
173 | cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : | 173 | if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) |
174 | cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) | 174 | goto error; |
175 | goto error; | 175 | } else { |
176 | HFSPLUS_SB(sb).s_vhbh = bh; | 176 | if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) |
177 | HFSPLUS_SB(sb).s_vhdr = vhdr; | 177 | goto error; |
178 | } | ||
179 | |||
180 | sbi->s_vhbh = bh; | ||
181 | sbi->s_vhdr = vhdr; | ||
178 | 182 | ||
179 | return 0; | 183 | return 0; |
180 | error: | 184 | error: |
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56bd15c5bf6c..63b6f5632318 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config HPFS_FS | 1 | config HPFS_FS |
2 | tristate "OS/2 HPFS file system support" | 2 | tristate "OS/2 HPFS file system support" |
3 | depends on BLOCK | 3 | depends on BLOCK |
4 | depends on BKL # nontrivial to fix | ||
4 | help | 5 | help |
5 | OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS | 6 | OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS |
6 | is the file system used for organizing files on OS/2 hard disk | 7 | is the file system used for organizing files on OS/2 hard disk |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0e8014ea6b94..262419f83d80 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, | |||
1371 | 1371 | ||
1372 | if (!compat && !ro && !incompat) | 1372 | if (!compat && !ro && !incompat) |
1373 | return 1; | 1373 | return 1; |
1374 | /* Load journal superblock if it is not loaded yet. */ | ||
1375 | if (journal->j_format_version == 0 && | ||
1376 | journal_get_superblock(journal) != 0) | ||
1377 | return 0; | ||
1374 | if (journal->j_format_version == 1) | 1378 | if (journal->j_format_version == 1) |
1375 | return 0; | 1379 | return 0; |
1376 | 1380 | ||
diff --git a/fs/libfs.c b/fs/libfs.c index 0a9da95317f7..62baa0387d6e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync) | |||
913 | } | 913 | } |
914 | EXPORT_SYMBOL(generic_file_fsync); | 914 | EXPORT_SYMBOL(generic_file_fsync); |
915 | 915 | ||
916 | /** | ||
917 | * generic_check_addressable - Check addressability of file system | ||
918 | * @blocksize_bits: log of file system block size | ||
919 | * @num_blocks: number of blocks in file system | ||
920 | * | ||
921 | * Determine whether a file system with @num_blocks blocks (and a | ||
922 | * block size of 2**@blocksize_bits) is addressable by the sector_t | ||
923 | * and page cache of the system. Return 0 if so and -EFBIG otherwise. | ||
924 | */ | ||
925 | int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) | ||
926 | { | ||
927 | u64 last_fs_block = num_blocks - 1; | ||
928 | u64 last_fs_page = | ||
929 | last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits); | ||
930 | |||
931 | if (unlikely(num_blocks == 0)) | ||
932 | return 0; | ||
933 | |||
934 | if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT)) | ||
935 | return -EINVAL; | ||
936 | |||
937 | if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || | ||
938 | (last_fs_page > (pgoff_t)(~0ULL))) { | ||
939 | return -EFBIG; | ||
940 | } | ||
941 | return 0; | ||
942 | } | ||
943 | EXPORT_SYMBOL(generic_check_addressable); | ||
944 | |||
916 | /* | 945 | /* |
917 | * No-op implementation of ->fsync for in-memory filesystems. | 946 | * No-op implementation of ->fsync for in-memory filesystems. |
918 | */ | 947 | */ |
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f7e13db613cb..b950415d7c43 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config NFS_FS | 1 | config NFS_FS |
2 | tristate "NFS client support" | 2 | tristate "NFS client support" |
3 | depends on INET && FILE_LOCKING | 3 | depends on INET && FILE_LOCKING |
4 | depends on BKL # fix as soon as lockd is done | ||
4 | select LOCKD | 5 | select LOCKD |
5 | select SUNRPC | 6 | select SUNRPC |
6 | select NFS_ACL_SUPPORT if NFS_V3_ACL | 7 | select NFS_ACL_SUPPORT if NFS_V3_ACL |
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 4264377552e2..7cf4ddafb4ab 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig | |||
@@ -2,6 +2,7 @@ config NFSD | |||
2 | tristate "NFS server support" | 2 | tristate "NFS server support" |
3 | depends on INET | 3 | depends on INET |
4 | depends on FILE_LOCKING | 4 | depends on FILE_LOCKING |
5 | depends on BKL # fix as soon as lockd is done | ||
5 | select LOCKD | 6 | select LOCKD |
6 | select SUNRPC | 7 | select SUNRPC |
7 | select EXPORTFS | 8 | select EXPORTFS |
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cdfb8c6a4206..c16f8d8331b5 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h | |||
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp) | |||
196 | static inline void | 196 | static inline void |
197 | fh_unlock(struct svc_fh *fhp) | 197 | fh_unlock(struct svc_fh *fhp) |
198 | { | 198 | { |
199 | BUG_ON(!fhp->fh_dentry); | ||
200 | |||
201 | if (fhp->fh_locked) { | 199 | if (fhp->fh_locked) { |
202 | fill_post_wcc(fhp); | 200 | fill_post_wcc(fhp); |
203 | mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); | 201 | mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); |
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629eedd82..b388443c3a09 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig | |||
@@ -3,4 +3,4 @@ config FSNOTIFY | |||
3 | 3 | ||
4 | source "fs/notify/dnotify/Kconfig" | 4 | source "fs/notify/dnotify/Kconfig" |
5 | source "fs/notify/inotify/Kconfig" | 5 | source "fs/notify/inotify/Kconfig" |
6 | source "fs/notify/fanotify/Kconfig" | 6 | #source "fs/notify/fanotify/Kconfig" |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0de69c9a08be..5cfeee118158 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt { | |||
883 | * out in so that future reads from that region will get | 883 | * out in so that future reads from that region will get |
884 | * zero's. | 884 | * zero's. |
885 | */ | 885 | */ |
886 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
887 | unsigned int w_num_pages; | 886 | unsigned int w_num_pages; |
887 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
888 | struct page *w_target_page; | 888 | struct page *w_target_page; |
889 | 889 | ||
890 | /* | 890 | /* |
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, | |||
1642 | return ret; | 1642 | return ret; |
1643 | } | 1643 | } |
1644 | 1644 | ||
1645 | int ocfs2_write_begin_nolock(struct address_space *mapping, | 1645 | int ocfs2_write_begin_nolock(struct file *filp, |
1646 | struct address_space *mapping, | ||
1646 | loff_t pos, unsigned len, unsigned flags, | 1647 | loff_t pos, unsigned len, unsigned flags, |
1647 | struct page **pagep, void **fsdata, | 1648 | struct page **pagep, void **fsdata, |
1648 | struct buffer_head *di_bh, struct page *mmap_page) | 1649 | struct buffer_head *di_bh, struct page *mmap_page) |
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, | |||
1692 | mlog_errno(ret); | 1693 | mlog_errno(ret); |
1693 | goto out; | 1694 | goto out; |
1694 | } else if (ret == 1) { | 1695 | } else if (ret == 1) { |
1695 | ret = ocfs2_refcount_cow(inode, di_bh, | 1696 | ret = ocfs2_refcount_cow(inode, filp, di_bh, |
1696 | wc->w_cpos, wc->w_clen, UINT_MAX); | 1697 | wc->w_cpos, wc->w_clen, UINT_MAX); |
1697 | if (ret) { | 1698 | if (ret) { |
1698 | mlog_errno(ret); | 1699 | mlog_errno(ret); |
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, | |||
1854 | */ | 1855 | */ |
1855 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 1856 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
1856 | 1857 | ||
1857 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, | 1858 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, |
1858 | fsdata, di_bh, NULL); | 1859 | fsdata, di_bh, NULL); |
1859 | if (ret) { | 1860 | if (ret) { |
1860 | mlog_errno(ret); | 1861 | mlog_errno(ret); |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93ffc513..7606f663da6d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
48 | loff_t pos, unsigned len, unsigned copied, | 48 | loff_t pos, unsigned len, unsigned copied, |
49 | struct page *page, void *fsdata); | 49 | struct page *page, void *fsdata); |
50 | 50 | ||
51 | int ocfs2_write_begin_nolock(struct address_space *mapping, | 51 | int ocfs2_write_begin_nolock(struct file *filp, |
52 | struct address_space *mapping, | ||
52 | loff_t pos, unsigned len, unsigned flags, | 53 | loff_t pos, unsigned len, unsigned flags, |
53 | struct page **pagep, void **fsdata, | 54 | struct page **pagep, void **fsdata, |
54 | struct buffer_head *di_bh, struct page *mmap_page); | 55 | struct buffer_head *di_bh, struct page *mmap_page); |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 41d5f1f92d56..52c7557f3e25 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | |||
62 | static LIST_HEAD(o2hb_node_events); | 62 | static LIST_HEAD(o2hb_node_events); |
63 | static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); | 63 | static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); |
64 | 64 | ||
65 | /* | ||
66 | * In global heartbeat, we maintain a series of region bitmaps. | ||
67 | * - o2hb_region_bitmap allows us to limit the region number to max region. | ||
68 | * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). | ||
69 | * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes | ||
70 | * heartbeat on it. | ||
71 | * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. | ||
72 | */ | ||
73 | static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | ||
74 | static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | ||
75 | static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | ||
76 | static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | ||
77 | |||
78 | #define O2HB_DB_TYPE_LIVENODES 0 | ||
79 | #define O2HB_DB_TYPE_LIVEREGIONS 1 | ||
80 | #define O2HB_DB_TYPE_QUORUMREGIONS 2 | ||
81 | #define O2HB_DB_TYPE_FAILEDREGIONS 3 | ||
82 | #define O2HB_DB_TYPE_REGION_LIVENODES 4 | ||
83 | #define O2HB_DB_TYPE_REGION_NUMBER 5 | ||
84 | #define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6 | ||
85 | struct o2hb_debug_buf { | ||
86 | int db_type; | ||
87 | int db_size; | ||
88 | int db_len; | ||
89 | void *db_data; | ||
90 | }; | ||
91 | |||
92 | static struct o2hb_debug_buf *o2hb_db_livenodes; | ||
93 | static struct o2hb_debug_buf *o2hb_db_liveregions; | ||
94 | static struct o2hb_debug_buf *o2hb_db_quorumregions; | ||
95 | static struct o2hb_debug_buf *o2hb_db_failedregions; | ||
96 | |||
65 | #define O2HB_DEBUG_DIR "o2hb" | 97 | #define O2HB_DEBUG_DIR "o2hb" |
66 | #define O2HB_DEBUG_LIVENODES "livenodes" | 98 | #define O2HB_DEBUG_LIVENODES "livenodes" |
99 | #define O2HB_DEBUG_LIVEREGIONS "live_regions" | ||
100 | #define O2HB_DEBUG_QUORUMREGIONS "quorum_regions" | ||
101 | #define O2HB_DEBUG_FAILEDREGIONS "failed_regions" | ||
102 | #define O2HB_DEBUG_REGION_NUMBER "num" | ||
103 | #define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms" | ||
104 | |||
67 | static struct dentry *o2hb_debug_dir; | 105 | static struct dentry *o2hb_debug_dir; |
68 | static struct dentry *o2hb_debug_livenodes; | 106 | static struct dentry *o2hb_debug_livenodes; |
107 | static struct dentry *o2hb_debug_liveregions; | ||
108 | static struct dentry *o2hb_debug_quorumregions; | ||
109 | static struct dentry *o2hb_debug_failedregions; | ||
69 | 110 | ||
70 | static LIST_HEAD(o2hb_all_regions); | 111 | static LIST_HEAD(o2hb_all_regions); |
71 | 112 | ||
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); | |||
77 | 118 | ||
78 | #define O2HB_DEFAULT_BLOCK_BITS 9 | 119 | #define O2HB_DEFAULT_BLOCK_BITS 9 |
79 | 120 | ||
121 | enum o2hb_heartbeat_modes { | ||
122 | O2HB_HEARTBEAT_LOCAL = 0, | ||
123 | O2HB_HEARTBEAT_GLOBAL, | ||
124 | O2HB_HEARTBEAT_NUM_MODES, | ||
125 | }; | ||
126 | |||
127 | char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { | ||
128 | "local", /* O2HB_HEARTBEAT_LOCAL */ | ||
129 | "global", /* O2HB_HEARTBEAT_GLOBAL */ | ||
130 | }; | ||
131 | |||
80 | unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; | 132 | unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; |
133 | unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; | ||
81 | 134 | ||
82 | /* Only sets a new threshold if there are no active regions. | 135 | /* Only sets a new threshold if there are no active regions. |
83 | * | 136 | * |
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold) | |||
94 | } | 147 | } |
95 | } | 148 | } |
96 | 149 | ||
150 | static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) | ||
151 | { | ||
152 | int ret = -1; | ||
153 | |||
154 | if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) { | ||
155 | spin_lock(&o2hb_live_lock); | ||
156 | if (list_empty(&o2hb_all_regions)) { | ||
157 | o2hb_heartbeat_mode = hb_mode; | ||
158 | ret = 0; | ||
159 | } | ||
160 | spin_unlock(&o2hb_live_lock); | ||
161 | } | ||
162 | |||
163 | return ret; | ||
164 | } | ||
165 | |||
97 | struct o2hb_node_event { | 166 | struct o2hb_node_event { |
98 | struct list_head hn_item; | 167 | struct list_head hn_item; |
99 | enum o2hb_callback_type hn_event_type; | 168 | enum o2hb_callback_type hn_event_type; |
@@ -135,6 +204,18 @@ struct o2hb_region { | |||
135 | struct block_device *hr_bdev; | 204 | struct block_device *hr_bdev; |
136 | struct o2hb_disk_slot *hr_slots; | 205 | struct o2hb_disk_slot *hr_slots; |
137 | 206 | ||
207 | /* live node map of this region */ | ||
208 | unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
209 | unsigned int hr_region_num; | ||
210 | |||
211 | struct dentry *hr_debug_dir; | ||
212 | struct dentry *hr_debug_livenodes; | ||
213 | struct dentry *hr_debug_regnum; | ||
214 | struct dentry *hr_debug_elapsed_time; | ||
215 | struct o2hb_debug_buf *hr_db_livenodes; | ||
216 | struct o2hb_debug_buf *hr_db_regnum; | ||
217 | struct o2hb_debug_buf *hr_db_elapsed_time; | ||
218 | |||
138 | /* let the person setting up hb wait for it to return until it | 219 | /* let the person setting up hb wait for it to return until it |
139 | * has reached a 'steady' state. This will be fixed when we have | 220 | * has reached a 'steady' state. This will be fixed when we have |
140 | * a more complete api that doesn't lead to this sort of fragility. */ | 221 | * a more complete api that doesn't lead to this sort of fragility. */ |
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt { | |||
163 | int wc_error; | 244 | int wc_error; |
164 | }; | 245 | }; |
165 | 246 | ||
247 | static int o2hb_pop_count(void *map, int count) | ||
248 | { | ||
249 | int i = -1, pop = 0; | ||
250 | |||
251 | while ((i = find_next_bit(map, count, i + 1)) < count) | ||
252 | pop++; | ||
253 | return pop; | ||
254 | } | ||
255 | |||
166 | static void o2hb_write_timeout(struct work_struct *work) | 256 | static void o2hb_write_timeout(struct work_struct *work) |
167 | { | 257 | { |
258 | int failed, quorum; | ||
259 | unsigned long flags; | ||
168 | struct o2hb_region *reg = | 260 | struct o2hb_region *reg = |
169 | container_of(work, struct o2hb_region, | 261 | container_of(work, struct o2hb_region, |
170 | hr_write_timeout_work.work); | 262 | hr_write_timeout_work.work); |
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work) | |||
172 | mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " | 264 | mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " |
173 | "milliseconds\n", reg->hr_dev_name, | 265 | "milliseconds\n", reg->hr_dev_name, |
174 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); | 266 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); |
267 | |||
268 | if (o2hb_global_heartbeat_active()) { | ||
269 | spin_lock_irqsave(&o2hb_live_lock, flags); | ||
270 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
271 | set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); | ||
272 | failed = o2hb_pop_count(&o2hb_failed_region_bitmap, | ||
273 | O2NM_MAX_REGIONS); | ||
274 | quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, | ||
275 | O2NM_MAX_REGIONS); | ||
276 | spin_unlock_irqrestore(&o2hb_live_lock, flags); | ||
277 | |||
278 | mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", | ||
279 | quorum, failed); | ||
280 | |||
281 | /* | ||
282 | * Fence if the number of failed regions >= half the number | ||
283 | * of quorum regions | ||
284 | */ | ||
285 | if ((failed << 1) < quorum) | ||
286 | return; | ||
287 | } | ||
288 | |||
175 | o2quo_disk_timeout(); | 289 | o2quo_disk_timeout(); |
176 | } | 290 | } |
177 | 291 | ||
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) | |||
180 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", | 294 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", |
181 | O2HB_MAX_WRITE_TIMEOUT_MS); | 295 | O2HB_MAX_WRITE_TIMEOUT_MS); |
182 | 296 | ||
297 | if (o2hb_global_heartbeat_active()) { | ||
298 | spin_lock(&o2hb_live_lock); | ||
299 | clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); | ||
300 | spin_unlock(&o2hb_live_lock); | ||
301 | } | ||
183 | cancel_delayed_work(®->hr_write_timeout_work); | 302 | cancel_delayed_work(®->hr_write_timeout_work); |
184 | reg->hr_last_timeout_start = jiffies; | 303 | reg->hr_last_timeout_start = jiffies; |
185 | schedule_delayed_work(®->hr_write_timeout_work, | 304 | schedule_delayed_work(®->hr_write_timeout_work, |
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, | |||
513 | { | 632 | { |
514 | assert_spin_locked(&o2hb_live_lock); | 633 | assert_spin_locked(&o2hb_live_lock); |
515 | 634 | ||
635 | BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); | ||
636 | |||
516 | event->hn_event_type = type; | 637 | event->hn_event_type = type; |
517 | event->hn_node = node; | 638 | event->hn_node = node; |
518 | event->hn_node_num = node_num; | 639 | event->hn_node_num = node_num; |
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) | |||
554 | o2nm_node_put(node); | 675 | o2nm_node_put(node); |
555 | } | 676 | } |
556 | 677 | ||
678 | static void o2hb_set_quorum_device(struct o2hb_region *reg, | ||
679 | struct o2hb_disk_slot *slot) | ||
680 | { | ||
681 | assert_spin_locked(&o2hb_live_lock); | ||
682 | |||
683 | if (!o2hb_global_heartbeat_active()) | ||
684 | return; | ||
685 | |||
686 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
687 | return; | ||
688 | |||
689 | /* | ||
690 | * A region can be added to the quorum only when it sees all | ||
691 | * live nodes heartbeat on it. In other words, the region has been | ||
692 | * added to all nodes. | ||
693 | */ | ||
694 | if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, | ||
695 | sizeof(o2hb_live_node_bitmap))) | ||
696 | return; | ||
697 | |||
698 | if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) | ||
699 | return; | ||
700 | |||
701 | printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", | ||
702 | config_item_name(®->hr_item)); | ||
703 | |||
704 | set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | ||
705 | } | ||
706 | |||
557 | static int o2hb_check_slot(struct o2hb_region *reg, | 707 | static int o2hb_check_slot(struct o2hb_region *reg, |
558 | struct o2hb_disk_slot *slot) | 708 | struct o2hb_disk_slot *slot) |
559 | { | 709 | { |
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, | |||
565 | u64 cputime; | 715 | u64 cputime; |
566 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; | 716 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; |
567 | unsigned int slot_dead_ms; | 717 | unsigned int slot_dead_ms; |
718 | int tmp; | ||
568 | 719 | ||
569 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); | 720 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); |
570 | 721 | ||
571 | /* Is this correct? Do we assume that the node doesn't exist | 722 | /* |
572 | * if we're not configured for him? */ | 723 | * If a node is no longer configured but is still in the livemap, we |
724 | * may need to clear that bit from the livemap. | ||
725 | */ | ||
573 | node = o2nm_get_node_by_num(slot->ds_node_num); | 726 | node = o2nm_get_node_by_num(slot->ds_node_num); |
574 | if (!node) | 727 | if (!node) { |
575 | return 0; | 728 | spin_lock(&o2hb_live_lock); |
729 | tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); | ||
730 | spin_unlock(&o2hb_live_lock); | ||
731 | if (!tmp) | ||
732 | return 0; | ||
733 | } | ||
576 | 734 | ||
577 | if (!o2hb_verify_crc(reg, hb_block)) { | 735 | if (!o2hb_verify_crc(reg, hb_block)) { |
578 | /* all paths from here will drop o2hb_live_lock for | 736 | /* all paths from here will drop o2hb_live_lock for |
@@ -639,8 +797,12 @@ fire_callbacks: | |||
639 | mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", | 797 | mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n", |
640 | slot->ds_node_num, (long long)slot->ds_last_generation); | 798 | slot->ds_node_num, (long long)slot->ds_last_generation); |
641 | 799 | ||
800 | set_bit(slot->ds_node_num, reg->hr_live_node_bitmap); | ||
801 | |||
642 | /* first on the list generates a callback */ | 802 | /* first on the list generates a callback */ |
643 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | 803 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { |
804 | mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes " | ||
805 | "bitmap\n", slot->ds_node_num); | ||
644 | set_bit(slot->ds_node_num, o2hb_live_node_bitmap); | 806 | set_bit(slot->ds_node_num, o2hb_live_node_bitmap); |
645 | 807 | ||
646 | o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, | 808 | o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, |
@@ -684,13 +846,18 @@ fire_callbacks: | |||
684 | mlog(ML_HEARTBEAT, "Node %d left my region\n", | 846 | mlog(ML_HEARTBEAT, "Node %d left my region\n", |
685 | slot->ds_node_num); | 847 | slot->ds_node_num); |
686 | 848 | ||
849 | clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap); | ||
850 | |||
687 | /* last off the live_slot generates a callback */ | 851 | /* last off the live_slot generates a callback */ |
688 | list_del_init(&slot->ds_live_item); | 852 | list_del_init(&slot->ds_live_item); |
689 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | 853 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { |
854 | mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live " | ||
855 | "nodes bitmap\n", slot->ds_node_num); | ||
690 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); | 856 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); |
691 | 857 | ||
692 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, | 858 | /* node can be null */ |
693 | slot->ds_node_num); | 859 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, |
860 | node, slot->ds_node_num); | ||
694 | 861 | ||
695 | changed = 1; | 862 | changed = 1; |
696 | } | 863 | } |
@@ -706,11 +873,14 @@ fire_callbacks: | |||
706 | slot->ds_equal_samples = 0; | 873 | slot->ds_equal_samples = 0; |
707 | } | 874 | } |
708 | out: | 875 | out: |
876 | o2hb_set_quorum_device(reg, slot); | ||
877 | |||
709 | spin_unlock(&o2hb_live_lock); | 878 | spin_unlock(&o2hb_live_lock); |
710 | 879 | ||
711 | o2hb_run_event_list(&event); | 880 | o2hb_run_event_list(&event); |
712 | 881 | ||
713 | o2nm_node_put(node); | 882 | if (node) |
883 | o2nm_node_put(node); | ||
714 | return changed; | 884 | return changed; |
715 | } | 885 | } |
716 | 886 | ||
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
737 | { | 907 | { |
738 | int i, ret, highest_node, change = 0; | 908 | int i, ret, highest_node, change = 0; |
739 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 909 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
910 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
740 | struct o2hb_bio_wait_ctxt write_wc; | 911 | struct o2hb_bio_wait_ctxt write_wc; |
741 | 912 | ||
742 | ret = o2nm_configured_node_map(configured_nodes, | 913 | ret = o2nm_configured_node_map(configured_nodes, |
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
746 | return ret; | 917 | return ret; |
747 | } | 918 | } |
748 | 919 | ||
920 | /* | ||
921 | * If a node is not configured but is in the livemap, we still need | ||
922 | * to read the slot so as to be able to remove it from the livemap. | ||
923 | */ | ||
924 | o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); | ||
925 | i = -1; | ||
926 | while ((i = find_next_bit(live_node_bitmap, | ||
927 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | ||
928 | set_bit(i, configured_nodes); | ||
929 | } | ||
930 | |||
749 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | 931 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); |
750 | if (highest_node >= O2NM_MAX_NODES) { | 932 | if (highest_node >= O2NM_MAX_NODES) { |
751 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | 933 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); |
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data) | |||
917 | #ifdef CONFIG_DEBUG_FS | 1099 | #ifdef CONFIG_DEBUG_FS |
918 | static int o2hb_debug_open(struct inode *inode, struct file *file) | 1100 | static int o2hb_debug_open(struct inode *inode, struct file *file) |
919 | { | 1101 | { |
1102 | struct o2hb_debug_buf *db = inode->i_private; | ||
1103 | struct o2hb_region *reg; | ||
920 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 1104 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
921 | char *buf = NULL; | 1105 | char *buf = NULL; |
922 | int i = -1; | 1106 | int i = -1; |
923 | int out = 0; | 1107 | int out = 0; |
924 | 1108 | ||
1109 | /* max_nodes should be the largest bitmap we pass here */ | ||
1110 | BUG_ON(sizeof(map) < db->db_size); | ||
1111 | |||
925 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 1112 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
926 | if (!buf) | 1113 | if (!buf) |
927 | goto bail; | 1114 | goto bail; |
928 | 1115 | ||
929 | o2hb_fill_node_map(map, sizeof(map)); | 1116 | switch (db->db_type) { |
1117 | case O2HB_DB_TYPE_LIVENODES: | ||
1118 | case O2HB_DB_TYPE_LIVEREGIONS: | ||
1119 | case O2HB_DB_TYPE_QUORUMREGIONS: | ||
1120 | case O2HB_DB_TYPE_FAILEDREGIONS: | ||
1121 | spin_lock(&o2hb_live_lock); | ||
1122 | memcpy(map, db->db_data, db->db_size); | ||
1123 | spin_unlock(&o2hb_live_lock); | ||
1124 | break; | ||
1125 | |||
1126 | case O2HB_DB_TYPE_REGION_LIVENODES: | ||
1127 | spin_lock(&o2hb_live_lock); | ||
1128 | reg = (struct o2hb_region *)db->db_data; | ||
1129 | memcpy(map, reg->hr_live_node_bitmap, db->db_size); | ||
1130 | spin_unlock(&o2hb_live_lock); | ||
1131 | break; | ||
1132 | |||
1133 | case O2HB_DB_TYPE_REGION_NUMBER: | ||
1134 | reg = (struct o2hb_region *)db->db_data; | ||
1135 | out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", | ||
1136 | reg->hr_region_num); | ||
1137 | goto done; | ||
1138 | |||
1139 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: | ||
1140 | reg = (struct o2hb_region *)db->db_data; | ||
1141 | out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", | ||
1142 | jiffies_to_msecs(jiffies - | ||
1143 | reg->hr_last_timeout_start)); | ||
1144 | goto done; | ||
1145 | |||
1146 | default: | ||
1147 | goto done; | ||
1148 | } | ||
930 | 1149 | ||
931 | while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) | 1150 | while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) |
932 | out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); | 1151 | out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); |
933 | out += snprintf(buf + out, PAGE_SIZE - out, "\n"); | 1152 | out += snprintf(buf + out, PAGE_SIZE - out, "\n"); |
934 | 1153 | ||
1154 | done: | ||
935 | i_size_write(inode, out); | 1155 | i_size_write(inode, out); |
936 | 1156 | ||
937 | file->private_data = buf; | 1157 | file->private_data = buf; |
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = { | |||
978 | 1198 | ||
979 | void o2hb_exit(void) | 1199 | void o2hb_exit(void) |
980 | { | 1200 | { |
981 | if (o2hb_debug_livenodes) | 1201 | kfree(o2hb_db_livenodes); |
982 | debugfs_remove(o2hb_debug_livenodes); | 1202 | kfree(o2hb_db_liveregions); |
983 | if (o2hb_debug_dir) | 1203 | kfree(o2hb_db_quorumregions); |
984 | debugfs_remove(o2hb_debug_dir); | 1204 | kfree(o2hb_db_failedregions); |
1205 | debugfs_remove(o2hb_debug_failedregions); | ||
1206 | debugfs_remove(o2hb_debug_quorumregions); | ||
1207 | debugfs_remove(o2hb_debug_liveregions); | ||
1208 | debugfs_remove(o2hb_debug_livenodes); | ||
1209 | debugfs_remove(o2hb_debug_dir); | ||
1210 | } | ||
1211 | |||
1212 | static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, | ||
1213 | struct o2hb_debug_buf **db, int db_len, | ||
1214 | int type, int size, int len, void *data) | ||
1215 | { | ||
1216 | *db = kmalloc(db_len, GFP_KERNEL); | ||
1217 | if (!*db) | ||
1218 | return NULL; | ||
1219 | |||
1220 | (*db)->db_type = type; | ||
1221 | (*db)->db_size = size; | ||
1222 | (*db)->db_len = len; | ||
1223 | (*db)->db_data = data; | ||
1224 | |||
1225 | return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, | ||
1226 | &o2hb_debug_fops); | ||
1227 | } | ||
1228 | |||
1229 | static int o2hb_debug_init(void) | ||
1230 | { | ||
1231 | int ret = -ENOMEM; | ||
1232 | |||
1233 | o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); | ||
1234 | if (!o2hb_debug_dir) { | ||
1235 | mlog_errno(ret); | ||
1236 | goto bail; | ||
1237 | } | ||
1238 | |||
1239 | o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, | ||
1240 | o2hb_debug_dir, | ||
1241 | &o2hb_db_livenodes, | ||
1242 | sizeof(*o2hb_db_livenodes), | ||
1243 | O2HB_DB_TYPE_LIVENODES, | ||
1244 | sizeof(o2hb_live_node_bitmap), | ||
1245 | O2NM_MAX_NODES, | ||
1246 | o2hb_live_node_bitmap); | ||
1247 | if (!o2hb_debug_livenodes) { | ||
1248 | mlog_errno(ret); | ||
1249 | goto bail; | ||
1250 | } | ||
1251 | |||
1252 | o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, | ||
1253 | o2hb_debug_dir, | ||
1254 | &o2hb_db_liveregions, | ||
1255 | sizeof(*o2hb_db_liveregions), | ||
1256 | O2HB_DB_TYPE_LIVEREGIONS, | ||
1257 | sizeof(o2hb_live_region_bitmap), | ||
1258 | O2NM_MAX_REGIONS, | ||
1259 | o2hb_live_region_bitmap); | ||
1260 | if (!o2hb_debug_liveregions) { | ||
1261 | mlog_errno(ret); | ||
1262 | goto bail; | ||
1263 | } | ||
1264 | |||
1265 | o2hb_debug_quorumregions = | ||
1266 | o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, | ||
1267 | o2hb_debug_dir, | ||
1268 | &o2hb_db_quorumregions, | ||
1269 | sizeof(*o2hb_db_quorumregions), | ||
1270 | O2HB_DB_TYPE_QUORUMREGIONS, | ||
1271 | sizeof(o2hb_quorum_region_bitmap), | ||
1272 | O2NM_MAX_REGIONS, | ||
1273 | o2hb_quorum_region_bitmap); | ||
1274 | if (!o2hb_debug_quorumregions) { | ||
1275 | mlog_errno(ret); | ||
1276 | goto bail; | ||
1277 | } | ||
1278 | |||
1279 | o2hb_debug_failedregions = | ||
1280 | o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, | ||
1281 | o2hb_debug_dir, | ||
1282 | &o2hb_db_failedregions, | ||
1283 | sizeof(*o2hb_db_failedregions), | ||
1284 | O2HB_DB_TYPE_FAILEDREGIONS, | ||
1285 | sizeof(o2hb_failed_region_bitmap), | ||
1286 | O2NM_MAX_REGIONS, | ||
1287 | o2hb_failed_region_bitmap); | ||
1288 | if (!o2hb_debug_failedregions) { | ||
1289 | mlog_errno(ret); | ||
1290 | goto bail; | ||
1291 | } | ||
1292 | |||
1293 | ret = 0; | ||
1294 | bail: | ||
1295 | if (ret) | ||
1296 | o2hb_exit(); | ||
1297 | |||
1298 | return ret; | ||
985 | } | 1299 | } |
986 | 1300 | ||
987 | int o2hb_init(void) | 1301 | int o2hb_init(void) |
@@ -997,24 +1311,12 @@ int o2hb_init(void) | |||
997 | INIT_LIST_HEAD(&o2hb_node_events); | 1311 | INIT_LIST_HEAD(&o2hb_node_events); |
998 | 1312 | ||
999 | memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); | 1313 | memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); |
1314 | memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); | ||
1315 | memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); | ||
1316 | memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); | ||
1317 | memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); | ||
1000 | 1318 | ||
1001 | o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); | 1319 | return o2hb_debug_init(); |
1002 | if (!o2hb_debug_dir) { | ||
1003 | mlog_errno(-ENOMEM); | ||
1004 | return -ENOMEM; | ||
1005 | } | ||
1006 | |||
1007 | o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES, | ||
1008 | S_IFREG|S_IRUSR, | ||
1009 | o2hb_debug_dir, NULL, | ||
1010 | &o2hb_debug_fops); | ||
1011 | if (!o2hb_debug_livenodes) { | ||
1012 | mlog_errno(-ENOMEM); | ||
1013 | debugfs_remove(o2hb_debug_dir); | ||
1014 | return -ENOMEM; | ||
1015 | } | ||
1016 | |||
1017 | return 0; | ||
1018 | } | 1320 | } |
1019 | 1321 | ||
1020 | /* if we're already in a callback then we're already serialized by the sem */ | 1322 | /* if we're already in a callback then we're already serialized by the sem */ |
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item) | |||
1078 | if (reg->hr_slots) | 1380 | if (reg->hr_slots) |
1079 | kfree(reg->hr_slots); | 1381 | kfree(reg->hr_slots); |
1080 | 1382 | ||
1383 | kfree(reg->hr_db_regnum); | ||
1384 | kfree(reg->hr_db_livenodes); | ||
1385 | debugfs_remove(reg->hr_debug_livenodes); | ||
1386 | debugfs_remove(reg->hr_debug_regnum); | ||
1387 | debugfs_remove(reg->hr_debug_elapsed_time); | ||
1388 | debugfs_remove(reg->hr_debug_dir); | ||
1389 | |||
1081 | spin_lock(&o2hb_live_lock); | 1390 | spin_lock(&o2hb_live_lock); |
1082 | list_del(®->hr_all_item); | 1391 | list_del(®->hr_all_item); |
1083 | spin_unlock(&o2hb_live_lock); | 1392 | spin_unlock(&o2hb_live_lock); |
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1441 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ | 1750 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ |
1442 | spin_lock(&o2hb_live_lock); | 1751 | spin_lock(&o2hb_live_lock); |
1443 | hb_task = reg->hr_task; | 1752 | hb_task = reg->hr_task; |
1753 | if (o2hb_global_heartbeat_active()) | ||
1754 | set_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
1444 | spin_unlock(&o2hb_live_lock); | 1755 | spin_unlock(&o2hb_live_lock); |
1445 | 1756 | ||
1446 | if (hb_task) | 1757 | if (hb_task) |
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1448 | else | 1759 | else |
1449 | ret = -EIO; | 1760 | ret = -EIO; |
1450 | 1761 | ||
1762 | if (hb_task && o2hb_global_heartbeat_active()) | ||
1763 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", | ||
1764 | config_item_name(®->hr_item)); | ||
1765 | |||
1451 | out: | 1766 | out: |
1452 | if (filp) | 1767 | if (filp) |
1453 | fput(filp); | 1768 | fput(filp); |
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group | |||
1586 | : NULL; | 1901 | : NULL; |
1587 | } | 1902 | } |
1588 | 1903 | ||
1904 | static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) | ||
1905 | { | ||
1906 | int ret = -ENOMEM; | ||
1907 | |||
1908 | reg->hr_debug_dir = | ||
1909 | debugfs_create_dir(config_item_name(®->hr_item), dir); | ||
1910 | if (!reg->hr_debug_dir) { | ||
1911 | mlog_errno(ret); | ||
1912 | goto bail; | ||
1913 | } | ||
1914 | |||
1915 | reg->hr_debug_livenodes = | ||
1916 | o2hb_debug_create(O2HB_DEBUG_LIVENODES, | ||
1917 | reg->hr_debug_dir, | ||
1918 | &(reg->hr_db_livenodes), | ||
1919 | sizeof(*(reg->hr_db_livenodes)), | ||
1920 | O2HB_DB_TYPE_REGION_LIVENODES, | ||
1921 | sizeof(reg->hr_live_node_bitmap), | ||
1922 | O2NM_MAX_NODES, reg); | ||
1923 | if (!reg->hr_debug_livenodes) { | ||
1924 | mlog_errno(ret); | ||
1925 | goto bail; | ||
1926 | } | ||
1927 | |||
1928 | reg->hr_debug_regnum = | ||
1929 | o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, | ||
1930 | reg->hr_debug_dir, | ||
1931 | &(reg->hr_db_regnum), | ||
1932 | sizeof(*(reg->hr_db_regnum)), | ||
1933 | O2HB_DB_TYPE_REGION_NUMBER, | ||
1934 | 0, O2NM_MAX_NODES, reg); | ||
1935 | if (!reg->hr_debug_regnum) { | ||
1936 | mlog_errno(ret); | ||
1937 | goto bail; | ||
1938 | } | ||
1939 | |||
1940 | reg->hr_debug_elapsed_time = | ||
1941 | o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, | ||
1942 | reg->hr_debug_dir, | ||
1943 | &(reg->hr_db_elapsed_time), | ||
1944 | sizeof(*(reg->hr_db_elapsed_time)), | ||
1945 | O2HB_DB_TYPE_REGION_ELAPSED_TIME, | ||
1946 | 0, 0, reg); | ||
1947 | if (!reg->hr_debug_elapsed_time) { | ||
1948 | mlog_errno(ret); | ||
1949 | goto bail; | ||
1950 | } | ||
1951 | |||
1952 | ret = 0; | ||
1953 | bail: | ||
1954 | return ret; | ||
1955 | } | ||
1956 | |||
1589 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, | 1957 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, |
1590 | const char *name) | 1958 | const char *name) |
1591 | { | 1959 | { |
1592 | struct o2hb_region *reg = NULL; | 1960 | struct o2hb_region *reg = NULL; |
1961 | int ret; | ||
1593 | 1962 | ||
1594 | reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); | 1963 | reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL); |
1595 | if (reg == NULL) | 1964 | if (reg == NULL) |
1596 | return ERR_PTR(-ENOMEM); | 1965 | return ERR_PTR(-ENOMEM); |
1597 | 1966 | ||
1598 | config_item_init_type_name(®->hr_item, name, &o2hb_region_type); | 1967 | if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) |
1968 | return ERR_PTR(-ENAMETOOLONG); | ||
1599 | 1969 | ||
1600 | spin_lock(&o2hb_live_lock); | 1970 | spin_lock(&o2hb_live_lock); |
1971 | reg->hr_region_num = 0; | ||
1972 | if (o2hb_global_heartbeat_active()) { | ||
1973 | reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap, | ||
1974 | O2NM_MAX_REGIONS); | ||
1975 | if (reg->hr_region_num >= O2NM_MAX_REGIONS) { | ||
1976 | spin_unlock(&o2hb_live_lock); | ||
1977 | return ERR_PTR(-EFBIG); | ||
1978 | } | ||
1979 | set_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
1980 | } | ||
1601 | list_add_tail(®->hr_all_item, &o2hb_all_regions); | 1981 | list_add_tail(®->hr_all_item, &o2hb_all_regions); |
1602 | spin_unlock(&o2hb_live_lock); | 1982 | spin_unlock(&o2hb_live_lock); |
1603 | 1983 | ||
1984 | config_item_init_type_name(®->hr_item, name, &o2hb_region_type); | ||
1985 | |||
1986 | ret = o2hb_debug_region_init(reg, o2hb_debug_dir); | ||
1987 | if (ret) { | ||
1988 | config_item_put(®->hr_item); | ||
1989 | return ERR_PTR(ret); | ||
1990 | } | ||
1991 | |||
1604 | return ®->hr_item; | 1992 | return ®->hr_item; |
1605 | } | 1993 | } |
1606 | 1994 | ||
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
1612 | 2000 | ||
1613 | /* stop the thread when the user removes the region dir */ | 2001 | /* stop the thread when the user removes the region dir */ |
1614 | spin_lock(&o2hb_live_lock); | 2002 | spin_lock(&o2hb_live_lock); |
2003 | if (o2hb_global_heartbeat_active()) { | ||
2004 | clear_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
2005 | clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
2006 | } | ||
1615 | hb_task = reg->hr_task; | 2007 | hb_task = reg->hr_task; |
1616 | reg->hr_task = NULL; | 2008 | reg->hr_task = NULL; |
1617 | spin_unlock(&o2hb_live_lock); | 2009 | spin_unlock(&o2hb_live_lock); |
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
1628 | wake_up(&o2hb_steady_queue); | 2020 | wake_up(&o2hb_steady_queue); |
1629 | } | 2021 | } |
1630 | 2022 | ||
2023 | if (o2hb_global_heartbeat_active()) | ||
2024 | printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", | ||
2025 | config_item_name(®->hr_item)); | ||
1631 | config_item_put(item); | 2026 | config_item_put(item); |
1632 | } | 2027 | } |
1633 | 2028 | ||
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group | |||
1688 | return count; | 2083 | return count; |
1689 | } | 2084 | } |
1690 | 2085 | ||
2086 | static | ||
2087 | ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, | ||
2088 | char *page) | ||
2089 | { | ||
2090 | return sprintf(page, "%s\n", | ||
2091 | o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); | ||
2092 | } | ||
2093 | |||
2094 | static | ||
2095 | ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, | ||
2096 | const char *page, size_t count) | ||
2097 | { | ||
2098 | unsigned int i; | ||
2099 | int ret; | ||
2100 | size_t len; | ||
2101 | |||
2102 | len = (page[count - 1] == '\n') ? count - 1 : count; | ||
2103 | if (!len) | ||
2104 | return -EINVAL; | ||
2105 | |||
2106 | for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { | ||
2107 | if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) | ||
2108 | continue; | ||
2109 | |||
2110 | ret = o2hb_global_hearbeat_mode_set(i); | ||
2111 | if (!ret) | ||
2112 | printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", | ||
2113 | o2hb_heartbeat_mode_desc[i]); | ||
2114 | return count; | ||
2115 | } | ||
2116 | |||
2117 | return -EINVAL; | ||
2118 | |||
2119 | } | ||
2120 | |||
1691 | static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { | 2121 | static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { |
1692 | .attr = { .ca_owner = THIS_MODULE, | 2122 | .attr = { .ca_owner = THIS_MODULE, |
1693 | .ca_name = "dead_threshold", | 2123 | .ca_name = "dead_threshold", |
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold | |||
1696 | .store = o2hb_heartbeat_group_threshold_store, | 2126 | .store = o2hb_heartbeat_group_threshold_store, |
1697 | }; | 2127 | }; |
1698 | 2128 | ||
2129 | static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { | ||
2130 | .attr = { .ca_owner = THIS_MODULE, | ||
2131 | .ca_name = "mode", | ||
2132 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
2133 | .show = o2hb_heartbeat_group_mode_show, | ||
2134 | .store = o2hb_heartbeat_group_mode_store, | ||
2135 | }; | ||
2136 | |||
1699 | static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { | 2137 | static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { |
1700 | &o2hb_heartbeat_group_attr_threshold.attr, | 2138 | &o2hb_heartbeat_group_attr_threshold.attr, |
2139 | &o2hb_heartbeat_group_attr_mode.attr, | ||
1701 | NULL, | 2140 | NULL, |
1702 | }; | 2141 | }; |
1703 | 2142 | ||
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void) | |||
1963 | spin_unlock(&o2hb_live_lock); | 2402 | spin_unlock(&o2hb_live_lock); |
1964 | } | 2403 | } |
1965 | EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); | 2404 | EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); |
2405 | |||
2406 | int o2hb_get_all_regions(char *region_uuids, u8 max_regions) | ||
2407 | { | ||
2408 | struct o2hb_region *reg; | ||
2409 | int numregs = 0; | ||
2410 | char *p; | ||
2411 | |||
2412 | spin_lock(&o2hb_live_lock); | ||
2413 | |||
2414 | p = region_uuids; | ||
2415 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { | ||
2416 | mlog(0, "Region: %s\n", config_item_name(®->hr_item)); | ||
2417 | if (numregs < max_regions) { | ||
2418 | memcpy(p, config_item_name(®->hr_item), | ||
2419 | O2HB_MAX_REGION_NAME_LEN); | ||
2420 | p += O2HB_MAX_REGION_NAME_LEN; | ||
2421 | } | ||
2422 | numregs++; | ||
2423 | } | ||
2424 | |||
2425 | spin_unlock(&o2hb_live_lock); | ||
2426 | |||
2427 | return numregs; | ||
2428 | } | ||
2429 | EXPORT_SYMBOL_GPL(o2hb_get_all_regions); | ||
2430 | |||
2431 | int o2hb_global_heartbeat_active(void) | ||
2432 | { | ||
2433 | return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL); | ||
2434 | } | ||
2435 | EXPORT_SYMBOL(o2hb_global_heartbeat_active); | ||
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 2f1649253b49..00ad8e8fea51 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h | |||
@@ -31,6 +31,8 @@ | |||
31 | 31 | ||
32 | #define O2HB_REGION_TIMEOUT_MS 2000 | 32 | #define O2HB_REGION_TIMEOUT_MS 2000 |
33 | 33 | ||
34 | #define O2HB_MAX_REGION_NAME_LEN 32 | ||
35 | |||
34 | /* number of changes to be seen as live */ | 36 | /* number of changes to be seen as live */ |
35 | #define O2HB_LIVE_THRESHOLD 2 | 37 | #define O2HB_LIVE_THRESHOLD 2 |
36 | /* number of equal samples to be seen as dead */ | 38 | /* number of equal samples to be seen as dead */ |
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num); | |||
81 | int o2hb_check_node_heartbeating_from_callback(u8 node_num); | 83 | int o2hb_check_node_heartbeating_from_callback(u8 node_num); |
82 | int o2hb_check_local_node_heartbeating(void); | 84 | int o2hb_check_local_node_heartbeating(void); |
83 | void o2hb_stop_all_regions(void); | 85 | void o2hb_stop_all_regions(void); |
86 | int o2hb_get_all_regions(char *region_uuids, u8 numregions); | ||
87 | int o2hb_global_heartbeat_active(void); | ||
84 | 88 | ||
85 | #endif /* O2CLUSTER_HEARTBEAT_H */ | 89 | #endif /* O2CLUSTER_HEARTBEAT_H */ |
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index fd96e2a2fa56..ea2ed9f56c94 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h | |||
@@ -119,7 +119,8 @@ | |||
119 | #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ | 119 | #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ |
120 | #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ | 120 | #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ |
121 | #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ | 121 | #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ |
122 | #define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ | 122 | #define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ |
123 | #define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ | ||
123 | 124 | ||
124 | #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) | 125 | #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) |
125 | #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) | 126 | #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) |
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index ed0c9f367fed..bb240647ca5f 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group, | |||
711 | config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); | 711 | config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); |
712 | spin_lock_init(&node->nd_lock); | 712 | spin_lock_init(&node->nd_lock); |
713 | 713 | ||
714 | mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name); | ||
715 | |||
714 | return &node->nd_item; | 716 | return &node->nd_item; |
715 | } | 717 | } |
716 | 718 | ||
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group, | |||
744 | } | 746 | } |
745 | write_unlock(&cluster->cl_nodes_lock); | 747 | write_unlock(&cluster->cl_nodes_lock); |
746 | 748 | ||
749 | mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n", | ||
750 | config_item_name(&node->nd_item)); | ||
751 | |||
747 | config_item_put(item); | 752 | config_item_put(item); |
748 | } | 753 | } |
749 | 754 | ||
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 5b9854bad571..49b594325bec 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h | |||
@@ -36,4 +36,10 @@ | |||
36 | /* host name, group name, cluster name all 64 bytes */ | 36 | /* host name, group name, cluster name all 64 bytes */ |
37 | #define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN | 37 | #define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN |
38 | 38 | ||
39 | /* | ||
40 | * Maximum number of global heartbeat regions allowed. | ||
41 | * **CAUTION** Changing this number will break dlm compatibility. | ||
42 | */ | ||
43 | #define O2NM_MAX_REGIONS 32 | ||
44 | |||
39 | #endif /* _OCFS2_NODEMANAGER_H */ | 45 | #endif /* _OCFS2_NODEMANAGER_H */ |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index cbe2f057cc28..9aa426e42123 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, | |||
1696 | { | 1696 | { |
1697 | o2quo_hb_down(node_num); | 1697 | o2quo_hb_down(node_num); |
1698 | 1698 | ||
1699 | if (!node) | ||
1700 | return; | ||
1701 | |||
1699 | if (node_num != o2nm_this_node()) | 1702 | if (node_num != o2nm_this_node()) |
1700 | o2net_disconnect_node(node); | 1703 | o2net_disconnect_node(node); |
1701 | 1704 | ||
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | |||
1709 | 1712 | ||
1710 | o2quo_hb_up(node_num); | 1713 | o2quo_hb_up(node_num); |
1711 | 1714 | ||
1715 | BUG_ON(!node); | ||
1716 | |||
1712 | /* ensure an immediate connect attempt */ | 1717 | /* ensure an immediate connect attempt */ |
1713 | nn->nn_last_connect_attempt = jiffies - | 1718 | nn->nn_last_connect_attempt = jiffies - |
1714 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); | 1719 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); |
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b4957c7d9fe2..edaded48e7e9 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c | |||
@@ -40,6 +40,14 @@ | |||
40 | #include "inode.h" | 40 | #include "inode.h" |
41 | #include "super.h" | 41 | #include "super.h" |
42 | 42 | ||
43 | void ocfs2_dentry_attach_gen(struct dentry *dentry) | ||
44 | { | ||
45 | unsigned long gen = | ||
46 | OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; | ||
47 | BUG_ON(dentry->d_inode); | ||
48 | dentry->d_fsdata = (void *)gen; | ||
49 | } | ||
50 | |||
43 | 51 | ||
44 | static int ocfs2_dentry_revalidate(struct dentry *dentry, | 52 | static int ocfs2_dentry_revalidate(struct dentry *dentry, |
45 | struct nameidata *nd) | 53 | struct nameidata *nd) |
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, | |||
51 | mlog_entry("(0x%p, '%.*s')\n", dentry, | 59 | mlog_entry("(0x%p, '%.*s')\n", dentry, |
52 | dentry->d_name.len, dentry->d_name.name); | 60 | dentry->d_name.len, dentry->d_name.name); |
53 | 61 | ||
54 | /* Never trust a negative dentry - force a new lookup. */ | 62 | /* For a negative dentry - |
63 | * check the generation number of the parent and compare with the | ||
64 | * one stored in the inode. | ||
65 | */ | ||
55 | if (inode == NULL) { | 66 | if (inode == NULL) { |
56 | mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, | 67 | unsigned long gen = (unsigned long) dentry->d_fsdata; |
57 | dentry->d_name.name); | 68 | unsigned long pgen = |
58 | goto bail; | 69 | OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; |
70 | mlog(0, "negative dentry: %.*s parent gen: %lu " | ||
71 | "dentry gen: %lu\n", | ||
72 | dentry->d_name.len, dentry->d_name.name, pgen, gen); | ||
73 | if (gen != pgen) | ||
74 | goto bail; | ||
75 | goto valid; | ||
59 | } | 76 | } |
60 | 77 | ||
61 | BUG_ON(!osb); | 78 | BUG_ON(!osb); |
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, | |||
96 | goto bail; | 113 | goto bail; |
97 | } | 114 | } |
98 | 115 | ||
116 | valid: | ||
99 | ret = 1; | 117 | ret = 1; |
100 | 118 | ||
101 | bail: | 119 | bail: |
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, | |||
227 | if (!inode) | 245 | if (!inode) |
228 | return 0; | 246 | return 0; |
229 | 247 | ||
248 | if (!dentry->d_inode && dentry->d_fsdata) { | ||
249 | /* Converting a negative dentry to positive | ||
250 | Clear dentry->d_fsdata */ | ||
251 | dentry->d_fsdata = dl = NULL; | ||
252 | } | ||
253 | |||
230 | if (dl) { | 254 | if (dl) { |
231 | mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, | 255 | mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, |
232 | " \"%.*s\": old parent: %llu, new: %llu\n", | 256 | " \"%.*s\": old parent: %llu, new: %llu\n", |
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) | |||
452 | 476 | ||
453 | out: | 477 | out: |
454 | iput(inode); | 478 | iput(inode); |
479 | ocfs2_dentry_attach_gen(dentry); | ||
455 | } | 480 | } |
456 | 481 | ||
457 | /* | 482 | /* |
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index f5dd1789acf1..b79eff709958 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h | |||
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, | |||
64 | struct inode *old_dir, struct inode *new_dir); | 64 | struct inode *old_dir, struct inode *new_dir); |
65 | 65 | ||
66 | extern spinlock_t dentry_attach_lock; | 66 | extern spinlock_t dentry_attach_lock; |
67 | void ocfs2_dentry_attach_gen(struct dentry *dentry); | ||
67 | 68 | ||
68 | #endif /* OCFS2_DCACHE_H */ | 69 | #endif /* OCFS2_DCACHE_H */ |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 765298908f1d..b36d0bf77a5a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -445,7 +445,9 @@ enum { | |||
445 | DLM_LOCK_REQUEST_MSG, /* 515 */ | 445 | DLM_LOCK_REQUEST_MSG, /* 515 */ |
446 | DLM_RECO_DATA_DONE_MSG, /* 516 */ | 446 | DLM_RECO_DATA_DONE_MSG, /* 516 */ |
447 | DLM_BEGIN_RECO_MSG, /* 517 */ | 447 | DLM_BEGIN_RECO_MSG, /* 517 */ |
448 | DLM_FINALIZE_RECO_MSG /* 518 */ | 448 | DLM_FINALIZE_RECO_MSG, /* 518 */ |
449 | DLM_QUERY_REGION, /* 519 */ | ||
450 | DLM_QUERY_NODEINFO, /* 520 */ | ||
449 | }; | 451 | }; |
450 | 452 | ||
451 | struct dlm_reco_node_data | 453 | struct dlm_reco_node_data |
@@ -727,6 +729,31 @@ struct dlm_cancel_join | |||
727 | u8 domain[O2NM_MAX_NAME_LEN]; | 729 | u8 domain[O2NM_MAX_NAME_LEN]; |
728 | }; | 730 | }; |
729 | 731 | ||
732 | struct dlm_query_region { | ||
733 | u8 qr_node; | ||
734 | u8 qr_numregions; | ||
735 | u8 qr_namelen; | ||
736 | u8 pad1; | ||
737 | u8 qr_domain[O2NM_MAX_NAME_LEN]; | ||
738 | u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS]; | ||
739 | }; | ||
740 | |||
741 | struct dlm_node_info { | ||
742 | u8 ni_nodenum; | ||
743 | u8 pad1; | ||
744 | u16 ni_ipv4_port; | ||
745 | u32 ni_ipv4_address; | ||
746 | }; | ||
747 | |||
748 | struct dlm_query_nodeinfo { | ||
749 | u8 qn_nodenum; | ||
750 | u8 qn_numnodes; | ||
751 | u8 qn_namelen; | ||
752 | u8 pad1; | ||
753 | u8 qn_domain[O2NM_MAX_NAME_LEN]; | ||
754 | struct dlm_node_info qn_nodes[O2NM_MAX_NODES]; | ||
755 | }; | ||
756 | |||
730 | struct dlm_exit_domain | 757 | struct dlm_exit_domain |
731 | { | 758 | { |
732 | u8 node_idx; | 759 | u8 node_idx; |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 901ca52bf86b..272ec8631a51 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | |||
493 | struct hlist_head *bucket; | 493 | struct hlist_head *bucket; |
494 | struct hlist_node *list; | 494 | struct hlist_node *list; |
495 | int i, out = 0; | 495 | int i, out = 0; |
496 | unsigned long total = 0, longest = 0, bktcnt; | 496 | unsigned long total = 0, longest = 0, bucket_count = 0; |
497 | 497 | ||
498 | out += snprintf(db->buf + out, db->len - out, | 498 | out += snprintf(db->buf + out, db->len - out, |
499 | "Dumping MLEs for Domain: %s\n", dlm->name); | 499 | "Dumping MLEs for Domain: %s\n", dlm->name); |
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | |||
505 | mle = hlist_entry(list, struct dlm_master_list_entry, | 505 | mle = hlist_entry(list, struct dlm_master_list_entry, |
506 | master_hash_node); | 506 | master_hash_node); |
507 | ++total; | 507 | ++total; |
508 | ++bktcnt; | 508 | ++bucket_count; |
509 | if (db->len - out < 200) | 509 | if (db->len - out < 200) |
510 | continue; | 510 | continue; |
511 | out += dump_mle(mle, db->buf + out, db->len - out); | 511 | out += dump_mle(mle, db->buf + out, db->len - out); |
512 | } | 512 | } |
513 | longest = max(longest, bktcnt); | 513 | longest = max(longest, bucket_count); |
514 | bktcnt = 0; | 514 | bucket_count = 0; |
515 | } | 515 | } |
516 | spin_unlock(&dlm->master_lock); | 516 | spin_unlock(&dlm->master_lock); |
517 | 517 | ||
@@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | |||
782 | 782 | ||
783 | /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ | 783 | /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ |
784 | out += snprintf(db->buf + out, db->len - out, | 784 | out += snprintf(db->buf + out, db->len - out, |
785 | "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); | 785 | "Domain: %s Key: 0x%08x Protocol: %d.%d\n", |
786 | dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, | ||
787 | dlm->dlm_locking_proto.pv_minor); | ||
786 | 788 | ||
787 | /* Thread Pid: xxx Node: xxx State: xxxxx */ | 789 | /* Thread Pid: xxx Node: xxx State: xxxxx */ |
788 | out += snprintf(db->buf + out, db->len - out, | 790 | out += snprintf(db->buf + out, db->len - out, |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 11a5c87fd7f7..58a93b953735 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); | |||
128 | * will have a negotiated version with the same major number and a minor | 128 | * will have a negotiated version with the same major number and a minor |
129 | * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should | 129 | * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should |
130 | * be used to determine what a running domain is actually using. | 130 | * be used to determine what a running domain is actually using. |
131 | * | ||
132 | * New in version 1.1: | ||
133 | * - Message DLM_QUERY_REGION added to support global heartbeat | ||
134 | * - Message DLM_QUERY_NODEINFO added to allow online node removes | ||
131 | */ | 135 | */ |
132 | static const struct dlm_protocol_version dlm_protocol = { | 136 | static const struct dlm_protocol_version dlm_protocol = { |
133 | .pv_major = 1, | 137 | .pv_major = 1, |
134 | .pv_minor = 0, | 138 | .pv_minor = 1, |
135 | }; | 139 | }; |
136 | 140 | ||
137 | #define DLM_DOMAIN_BACKOFF_MS 200 | 141 | #define DLM_DOMAIN_BACKOFF_MS 200 |
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, | |||
142 | void **ret_data); | 146 | void **ret_data); |
143 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, | 147 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, |
144 | void **ret_data); | 148 | void **ret_data); |
149 | static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, | ||
150 | void *data, void **ret_data); | ||
145 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, | 151 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, |
146 | void **ret_data); | 152 | void **ret_data); |
147 | static int dlm_protocol_compare(struct dlm_protocol_version *existing, | 153 | static int dlm_protocol_compare(struct dlm_protocol_version *existing, |
@@ -921,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, | |||
921 | return 0; | 927 | return 0; |
922 | } | 928 | } |
923 | 929 | ||
930 | static int dlm_match_regions(struct dlm_ctxt *dlm, | ||
931 | struct dlm_query_region *qr) | ||
932 | { | ||
933 | char *local = NULL, *remote = qr->qr_regions; | ||
934 | char *l, *r; | ||
935 | int localnr, i, j, foundit; | ||
936 | int status = 0; | ||
937 | |||
938 | if (!o2hb_global_heartbeat_active()) { | ||
939 | if (qr->qr_numregions) { | ||
940 | mlog(ML_ERROR, "Domain %s: Joining node %d has global " | ||
941 | "heartbeat enabled but local node %d does not\n", | ||
942 | qr->qr_domain, qr->qr_node, dlm->node_num); | ||
943 | status = -EINVAL; | ||
944 | } | ||
945 | goto bail; | ||
946 | } | ||
947 | |||
948 | if (o2hb_global_heartbeat_active() && !qr->qr_numregions) { | ||
949 | mlog(ML_ERROR, "Domain %s: Local node %d has global " | ||
950 | "heartbeat enabled but joining node %d does not\n", | ||
951 | qr->qr_domain, dlm->node_num, qr->qr_node); | ||
952 | status = -EINVAL; | ||
953 | goto bail; | ||
954 | } | ||
955 | |||
956 | r = remote; | ||
957 | for (i = 0; i < qr->qr_numregions; ++i) { | ||
958 | mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r); | ||
959 | r += O2HB_MAX_REGION_NAME_LEN; | ||
960 | } | ||
961 | |||
962 | local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); | ||
963 | if (!local) { | ||
964 | status = -ENOMEM; | ||
965 | goto bail; | ||
966 | } | ||
967 | |||
968 | localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS); | ||
969 | |||
970 | /* compare local regions with remote */ | ||
971 | l = local; | ||
972 | for (i = 0; i < localnr; ++i) { | ||
973 | foundit = 0; | ||
974 | r = remote; | ||
975 | for (j = 0; j <= qr->qr_numregions; ++j) { | ||
976 | if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) { | ||
977 | foundit = 1; | ||
978 | break; | ||
979 | } | ||
980 | r += O2HB_MAX_REGION_NAME_LEN; | ||
981 | } | ||
982 | if (!foundit) { | ||
983 | status = -EINVAL; | ||
984 | mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " | ||
985 | "in local node %d but not in joining node %d\n", | ||
986 | qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l, | ||
987 | dlm->node_num, qr->qr_node); | ||
988 | goto bail; | ||
989 | } | ||
990 | l += O2HB_MAX_REGION_NAME_LEN; | ||
991 | } | ||
992 | |||
993 | /* compare remote with local regions */ | ||
994 | r = remote; | ||
995 | for (i = 0; i < qr->qr_numregions; ++i) { | ||
996 | foundit = 0; | ||
997 | l = local; | ||
998 | for (j = 0; j < localnr; ++j) { | ||
999 | if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) { | ||
1000 | foundit = 1; | ||
1001 | break; | ||
1002 | } | ||
1003 | l += O2HB_MAX_REGION_NAME_LEN; | ||
1004 | } | ||
1005 | if (!foundit) { | ||
1006 | status = -EINVAL; | ||
1007 | mlog(ML_ERROR, "Domain %s: Region '%.*s' registered " | ||
1008 | "in joining node %d but not in local node %d\n", | ||
1009 | qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r, | ||
1010 | qr->qr_node, dlm->node_num); | ||
1011 | goto bail; | ||
1012 | } | ||
1013 | r += O2HB_MAX_REGION_NAME_LEN; | ||
1014 | } | ||
1015 | |||
1016 | bail: | ||
1017 | kfree(local); | ||
1018 | |||
1019 | return status; | ||
1020 | } | ||
1021 | |||
1022 | static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) | ||
1023 | { | ||
1024 | struct dlm_query_region *qr = NULL; | ||
1025 | int status, ret = 0, i; | ||
1026 | char *p; | ||
1027 | |||
1028 | if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) | ||
1029 | goto bail; | ||
1030 | |||
1031 | qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); | ||
1032 | if (!qr) { | ||
1033 | ret = -ENOMEM; | ||
1034 | mlog_errno(ret); | ||
1035 | goto bail; | ||
1036 | } | ||
1037 | |||
1038 | qr->qr_node = dlm->node_num; | ||
1039 | qr->qr_namelen = strlen(dlm->name); | ||
1040 | memcpy(qr->qr_domain, dlm->name, qr->qr_namelen); | ||
1041 | /* if local hb, the numregions will be zero */ | ||
1042 | if (o2hb_global_heartbeat_active()) | ||
1043 | qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions, | ||
1044 | O2NM_MAX_REGIONS); | ||
1045 | |||
1046 | p = qr->qr_regions; | ||
1047 | for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN) | ||
1048 | mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p); | ||
1049 | |||
1050 | i = -1; | ||
1051 | while ((i = find_next_bit(node_map, O2NM_MAX_NODES, | ||
1052 | i + 1)) < O2NM_MAX_NODES) { | ||
1053 | if (i == dlm->node_num) | ||
1054 | continue; | ||
1055 | |||
1056 | mlog(0, "Sending regions to node %d\n", i); | ||
1057 | |||
1058 | ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr, | ||
1059 | sizeof(struct dlm_query_region), | ||
1060 | i, &status); | ||
1061 | if (ret >= 0) | ||
1062 | ret = status; | ||
1063 | if (ret) { | ||
1064 | mlog(ML_ERROR, "Region mismatch %d, node %d\n", | ||
1065 | ret, i); | ||
1066 | break; | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | bail: | ||
1071 | kfree(qr); | ||
1072 | return ret; | ||
1073 | } | ||
1074 | |||
1075 | static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, | ||
1076 | void *data, void **ret_data) | ||
1077 | { | ||
1078 | struct dlm_query_region *qr; | ||
1079 | struct dlm_ctxt *dlm = NULL; | ||
1080 | int status = 0; | ||
1081 | int locked = 0; | ||
1082 | |||
1083 | qr = (struct dlm_query_region *) msg->buf; | ||
1084 | |||
1085 | mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, | ||
1086 | qr->qr_domain); | ||
1087 | |||
1088 | status = -EINVAL; | ||
1089 | |||
1090 | spin_lock(&dlm_domain_lock); | ||
1091 | dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen); | ||
1092 | if (!dlm) { | ||
1093 | mlog(ML_ERROR, "Node %d queried hb regions on domain %s " | ||
1094 | "before join domain\n", qr->qr_node, qr->qr_domain); | ||
1095 | goto bail; | ||
1096 | } | ||
1097 | |||
1098 | spin_lock(&dlm->spinlock); | ||
1099 | locked = 1; | ||
1100 | if (dlm->joining_node != qr->qr_node) { | ||
1101 | mlog(ML_ERROR, "Node %d queried hb regions on domain %s " | ||
1102 | "but joining node is %d\n", qr->qr_node, qr->qr_domain, | ||
1103 | dlm->joining_node); | ||
1104 | goto bail; | ||
1105 | } | ||
1106 | |||
1107 | /* Support for global heartbeat was added in 1.1 */ | ||
1108 | if (dlm->dlm_locking_proto.pv_major == 1 && | ||
1109 | dlm->dlm_locking_proto.pv_minor == 0) { | ||
1110 | mlog(ML_ERROR, "Node %d queried hb regions on domain %s " | ||
1111 | "but active dlm protocol is %d.%d\n", qr->qr_node, | ||
1112 | qr->qr_domain, dlm->dlm_locking_proto.pv_major, | ||
1113 | dlm->dlm_locking_proto.pv_minor); | ||
1114 | goto bail; | ||
1115 | } | ||
1116 | |||
1117 | status = dlm_match_regions(dlm, qr); | ||
1118 | |||
1119 | bail: | ||
1120 | if (locked) | ||
1121 | spin_unlock(&dlm->spinlock); | ||
1122 | spin_unlock(&dlm_domain_lock); | ||
1123 | |||
1124 | return status; | ||
1125 | } | ||
1126 | |||
1127 | static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn) | ||
1128 | { | ||
1129 | struct o2nm_node *local; | ||
1130 | struct dlm_node_info *remote; | ||
1131 | int i, j; | ||
1132 | int status = 0; | ||
1133 | |||
1134 | for (j = 0; j < qn->qn_numnodes; ++j) | ||
1135 | mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum, | ||
1136 | &(qn->qn_nodes[j].ni_ipv4_address), | ||
1137 | ntohs(qn->qn_nodes[j].ni_ipv4_port)); | ||
1138 | |||
1139 | for (i = 0; i < O2NM_MAX_NODES && !status; ++i) { | ||
1140 | local = o2nm_get_node_by_num(i); | ||
1141 | remote = NULL; | ||
1142 | for (j = 0; j < qn->qn_numnodes; ++j) { | ||
1143 | if (qn->qn_nodes[j].ni_nodenum == i) { | ||
1144 | remote = &(qn->qn_nodes[j]); | ||
1145 | break; | ||
1146 | } | ||
1147 | } | ||
1148 | |||
1149 | if (!local && !remote) | ||
1150 | continue; | ||
1151 | |||
1152 | if ((local && !remote) || (!local && remote)) | ||
1153 | status = -EINVAL; | ||
1154 | |||
1155 | if (!status && | ||
1156 | ((remote->ni_nodenum != local->nd_num) || | ||
1157 | (remote->ni_ipv4_port != local->nd_ipv4_port) || | ||
1158 | (remote->ni_ipv4_address != local->nd_ipv4_address))) | ||
1159 | status = -EINVAL; | ||
1160 | |||
1161 | if (status) { | ||
1162 | if (remote && !local) | ||
1163 | mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " | ||
1164 | "registered in joining node %d but not in " | ||
1165 | "local node %d\n", qn->qn_domain, | ||
1166 | remote->ni_nodenum, | ||
1167 | &(remote->ni_ipv4_address), | ||
1168 | ntohs(remote->ni_ipv4_port), | ||
1169 | qn->qn_nodenum, dlm->node_num); | ||
1170 | if (local && !remote) | ||
1171 | mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) " | ||
1172 | "registered in local node %d but not in " | ||
1173 | "joining node %d\n", qn->qn_domain, | ||
1174 | local->nd_num, &(local->nd_ipv4_address), | ||
1175 | ntohs(local->nd_ipv4_port), | ||
1176 | dlm->node_num, qn->qn_nodenum); | ||
1177 | BUG_ON((!local && !remote)); | ||
1178 | } | ||
1179 | |||
1180 | if (local) | ||
1181 | o2nm_node_put(local); | ||
1182 | } | ||
1183 | |||
1184 | return status; | ||
1185 | } | ||
1186 | |||
1187 | static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) | ||
1188 | { | ||
1189 | struct dlm_query_nodeinfo *qn = NULL; | ||
1190 | struct o2nm_node *node; | ||
1191 | int ret = 0, status, count, i; | ||
1192 | |||
1193 | if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) | ||
1194 | goto bail; | ||
1195 | |||
1196 | qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); | ||
1197 | if (!qn) { | ||
1198 | ret = -ENOMEM; | ||
1199 | mlog_errno(ret); | ||
1200 | goto bail; | ||
1201 | } | ||
1202 | |||
1203 | for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) { | ||
1204 | node = o2nm_get_node_by_num(i); | ||
1205 | if (!node) | ||
1206 | continue; | ||
1207 | qn->qn_nodes[count].ni_nodenum = node->nd_num; | ||
1208 | qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port; | ||
1209 | qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address; | ||
1210 | mlog(0, "Node %3d, %pI4:%u\n", node->nd_num, | ||
1211 | &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port)); | ||
1212 | ++count; | ||
1213 | o2nm_node_put(node); | ||
1214 | } | ||
1215 | |||
1216 | qn->qn_nodenum = dlm->node_num; | ||
1217 | qn->qn_numnodes = count; | ||
1218 | qn->qn_namelen = strlen(dlm->name); | ||
1219 | memcpy(qn->qn_domain, dlm->name, qn->qn_namelen); | ||
1220 | |||
1221 | i = -1; | ||
1222 | while ((i = find_next_bit(node_map, O2NM_MAX_NODES, | ||
1223 | i + 1)) < O2NM_MAX_NODES) { | ||
1224 | if (i == dlm->node_num) | ||
1225 | continue; | ||
1226 | |||
1227 | mlog(0, "Sending nodeinfo to node %d\n", i); | ||
1228 | |||
1229 | ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY, | ||
1230 | qn, sizeof(struct dlm_query_nodeinfo), | ||
1231 | i, &status); | ||
1232 | if (ret >= 0) | ||
1233 | ret = status; | ||
1234 | if (ret) { | ||
1235 | mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i); | ||
1236 | break; | ||
1237 | } | ||
1238 | } | ||
1239 | |||
1240 | bail: | ||
1241 | kfree(qn); | ||
1242 | return ret; | ||
1243 | } | ||
1244 | |||
1245 | static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, | ||
1246 | void *data, void **ret_data) | ||
1247 | { | ||
1248 | struct dlm_query_nodeinfo *qn; | ||
1249 | struct dlm_ctxt *dlm = NULL; | ||
1250 | int locked = 0, status = -EINVAL; | ||
1251 | |||
1252 | qn = (struct dlm_query_nodeinfo *) msg->buf; | ||
1253 | |||
1254 | mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum, | ||
1255 | qn->qn_domain); | ||
1256 | |||
1257 | spin_lock(&dlm_domain_lock); | ||
1258 | dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen); | ||
1259 | if (!dlm) { | ||
1260 | mlog(ML_ERROR, "Node %d queried nodes on domain %s before " | ||
1261 | "join domain\n", qn->qn_nodenum, qn->qn_domain); | ||
1262 | goto bail; | ||
1263 | } | ||
1264 | |||
1265 | spin_lock(&dlm->spinlock); | ||
1266 | locked = 1; | ||
1267 | if (dlm->joining_node != qn->qn_nodenum) { | ||
1268 | mlog(ML_ERROR, "Node %d queried nodes on domain %s but " | ||
1269 | "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, | ||
1270 | dlm->joining_node); | ||
1271 | goto bail; | ||
1272 | } | ||
1273 | |||
1274 | /* Support for node query was added in 1.1 */ | ||
1275 | if (dlm->dlm_locking_proto.pv_major == 1 && | ||
1276 | dlm->dlm_locking_proto.pv_minor == 0) { | ||
1277 | mlog(ML_ERROR, "Node %d queried nodes on domain %s " | ||
1278 | "but active dlm protocol is %d.%d\n", qn->qn_nodenum, | ||
1279 | qn->qn_domain, dlm->dlm_locking_proto.pv_major, | ||
1280 | dlm->dlm_locking_proto.pv_minor); | ||
1281 | goto bail; | ||
1282 | } | ||
1283 | |||
1284 | status = dlm_match_nodes(dlm, qn); | ||
1285 | |||
1286 | bail: | ||
1287 | if (locked) | ||
1288 | spin_unlock(&dlm->spinlock); | ||
1289 | spin_unlock(&dlm_domain_lock); | ||
1290 | |||
1291 | return status; | ||
1292 | } | ||
1293 | |||
924 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, | 1294 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data, |
925 | void **ret_data) | 1295 | void **ret_data) |
926 | { | 1296 | { |
@@ -1241,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | |||
1241 | set_bit(dlm->node_num, dlm->domain_map); | 1611 | set_bit(dlm->node_num, dlm->domain_map); |
1242 | spin_unlock(&dlm->spinlock); | 1612 | spin_unlock(&dlm->spinlock); |
1243 | 1613 | ||
1614 | /* Support for global heartbeat and node info was added in 1.1 */ | ||
1615 | if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { | ||
1616 | status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); | ||
1617 | if (status) { | ||
1618 | mlog_errno(status); | ||
1619 | goto bail; | ||
1620 | } | ||
1621 | status = dlm_send_regions(dlm, ctxt->yes_resp_map); | ||
1622 | if (status) { | ||
1623 | mlog_errno(status); | ||
1624 | goto bail; | ||
1625 | } | ||
1626 | } | ||
1627 | |||
1244 | dlm_send_join_asserts(dlm, ctxt->yes_resp_map); | 1628 | dlm_send_join_asserts(dlm, ctxt->yes_resp_map); |
1245 | 1629 | ||
1246 | /* Joined state *must* be set before the joining node | 1630 | /* Joined state *must* be set before the joining node |
@@ -1807,7 +2191,21 @@ static int dlm_register_net_handlers(void) | |||
1807 | sizeof(struct dlm_cancel_join), | 2191 | sizeof(struct dlm_cancel_join), |
1808 | dlm_cancel_join_handler, | 2192 | dlm_cancel_join_handler, |
1809 | NULL, NULL, &dlm_join_handlers); | 2193 | NULL, NULL, &dlm_join_handlers); |
2194 | if (status) | ||
2195 | goto bail; | ||
2196 | |||
2197 | status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY, | ||
2198 | sizeof(struct dlm_query_region), | ||
2199 | dlm_query_region_handler, | ||
2200 | NULL, NULL, &dlm_join_handlers); | ||
1810 | 2201 | ||
2202 | if (status) | ||
2203 | goto bail; | ||
2204 | |||
2205 | status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY, | ||
2206 | sizeof(struct dlm_query_nodeinfo), | ||
2207 | dlm_query_nodeinfo_handler, | ||
2208 | NULL, NULL, &dlm_join_handlers); | ||
1811 | bail: | 2209 | bail: |
1812 | if (status < 0) | 2210 | if (status < 0) |
1813 | dlm_unregister_net_handlers(); | 2211 | dlm_unregister_net_handlers(); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5e02a893f46e..e8d94d722ecb 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | |||
3635 | { | 3635 | { |
3636 | struct inode *inode; | 3636 | struct inode *inode; |
3637 | struct address_space *mapping; | 3637 | struct address_space *mapping; |
3638 | struct ocfs2_inode_info *oi; | ||
3638 | 3639 | ||
3639 | inode = ocfs2_lock_res_inode(lockres); | 3640 | inode = ocfs2_lock_res_inode(lockres); |
3640 | mapping = inode->i_mapping; | 3641 | mapping = inode->i_mapping; |
3641 | 3642 | ||
3643 | if (S_ISDIR(inode->i_mode)) { | ||
3644 | oi = OCFS2_I(inode); | ||
3645 | oi->ip_dir_lock_gen++; | ||
3646 | mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); | ||
3647 | goto out; | ||
3648 | } | ||
3649 | |||
3642 | if (!S_ISREG(inode->i_mode)) | 3650 | if (!S_ISREG(inode->i_mode)) |
3643 | goto out; | 3651 | goto out; |
3644 | 3652 | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9a03c151b5ce..9e8cc4346b76 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -64,12 +64,6 @@ | |||
64 | 64 | ||
65 | #include "buffer_head_io.h" | 65 | #include "buffer_head_io.h" |
66 | 66 | ||
67 | static int ocfs2_sync_inode(struct inode *inode) | ||
68 | { | ||
69 | filemap_fdatawrite(inode->i_mapping); | ||
70 | return sync_mapping_buffers(inode->i_mapping); | ||
71 | } | ||
72 | |||
73 | static int ocfs2_init_file_private(struct inode *inode, struct file *file) | 67 | static int ocfs2_init_file_private(struct inode *inode, struct file *file) |
74 | { | 68 | { |
75 | struct ocfs2_file_private *fp; | 69 | struct ocfs2_file_private *fp; |
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync) | |||
180 | { | 174 | { |
181 | int err = 0; | 175 | int err = 0; |
182 | journal_t *journal; | 176 | journal_t *journal; |
183 | struct dentry *dentry = file->f_path.dentry; | ||
184 | struct inode *inode = file->f_mapping->host; | 177 | struct inode *inode = file->f_mapping->host; |
185 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 178 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
186 | 179 | ||
187 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | 180 | mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync, |
188 | dentry->d_name.len, dentry->d_name.name); | 181 | file->f_path.dentry, file->f_path.dentry->d_name.len, |
189 | 182 | file->f_path.dentry->d_name.name); | |
190 | err = ocfs2_sync_inode(dentry->d_inode); | ||
191 | if (err) | ||
192 | goto bail; | ||
193 | 183 | ||
194 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { | 184 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { |
195 | /* | 185 | /* |
@@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, | |||
370 | if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) | 360 | if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) |
371 | goto out; | 361 | goto out; |
372 | 362 | ||
373 | return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); | 363 | return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); |
374 | 364 | ||
375 | out: | 365 | out: |
376 | return status; | 366 | return status; |
@@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode, | |||
913 | zero_clusters = last_cpos - zero_cpos; | 903 | zero_clusters = last_cpos - zero_cpos; |
914 | 904 | ||
915 | if (needs_cow) { | 905 | if (needs_cow) { |
916 | rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, | 906 | rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, |
917 | UINT_MAX); | 907 | zero_clusters, UINT_MAX); |
918 | if (rc) { | 908 | if (rc) { |
919 | mlog_errno(rc); | 909 | mlog_errno(rc); |
920 | goto out; | 910 | goto out; |
@@ -2062,6 +2052,7 @@ out: | |||
2062 | } | 2052 | } |
2063 | 2053 | ||
2064 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, | 2054 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, |
2055 | struct file *file, | ||
2065 | loff_t pos, size_t count, | 2056 | loff_t pos, size_t count, |
2066 | int *meta_level) | 2057 | int *meta_level) |
2067 | { | 2058 | { |
@@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, | |||
2079 | 2070 | ||
2080 | *meta_level = 1; | 2071 | *meta_level = 1; |
2081 | 2072 | ||
2082 | ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); | 2073 | ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); |
2083 | if (ret) | 2074 | if (ret) |
2084 | mlog_errno(ret); | 2075 | mlog_errno(ret); |
2085 | out: | 2076 | out: |
@@ -2087,7 +2078,7 @@ out: | |||
2087 | return ret; | 2078 | return ret; |
2088 | } | 2079 | } |
2089 | 2080 | ||
2090 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 2081 | static int ocfs2_prepare_inode_for_write(struct file *file, |
2091 | loff_t *ppos, | 2082 | loff_t *ppos, |
2092 | size_t count, | 2083 | size_t count, |
2093 | int appending, | 2084 | int appending, |
@@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
2095 | int *has_refcount) | 2086 | int *has_refcount) |
2096 | { | 2087 | { |
2097 | int ret = 0, meta_level = 0; | 2088 | int ret = 0, meta_level = 0; |
2089 | struct dentry *dentry = file->f_path.dentry; | ||
2098 | struct inode *inode = dentry->d_inode; | 2090 | struct inode *inode = dentry->d_inode; |
2099 | loff_t saved_pos, end; | 2091 | loff_t saved_pos, end; |
2100 | 2092 | ||
@@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
2150 | meta_level = -1; | 2142 | meta_level = -1; |
2151 | 2143 | ||
2152 | ret = ocfs2_prepare_inode_for_refcount(inode, | 2144 | ret = ocfs2_prepare_inode_for_refcount(inode, |
2145 | file, | ||
2153 | saved_pos, | 2146 | saved_pos, |
2154 | count, | 2147 | count, |
2155 | &meta_level); | 2148 | &meta_level); |
@@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
2232 | struct file *file = iocb->ki_filp; | 2225 | struct file *file = iocb->ki_filp; |
2233 | struct inode *inode = file->f_path.dentry->d_inode; | 2226 | struct inode *inode = file->f_path.dentry->d_inode; |
2234 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2227 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2228 | int full_coherency = !(osb->s_mount_opt & | ||
2229 | OCFS2_MOUNT_COHERENCY_BUFFERED); | ||
2235 | 2230 | ||
2236 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | 2231 | mlog_entry("(0x%p, %u, '%.*s')\n", file, |
2237 | (unsigned int)nr_segs, | 2232 | (unsigned int)nr_segs, |
@@ -2255,16 +2250,39 @@ relock: | |||
2255 | have_alloc_sem = 1; | 2250 | have_alloc_sem = 1; |
2256 | } | 2251 | } |
2257 | 2252 | ||
2258 | /* concurrent O_DIRECT writes are allowed */ | 2253 | /* |
2259 | rw_level = !direct_io; | 2254 | * Concurrent O_DIRECT writes are allowed with |
2255 | * mount_option "coherency=buffered". | ||
2256 | */ | ||
2257 | rw_level = (!direct_io || full_coherency); | ||
2258 | |||
2260 | ret = ocfs2_rw_lock(inode, rw_level); | 2259 | ret = ocfs2_rw_lock(inode, rw_level); |
2261 | if (ret < 0) { | 2260 | if (ret < 0) { |
2262 | mlog_errno(ret); | 2261 | mlog_errno(ret); |
2263 | goto out_sems; | 2262 | goto out_sems; |
2264 | } | 2263 | } |
2265 | 2264 | ||
2265 | /* | ||
2266 | * O_DIRECT writes with "coherency=full" need to take EX cluster | ||
2267 | * inode_lock to guarantee coherency. | ||
2268 | */ | ||
2269 | if (direct_io && full_coherency) { | ||
2270 | /* | ||
2271 | * We need to take and drop the inode lock to force | ||
2272 | * other nodes to drop their caches. Buffered I/O | ||
2273 | * already does this in write_begin(). | ||
2274 | */ | ||
2275 | ret = ocfs2_inode_lock(inode, NULL, 1); | ||
2276 | if (ret < 0) { | ||
2277 | mlog_errno(ret); | ||
2278 | goto out_sems; | ||
2279 | } | ||
2280 | |||
2281 | ocfs2_inode_unlock(inode, 1); | ||
2282 | } | ||
2283 | |||
2266 | can_do_direct = direct_io; | 2284 | can_do_direct = direct_io; |
2267 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, | 2285 | ret = ocfs2_prepare_inode_for_write(file, ppos, |
2268 | iocb->ki_left, appending, | 2286 | iocb->ki_left, appending, |
2269 | &can_do_direct, &has_refcount); | 2287 | &can_do_direct, &has_refcount); |
2270 | if (ret < 0) { | 2288 | if (ret < 0) { |
@@ -2312,17 +2330,6 @@ relock: | |||
2312 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | 2330 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, |
2313 | ppos, count, ocount); | 2331 | ppos, count, ocount); |
2314 | if (written < 0) { | 2332 | if (written < 0) { |
2315 | /* | ||
2316 | * direct write may have instantiated a few | ||
2317 | * blocks outside i_size. Trim these off again. | ||
2318 | * Don't need i_size_read because we hold i_mutex. | ||
2319 | * | ||
2320 | * XXX(truncate): this looks buggy because ocfs2 did not | ||
2321 | * actually implement ->truncate. Take a look at | ||
2322 | * the new truncate sequence and update this accordingly | ||
2323 | */ | ||
2324 | if (*ppos + count > inode->i_size) | ||
2325 | truncate_setsize(inode, inode->i_size); | ||
2326 | ret = written; | 2333 | ret = written; |
2327 | goto out_dio; | 2334 | goto out_dio; |
2328 | } | 2335 | } |
@@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, | |||
2394 | { | 2401 | { |
2395 | int ret; | 2402 | int ret; |
2396 | 2403 | ||
2397 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, | 2404 | ret = ocfs2_prepare_inode_for_write(out, &sd->pos, |
2398 | sd->total_len, 0, NULL, NULL); | 2405 | sd->total_len, 0, NULL, NULL); |
2399 | if (ret < 0) { | 2406 | if (ret < 0) { |
2400 | mlog_errno(ret); | 2407 | mlog_errno(ret); |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index eece3e05d9d0..f935fd6600dd 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | |||
335 | else | 335 | else |
336 | inode->i_fop = &ocfs2_dops_no_plocks; | 336 | inode->i_fop = &ocfs2_dops_no_plocks; |
337 | i_size_write(inode, le64_to_cpu(fe->i_size)); | 337 | i_size_write(inode, le64_to_cpu(fe->i_size)); |
338 | OCFS2_I(inode)->ip_dir_lock_gen = 1; | ||
338 | break; | 339 | break; |
339 | case S_IFLNK: | 340 | case S_IFLNK: |
340 | if (ocfs2_inode_is_fast_symlink(inode)) | 341 | if (ocfs2_inode_is_fast_symlink(inode)) |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 6de5a869db30..1c508b149b3a 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -46,30 +46,28 @@ struct ocfs2_inode_info | |||
46 | /* These fields are protected by ip_lock */ | 46 | /* These fields are protected by ip_lock */ |
47 | spinlock_t ip_lock; | 47 | spinlock_t ip_lock; |
48 | u32 ip_open_count; | 48 | u32 ip_open_count; |
49 | u32 ip_clusters; | ||
50 | struct list_head ip_io_markers; | 49 | struct list_head ip_io_markers; |
50 | u32 ip_clusters; | ||
51 | 51 | ||
52 | u16 ip_dyn_features; | ||
52 | struct mutex ip_io_mutex; | 53 | struct mutex ip_io_mutex; |
53 | |||
54 | u32 ip_flags; /* see below */ | 54 | u32 ip_flags; /* see below */ |
55 | u32 ip_attr; /* inode attributes */ | 55 | u32 ip_attr; /* inode attributes */ |
56 | u16 ip_dyn_features; | ||
57 | 56 | ||
58 | /* protected by recovery_lock. */ | 57 | /* protected by recovery_lock. */ |
59 | struct inode *ip_next_orphan; | 58 | struct inode *ip_next_orphan; |
60 | 59 | ||
61 | u32 ip_dir_start_lookup; | ||
62 | |||
63 | struct ocfs2_caching_info ip_metadata_cache; | 60 | struct ocfs2_caching_info ip_metadata_cache; |
64 | |||
65 | struct ocfs2_extent_map ip_extent_map; | 61 | struct ocfs2_extent_map ip_extent_map; |
66 | |||
67 | struct inode vfs_inode; | 62 | struct inode vfs_inode; |
68 | struct jbd2_inode ip_jinode; | 63 | struct jbd2_inode ip_jinode; |
69 | 64 | ||
65 | u32 ip_dir_start_lookup; | ||
66 | |||
70 | /* Only valid if the inode is the dir. */ | 67 | /* Only valid if the inode is the dir. */ |
71 | u32 ip_last_used_slot; | 68 | u32 ip_last_used_slot; |
72 | u64 ip_last_used_group; | 69 | u64 ip_last_used_group; |
70 | u32 ip_dir_lock_gen; | ||
73 | 71 | ||
74 | struct ocfs2_alloc_reservation ip_la_data_resv; | 72 | struct ocfs2_alloc_reservation ip_la_data_resv; |
75 | }; | 73 | }; |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7d9d9c132cef..7a4868196152 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -26,6 +26,26 @@ | |||
26 | 26 | ||
27 | #include <linux/ext2_fs.h> | 27 | #include <linux/ext2_fs.h> |
28 | 28 | ||
29 | #define o2info_from_user(a, b) \ | ||
30 | copy_from_user(&(a), (b), sizeof(a)) | ||
31 | #define o2info_to_user(a, b) \ | ||
32 | copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) | ||
33 | |||
34 | /* | ||
35 | * This call is void because we are already reporting an error that may | ||
36 | * be -EFAULT. The error will be returned from the ioctl(2) call. It's | ||
37 | * just a best-effort to tell userspace that this request caused the error. | ||
38 | */ | ||
39 | static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, | ||
40 | struct ocfs2_info_request __user *req) | ||
41 | { | ||
42 | kreq->ir_flags |= OCFS2_INFO_FL_ERROR; | ||
43 | (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); | ||
44 | } | ||
45 | |||
46 | #define o2info_set_request_error(a, b) \ | ||
47 | __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) | ||
48 | |||
29 | static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) | 49 | static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) |
30 | { | 50 | { |
31 | int status; | 51 | int status; |
@@ -109,6 +129,328 @@ bail: | |||
109 | return status; | 129 | return status; |
110 | } | 130 | } |
111 | 131 | ||
132 | int ocfs2_info_handle_blocksize(struct inode *inode, | ||
133 | struct ocfs2_info_request __user *req) | ||
134 | { | ||
135 | int status = -EFAULT; | ||
136 | struct ocfs2_info_blocksize oib; | ||
137 | |||
138 | if (o2info_from_user(oib, req)) | ||
139 | goto bail; | ||
140 | |||
141 | oib.ib_blocksize = inode->i_sb->s_blocksize; | ||
142 | oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
143 | |||
144 | if (o2info_to_user(oib, req)) | ||
145 | goto bail; | ||
146 | |||
147 | status = 0; | ||
148 | bail: | ||
149 | if (status) | ||
150 | o2info_set_request_error(oib, req); | ||
151 | |||
152 | return status; | ||
153 | } | ||
154 | |||
155 | int ocfs2_info_handle_clustersize(struct inode *inode, | ||
156 | struct ocfs2_info_request __user *req) | ||
157 | { | ||
158 | int status = -EFAULT; | ||
159 | struct ocfs2_info_clustersize oic; | ||
160 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
161 | |||
162 | if (o2info_from_user(oic, req)) | ||
163 | goto bail; | ||
164 | |||
165 | oic.ic_clustersize = osb->s_clustersize; | ||
166 | oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
167 | |||
168 | if (o2info_to_user(oic, req)) | ||
169 | goto bail; | ||
170 | |||
171 | status = 0; | ||
172 | bail: | ||
173 | if (status) | ||
174 | o2info_set_request_error(oic, req); | ||
175 | |||
176 | return status; | ||
177 | } | ||
178 | |||
179 | int ocfs2_info_handle_maxslots(struct inode *inode, | ||
180 | struct ocfs2_info_request __user *req) | ||
181 | { | ||
182 | int status = -EFAULT; | ||
183 | struct ocfs2_info_maxslots oim; | ||
184 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
185 | |||
186 | if (o2info_from_user(oim, req)) | ||
187 | goto bail; | ||
188 | |||
189 | oim.im_max_slots = osb->max_slots; | ||
190 | oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
191 | |||
192 | if (o2info_to_user(oim, req)) | ||
193 | goto bail; | ||
194 | |||
195 | status = 0; | ||
196 | bail: | ||
197 | if (status) | ||
198 | o2info_set_request_error(oim, req); | ||
199 | |||
200 | return status; | ||
201 | } | ||
202 | |||
203 | int ocfs2_info_handle_label(struct inode *inode, | ||
204 | struct ocfs2_info_request __user *req) | ||
205 | { | ||
206 | int status = -EFAULT; | ||
207 | struct ocfs2_info_label oil; | ||
208 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
209 | |||
210 | if (o2info_from_user(oil, req)) | ||
211 | goto bail; | ||
212 | |||
213 | memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); | ||
214 | oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
215 | |||
216 | if (o2info_to_user(oil, req)) | ||
217 | goto bail; | ||
218 | |||
219 | status = 0; | ||
220 | bail: | ||
221 | if (status) | ||
222 | o2info_set_request_error(oil, req); | ||
223 | |||
224 | return status; | ||
225 | } | ||
226 | |||
227 | int ocfs2_info_handle_uuid(struct inode *inode, | ||
228 | struct ocfs2_info_request __user *req) | ||
229 | { | ||
230 | int status = -EFAULT; | ||
231 | struct ocfs2_info_uuid oiu; | ||
232 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
233 | |||
234 | if (o2info_from_user(oiu, req)) | ||
235 | goto bail; | ||
236 | |||
237 | memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); | ||
238 | oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
239 | |||
240 | if (o2info_to_user(oiu, req)) | ||
241 | goto bail; | ||
242 | |||
243 | status = 0; | ||
244 | bail: | ||
245 | if (status) | ||
246 | o2info_set_request_error(oiu, req); | ||
247 | |||
248 | return status; | ||
249 | } | ||
250 | |||
251 | int ocfs2_info_handle_fs_features(struct inode *inode, | ||
252 | struct ocfs2_info_request __user *req) | ||
253 | { | ||
254 | int status = -EFAULT; | ||
255 | struct ocfs2_info_fs_features oif; | ||
256 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
257 | |||
258 | if (o2info_from_user(oif, req)) | ||
259 | goto bail; | ||
260 | |||
261 | oif.if_compat_features = osb->s_feature_compat; | ||
262 | oif.if_incompat_features = osb->s_feature_incompat; | ||
263 | oif.if_ro_compat_features = osb->s_feature_ro_compat; | ||
264 | oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
265 | |||
266 | if (o2info_to_user(oif, req)) | ||
267 | goto bail; | ||
268 | |||
269 | status = 0; | ||
270 | bail: | ||
271 | if (status) | ||
272 | o2info_set_request_error(oif, req); | ||
273 | |||
274 | return status; | ||
275 | } | ||
276 | |||
277 | int ocfs2_info_handle_journal_size(struct inode *inode, | ||
278 | struct ocfs2_info_request __user *req) | ||
279 | { | ||
280 | int status = -EFAULT; | ||
281 | struct ocfs2_info_journal_size oij; | ||
282 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
283 | |||
284 | if (o2info_from_user(oij, req)) | ||
285 | goto bail; | ||
286 | |||
287 | oij.ij_journal_size = osb->journal->j_inode->i_size; | ||
288 | |||
289 | oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; | ||
290 | |||
291 | if (o2info_to_user(oij, req)) | ||
292 | goto bail; | ||
293 | |||
294 | status = 0; | ||
295 | bail: | ||
296 | if (status) | ||
297 | o2info_set_request_error(oij, req); | ||
298 | |||
299 | return status; | ||
300 | } | ||
301 | |||
302 | int ocfs2_info_handle_unknown(struct inode *inode, | ||
303 | struct ocfs2_info_request __user *req) | ||
304 | { | ||
305 | int status = -EFAULT; | ||
306 | struct ocfs2_info_request oir; | ||
307 | |||
308 | if (o2info_from_user(oir, req)) | ||
309 | goto bail; | ||
310 | |||
311 | oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; | ||
312 | |||
313 | if (o2info_to_user(oir, req)) | ||
314 | goto bail; | ||
315 | |||
316 | status = 0; | ||
317 | bail: | ||
318 | if (status) | ||
319 | o2info_set_request_error(oir, req); | ||
320 | |||
321 | return status; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Validate and distinguish OCFS2_IOC_INFO requests. | ||
326 | * | ||
327 | * - validate the magic number. | ||
328 | * - distinguish different requests. | ||
329 | * - validate size of different requests. | ||
330 | */ | ||
331 | int ocfs2_info_handle_request(struct inode *inode, | ||
332 | struct ocfs2_info_request __user *req) | ||
333 | { | ||
334 | int status = -EFAULT; | ||
335 | struct ocfs2_info_request oir; | ||
336 | |||
337 | if (o2info_from_user(oir, req)) | ||
338 | goto bail; | ||
339 | |||
340 | status = -EINVAL; | ||
341 | if (oir.ir_magic != OCFS2_INFO_MAGIC) | ||
342 | goto bail; | ||
343 | |||
344 | switch (oir.ir_code) { | ||
345 | case OCFS2_INFO_BLOCKSIZE: | ||
346 | if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) | ||
347 | status = ocfs2_info_handle_blocksize(inode, req); | ||
348 | break; | ||
349 | case OCFS2_INFO_CLUSTERSIZE: | ||
350 | if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) | ||
351 | status = ocfs2_info_handle_clustersize(inode, req); | ||
352 | break; | ||
353 | case OCFS2_INFO_MAXSLOTS: | ||
354 | if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) | ||
355 | status = ocfs2_info_handle_maxslots(inode, req); | ||
356 | break; | ||
357 | case OCFS2_INFO_LABEL: | ||
358 | if (oir.ir_size == sizeof(struct ocfs2_info_label)) | ||
359 | status = ocfs2_info_handle_label(inode, req); | ||
360 | break; | ||
361 | case OCFS2_INFO_UUID: | ||
362 | if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) | ||
363 | status = ocfs2_info_handle_uuid(inode, req); | ||
364 | break; | ||
365 | case OCFS2_INFO_FS_FEATURES: | ||
366 | if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) | ||
367 | status = ocfs2_info_handle_fs_features(inode, req); | ||
368 | break; | ||
369 | case OCFS2_INFO_JOURNAL_SIZE: | ||
370 | if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) | ||
371 | status = ocfs2_info_handle_journal_size(inode, req); | ||
372 | break; | ||
373 | default: | ||
374 | status = ocfs2_info_handle_unknown(inode, req); | ||
375 | break; | ||
376 | } | ||
377 | |||
378 | bail: | ||
379 | return status; | ||
380 | } | ||
381 | |||
382 | int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, | ||
383 | u64 *req_addr, int compat_flag) | ||
384 | { | ||
385 | int status = -EFAULT; | ||
386 | u64 __user *bp = NULL; | ||
387 | |||
388 | if (compat_flag) { | ||
389 | #ifdef CONFIG_COMPAT | ||
390 | /* | ||
391 | * pointer bp stores the base address of a pointers array, | ||
392 | * which collects all addresses of separate request. | ||
393 | */ | ||
394 | bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); | ||
395 | #else | ||
396 | BUG(); | ||
397 | #endif | ||
398 | } else | ||
399 | bp = (u64 __user *)(unsigned long)(info->oi_requests); | ||
400 | |||
401 | if (o2info_from_user(*req_addr, bp + idx)) | ||
402 | goto bail; | ||
403 | |||
404 | status = 0; | ||
405 | bail: | ||
406 | return status; | ||
407 | } | ||
408 | |||
409 | /* | ||
410 | * OCFS2_IOC_INFO handles an array of requests passed from userspace. | ||
411 | * | ||
412 | * ocfs2_info_handle() recevies a large info aggregation, grab and | ||
413 | * validate the request count from header, then break it into small | ||
414 | * pieces, later specific handlers can handle them one by one. | ||
415 | * | ||
416 | * Idea here is to make each separate request small enough to ensure | ||
417 | * a better backward&forward compatibility, since a small piece of | ||
418 | * request will be less likely to be broken if disk layout get changed. | ||
419 | */ | ||
420 | int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, | ||
421 | int compat_flag) | ||
422 | { | ||
423 | int i, status = 0; | ||
424 | u64 req_addr; | ||
425 | struct ocfs2_info_request __user *reqp; | ||
426 | |||
427 | if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || | ||
428 | (!info->oi_requests)) { | ||
429 | status = -EINVAL; | ||
430 | goto bail; | ||
431 | } | ||
432 | |||
433 | for (i = 0; i < info->oi_count; i++) { | ||
434 | |||
435 | status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); | ||
436 | if (status) | ||
437 | break; | ||
438 | |||
439 | reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; | ||
440 | if (!reqp) { | ||
441 | status = -EINVAL; | ||
442 | goto bail; | ||
443 | } | ||
444 | |||
445 | status = ocfs2_info_handle_request(inode, reqp); | ||
446 | if (status) | ||
447 | break; | ||
448 | } | ||
449 | |||
450 | bail: | ||
451 | return status; | ||
452 | } | ||
453 | |||
112 | long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | 454 | long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
113 | { | 455 | { |
114 | struct inode *inode = filp->f_path.dentry->d_inode; | 456 | struct inode *inode = filp->f_path.dentry->d_inode; |
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
120 | struct reflink_arguments args; | 462 | struct reflink_arguments args; |
121 | const char *old_path, *new_path; | 463 | const char *old_path, *new_path; |
122 | bool preserve; | 464 | bool preserve; |
465 | struct ocfs2_info info; | ||
123 | 466 | ||
124 | switch (cmd) { | 467 | switch (cmd) { |
125 | case OCFS2_IOC_GETFLAGS: | 468 | case OCFS2_IOC_GETFLAGS: |
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | |||
174 | preserve = (args.preserve != 0); | 517 | preserve = (args.preserve != 0); |
175 | 518 | ||
176 | return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); | 519 | return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); |
520 | case OCFS2_IOC_INFO: | ||
521 | if (copy_from_user(&info, (struct ocfs2_info __user *)arg, | ||
522 | sizeof(struct ocfs2_info))) | ||
523 | return -EFAULT; | ||
524 | |||
525 | return ocfs2_info_handle(inode, &info, 0); | ||
177 | default: | 526 | default: |
178 | return -ENOTTY; | 527 | return -ENOTTY; |
179 | } | 528 | } |
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
185 | bool preserve; | 534 | bool preserve; |
186 | struct reflink_arguments args; | 535 | struct reflink_arguments args; |
187 | struct inode *inode = file->f_path.dentry->d_inode; | 536 | struct inode *inode = file->f_path.dentry->d_inode; |
537 | struct ocfs2_info info; | ||
188 | 538 | ||
189 | switch (cmd) { | 539 | switch (cmd) { |
190 | case OCFS2_IOC32_GETFLAGS: | 540 | case OCFS2_IOC32_GETFLAGS: |
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
209 | 559 | ||
210 | return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), | 560 | return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), |
211 | compat_ptr(args.new_path), preserve); | 561 | compat_ptr(args.new_path), preserve); |
562 | case OCFS2_IOC_INFO: | ||
563 | if (copy_from_user(&info, (struct ocfs2_info __user *)arg, | ||
564 | sizeof(struct ocfs2_info))) | ||
565 | return -EFAULT; | ||
566 | |||
567 | return ocfs2_info_handle(inode, &info, 1); | ||
212 | default: | 568 | default: |
213 | return -ENOIOCTLCMD; | 569 | return -ENOIOCTLCMD; |
214 | } | 570 | } |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9b57c0350ff9..faa2303dbf0a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) | |||
301 | { | 301 | { |
302 | int status = 0; | 302 | int status = 0; |
303 | unsigned int flushed; | 303 | unsigned int flushed; |
304 | unsigned long old_id; | ||
305 | struct ocfs2_journal *journal = NULL; | 304 | struct ocfs2_journal *journal = NULL; |
306 | 305 | ||
307 | mlog_entry_void(); | 306 | mlog_entry_void(); |
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) | |||
326 | goto finally; | 325 | goto finally; |
327 | } | 326 | } |
328 | 327 | ||
329 | old_id = ocfs2_inc_trans_id(journal); | 328 | ocfs2_inc_trans_id(journal); |
330 | 329 | ||
331 | flushed = atomic_read(&journal->j_num_trans); | 330 | flushed = atomic_read(&journal->j_num_trans); |
332 | atomic_set(&journal->j_num_trans, 0); | 331 | atomic_set(&journal->j_num_trans, 0); |
@@ -342,9 +341,6 @@ finally: | |||
342 | return status; | 341 | return status; |
343 | } | 342 | } |
344 | 343 | ||
345 | /* pass it NULL and it will allocate a new handle object for you. If | ||
346 | * you pass it a handle however, it may still return error, in which | ||
347 | * case it has free'd the passed handle for you. */ | ||
348 | handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) | 344 | handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) |
349 | { | 345 | { |
350 | journal_t *journal = osb->journal->j_journal; | 346 | journal_t *journal = osb->journal->j_journal; |
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) | |||
1888 | 1884 | ||
1889 | os = &osb->osb_orphan_scan; | 1885 | os = &osb->osb_orphan_scan; |
1890 | 1886 | ||
1887 | mlog(0, "Begin orphan scan\n"); | ||
1888 | |||
1891 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) | 1889 | if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) |
1892 | goto out; | 1890 | goto out; |
1893 | 1891 | ||
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) | |||
1920 | unlock: | 1918 | unlock: |
1921 | ocfs2_orphan_scan_unlock(osb, seqno); | 1919 | ocfs2_orphan_scan_unlock(osb, seqno); |
1922 | out: | 1920 | out: |
1921 | mlog(0, "Orphan scan completed\n"); | ||
1923 | return; | 1922 | return; |
1924 | } | 1923 | } |
1925 | 1924 | ||
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index b5baaa8e710f..43e56b97f9c0 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -67,11 +67,12 @@ struct ocfs2_journal { | |||
67 | struct buffer_head *j_bh; /* Journal disk inode block */ | 67 | struct buffer_head *j_bh; /* Journal disk inode block */ |
68 | atomic_t j_num_trans; /* Number of transactions | 68 | atomic_t j_num_trans; /* Number of transactions |
69 | * currently in the system. */ | 69 | * currently in the system. */ |
70 | spinlock_t j_lock; | ||
70 | unsigned long j_trans_id; | 71 | unsigned long j_trans_id; |
71 | struct rw_semaphore j_trans_barrier; | 72 | struct rw_semaphore j_trans_barrier; |
72 | wait_queue_head_t j_checkpointed; | 73 | wait_queue_head_t j_checkpointed; |
73 | 74 | ||
74 | spinlock_t j_lock; | 75 | /* both fields protected by j_lock*/ |
75 | struct list_head j_la_cleanups; | 76 | struct list_head j_la_cleanups; |
76 | struct work_struct j_recovery_work; | 77 | struct work_struct j_recovery_work; |
77 | }; | 78 | }; |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 4c18f4ad93b4..7e32db9c2c99 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
59 | return ret; | 59 | return ret; |
60 | } | 60 | } |
61 | 61 | ||
62 | static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, | 62 | static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, |
63 | struct page *page) | 63 | struct page *page) |
64 | { | 64 | { |
65 | int ret; | 65 | int ret; |
66 | struct inode *inode = file->f_path.dentry->d_inode; | ||
66 | struct address_space *mapping = inode->i_mapping; | 67 | struct address_space *mapping = inode->i_mapping; |
67 | loff_t pos = page_offset(page); | 68 | loff_t pos = page_offset(page); |
68 | unsigned int len = PAGE_CACHE_SIZE; | 69 | unsigned int len = PAGE_CACHE_SIZE; |
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, | |||
111 | if (page->index == last_index) | 112 | if (page->index == last_index) |
112 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; | 113 | len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; |
113 | 114 | ||
114 | ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, | 115 | ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, |
115 | &fsdata, di_bh, page); | 116 | &fsdata, di_bh, page); |
116 | if (ret) { | 117 | if (ret) { |
117 | if (ret != -ENOSPC) | 118 | if (ret != -ENOSPC) |
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
159 | */ | 160 | */ |
160 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 161 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
161 | 162 | ||
162 | ret = __ocfs2_page_mkwrite(inode, di_bh, page); | 163 | ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); |
163 | 164 | ||
164 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 165 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
165 | 166 | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a00dda2e4f16..e7bde21149ae 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -171,7 +171,8 @@ bail_add: | |||
171 | ret = ERR_PTR(status); | 171 | ret = ERR_PTR(status); |
172 | goto bail_unlock; | 172 | goto bail_unlock; |
173 | } | 173 | } |
174 | } | 174 | } else |
175 | ocfs2_dentry_attach_gen(dentry); | ||
175 | 176 | ||
176 | bail_unlock: | 177 | bail_unlock: |
177 | /* Don't drop the cluster lock until *after* the d_add -- | 178 | /* Don't drop the cluster lock until *after* the d_add -- |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c67003b6b5a2..d8408217e3bd 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data); | |||
150 | struct ocfs2_lock_res { | 150 | struct ocfs2_lock_res { |
151 | void *l_priv; | 151 | void *l_priv; |
152 | struct ocfs2_lock_res_ops *l_ops; | 152 | struct ocfs2_lock_res_ops *l_ops; |
153 | spinlock_t l_lock; | 153 | |
154 | 154 | ||
155 | struct list_head l_blocked_list; | 155 | struct list_head l_blocked_list; |
156 | struct list_head l_mask_waiters; | 156 | struct list_head l_mask_waiters; |
157 | 157 | ||
158 | enum ocfs2_lock_type l_type; | ||
159 | unsigned long l_flags; | 158 | unsigned long l_flags; |
160 | char l_name[OCFS2_LOCK_ID_MAX_LEN]; | 159 | char l_name[OCFS2_LOCK_ID_MAX_LEN]; |
161 | int l_level; | ||
162 | unsigned int l_ro_holders; | 160 | unsigned int l_ro_holders; |
163 | unsigned int l_ex_holders; | 161 | unsigned int l_ex_holders; |
164 | struct ocfs2_dlm_lksb l_lksb; | 162 | unsigned char l_level; |
163 | |||
164 | /* Data packed - type enum ocfs2_lock_type */ | ||
165 | unsigned char l_type; | ||
165 | 166 | ||
166 | /* used from AST/BAST funcs. */ | 167 | /* used from AST/BAST funcs. */ |
167 | enum ocfs2_ast_action l_action; | 168 | /* Data packed - enum type ocfs2_ast_action */ |
168 | enum ocfs2_unlock_action l_unlock_action; | 169 | unsigned char l_action; |
169 | int l_requested; | 170 | /* Data packed - enum type ocfs2_unlock_action */ |
170 | int l_blocking; | 171 | unsigned char l_unlock_action; |
172 | unsigned char l_requested; | ||
173 | unsigned char l_blocking; | ||
171 | unsigned int l_pending_gen; | 174 | unsigned int l_pending_gen; |
172 | 175 | ||
176 | spinlock_t l_lock; | ||
177 | |||
178 | struct ocfs2_dlm_lksb l_lksb; | ||
179 | |||
173 | wait_queue_head_t l_event; | 180 | wait_queue_head_t l_event; |
174 | 181 | ||
175 | struct list_head l_debug_list; | 182 | struct list_head l_debug_list; |
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state | |||
243 | 250 | ||
244 | enum ocfs2_mount_options | 251 | enum ocfs2_mount_options |
245 | { | 252 | { |
246 | OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ | 253 | OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */ |
247 | OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ | 254 | OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ |
248 | OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ | 255 | OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ |
249 | OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ | 256 | OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ |
@@ -256,6 +263,10 @@ enum ocfs2_mount_options | |||
256 | control lists */ | 263 | control lists */ |
257 | OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ | 264 | OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ |
258 | OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ | 265 | OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ |
266 | OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT | ||
267 | writes */ | ||
268 | OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ | ||
269 | OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ | ||
259 | }; | 270 | }; |
260 | 271 | ||
261 | #define OCFS2_OSB_SOFT_RO 0x0001 | 272 | #define OCFS2_OSB_SOFT_RO 0x0001 |
@@ -277,7 +288,8 @@ struct ocfs2_super | |||
277 | struct super_block *sb; | 288 | struct super_block *sb; |
278 | struct inode *root_inode; | 289 | struct inode *root_inode; |
279 | struct inode *sys_root_inode; | 290 | struct inode *sys_root_inode; |
280 | struct inode *system_inodes[NUM_SYSTEM_INODES]; | 291 | struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES]; |
292 | struct inode **local_system_inodes; | ||
281 | 293 | ||
282 | struct ocfs2_slot_info *slot_info; | 294 | struct ocfs2_slot_info *slot_info; |
283 | 295 | ||
@@ -368,6 +380,8 @@ struct ocfs2_super | |||
368 | struct ocfs2_alloc_stats alloc_stats; | 380 | struct ocfs2_alloc_stats alloc_stats; |
369 | char dev_str[20]; /* "major,minor" of the device */ | 381 | char dev_str[20]; /* "major,minor" of the device */ |
370 | 382 | ||
383 | u8 osb_stackflags; | ||
384 | |||
371 | char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; | 385 | char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; |
372 | struct ocfs2_cluster_connection *cconn; | 386 | struct ocfs2_cluster_connection *cconn; |
373 | struct ocfs2_lock_res osb_super_lockres; | 387 | struct ocfs2_lock_res osb_super_lockres; |
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) | |||
601 | return ret; | 615 | return ret; |
602 | } | 616 | } |
603 | 617 | ||
604 | static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) | 618 | static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb) |
605 | { | 619 | { |
606 | return (osb->s_feature_incompat & | 620 | return (osb->s_feature_incompat & |
607 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); | 621 | (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK | |
622 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)); | ||
623 | } | ||
624 | |||
625 | static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) | ||
626 | { | ||
627 | if (ocfs2_clusterinfo_valid(osb) && | ||
628 | memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, | ||
629 | OCFS2_STACK_LABEL_LEN)) | ||
630 | return 1; | ||
631 | return 0; | ||
632 | } | ||
633 | |||
634 | static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb) | ||
635 | { | ||
636 | if (ocfs2_clusterinfo_valid(osb) && | ||
637 | !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, | ||
638 | OCFS2_STACK_LABEL_LEN)) | ||
639 | return 1; | ||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) | ||
644 | { | ||
645 | return ocfs2_o2cb_stack(osb) && | ||
646 | (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT); | ||
608 | } | 647 | } |
609 | 648 | ||
610 | static inline int ocfs2_mount_local(struct ocfs2_super *osb) | 649 | static inline int ocfs2_mount_local(struct ocfs2_super *osb) |
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index fa31d05e41b7..c2e4f8222e2f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -101,7 +101,8 @@ | |||
101 | | OCFS2_FEATURE_INCOMPAT_META_ECC \ | 101 | | OCFS2_FEATURE_INCOMPAT_META_ECC \ |
102 | | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | 102 | | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ |
103 | | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | 103 | | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ |
104 | | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) | 104 | | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ |
105 | | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) | ||
105 | #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | 106 | #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ |
106 | | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | 107 | | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ |
107 | | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) | 108 | | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) |
@@ -170,6 +171,13 @@ | |||
170 | #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 | 171 | #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 |
171 | 172 | ||
172 | /* | 173 | /* |
174 | * Incompat bit to indicate useable clusterinfo with stackflags for all | ||
175 | * cluster stacks (userspace adnd o2cb). If this bit is set, | ||
176 | * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. | ||
177 | */ | ||
178 | #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 | ||
179 | |||
180 | /* | ||
173 | * backup superblock flag is used to indicate that this volume | 181 | * backup superblock flag is used to indicate that this volume |
174 | * has backup superblocks. | 182 | * has backup superblocks. |
175 | */ | 183 | */ |
@@ -292,10 +300,13 @@ | |||
292 | #define OCFS2_VOL_UUID_LEN 16 | 300 | #define OCFS2_VOL_UUID_LEN 16 |
293 | #define OCFS2_MAX_VOL_LABEL_LEN 64 | 301 | #define OCFS2_MAX_VOL_LABEL_LEN 64 |
294 | 302 | ||
295 | /* The alternate, userspace stack fields */ | 303 | /* The cluster stack fields */ |
296 | #define OCFS2_STACK_LABEL_LEN 4 | 304 | #define OCFS2_STACK_LABEL_LEN 4 |
297 | #define OCFS2_CLUSTER_NAME_LEN 16 | 305 | #define OCFS2_CLUSTER_NAME_LEN 16 |
298 | 306 | ||
307 | /* Classic (historically speaking) cluster stack */ | ||
308 | #define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" | ||
309 | |||
299 | /* Journal limits (in bytes) */ | 310 | /* Journal limits (in bytes) */ |
300 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) | 311 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) |
301 | 312 | ||
@@ -305,6 +316,11 @@ | |||
305 | */ | 316 | */ |
306 | #define OCFS2_MIN_XATTR_INLINE_SIZE 256 | 317 | #define OCFS2_MIN_XATTR_INLINE_SIZE 256 |
307 | 318 | ||
319 | /* | ||
320 | * Cluster info flags (ocfs2_cluster_info.ci_stackflags) | ||
321 | */ | ||
322 | #define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) | ||
323 | |||
308 | struct ocfs2_system_inode_info { | 324 | struct ocfs2_system_inode_info { |
309 | char *si_name; | 325 | char *si_name; |
310 | int si_iflags; | 326 | int si_iflags; |
@@ -322,6 +338,7 @@ enum { | |||
322 | USER_QUOTA_SYSTEM_INODE, | 338 | USER_QUOTA_SYSTEM_INODE, |
323 | GROUP_QUOTA_SYSTEM_INODE, | 339 | GROUP_QUOTA_SYSTEM_INODE, |
324 | #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE | 340 | #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE |
341 | #define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE | ||
325 | ORPHAN_DIR_SYSTEM_INODE, | 342 | ORPHAN_DIR_SYSTEM_INODE, |
326 | EXTENT_ALLOC_SYSTEM_INODE, | 343 | EXTENT_ALLOC_SYSTEM_INODE, |
327 | INODE_ALLOC_SYSTEM_INODE, | 344 | INODE_ALLOC_SYSTEM_INODE, |
@@ -330,8 +347,12 @@ enum { | |||
330 | TRUNCATE_LOG_SYSTEM_INODE, | 347 | TRUNCATE_LOG_SYSTEM_INODE, |
331 | LOCAL_USER_QUOTA_SYSTEM_INODE, | 348 | LOCAL_USER_QUOTA_SYSTEM_INODE, |
332 | LOCAL_GROUP_QUOTA_SYSTEM_INODE, | 349 | LOCAL_GROUP_QUOTA_SYSTEM_INODE, |
350 | #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE | ||
333 | NUM_SYSTEM_INODES | 351 | NUM_SYSTEM_INODES |
334 | }; | 352 | }; |
353 | #define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE | ||
354 | #define NUM_LOCAL_SYSTEM_INODES \ | ||
355 | (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) | ||
335 | 356 | ||
336 | static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { | 357 | static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { |
337 | /* Global system inodes (single copy) */ | 358 | /* Global system inodes (single copy) */ |
@@ -360,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { | |||
360 | /* Parameter passed from mount.ocfs2 to module */ | 381 | /* Parameter passed from mount.ocfs2 to module */ |
361 | #define OCFS2_HB_NONE "heartbeat=none" | 382 | #define OCFS2_HB_NONE "heartbeat=none" |
362 | #define OCFS2_HB_LOCAL "heartbeat=local" | 383 | #define OCFS2_HB_LOCAL "heartbeat=local" |
384 | #define OCFS2_HB_GLOBAL "heartbeat=global" | ||
363 | 385 | ||
364 | /* | 386 | /* |
365 | * OCFS2 directory file types. Only the low 3 bits are used. The | 387 | * OCFS2 directory file types. Only the low 3 bits are used. The |
@@ -566,9 +588,21 @@ struct ocfs2_slot_map_extended { | |||
566 | */ | 588 | */ |
567 | }; | 589 | }; |
568 | 590 | ||
591 | /* | ||
592 | * ci_stackflags is only valid if the incompat bit | ||
593 | * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. | ||
594 | */ | ||
569 | struct ocfs2_cluster_info { | 595 | struct ocfs2_cluster_info { |
570 | /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; | 596 | /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; |
571 | __le32 ci_reserved; | 597 | union { |
598 | __le32 ci_reserved; | ||
599 | struct { | ||
600 | __u8 ci_stackflags; | ||
601 | __u8 ci_reserved1; | ||
602 | __u8 ci_reserved2; | ||
603 | __u8 ci_reserved3; | ||
604 | }; | ||
605 | }; | ||
572 | /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; | 606 | /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; |
573 | /*18*/ | 607 | /*18*/ |
574 | }; | 608 | }; |
@@ -605,9 +639,9 @@ struct ocfs2_super_block { | |||
605 | * group header */ | 639 | * group header */ |
606 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ | 640 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ |
607 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ | 641 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ |
608 | /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace | 642 | /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either |
609 | stack. Only valid | 643 | userspace or clusterinfo |
610 | with INCOMPAT flag. */ | 644 | INCOMPAT flag set. */ |
611 | /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size | 645 | /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size |
612 | for this fs*/ | 646 | for this fs*/ |
613 | __le16 s_reserved0; | 647 | __le16 s_reserved0; |
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 5d241505690b..b46f39bf7438 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h | |||
@@ -76,4 +76,99 @@ struct reflink_arguments { | |||
76 | }; | 76 | }; |
77 | #define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) | 77 | #define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) |
78 | 78 | ||
79 | /* Following definitions dedicated for ocfs2_info_request ioctls. */ | ||
80 | #define OCFS2_INFO_MAX_REQUEST (50) | ||
81 | #define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) | ||
82 | |||
83 | /* Magic number of all requests */ | ||
84 | #define OCFS2_INFO_MAGIC (0x4F32494E) | ||
85 | |||
86 | /* | ||
87 | * Always try to separate info request into small pieces to | ||
88 | * guarantee the backward&forward compatibility. | ||
89 | */ | ||
90 | struct ocfs2_info { | ||
91 | __u64 oi_requests; /* Array of __u64 pointers to requests */ | ||
92 | __u32 oi_count; /* Number of requests in info_requests */ | ||
93 | __u32 oi_pad; | ||
94 | }; | ||
95 | |||
96 | struct ocfs2_info_request { | ||
97 | /*00*/ __u32 ir_magic; /* Magic number */ | ||
98 | __u32 ir_code; /* Info request code */ | ||
99 | __u32 ir_size; /* Size of request */ | ||
100 | __u32 ir_flags; /* Request flags */ | ||
101 | /*10*/ /* Request specific fields */ | ||
102 | }; | ||
103 | |||
104 | struct ocfs2_info_clustersize { | ||
105 | struct ocfs2_info_request ic_req; | ||
106 | __u32 ic_clustersize; | ||
107 | __u32 ic_pad; | ||
108 | }; | ||
109 | |||
110 | struct ocfs2_info_blocksize { | ||
111 | struct ocfs2_info_request ib_req; | ||
112 | __u32 ib_blocksize; | ||
113 | __u32 ib_pad; | ||
114 | }; | ||
115 | |||
116 | struct ocfs2_info_maxslots { | ||
117 | struct ocfs2_info_request im_req; | ||
118 | __u32 im_max_slots; | ||
119 | __u32 im_pad; | ||
120 | }; | ||
121 | |||
122 | struct ocfs2_info_label { | ||
123 | struct ocfs2_info_request il_req; | ||
124 | __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; | ||
125 | } __attribute__ ((packed)); | ||
126 | |||
127 | struct ocfs2_info_uuid { | ||
128 | struct ocfs2_info_request iu_req; | ||
129 | __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; | ||
130 | } __attribute__ ((packed)); | ||
131 | |||
132 | struct ocfs2_info_fs_features { | ||
133 | struct ocfs2_info_request if_req; | ||
134 | __u32 if_compat_features; | ||
135 | __u32 if_incompat_features; | ||
136 | __u32 if_ro_compat_features; | ||
137 | __u32 if_pad; | ||
138 | }; | ||
139 | |||
140 | struct ocfs2_info_journal_size { | ||
141 | struct ocfs2_info_request ij_req; | ||
142 | __u64 ij_journal_size; | ||
143 | }; | ||
144 | |||
145 | /* Codes for ocfs2_info_request */ | ||
146 | enum ocfs2_info_type { | ||
147 | OCFS2_INFO_CLUSTERSIZE = 1, | ||
148 | OCFS2_INFO_BLOCKSIZE, | ||
149 | OCFS2_INFO_MAXSLOTS, | ||
150 | OCFS2_INFO_LABEL, | ||
151 | OCFS2_INFO_UUID, | ||
152 | OCFS2_INFO_FS_FEATURES, | ||
153 | OCFS2_INFO_JOURNAL_SIZE, | ||
154 | OCFS2_INFO_NUM_TYPES | ||
155 | }; | ||
156 | |||
157 | /* Flags for struct ocfs2_info_request */ | ||
158 | /* Filled by the caller */ | ||
159 | #define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not | ||
160 | required. This is a hint. | ||
161 | It is up to ocfs2 whether | ||
162 | the request can be fulfilled | ||
163 | without locking. */ | ||
164 | /* Filled by ocfs2 */ | ||
165 | #define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood | ||
166 | this request and | ||
167 | filled in the answer */ | ||
168 | |||
169 | #define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during | ||
170 | request handling. */ | ||
171 | |||
172 | #define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) | ||
173 | |||
79 | #endif /* OCFS2_IOCTL_H */ | 174 | #endif /* OCFS2_IOCTL_H */ |
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index efdd75607406..b5f9160e93e9 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c | |||
@@ -49,6 +49,7 @@ | |||
49 | 49 | ||
50 | struct ocfs2_cow_context { | 50 | struct ocfs2_cow_context { |
51 | struct inode *inode; | 51 | struct inode *inode; |
52 | struct file *file; | ||
52 | u32 cow_start; | 53 | u32 cow_start; |
53 | u32 cow_len; | 54 | u32 cow_len; |
54 | struct ocfs2_extent_tree data_et; | 55 | struct ocfs2_extent_tree data_et; |
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, | |||
2932 | u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); | 2933 | u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); |
2933 | struct page *page; | 2934 | struct page *page; |
2934 | pgoff_t page_index; | 2935 | pgoff_t page_index; |
2935 | unsigned int from, to; | 2936 | unsigned int from, to, readahead_pages; |
2936 | loff_t offset, end, map_end; | 2937 | loff_t offset, end, map_end; |
2937 | struct address_space *mapping = context->inode->i_mapping; | 2938 | struct address_space *mapping = context->inode->i_mapping; |
2938 | 2939 | ||
2939 | mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, | 2940 | mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, |
2940 | new_cluster, new_len, cpos); | 2941 | new_cluster, new_len, cpos); |
2941 | 2942 | ||
2943 | readahead_pages = | ||
2944 | (ocfs2_cow_contig_clusters(sb) << | ||
2945 | OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; | ||
2942 | offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; | 2946 | offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; |
2943 | end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); | 2947 | end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); |
2944 | /* | 2948 | /* |
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, | |||
2969 | if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) | 2973 | if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) |
2970 | BUG_ON(PageDirty(page)); | 2974 | BUG_ON(PageDirty(page)); |
2971 | 2975 | ||
2976 | if (PageReadahead(page) && context->file) { | ||
2977 | page_cache_async_readahead(mapping, | ||
2978 | &context->file->f_ra, | ||
2979 | context->file, | ||
2980 | page, page_index, | ||
2981 | readahead_pages); | ||
2982 | } | ||
2983 | |||
2972 | if (!PageUptodate(page)) { | 2984 | if (!PageUptodate(page)) { |
2973 | ret = block_read_full_page(page, ocfs2_get_block); | 2985 | ret = block_read_full_page(page, ocfs2_get_block); |
2974 | if (ret) { | 2986 | if (ret) { |
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) | |||
3409 | return ret; | 3421 | return ret; |
3410 | } | 3422 | } |
3411 | 3423 | ||
3424 | static void ocfs2_readahead_for_cow(struct inode *inode, | ||
3425 | struct file *file, | ||
3426 | u32 start, u32 len) | ||
3427 | { | ||
3428 | struct address_space *mapping; | ||
3429 | pgoff_t index; | ||
3430 | unsigned long num_pages; | ||
3431 | int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
3432 | |||
3433 | if (!file) | ||
3434 | return; | ||
3435 | |||
3436 | mapping = file->f_mapping; | ||
3437 | num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; | ||
3438 | if (!num_pages) | ||
3439 | num_pages = 1; | ||
3440 | |||
3441 | index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; | ||
3442 | page_cache_sync_readahead(mapping, &file->f_ra, file, | ||
3443 | index, num_pages); | ||
3444 | } | ||
3445 | |||
3412 | /* | 3446 | /* |
3413 | * Starting at cpos, try to CoW write_len clusters. Don't CoW | 3447 | * Starting at cpos, try to CoW write_len clusters. Don't CoW |
3414 | * past max_cpos. This will stop when it runs into a hole or an | 3448 | * past max_cpos. This will stop when it runs into a hole or an |
3415 | * unrefcounted extent. | 3449 | * unrefcounted extent. |
3416 | */ | 3450 | */ |
3417 | static int ocfs2_refcount_cow_hunk(struct inode *inode, | 3451 | static int ocfs2_refcount_cow_hunk(struct inode *inode, |
3452 | struct file *file, | ||
3418 | struct buffer_head *di_bh, | 3453 | struct buffer_head *di_bh, |
3419 | u32 cpos, u32 write_len, u32 max_cpos) | 3454 | u32 cpos, u32 write_len, u32 max_cpos) |
3420 | { | 3455 | { |
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, | |||
3443 | 3478 | ||
3444 | BUG_ON(cow_len == 0); | 3479 | BUG_ON(cow_len == 0); |
3445 | 3480 | ||
3481 | ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); | ||
3482 | |||
3446 | context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); | 3483 | context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); |
3447 | if (!context) { | 3484 | if (!context) { |
3448 | ret = -ENOMEM; | 3485 | ret = -ENOMEM; |
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, | |||
3464 | context->ref_root_bh = ref_root_bh; | 3501 | context->ref_root_bh = ref_root_bh; |
3465 | context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; | 3502 | context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; |
3466 | context->get_clusters = ocfs2_di_get_clusters; | 3503 | context->get_clusters = ocfs2_di_get_clusters; |
3504 | context->file = file; | ||
3467 | 3505 | ||
3468 | ocfs2_init_dinode_extent_tree(&context->data_et, | 3506 | ocfs2_init_dinode_extent_tree(&context->data_et, |
3469 | INODE_CACHE(inode), di_bh); | 3507 | INODE_CACHE(inode), di_bh); |
@@ -3492,6 +3530,7 @@ out: | |||
3492 | * clusters between cpos and cpos+write_len are safe to modify. | 3530 | * clusters between cpos and cpos+write_len are safe to modify. |
3493 | */ | 3531 | */ |
3494 | int ocfs2_refcount_cow(struct inode *inode, | 3532 | int ocfs2_refcount_cow(struct inode *inode, |
3533 | struct file *file, | ||
3495 | struct buffer_head *di_bh, | 3534 | struct buffer_head *di_bh, |
3496 | u32 cpos, u32 write_len, u32 max_cpos) | 3535 | u32 cpos, u32 write_len, u32 max_cpos) |
3497 | { | 3536 | { |
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode, | |||
3511 | num_clusters = write_len; | 3550 | num_clusters = write_len; |
3512 | 3551 | ||
3513 | if (ext_flags & OCFS2_EXT_REFCOUNTED) { | 3552 | if (ext_flags & OCFS2_EXT_REFCOUNTED) { |
3514 | ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, | 3553 | ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, |
3515 | num_clusters, max_cpos); | 3554 | num_clusters, max_cpos); |
3516 | if (ret) { | 3555 | if (ret) { |
3517 | mlog_errno(ret); | 3556 | mlog_errno(ret); |
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1570e2..c8ce46f7d8e3 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h | |||
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree { | |||
21 | struct rb_node rf_node; | 21 | struct rb_node rf_node; |
22 | u64 rf_blkno; | 22 | u64 rf_blkno; |
23 | u32 rf_generation; | 23 | u32 rf_generation; |
24 | struct kref rf_getcnt; | ||
24 | struct rw_semaphore rf_sem; | 25 | struct rw_semaphore rf_sem; |
25 | struct ocfs2_lock_res rf_lockres; | 26 | struct ocfs2_lock_res rf_lockres; |
26 | struct kref rf_getcnt; | ||
27 | int rf_removed; | 27 | int rf_removed; |
28 | 28 | ||
29 | /* the following 4 fields are used by caching_info. */ | 29 | /* the following 4 fields are used by caching_info. */ |
30 | struct ocfs2_caching_info rf_ci; | ||
31 | spinlock_t rf_lock; | 30 | spinlock_t rf_lock; |
31 | struct ocfs2_caching_info rf_ci; | ||
32 | struct mutex rf_io_mutex; | 32 | struct mutex rf_io_mutex; |
33 | struct super_block *rf_sb; | 33 | struct super_block *rf_sb; |
34 | }; | 34 | }; |
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, | |||
52 | u32 clusters, | 52 | u32 clusters, |
53 | int *credits, | 53 | int *credits, |
54 | int *ref_blocks); | 54 | int *ref_blocks); |
55 | int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, | 55 | int ocfs2_refcount_cow(struct inode *inode, |
56 | struct file *filep, struct buffer_head *di_bh, | ||
56 | u32 cpos, u32 write_len, u32 max_cpos); | 57 | u32 cpos, u32 write_len, u32 max_cpos); |
57 | 58 | ||
58 | typedef int (ocfs2_post_refcount_func)(struct inode *inode, | 59 | typedef int (ocfs2_post_refcount_func)(struct inode *inode, |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bfbd7e9e949f..ab4e0172cc1d 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, | |||
357 | { | 357 | { |
358 | int status = 0; | 358 | int status = 0; |
359 | u64 blkno; | 359 | u64 blkno; |
360 | unsigned long long blocks, bytes; | 360 | unsigned long long blocks, bytes = 0; |
361 | unsigned int i; | 361 | unsigned int i; |
362 | struct buffer_head *bh; | 362 | struct buffer_head *bh; |
363 | 363 | ||
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 0d3049f696c5..19965b00c43c 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c | |||
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) | |||
283 | /* for now we only have one cluster/node, make sure we see it | 283 | /* for now we only have one cluster/node, make sure we see it |
284 | * in the heartbeat universe */ | 284 | * in the heartbeat universe */ |
285 | if (!o2hb_check_local_node_heartbeating()) { | 285 | if (!o2hb_check_local_node_heartbeating()) { |
286 | if (o2hb_global_heartbeat_active()) | ||
287 | mlog(ML_ERROR, "Global heartbeat not started\n"); | ||
286 | rc = -EINVAL; | 288 | rc = -EINVAL; |
287 | goto out; | 289 | goto out; |
288 | } | 290 | } |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 849c2f0e0a0e..5fed60de7630 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, | |||
1380 | } | 1380 | } |
1381 | 1381 | ||
1382 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | 1382 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); |
1383 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | ||
1384 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | ||
1385 | " count %u but claims %u are freed. num_bits %d", | ||
1386 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | ||
1387 | le16_to_cpu(bg->bg_bits), | ||
1388 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | ||
1389 | return -EROFS; | ||
1390 | } | ||
1383 | while(num_bits--) | 1391 | while(num_bits--) |
1384 | ocfs2_set_bit(bit_off++, bitmap); | 1392 | ocfs2_set_bit(bit_off++, bitmap); |
1385 | 1393 | ||
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, | |||
2419 | (unsigned long *) undo_bg->bg_bitmap); | 2427 | (unsigned long *) undo_bg->bg_bitmap); |
2420 | } | 2428 | } |
2421 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); | 2429 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); |
2430 | if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { | ||
2431 | ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" | ||
2432 | " count %u but claims %u are freed. num_bits %d", | ||
2433 | (unsigned long long)le64_to_cpu(bg->bg_blkno), | ||
2434 | le16_to_cpu(bg->bg_bits), | ||
2435 | le16_to_cpu(bg->bg_free_bits_count), num_bits); | ||
2436 | return -EROFS; | ||
2437 | } | ||
2422 | 2438 | ||
2423 | if (undo_fn) | 2439 | if (undo_fn) |
2424 | jbd_unlock_bh_state(group_bh); | 2440 | jbd_unlock_bh_state(group_bh); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b7d724393b5a..56f0cb395820 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -162,6 +162,7 @@ enum { | |||
162 | Opt_nointr, | 162 | Opt_nointr, |
163 | Opt_hb_none, | 163 | Opt_hb_none, |
164 | Opt_hb_local, | 164 | Opt_hb_local, |
165 | Opt_hb_global, | ||
165 | Opt_data_ordered, | 166 | Opt_data_ordered, |
166 | Opt_data_writeback, | 167 | Opt_data_writeback, |
167 | Opt_atime_quantum, | 168 | Opt_atime_quantum, |
@@ -177,6 +178,8 @@ enum { | |||
177 | Opt_noacl, | 178 | Opt_noacl, |
178 | Opt_usrquota, | 179 | Opt_usrquota, |
179 | Opt_grpquota, | 180 | Opt_grpquota, |
181 | Opt_coherency_buffered, | ||
182 | Opt_coherency_full, | ||
180 | Opt_resv_level, | 183 | Opt_resv_level, |
181 | Opt_dir_resv_level, | 184 | Opt_dir_resv_level, |
182 | Opt_err, | 185 | Opt_err, |
@@ -190,6 +193,7 @@ static const match_table_t tokens = { | |||
190 | {Opt_nointr, "nointr"}, | 193 | {Opt_nointr, "nointr"}, |
191 | {Opt_hb_none, OCFS2_HB_NONE}, | 194 | {Opt_hb_none, OCFS2_HB_NONE}, |
192 | {Opt_hb_local, OCFS2_HB_LOCAL}, | 195 | {Opt_hb_local, OCFS2_HB_LOCAL}, |
196 | {Opt_hb_global, OCFS2_HB_GLOBAL}, | ||
193 | {Opt_data_ordered, "data=ordered"}, | 197 | {Opt_data_ordered, "data=ordered"}, |
194 | {Opt_data_writeback, "data=writeback"}, | 198 | {Opt_data_writeback, "data=writeback"}, |
195 | {Opt_atime_quantum, "atime_quantum=%u"}, | 199 | {Opt_atime_quantum, "atime_quantum=%u"}, |
@@ -205,6 +209,8 @@ static const match_table_t tokens = { | |||
205 | {Opt_noacl, "noacl"}, | 209 | {Opt_noacl, "noacl"}, |
206 | {Opt_usrquota, "usrquota"}, | 210 | {Opt_usrquota, "usrquota"}, |
207 | {Opt_grpquota, "grpquota"}, | 211 | {Opt_grpquota, "grpquota"}, |
212 | {Opt_coherency_buffered, "coherency=buffered"}, | ||
213 | {Opt_coherency_full, "coherency=full"}, | ||
208 | {Opt_resv_level, "resv_level=%u"}, | 214 | {Opt_resv_level, "resv_level=%u"}, |
209 | {Opt_dir_resv_level, "dir_resv_level=%u"}, | 215 | {Opt_dir_resv_level, "dir_resv_level=%u"}, |
210 | {Opt_err, NULL} | 216 | {Opt_err, NULL} |
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) | |||
514 | 520 | ||
515 | mlog_entry_void(); | 521 | mlog_entry_void(); |
516 | 522 | ||
517 | for (i = 0; i < NUM_SYSTEM_INODES; i++) { | 523 | for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { |
518 | inode = osb->system_inodes[i]; | 524 | inode = osb->global_system_inodes[i]; |
519 | if (inode) { | 525 | if (inode) { |
520 | iput(inode); | 526 | iput(inode); |
521 | osb->system_inodes[i] = NULL; | 527 | osb->global_system_inodes[i] = NULL; |
522 | } | 528 | } |
523 | } | 529 | } |
524 | 530 | ||
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb) | |||
534 | osb->root_inode = NULL; | 540 | osb->root_inode = NULL; |
535 | } | 541 | } |
536 | 542 | ||
543 | if (!osb->local_system_inodes) | ||
544 | goto out; | ||
545 | |||
546 | for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { | ||
547 | if (osb->local_system_inodes[i]) { | ||
548 | iput(osb->local_system_inodes[i]); | ||
549 | osb->local_system_inodes[i] = NULL; | ||
550 | } | ||
551 | } | ||
552 | |||
553 | kfree(osb->local_system_inodes); | ||
554 | osb->local_system_inodes = NULL; | ||
555 | |||
556 | out: | ||
537 | mlog_exit(0); | 557 | mlog_exit(0); |
538 | } | 558 | } |
539 | 559 | ||
@@ -608,6 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | |||
608 | int ret = 0; | 628 | int ret = 0; |
609 | struct mount_options parsed_options; | 629 | struct mount_options parsed_options; |
610 | struct ocfs2_super *osb = OCFS2_SB(sb); | 630 | struct ocfs2_super *osb = OCFS2_SB(sb); |
631 | u32 tmp; | ||
611 | 632 | ||
612 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || | 633 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || |
613 | !ocfs2_check_set_options(sb, &parsed_options)) { | 634 | !ocfs2_check_set_options(sb, &parsed_options)) { |
@@ -615,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | |||
615 | goto out; | 636 | goto out; |
616 | } | 637 | } |
617 | 638 | ||
618 | if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != | 639 | tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | |
619 | (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { | 640 | OCFS2_MOUNT_HB_NONE; |
641 | if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { | ||
620 | ret = -EINVAL; | 642 | ret = -EINVAL; |
621 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); | 643 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); |
622 | goto out; | 644 | goto out; |
@@ -806,23 +828,29 @@ bail: | |||
806 | 828 | ||
807 | static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) | 829 | static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) |
808 | { | 830 | { |
809 | if (ocfs2_mount_local(osb)) { | 831 | u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; |
810 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | 832 | |
833 | if (osb->s_mount_opt & hb_enabled) { | ||
834 | if (ocfs2_mount_local(osb)) { | ||
811 | mlog(ML_ERROR, "Cannot heartbeat on a locally " | 835 | mlog(ML_ERROR, "Cannot heartbeat on a locally " |
812 | "mounted device.\n"); | 836 | "mounted device.\n"); |
813 | return -EINVAL; | 837 | return -EINVAL; |
814 | } | 838 | } |
815 | } | 839 | if (ocfs2_userspace_stack(osb)) { |
816 | |||
817 | if (ocfs2_userspace_stack(osb)) { | ||
818 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | ||
819 | mlog(ML_ERROR, "Userspace stack expected, but " | 840 | mlog(ML_ERROR, "Userspace stack expected, but " |
820 | "o2cb heartbeat arguments passed to mount\n"); | 841 | "o2cb heartbeat arguments passed to mount\n"); |
821 | return -EINVAL; | 842 | return -EINVAL; |
822 | } | 843 | } |
844 | if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && | ||
845 | !ocfs2_cluster_o2cb_global_heartbeat(osb)) || | ||
846 | ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && | ||
847 | ocfs2_cluster_o2cb_global_heartbeat(osb))) { | ||
848 | mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); | ||
849 | return -EINVAL; | ||
850 | } | ||
823 | } | 851 | } |
824 | 852 | ||
825 | if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { | 853 | if (!(osb->s_mount_opt & hb_enabled)) { |
826 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && | 854 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && |
827 | !ocfs2_userspace_stack(osb)) { | 855 | !ocfs2_userspace_stack(osb)) { |
828 | mlog(ML_ERROR, "Heartbeat has to be started to mount " | 856 | mlog(ML_ERROR, "Heartbeat has to be started to mount " |
@@ -1288,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
1288 | { | 1316 | { |
1289 | int status; | 1317 | int status; |
1290 | char *p; | 1318 | char *p; |
1319 | u32 tmp; | ||
1291 | 1320 | ||
1292 | mlog_entry("remount: %d, options: \"%s\"\n", is_remount, | 1321 | mlog_entry("remount: %d, options: \"%s\"\n", is_remount, |
1293 | options ? options : "(none)"); | 1322 | options ? options : "(none)"); |
@@ -1319,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
1319 | mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; | 1348 | mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL; |
1320 | break; | 1349 | break; |
1321 | case Opt_hb_none: | 1350 | case Opt_hb_none: |
1322 | mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; | 1351 | mopt->mount_opt |= OCFS2_MOUNT_HB_NONE; |
1352 | break; | ||
1353 | case Opt_hb_global: | ||
1354 | mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL; | ||
1323 | break; | 1355 | break; |
1324 | case Opt_barrier: | 1356 | case Opt_barrier: |
1325 | if (match_int(&args[0], &option)) { | 1357 | if (match_int(&args[0], &option)) { |
@@ -1435,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
1435 | case Opt_grpquota: | 1467 | case Opt_grpquota: |
1436 | mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; | 1468 | mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; |
1437 | break; | 1469 | break; |
1470 | case Opt_coherency_buffered: | ||
1471 | mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; | ||
1472 | break; | ||
1473 | case Opt_coherency_full: | ||
1474 | mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; | ||
1475 | break; | ||
1438 | case Opt_acl: | 1476 | case Opt_acl: |
1439 | mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; | 1477 | mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; |
1440 | mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; | 1478 | mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; |
@@ -1474,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
1474 | } | 1512 | } |
1475 | } | 1513 | } |
1476 | 1514 | ||
1515 | /* Ensure only one heartbeat mode */ | ||
1516 | tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | | ||
1517 | OCFS2_MOUNT_HB_NONE); | ||
1518 | if (hweight32(tmp) != 1) { | ||
1519 | mlog(ML_ERROR, "Invalid heartbeat mount options\n"); | ||
1520 | status = 0; | ||
1521 | goto bail; | ||
1522 | } | ||
1523 | |||
1477 | status = 1; | 1524 | status = 1; |
1478 | 1525 | ||
1479 | bail: | 1526 | bail: |
@@ -1487,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
1487 | unsigned long opts = osb->s_mount_opt; | 1534 | unsigned long opts = osb->s_mount_opt; |
1488 | unsigned int local_alloc_megs; | 1535 | unsigned int local_alloc_megs; |
1489 | 1536 | ||
1490 | if (opts & OCFS2_MOUNT_HB_LOCAL) | 1537 | if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { |
1491 | seq_printf(s, ",_netdev,heartbeat=local"); | 1538 | seq_printf(s, ",_netdev"); |
1492 | else | 1539 | if (opts & OCFS2_MOUNT_HB_LOCAL) |
1493 | seq_printf(s, ",heartbeat=none"); | 1540 | seq_printf(s, ",%s", OCFS2_HB_LOCAL); |
1541 | else | ||
1542 | seq_printf(s, ",%s", OCFS2_HB_GLOBAL); | ||
1543 | } else | ||
1544 | seq_printf(s, ",%s", OCFS2_HB_NONE); | ||
1494 | 1545 | ||
1495 | if (opts & OCFS2_MOUNT_NOINTR) | 1546 | if (opts & OCFS2_MOUNT_NOINTR) |
1496 | seq_printf(s, ",nointr"); | 1547 | seq_printf(s, ",nointr"); |
@@ -1533,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
1533 | if (opts & OCFS2_MOUNT_GRPQUOTA) | 1584 | if (opts & OCFS2_MOUNT_GRPQUOTA) |
1534 | seq_printf(s, ",grpquota"); | 1585 | seq_printf(s, ",grpquota"); |
1535 | 1586 | ||
1587 | if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) | ||
1588 | seq_printf(s, ",coherency=buffered"); | ||
1589 | else | ||
1590 | seq_printf(s, ",coherency=full"); | ||
1591 | |||
1536 | if (opts & OCFS2_MOUNT_NOUSERXATTR) | 1592 | if (opts & OCFS2_MOUNT_NOUSERXATTR) |
1537 | seq_printf(s, ",nouser_xattr"); | 1593 | seq_printf(s, ",nouser_xattr"); |
1538 | else | 1594 | else |
@@ -1983,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu | |||
1983 | return 0; | 2039 | return 0; |
1984 | } | 2040 | } |
1985 | 2041 | ||
2042 | /* Make sure entire volume is addressable by our journal. Requires | ||
2043 | osb_clusters_at_boot to be valid and for the journal to have been | ||
2044 | initialized by ocfs2_journal_init(). */ | ||
2045 | static int ocfs2_journal_addressable(struct ocfs2_super *osb) | ||
2046 | { | ||
2047 | int status = 0; | ||
2048 | u64 max_block = | ||
2049 | ocfs2_clusters_to_blocks(osb->sb, | ||
2050 | osb->osb_clusters_at_boot) - 1; | ||
2051 | |||
2052 | /* 32-bit block number is always OK. */ | ||
2053 | if (max_block <= (u32)~0ULL) | ||
2054 | goto out; | ||
2055 | |||
2056 | /* Volume is "huge", so see if our journal is new enough to | ||
2057 | support it. */ | ||
2058 | if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, | ||
2059 | OCFS2_FEATURE_COMPAT_JBD2_SB) && | ||
2060 | jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, | ||
2061 | JBD2_FEATURE_INCOMPAT_64BIT))) { | ||
2062 | mlog(ML_ERROR, "The journal cannot address the entire volume. " | ||
2063 | "Enable the 'block64' journal option with tunefs.ocfs2"); | ||
2064 | status = -EFBIG; | ||
2065 | goto out; | ||
2066 | } | ||
2067 | |||
2068 | out: | ||
2069 | return status; | ||
2070 | } | ||
2071 | |||
1986 | static int ocfs2_initialize_super(struct super_block *sb, | 2072 | static int ocfs2_initialize_super(struct super_block *sb, |
1987 | struct buffer_head *bh, | 2073 | struct buffer_head *bh, |
1988 | int sector_size, | 2074 | int sector_size, |
@@ -1995,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1995 | struct ocfs2_journal *journal; | 2081 | struct ocfs2_journal *journal; |
1996 | __le32 uuid_net_key; | 2082 | __le32 uuid_net_key; |
1997 | struct ocfs2_super *osb; | 2083 | struct ocfs2_super *osb; |
2084 | u64 total_blocks; | ||
1998 | 2085 | ||
1999 | mlog_entry_void(); | 2086 | mlog_entry_void(); |
2000 | 2087 | ||
@@ -2053,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2053 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", | 2140 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", |
2054 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 2141 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); |
2055 | 2142 | ||
2143 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); | ||
2144 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { | ||
2145 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", | ||
2146 | osb->max_slots); | ||
2147 | status = -EINVAL; | ||
2148 | goto bail; | ||
2149 | } | ||
2150 | mlog(0, "max_slots for this device: %u\n", osb->max_slots); | ||
2151 | |||
2056 | ocfs2_orphan_scan_init(osb); | 2152 | ocfs2_orphan_scan_init(osb); |
2057 | 2153 | ||
2058 | status = ocfs2_recovery_init(osb); | 2154 | status = ocfs2_recovery_init(osb); |
@@ -2091,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2091 | goto bail; | 2187 | goto bail; |
2092 | } | 2188 | } |
2093 | 2189 | ||
2094 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); | ||
2095 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { | ||
2096 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", | ||
2097 | osb->max_slots); | ||
2098 | status = -EINVAL; | ||
2099 | goto bail; | ||
2100 | } | ||
2101 | mlog(0, "max_slots for this device: %u\n", osb->max_slots); | ||
2102 | |||
2103 | osb->slot_recovery_generations = | 2190 | osb->slot_recovery_generations = |
2104 | kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), | 2191 | kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), |
2105 | GFP_KERNEL); | 2192 | GFP_KERNEL); |
@@ -2142,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2142 | goto bail; | 2229 | goto bail; |
2143 | } | 2230 | } |
2144 | 2231 | ||
2145 | if (ocfs2_userspace_stack(osb)) { | 2232 | if (ocfs2_clusterinfo_valid(osb)) { |
2233 | osb->osb_stackflags = | ||
2234 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; | ||
2146 | memcpy(osb->osb_cluster_stack, | 2235 | memcpy(osb->osb_cluster_stack, |
2147 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, | 2236 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, |
2148 | OCFS2_STACK_LABEL_LEN); | 2237 | OCFS2_STACK_LABEL_LEN); |
@@ -2207,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2207 | goto bail; | 2296 | goto bail; |
2208 | } | 2297 | } |
2209 | 2298 | ||
2210 | if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) | 2299 | total_blocks = ocfs2_clusters_to_blocks(osb->sb, |
2211 | > (u32)~0UL) { | 2300 | le32_to_cpu(di->i_clusters)); |
2212 | mlog(ML_ERROR, "Volume might try to write to blocks beyond " | 2301 | |
2213 | "what jbd can address in 32 bits.\n"); | 2302 | status = generic_check_addressable(osb->sb->s_blocksize_bits, |
2214 | status = -EINVAL; | 2303 | total_blocks); |
2304 | if (status) { | ||
2305 | mlog(ML_ERROR, "Volume too large " | ||
2306 | "to mount safely on this system"); | ||
2307 | status = -EFBIG; | ||
2215 | goto bail; | 2308 | goto bail; |
2216 | } | 2309 | } |
2217 | 2310 | ||
@@ -2373,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) | |||
2373 | goto finally; | 2466 | goto finally; |
2374 | } | 2467 | } |
2375 | 2468 | ||
2469 | /* Now that journal has been initialized, check to make sure | ||
2470 | entire volume is addressable. */ | ||
2471 | status = ocfs2_journal_addressable(osb); | ||
2472 | if (status) | ||
2473 | goto finally; | ||
2474 | |||
2376 | /* If the journal was unmounted cleanly then we don't want to | 2475 | /* If the journal was unmounted cleanly then we don't want to |
2377 | * recover anything. Otherwise, journal_load will do that | 2476 | * recover anything. Otherwise, journal_load will do that |
2378 | * dirty work for us :) */ | 2477 | * dirty work for us :) */ |
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 32499d213fc4..9975457c981f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c | |||
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, | |||
128 | } | 128 | } |
129 | 129 | ||
130 | /* Fast symlinks can't be large */ | 130 | /* Fast symlinks can't be large */ |
131 | len = strlen(target); | 131 | len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); |
132 | link = kzalloc(len + 1, GFP_NOFS); | 132 | link = kzalloc(len + 1, GFP_NOFS); |
133 | if (!link) { | 133 | if (!link) { |
134 | status = -ENOMEM; | 134 | status = -ENOMEM; |
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index bfe7190cdbf1..902efb23b6a6 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c | |||
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | |||
44 | int type, | 44 | int type, |
45 | u32 slot); | 45 | u32 slot); |
46 | 46 | ||
47 | static inline int is_global_system_inode(int type); | ||
48 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | ||
49 | int type, | ||
50 | u32 slot); | ||
51 | |||
52 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 47 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
53 | static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; | 48 | static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; |
54 | #endif | 49 | #endif |
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type) | |||
59 | type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; | 54 | type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; |
60 | } | 55 | } |
61 | 56 | ||
62 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | 57 | static struct inode **get_local_system_inode(struct ocfs2_super *osb, |
63 | int type, | 58 | int type, |
64 | u32 slot) | 59 | u32 slot) |
65 | { | 60 | { |
66 | return slot == osb->slot_num || is_global_system_inode(type); | 61 | int index; |
62 | struct inode **local_system_inodes, **free = NULL; | ||
63 | |||
64 | BUG_ON(slot == OCFS2_INVALID_SLOT); | ||
65 | BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || | ||
66 | type > OCFS2_LAST_LOCAL_SYSTEM_INODE); | ||
67 | |||
68 | spin_lock(&osb->osb_lock); | ||
69 | local_system_inodes = osb->local_system_inodes; | ||
70 | spin_unlock(&osb->osb_lock); | ||
71 | |||
72 | if (unlikely(!local_system_inodes)) { | ||
73 | local_system_inodes = kzalloc(sizeof(struct inode *) * | ||
74 | NUM_LOCAL_SYSTEM_INODES * | ||
75 | osb->max_slots, | ||
76 | GFP_NOFS); | ||
77 | if (!local_system_inodes) { | ||
78 | mlog_errno(-ENOMEM); | ||
79 | /* | ||
80 | * return NULL here so that ocfs2_get_sytem_file_inodes | ||
81 | * will try to create an inode and use it. We will try | ||
82 | * to initialize local_system_inodes next time. | ||
83 | */ | ||
84 | return NULL; | ||
85 | } | ||
86 | |||
87 | spin_lock(&osb->osb_lock); | ||
88 | if (osb->local_system_inodes) { | ||
89 | /* Someone has initialized it for us. */ | ||
90 | free = local_system_inodes; | ||
91 | local_system_inodes = osb->local_system_inodes; | ||
92 | } else | ||
93 | osb->local_system_inodes = local_system_inodes; | ||
94 | spin_unlock(&osb->osb_lock); | ||
95 | if (unlikely(free)) | ||
96 | kfree(free); | ||
97 | } | ||
98 | |||
99 | index = (slot * NUM_LOCAL_SYSTEM_INODES) + | ||
100 | (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); | ||
101 | |||
102 | return &local_system_inodes[index]; | ||
67 | } | 103 | } |
68 | 104 | ||
69 | struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, | 105 | struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, |
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, | |||
74 | struct inode **arr = NULL; | 110 | struct inode **arr = NULL; |
75 | 111 | ||
76 | /* avoid the lookup if cached in local system file array */ | 112 | /* avoid the lookup if cached in local system file array */ |
77 | if (is_in_system_inode_array(osb, type, slot)) | 113 | if (is_global_system_inode(type)) { |
78 | arr = &(osb->system_inodes[type]); | 114 | arr = &(osb->global_system_inodes[type]); |
115 | } else | ||
116 | arr = get_local_system_inode(osb, type, slot); | ||
79 | 117 | ||
80 | if (arr && ((inode = *arr) != NULL)) { | 118 | if (arr && ((inode = *arr) != NULL)) { |
81 | /* get a ref in addition to the array ref */ | 119 | /* get a ref in addition to the array ref */ |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 06fa5e77c40e..67cd43914641 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, | |||
7081 | goto out; | 7081 | goto out; |
7082 | } | 7082 | } |
7083 | 7083 | ||
7084 | if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) | 7084 | if (!indexed) |
7085 | ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); | 7085 | ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); |
7086 | else | 7086 | else |
7087 | ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); | 7087 | ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index a1c43e7c8a7b..8e4addaa5424 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
2675 | INF("auxv", S_IRUSR, proc_pid_auxv), | 2675 | INF("auxv", S_IRUSR, proc_pid_auxv), |
2676 | ONE("status", S_IRUGO, proc_pid_status), | 2676 | ONE("status", S_IRUGO, proc_pid_status), |
2677 | ONE("personality", S_IRUSR, proc_pid_personality), | 2677 | ONE("personality", S_IRUSR, proc_pid_personality), |
2678 | INF("limits", S_IRUSR, proc_pid_limits), | 2678 | INF("limits", S_IRUGO, proc_pid_limits), |
2679 | #ifdef CONFIG_SCHED_DEBUG | 2679 | #ifdef CONFIG_SCHED_DEBUG |
2680 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), | 2680 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), |
2681 | #endif | 2681 | #endif |
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = { | |||
3011 | INF("auxv", S_IRUSR, proc_pid_auxv), | 3011 | INF("auxv", S_IRUSR, proc_pid_auxv), |
3012 | ONE("status", S_IRUGO, proc_pid_status), | 3012 | ONE("status", S_IRUGO, proc_pid_status), |
3013 | ONE("personality", S_IRUSR, proc_pid_personality), | 3013 | ONE("personality", S_IRUSR, proc_pid_personality), |
3014 | INF("limits", S_IRUSR, proc_pid_limits), | 3014 | INF("limits", S_IRUGO, proc_pid_limits), |
3015 | #ifdef CONFIG_SCHED_DEBUG | 3015 | #ifdef CONFIG_SCHED_DEBUG |
3016 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), | 3016 | REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), |
3017 | #endif | 3017 | #endif |
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index f53505de0712..5cbb81e134ac 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c | |||
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, | |||
170 | int reiserfs_unpack(struct inode *inode, struct file *filp) | 170 | int reiserfs_unpack(struct inode *inode, struct file *filp) |
171 | { | 171 | { |
172 | int retval = 0; | 172 | int retval = 0; |
173 | int depth; | ||
173 | int index; | 174 | int index; |
174 | struct page *page; | 175 | struct page *page; |
175 | struct address_space *mapping; | 176 | struct address_space *mapping; |
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) | |||
188 | /* we need to make sure nobody is changing the file size beneath | 189 | /* we need to make sure nobody is changing the file size beneath |
189 | ** us | 190 | ** us |
190 | */ | 191 | */ |
191 | mutex_lock(&inode->i_mutex); | 192 | reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); |
192 | reiserfs_write_lock(inode->i_sb); | 193 | depth = reiserfs_write_lock_once(inode->i_sb); |
193 | 194 | ||
194 | write_from = inode->i_size & (blocksize - 1); | 195 | write_from = inode->i_size & (blocksize - 1); |
195 | /* if we are on a block boundary, we are already unpacked. */ | 196 | /* if we are on a block boundary, we are already unpacked. */ |
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) | |||
224 | 225 | ||
225 | out: | 226 | out: |
226 | mutex_unlock(&inode->i_mutex); | 227 | mutex_unlock(&inode->i_mutex); |
227 | reiserfs_write_unlock(inode->i_sb); | 228 | reiserfs_write_unlock_once(inode->i_sb, depth); |
228 | return retval; | 229 | return retval; |
229 | } | 230 | } |
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig index e668127c8b2e..2bc24a8c4039 100644 --- a/fs/smbfs/Kconfig +++ b/fs/smbfs/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config SMB_FS | 1 | config SMB_FS |
2 | tristate "SMB file system support (OBSOLETE, please use CIFS)" | 2 | tristate "SMB file system support (OBSOLETE, please use CIFS)" |
3 | depends on BKL # probably unfixable | ||
3 | depends on INET | 4 | depends on INET |
4 | select NLS | 5 | select NLS |
5 | help | 6 | help |
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 23c1e598792a..442f34ff1af8 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c | |||
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj, | |||
148 | sysfs_put(sd); | 148 | sysfs_put(sd); |
149 | } | 149 | } |
150 | 150 | ||
151 | /** | ||
152 | * sysfs_merge_group - merge files into a pre-existing attribute group. | ||
153 | * @kobj: The kobject containing the group. | ||
154 | * @grp: The files to create and the attribute group they belong to. | ||
155 | * | ||
156 | * This function returns an error if the group doesn't exist or any of the | ||
157 | * files already exist in that group, in which case none of the new files | ||
158 | * are created. | ||
159 | */ | ||
160 | int sysfs_merge_group(struct kobject *kobj, | ||
161 | const struct attribute_group *grp) | ||
162 | { | ||
163 | struct sysfs_dirent *dir_sd; | ||
164 | int error = 0; | ||
165 | struct attribute *const *attr; | ||
166 | int i; | ||
167 | |||
168 | if (grp) | ||
169 | dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); | ||
170 | else | ||
171 | dir_sd = sysfs_get(kobj->sd); | ||
172 | if (!dir_sd) | ||
173 | return -ENOENT; | ||
174 | |||
175 | for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) | ||
176 | error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); | ||
177 | if (error) { | ||
178 | while (--i >= 0) | ||
179 | sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name); | ||
180 | } | ||
181 | sysfs_put(dir_sd); | ||
182 | |||
183 | return error; | ||
184 | } | ||
185 | EXPORT_SYMBOL_GPL(sysfs_merge_group); | ||
186 | |||
187 | /** | ||
188 | * sysfs_unmerge_group - remove files from a pre-existing attribute group. | ||
189 | * @kobj: The kobject containing the group. | ||
190 | * @grp: The files to remove and the attribute group they belong to. | ||
191 | */ | ||
192 | void sysfs_unmerge_group(struct kobject *kobj, | ||
193 | const struct attribute_group *grp) | ||
194 | { | ||
195 | struct sysfs_dirent *dir_sd; | ||
196 | struct attribute *const *attr; | ||
197 | |||
198 | if (grp) | ||
199 | dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name); | ||
200 | else | ||
201 | dir_sd = sysfs_get(kobj->sd); | ||
202 | if (dir_sd) { | ||
203 | for (attr = grp->attrs; *attr; ++attr) | ||
204 | sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); | ||
205 | sysfs_put(dir_sd); | ||
206 | } | ||
207 | } | ||
208 | EXPORT_SYMBOL_GPL(sysfs_unmerge_group); | ||
209 | |||
151 | 210 | ||
152 | EXPORT_SYMBOL_GPL(sysfs_create_group); | 211 | EXPORT_SYMBOL_GPL(sysfs_create_group); |
153 | EXPORT_SYMBOL_GPL(sysfs_update_group); | 212 | EXPORT_SYMBOL_GPL(sysfs_update_group); |
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 0e0e99bd6bce..f8def3c8ea4c 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config UDF_FS | 1 | config UDF_FS |
2 | tristate "UDF file system support" | 2 | tristate "UDF file system support" |
3 | depends on BKL # needs serious work to remove | ||
3 | select CRC_ITU_T | 4 | select CRC_ITU_T |
4 | help | 5 | help |
5 | This is the new file system used on some CD-ROMs and DVDs. Say Y if | 6 | This is the new file system used on some CD-ROMs and DVDs. Say Y if |
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index e4f10a40768a..30c8f223253d 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig | |||
@@ -1,6 +1,7 @@ | |||
1 | config UFS_FS | 1 | config UFS_FS |
2 | tristate "UFS file system support (read only)" | 2 | tristate "UFS file system support (read only)" |
3 | depends on BLOCK | 3 | depends on BLOCK |
4 | depends on BKL # probably fixable | ||
4 | help | 5 | help |
5 | BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, | 6 | BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, |
6 | OpenBSD and NeXTstep) use a file system called UFS. Some System V | 7 | OpenBSD and NeXTstep) use a file system called UFS. Some System V |
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index d59c4a65d492..81976ffed7d6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c | |||
@@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag( | |||
668 | xfs_perag_put(pag); | 668 | xfs_perag_put(pag); |
669 | } | 669 | } |
670 | 670 | ||
671 | void | 671 | STATIC void |
672 | __xfs_inode_clear_reclaim_tag( | 672 | __xfs_inode_clear_reclaim( |
673 | xfs_mount_t *mp, | ||
674 | xfs_perag_t *pag, | 673 | xfs_perag_t *pag, |
675 | xfs_inode_t *ip) | 674 | xfs_inode_t *ip) |
676 | { | 675 | { |
677 | radix_tree_tag_clear(&pag->pag_ici_root, | ||
678 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | ||
679 | pag->pag_ici_reclaimable--; | 676 | pag->pag_ici_reclaimable--; |
680 | if (!pag->pag_ici_reclaimable) { | 677 | if (!pag->pag_ici_reclaimable) { |
681 | /* clear the reclaim tag from the perag radix tree */ | 678 | /* clear the reclaim tag from the perag radix tree */ |
@@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag( | |||
689 | } | 686 | } |
690 | } | 687 | } |
691 | 688 | ||
689 | void | ||
690 | __xfs_inode_clear_reclaim_tag( | ||
691 | xfs_mount_t *mp, | ||
692 | xfs_perag_t *pag, | ||
693 | xfs_inode_t *ip) | ||
694 | { | ||
695 | radix_tree_tag_clear(&pag->pag_ici_root, | ||
696 | XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); | ||
697 | __xfs_inode_clear_reclaim(pag, ip); | ||
698 | } | ||
699 | |||
692 | /* | 700 | /* |
693 | * Inodes in different states need to be treated differently, and the return | 701 | * Inodes in different states need to be treated differently, and the return |
694 | * value of xfs_iflush is not sufficient to get this right. The following table | 702 | * value of xfs_iflush is not sufficient to get this right. The following table |
@@ -838,6 +846,7 @@ reclaim: | |||
838 | if (!radix_tree_delete(&pag->pag_ici_root, | 846 | if (!radix_tree_delete(&pag->pag_ici_root, |
839 | XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) | 847 | XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) |
840 | ASSERT(0); | 848 | ASSERT(0); |
849 | __xfs_inode_clear_reclaim(pag, ip); | ||
841 | write_unlock(&pag->pag_ici_lock); | 850 | write_unlock(&pag->pag_ici_lock); |
842 | 851 | ||
843 | /* | 852 | /* |
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ed575fb4b495..7e206fc1fa36 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c | |||
@@ -405,9 +405,15 @@ xlog_cil_push( | |||
405 | new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); | 405 | new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); |
406 | new_ctx->ticket = xlog_cil_ticket_alloc(log); | 406 | new_ctx->ticket = xlog_cil_ticket_alloc(log); |
407 | 407 | ||
408 | /* lock out transaction commit, but don't block on background push */ | 408 | /* |
409 | * Lock out transaction commit, but don't block for background pushes | ||
410 | * unless we are well over the CIL space limit. See the definition of | ||
411 | * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic | ||
412 | * used here. | ||
413 | */ | ||
409 | if (!down_write_trylock(&cil->xc_ctx_lock)) { | 414 | if (!down_write_trylock(&cil->xc_ctx_lock)) { |
410 | if (!push_seq) | 415 | if (!push_seq && |
416 | cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) | ||
411 | goto out_free_ticket; | 417 | goto out_free_ticket; |
412 | down_write(&cil->xc_ctx_lock); | 418 | down_write(&cil->xc_ctx_lock); |
413 | } | 419 | } |
@@ -422,7 +428,7 @@ xlog_cil_push( | |||
422 | goto out_skip; | 428 | goto out_skip; |
423 | 429 | ||
424 | /* check for a previously pushed seqeunce */ | 430 | /* check for a previously pushed seqeunce */ |
425 | if (push_seq < cil->xc_ctx->sequence) | 431 | if (push_seq && push_seq < cil->xc_ctx->sequence) |
426 | goto out_skip; | 432 | goto out_skip; |
427 | 433 | ||
428 | /* | 434 | /* |
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ced52b98b322..edcdfe01617f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h | |||
@@ -426,13 +426,13 @@ struct xfs_cil { | |||
426 | }; | 426 | }; |
427 | 427 | ||
428 | /* | 428 | /* |
429 | * The amount of log space we should the CIL to aggregate is difficult to size. | 429 | * The amount of log space we allow the CIL to aggregate is difficult to size. |
430 | * Whatever we chose we have to make we can get a reservation for the log space | 430 | * Whatever we choose, we have to make sure we can get a reservation for the |
431 | * effectively, that it is large enough to capture sufficient relogging to | 431 | * log space effectively, that it is large enough to capture sufficient |
432 | * reduce log buffer IO significantly, but it is not too large for the log or | 432 | * relogging to reduce log buffer IO significantly, but it is not too large for |
433 | * induces too much latency when writing out through the iclogs. We track both | 433 | * the log or induces too much latency when writing out through the iclogs. We |
434 | * space consumed and the number of vectors in the checkpoint context, so we | 434 | * track both space consumed and the number of vectors in the checkpoint |
435 | * need to decide which to use for limiting. | 435 | * context, so we need to decide which to use for limiting. |
436 | * | 436 | * |
437 | * Every log buffer we write out during a push needs a header reserved, which | 437 | * Every log buffer we write out during a push needs a header reserved, which |
438 | * is at least one sector and more for v2 logs. Hence we need a reservation of | 438 | * is at least one sector and more for v2 logs. Hence we need a reservation of |
@@ -459,16 +459,21 @@ struct xfs_cil { | |||
459 | * checkpoint transaction ticket is specific to the checkpoint context, rather | 459 | * checkpoint transaction ticket is specific to the checkpoint context, rather |
460 | * than the CIL itself. | 460 | * than the CIL itself. |
461 | * | 461 | * |
462 | * With dynamic reservations, we can basically make up arbitrary limits for the | 462 | * With dynamic reservations, we can effectively make up arbitrary limits for |
463 | * checkpoint size so long as they don't violate any other size rules. Hence | 463 | * the checkpoint size so long as they don't violate any other size rules. |
464 | * the initial maximum size for the checkpoint transaction will be set to a | 464 | * Recovery imposes a rule that no transaction exceed half the log, so we are |
465 | * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit | 465 | * limited by that. Furthermore, the log transaction reservation subsystem |
466 | * right now based on the latency of writing out a large amount of data through | 466 | * tries to keep 25% of the log free, so we need to keep below that limit or we |
467 | * the circular iclog buffers. | 467 | * risk running out of free log space to start any new transactions. |
468 | * | ||
469 | * In order to keep background CIL push efficient, we will set a lower | ||
470 | * threshold at which background pushing is attempted without blocking current | ||
471 | * transaction commits. A separate, higher bound defines when CIL pushes are | ||
472 | * enforced to ensure we stay within our maximum checkpoint size bounds. | ||
473 | * threshold, yet give us plenty of space for aggregation on large logs. | ||
468 | */ | 474 | */ |
469 | 475 | #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) | |
470 | #define XLOG_CIL_SPACE_LIMIT(log) \ | 476 | #define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) |
471 | (min((log->l_logsize >> 2), (8 * 1024 * 1024))) | ||
472 | 477 | ||
473 | /* | 478 | /* |
474 | * The reservation head lsn is not made up of a cycle number and block number. | 479 | * The reservation head lsn is not made up of a cycle number and block number. |