aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs/inode.c
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2010-02-01 06:35:51 -0500
committerBoaz Harrosh <bharrosh@panasas.com>2010-02-28 06:43:08 -0500
commit5d952b8391692553c31e620a92d6e09262a9a307 (patch)
treeb3a1a0490fc98b6304685d64bb4774235ec94a2d /fs/exofs/inode.c
parentd9c740d2253e75db8cef8f87a3125c450f3ebd82 (diff)
exofs: RAID0 support
We now support striping over mirror devices. Including variable sized stripe_unit. Some limits: * stripe_unit must be a multiple of PAGE_SIZE * stripe_unit * stripe_count is maximum upto 32-bit (4Gb) Tested RAID0 over mirrors, RAID0 only, mirrors only. All check. Design notes: * I'm not using a vectored raid-engine mechanism yet. Following the pnfs-objects-layout data-map structure, "Mirror" is just a private case of "group_width" == 1, and RAID0 is a private case of "Mirrors" == 1. The performance lose of the general case over the particular special case optimization is totally negligible, also considering the extra code size. * In general I added a prepare_stripes() stage that divides the to-be-io pages to the participating devices, the previous exofs_ios_write/read, now becomes _write/read_mirrors and a new write/read upper layer loops on all devices calling _write/read_mirrors. Effectively the prepare_stripes stage is the all secret. Also truncate need fixing to accommodate for striping. * In a RAID0 arrangement, in a regular usage scenario, if all inode layouts will start at the same device, the small files fill up the first device and the later devices stay empty, the farther the device the emptier it is. To fix that, each inode will start at a different stripe_unit, according to it's obj_id modulus number-of-stripe-units. And will then span all stripe-units in the same incrementing order wrapping back to the beginning of the device table. We call it a stripe-units moving window. Special consideration was taken to keep all devices in a mirror arrangement identical. So a broken osd-device could just be cloned from one of the mirrors and no FS scrubbing is needed. (We do that by rotating stripe-unit at a time and not a single device at a time.) TODO: We no longer verify object_length == inode->i_size in exofs_iget. (since i_size is stripped on multiple objects now). I should introduce a multiple-device attribute reading, and use it in exofs_iget. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs/inode.c')
-rw-r--r--fs/exofs/inode.c26
1 files changed, 4 insertions, 22 deletions
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0163546ba05a..2b3163ea56eb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
869 0); 869 0);
870 870
871/* 871/*
872 * Read an inode from the OSD, and return it as is. We also return the size 872 * Read the Linux inode info from the OSD, and return it as is. In exofs the
873 * attribute in the 'obj_size' argument. 873 * inode info is in an application specific page/attribute of the osd-object.
874 */ 874 */
875static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 875static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
876 struct exofs_fcb *inode, uint64_t *obj_size) 876 struct exofs_fcb *inode)
877{ 877{
878 struct exofs_sb_info *sbi = sb->s_fs_info; 878 struct exofs_sb_info *sbi = sb->s_fs_info;
879 struct osd_attr attrs[] = { 879 struct osd_attr attrs[] = {
880 [0] = g_attr_inode_data, 880 [0] = g_attr_inode_data,
881 [1] = g_attr_inode_file_layout, 881 [1] = g_attr_inode_file_layout,
882 [2] = g_attr_inode_dir_layout, 882 [2] = g_attr_inode_dir_layout,
883 [3] = g_attr_logical_length,
884 }; 883 };
885 struct exofs_io_state *ios; 884 struct exofs_io_state *ios;
886 struct exofs_on_disk_inode_layout *layout; 885 struct exofs_on_disk_inode_layout *layout;
@@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
944 } 943 }
945 } 944 }
946 945
947 *obj_size = ~0;
948 ret = extract_attr_from_ios(ios, &attrs[3]);
949 if (ret) {
950 EXOFS_ERR("%s: extract_attr of logical_length failed\n",
951 __func__);
952 goto out;
953 }
954 *obj_size = get_unaligned_be64(attrs[3].val_ptr);
955
956out: 946out:
957 exofs_put_io_state(ios); 947 exofs_put_io_state(ios);
958 return ret; 948 return ret;
@@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
971 struct exofs_i_info *oi; 961 struct exofs_i_info *oi;
972 struct exofs_fcb fcb; 962 struct exofs_fcb fcb;
973 struct inode *inode; 963 struct inode *inode;
974 uint64_t obj_size;
975 int ret; 964 int ret;
976 965
977 inode = iget_locked(sb, ino); 966 inode = iget_locked(sb, ino);
@@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
983 __oi_init(oi); 972 __oi_init(oi);
984 973
985 /* read the inode from the osd */ 974 /* read the inode from the osd */
986 ret = exofs_get_inode(sb, oi, &fcb, &obj_size); 975 ret = exofs_get_inode(sb, oi, &fcb);
987 if (ret) 976 if (ret)
988 goto bad_inode; 977 goto bad_inode;
989 978
@@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1004 inode->i_blkbits = EXOFS_BLKSHIFT; 993 inode->i_blkbits = EXOFS_BLKSHIFT;
1005 inode->i_generation = le32_to_cpu(fcb.i_generation); 994 inode->i_generation = le32_to_cpu(fcb.i_generation);
1006 995
1007 if ((inode->i_size != obj_size) &&
1008 (!exofs_inode_is_fast_symlink(inode))) {
1009 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
1010 inode->i_size, _LLU(obj_size));
1011 /* FIXME: call exofs_inode_recovery() */
1012 }
1013
1014 oi->i_dir_start_lookup = 0; 996 oi->i_dir_start_lookup = 0;
1015 997
1016 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 998 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {