diff options
author | Boaz Harrosh <bharrosh@panasas.com> | 2011-02-03 10:53:25 -0500 |
---|---|---|
committer | Boaz Harrosh <bharrosh@panasas.com> | 2011-03-15 09:02:51 -0400 |
commit | 1cea312ad49d9cb964179a784fedb1fcfe396283 (patch) | |
tree | 27c45af006b48b1a079698605ea9007398f652b5 /fs/exofs | |
parent | 9ed96484311b89360b80a4181d856cbdb21630fd (diff) |
exofs: Write sbi->s_nextid as part of the Create command
Before when creating a new inode, we'd set the sb->s_dirt flag,
and sometime later the system would write out s_nextid as part
of the sb_info. Also on inode sync we would force the sb sync
as well.
Define the s_nextid as a new partition attribute and set it
every time we create a new object.
At mount we read it from it's new place.
We now never set sb->s_dirt anywhere in exofs. write_super
is actually never called. The call to exofs_write_super from
exofs_put_super is also removed because the VFS always calls
->sync_fs before calling ->put_super twice.
To stay backward-and-forward compatible we also write the old
s_nextid in the super_block object at unmount, and support zero
length attribute on mount.
This also fixes a BUG where in layouts when group_width was not
a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read
from the device it was written to. Because of the sliding window
layout trick, and because the read was always done from the 0
device but the write was done via the raid engine that might slide
the device view. Now we read and write through the raid engine.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
-rw-r--r-- | fs/exofs/common.h | 18 | ||||
-rw-r--r-- | fs/exofs/exofs.h | 4 | ||||
-rw-r--r-- | fs/exofs/file.c | 11 | ||||
-rw-r--r-- | fs/exofs/inode.c | 4 | ||||
-rw-r--r-- | fs/exofs/super.c | 135 |
5 files changed, 141 insertions, 31 deletions
diff --git a/fs/exofs/common.h b/fs/exofs/common.h index f0d520312d8b..5e74ad3d4009 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h | |||
@@ -53,10 +53,14 @@ | |||
53 | #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ | 53 | #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ |
54 | 54 | ||
55 | /* exofs Application specific page/attribute */ | 55 | /* exofs Application specific page/attribute */ |
56 | /* Inode attrs */ | ||
56 | # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) | 57 | # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) |
57 | # define EXOFS_ATTR_INODE_DATA 1 | 58 | # define EXOFS_ATTR_INODE_DATA 1 |
58 | # define EXOFS_ATTR_INODE_FILE_LAYOUT 2 | 59 | # define EXOFS_ATTR_INODE_FILE_LAYOUT 2 |
59 | # define EXOFS_ATTR_INODE_DIR_LAYOUT 3 | 60 | # define EXOFS_ATTR_INODE_DIR_LAYOUT 3 |
61 | /* Partition attrs */ | ||
62 | # define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) | ||
63 | # define EXOFS_ATTR_SB_STATS 1 | ||
60 | 64 | ||
61 | /* | 65 | /* |
62 | * The maximum number of files we can have is limited by the size of the | 66 | * The maximum number of files we can have is limited by the size of the |
@@ -86,8 +90,8 @@ enum { | |||
86 | */ | 90 | */ |
87 | enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; | 91 | enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; |
88 | struct exofs_fscb { | 92 | struct exofs_fscb { |
89 | __le64 s_nextid; /* Highest object ID used */ | 93 | __le64 s_nextid; /* Only used after mkfs */ |
90 | __le64 s_numfiles; /* Number of files on fs */ | 94 | __le64 s_numfiles; /* Only used after mkfs */ |
91 | __le32 s_version; /* == EXOFS_FSCB_VER */ | 95 | __le32 s_version; /* == EXOFS_FSCB_VER */ |
92 | __le16 s_magic; /* Magic signature */ | 96 | __le16 s_magic; /* Magic signature */ |
93 | __le16 s_newfs; /* Non-zero if this is a new fs */ | 97 | __le16 s_newfs; /* Non-zero if this is a new fs */ |
@@ -98,6 +102,16 @@ struct exofs_fscb { | |||
98 | } __packed; | 102 | } __packed; |
99 | 103 | ||
100 | /* | 104 | /* |
105 | * This struct is set on the FS partition's attributes. | ||
106 | * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together | ||
107 | * with the create command, to atomically persist the sb writeable information. | ||
108 | */ | ||
109 | struct exofs_sb_stats { | ||
110 | __le64 s_nextid; /* Highest object ID used */ | ||
111 | __le64 s_numfiles; /* Number of files on fs */ | ||
112 | } __packed; | ||
113 | |||
114 | /* | ||
101 | * Describes the raid used in the FS. It is part of the device table. | 115 | * Describes the raid used in the FS. It is part of the device table. |
102 | * This here is taken from the pNFS-objects definition. In exofs we | 116 | * This here is taken from the pNFS-objects definition. In exofs we |
103 | * use one raid policy through-out the filesystem. (NOTE: the funny | 117 | * use one raid policy through-out the filesystem. (NOTE: the funny |
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 99fcb9126a97..c965806c2821 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
@@ -77,7 +77,7 @@ struct exofs_layout { | |||
77 | * our extension to the in-memory superblock | 77 | * our extension to the in-memory superblock |
78 | */ | 78 | */ |
79 | struct exofs_sb_info { | 79 | struct exofs_sb_info { |
80 | struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ | 80 | struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ |
81 | int s_timeout; /* timeout for OSD operations */ | 81 | int s_timeout; /* timeout for OSD operations */ |
82 | uint64_t s_nextid; /* highest object ID used */ | 82 | uint64_t s_nextid; /* highest object ID used */ |
83 | uint32_t s_numfiles; /* number of files on fs */ | 83 | uint32_t s_numfiles; /* number of files on fs */ |
@@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, | |||
281 | struct inode *); | 281 | struct inode *); |
282 | 282 | ||
283 | /* super.c */ | 283 | /* super.c */ |
284 | int exofs_sync_fs(struct super_block *sb, int wait); | 284 | int exofs_sbi_write_stats(struct exofs_sb_info *sbi); |
285 | 285 | ||
286 | /********************* | 286 | /********************* |
287 | * operation vectors * | 287 | * operation vectors * |
diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 4c0d6bac9143..45ca323d8363 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c | |||
@@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp) | |||
45 | static int exofs_file_fsync(struct file *filp, int datasync) | 45 | static int exofs_file_fsync(struct file *filp, int datasync) |
46 | { | 46 | { |
47 | int ret; | 47 | int ret; |
48 | struct inode *inode = filp->f_mapping->host; | ||
49 | struct super_block *sb; | ||
50 | |||
51 | ret = sync_inode_metadata(inode, 1); | ||
52 | |||
53 | /* This is a good place to write the sb */ | ||
54 | /* TODO: Sechedule an sb-sync on create */ | ||
55 | sb = inode->i_sb; | ||
56 | if (sb->s_dirt) | ||
57 | exofs_sync_fs(sb, 1); | ||
58 | 48 | ||
49 | ret = sync_inode_metadata(filp->f_mapping->host, 1); | ||
59 | return ret; | 50 | return ret; |
60 | } | 51 | } |
61 | 52 | ||
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 681b3cb9b4d8..0c713cfbebf0 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) | |||
1102 | } | 1102 | } |
1103 | return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; | 1103 | return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; |
1104 | } | 1104 | } |
1105 | |||
1105 | /* | 1106 | /* |
1106 | * Callback function from exofs_new_inode(). The important thing is that we | 1107 | * Callback function from exofs_new_inode(). The important thing is that we |
1107 | * set the obj_created flag so that other methods know that the object exists on | 1108 | * set the obj_created flag so that other methods know that the object exists on |
@@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1160 | sbi = sb->s_fs_info; | 1161 | sbi = sb->s_fs_info; |
1161 | 1162 | ||
1162 | inode->i_mapping->backing_dev_info = sb->s_bdi; | 1163 | inode->i_mapping->backing_dev_info = sb->s_bdi; |
1163 | sb->s_dirt = 1; | ||
1164 | inode_init_owner(inode, dir, mode); | 1164 | inode_init_owner(inode, dir, mode); |
1165 | inode->i_ino = sbi->s_nextid++; | 1165 | inode->i_ino = sbi->s_nextid++; |
1166 | inode->i_blkbits = EXOFS_BLKSHIFT; | 1166 | inode->i_blkbits = EXOFS_BLKSHIFT; |
@@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1171 | spin_unlock(&sbi->s_next_gen_lock); | 1171 | spin_unlock(&sbi->s_next_gen_lock); |
1172 | insert_inode_hash(inode); | 1172 | insert_inode_hash(inode); |
1173 | 1173 | ||
1174 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ | ||
1175 | |||
1174 | mark_inode_dirty(inode); | 1176 | mark_inode_dirty(inode); |
1175 | 1177 | ||
1176 | ret = exofs_get_io_state(&sbi->layout, &ios); | 1178 | ret = exofs_get_io_state(&sbi->layout, &ios); |
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 474989eeb7d6..5eb0851e5481 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -213,6 +213,101 @@ static void destroy_inodecache(void) | |||
213 | static const struct super_operations exofs_sops; | 213 | static const struct super_operations exofs_sops; |
214 | static const struct export_operations exofs_export_ops; | 214 | static const struct export_operations exofs_export_ops; |
215 | 215 | ||
216 | static const struct osd_attr g_attr_sb_stats = ATTR_DEF( | ||
217 | EXOFS_APAGE_SB_DATA, | ||
218 | EXOFS_ATTR_SB_STATS, | ||
219 | sizeof(struct exofs_sb_stats)); | ||
220 | |||
221 | static int __sbi_read_stats(struct exofs_sb_info *sbi) | ||
222 | { | ||
223 | struct osd_attr attrs[] = { | ||
224 | [0] = g_attr_sb_stats, | ||
225 | }; | ||
226 | struct exofs_io_state *ios; | ||
227 | int ret; | ||
228 | |||
229 | ret = exofs_get_io_state(&sbi->layout, &ios); | ||
230 | if (unlikely(ret)) { | ||
231 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | ||
232 | return ret; | ||
233 | } | ||
234 | |||
235 | ios->cred = sbi->s_cred; | ||
236 | |||
237 | ios->in_attr = attrs; | ||
238 | ios->in_attr_len = ARRAY_SIZE(attrs); | ||
239 | |||
240 | ret = exofs_sbi_read(ios); | ||
241 | if (unlikely(ret)) { | ||
242 | EXOFS_ERR("Error reading super_block stats => %d\n", ret); | ||
243 | goto out; | ||
244 | } | ||
245 | |||
246 | ret = extract_attr_from_ios(ios, &attrs[0]); | ||
247 | if (ret) { | ||
248 | EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); | ||
249 | goto out; | ||
250 | } | ||
251 | if (attrs[0].len) { | ||
252 | struct exofs_sb_stats *ess; | ||
253 | |||
254 | if (unlikely(attrs[0].len != sizeof(*ess))) { | ||
255 | EXOFS_ERR("%s: Wrong version of exofs_sb_stats " | ||
256 | "size(%d) != expected(%zd)\n", | ||
257 | __func__, attrs[0].len, sizeof(*ess)); | ||
258 | goto out; | ||
259 | } | ||
260 | |||
261 | ess = attrs[0].val_ptr; | ||
262 | sbi->s_nextid = le64_to_cpu(ess->s_nextid); | ||
263 | sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); | ||
264 | } | ||
265 | |||
266 | out: | ||
267 | exofs_put_io_state(ios); | ||
268 | return ret; | ||
269 | } | ||
270 | |||
271 | static void stats_done(struct exofs_io_state *ios, void *p) | ||
272 | { | ||
273 | exofs_put_io_state(ios); | ||
274 | /* Good thanks nothing to do anymore */ | ||
275 | } | ||
276 | |||
277 | /* Asynchronously write the stats attribute */ | ||
278 | int exofs_sbi_write_stats(struct exofs_sb_info *sbi) | ||
279 | { | ||
280 | struct osd_attr attrs[] = { | ||
281 | [0] = g_attr_sb_stats, | ||
282 | }; | ||
283 | struct exofs_io_state *ios; | ||
284 | int ret; | ||
285 | |||
286 | ret = exofs_get_io_state(&sbi->layout, &ios); | ||
287 | if (unlikely(ret)) { | ||
288 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | ||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); | ||
293 | sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); | ||
294 | attrs[0].val_ptr = &sbi->s_ess; | ||
295 | |||
296 | ios->cred = sbi->s_cred; | ||
297 | ios->done = stats_done; | ||
298 | ios->private = sbi; | ||
299 | ios->out_attr = attrs; | ||
300 | ios->out_attr_len = ARRAY_SIZE(attrs); | ||
301 | |||
302 | ret = exofs_sbi_write(ios); | ||
303 | if (unlikely(ret)) { | ||
304 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); | ||
305 | exofs_put_io_state(ios); | ||
306 | } | ||
307 | |||
308 | return ret; | ||
309 | } | ||
310 | |||
216 | /* | 311 | /* |
217 | * Write the superblock to the OSD | 312 | * Write the superblock to the OSD |
218 | */ | 313 | */ |
@@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
223 | struct exofs_io_state *ios; | 318 | struct exofs_io_state *ios; |
224 | int ret = -ENOMEM; | 319 | int ret = -ENOMEM; |
225 | 320 | ||
226 | lock_super(sb); | 321 | fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); |
322 | if (unlikely(!fscb)) | ||
323 | return -ENOMEM; | ||
324 | |||
227 | sbi = sb->s_fs_info; | 325 | sbi = sb->s_fs_info; |
228 | fscb = &sbi->s_fscb; | ||
229 | 326 | ||
327 | /* NOTE: We no longer dirty the super_block anywhere in exofs. The | ||
328 | * reason we write the fscb here on unmount is so we can stay backwards | ||
329 | * compatible with fscb->s_version == 1. (What we are not compatible | ||
330 | * with is if a new version FS crashed and then we try to mount an old | ||
331 | * version). Otherwise the exofs_fscb is read-only from mkfs time. All | ||
332 | * the writeable info is set in exofs_sbi_write_stats() above. | ||
333 | */ | ||
230 | ret = exofs_get_io_state(&sbi->layout, &ios); | 334 | ret = exofs_get_io_state(&sbi->layout, &ios); |
231 | if (ret) | 335 | if (unlikely(ret)) |
232 | goto out; | 336 | goto out; |
233 | 337 | ||
234 | /* Note: We only write the changing part of the fscb. .i.e upto the | 338 | lock_super(sb); |
235 | * the fscb->s_dev_table_oid member. There is no read-modify-write | 339 | |
236 | * here. | ||
237 | */ | ||
238 | ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); | 340 | ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); |
239 | memset(fscb, 0, ios->length); | 341 | memset(fscb, 0, ios->length); |
240 | fscb->s_nextid = cpu_to_le64(sbi->s_nextid); | 342 | fscb->s_nextid = cpu_to_le64(sbi->s_nextid); |
@@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
249 | ios->cred = sbi->s_cred; | 351 | ios->cred = sbi->s_cred; |
250 | 352 | ||
251 | ret = exofs_sbi_write(ios); | 353 | ret = exofs_sbi_write(ios); |
252 | if (unlikely(ret)) { | 354 | if (unlikely(ret)) |
253 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); | 355 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); |
254 | goto out; | 356 | else |
255 | } | 357 | sb->s_dirt = 0; |
256 | sb->s_dirt = 0; | ||
257 | 358 | ||
359 | |||
360 | unlock_super(sb); | ||
258 | out: | 361 | out: |
259 | EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); | 362 | EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); |
260 | exofs_put_io_state(ios); | 363 | exofs_put_io_state(ios); |
261 | unlock_super(sb); | 364 | kfree(fscb); |
262 | return ret; | 365 | return ret; |
263 | } | 366 | } |
264 | 367 | ||
@@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb) | |||
302 | int num_pend; | 405 | int num_pend; |
303 | struct exofs_sb_info *sbi = sb->s_fs_info; | 406 | struct exofs_sb_info *sbi = sb->s_fs_info; |
304 | 407 | ||
305 | if (sb->s_dirt) | ||
306 | exofs_write_super(sb); | ||
307 | |||
308 | /* make sure there are no pending commands */ | 408 | /* make sure there are no pending commands */ |
309 | for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; | 409 | for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; |
310 | num_pend = atomic_read(&sbi->s_curr_pending)) { | 410 | num_pend = atomic_read(&sbi->s_curr_pending)) { |
@@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
629 | goto free_sbi; | 729 | goto free_sbi; |
630 | 730 | ||
631 | sb->s_magic = le16_to_cpu(fscb.s_magic); | 731 | sb->s_magic = le16_to_cpu(fscb.s_magic); |
732 | /* NOTE: we read below to be backward compatible with old versions */ | ||
632 | sbi->s_nextid = le64_to_cpu(fscb.s_nextid); | 733 | sbi->s_nextid = le64_to_cpu(fscb.s_nextid); |
633 | sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); | 734 | sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); |
634 | 735 | ||
@@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
639 | ret = -EINVAL; | 740 | ret = -EINVAL; |
640 | goto free_sbi; | 741 | goto free_sbi; |
641 | } | 742 | } |
642 | if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { | 743 | if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { |
643 | EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", | 744 | EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", |
644 | EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); | 745 | EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); |
645 | ret = -EINVAL; | 746 | ret = -EINVAL; |
@@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
657 | goto free_sbi; | 758 | goto free_sbi; |
658 | } | 759 | } |
659 | 760 | ||
761 | __sbi_read_stats(sbi); | ||
762 | |||
660 | /* set up operation vectors */ | 763 | /* set up operation vectors */ |
661 | sbi->bdi.ra_pages = __ra_pages(&sbi->layout); | 764 | sbi->bdi.ra_pages = __ra_pages(&sbi->layout); |
662 | sb->s_bdi = &sbi->bdi; | 765 | sb->s_bdi = &sbi->bdi; |