aboutsummaryrefslogtreecommitdiffstats
path: root/fs/exofs
diff options
context:
space:
mode:
authorBoaz Harrosh <bharrosh@panasas.com>2011-02-03 10:53:25 -0500
committerBoaz Harrosh <bharrosh@panasas.com>2011-03-15 09:02:51 -0400
commit1cea312ad49d9cb964179a784fedb1fcfe396283 (patch)
tree27c45af006b48b1a079698605ea9007398f652b5 /fs/exofs
parent9ed96484311b89360b80a4181d856cbdb21630fd (diff)
exofs: Write sbi->s_nextid as part of the Create command
Before when creating a new inode, we'd set the sb->s_dirt flag, and sometime later the system would write out s_nextid as part of the sb_info. Also on inode sync we would force the sb sync as well. Define the s_nextid as a new partition attribute and set it every time we create a new object. At mount we read it from it's new place. We now never set sb->s_dirt anywhere in exofs. write_super is actually never called. The call to exofs_write_super from exofs_put_super is also removed because the VFS always calls ->sync_fs before calling ->put_super twice. To stay backward-and-forward compatible we also write the old s_nextid in the super_block object at unmount, and support zero length attribute on mount. This also fixes a BUG where in layouts when group_width was not a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read from the device it was written to. Because of the sliding window layout trick, and because the read was always done from the 0 device but the write was done via the raid engine that might slide the device view. Now we read and write through the raid engine. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Diffstat (limited to 'fs/exofs')
-rw-r--r--fs/exofs/common.h18
-rw-r--r--fs/exofs/exofs.h4
-rw-r--r--fs/exofs/file.c11
-rw-r--r--fs/exofs/inode.c4
-rw-r--r--fs/exofs/super.c135
5 files changed, 141 insertions, 31 deletions
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8..5e74ad3d400 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
54 54
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56/* Inode attrs */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 58# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 59# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 60# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
61/* Partition attrs */
62# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
63# define EXOFS_ATTR_SB_STATS 1
60 64
61/* 65/*
62 * The maximum number of files we can have is limited by the size of the 66 * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
86 */ 90 */
87enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; 91enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
88struct exofs_fscb { 92struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */ 93 __le64 s_nextid; /* Only used after mkfs */
90 __le64 s_numfiles; /* Number of files on fs */ 94 __le64 s_numfiles; /* Only used after mkfs */
91 __le32 s_version; /* == EXOFS_FSCB_VER */ 95 __le32 s_version; /* == EXOFS_FSCB_VER */
92 __le16 s_magic; /* Magic signature */ 96 __le16 s_magic; /* Magic signature */
93 __le16 s_newfs; /* Non-zero if this is a new fs */ 97 __le16 s_newfs; /* Non-zero if this is a new fs */
@@ -98,6 +102,16 @@ struct exofs_fscb {
98} __packed; 102} __packed;
99 103
100/* 104/*
105 * This struct is set on the FS partition's attributes.
106 * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
107 * with the create command, to atomically persist the sb writeable information.
108 */
109struct exofs_sb_stats {
110 __le64 s_nextid; /* Highest object ID used */
111 __le64 s_numfiles; /* Number of files on fs */
112} __packed;
113
114/*
101 * Describes the raid used in the FS. It is part of the device table. 115 * Describes the raid used in the FS. It is part of the device table.
102 * This here is taken from the pNFS-objects definition. In exofs we 116 * This here is taken from the pNFS-objects definition. In exofs we
103 * use one raid policy through-out the filesystem. (NOTE: the funny 117 * use one raid policy through-out the filesystem. (NOTE: the funny
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 99fcb9126a9..c965806c282 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
77 * our extension to the in-memory superblock 77 * our extension to the in-memory superblock
78 */ 78 */
79struct exofs_sb_info { 79struct exofs_sb_info {
80 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 80 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
81 int s_timeout; /* timeout for OSD operations */ 81 int s_timeout; /* timeout for OSD operations */
82 uint64_t s_nextid; /* highest object ID used */ 82 uint64_t s_nextid; /* highest object ID used */
83 uint32_t s_numfiles; /* number of files on fs */ 83 uint32_t s_numfiles; /* number of files on fs */
@@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
281 struct inode *); 281 struct inode *);
282 282
283/* super.c */ 283/* super.c */
284int exofs_sync_fs(struct super_block *sb, int wait); 284int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
285 285
286/********************* 286/*********************
287 * operation vectors * 287 * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 4c0d6bac914..45ca323d836 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
45static int exofs_file_fsync(struct file *filp, int datasync) 45static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host;
49 struct super_block *sb;
50
51 ret = sync_inode_metadata(inode, 1);
52
53 /* This is a good place to write the sb */
54 /* TODO: Sechedule an sb-sync on create */
55 sb = inode->i_sb;
56 if (sb->s_dirt)
57 exofs_sync_fs(sb, 1);
58 48
49 ret = sync_inode_metadata(filp->f_mapping->host, 1);
59 return ret; 50 return ret;
60} 51}
61 52
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 681b3cb9b4d..0c713cfbebf 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1102 } 1102 }
1103 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1103 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1104} 1104}
1105
1105/* 1106/*
1106 * Callback function from exofs_new_inode(). The important thing is that we 1107 * Callback function from exofs_new_inode(). The important thing is that we
1107 * set the obj_created flag so that other methods know that the object exists on 1108 * set the obj_created flag so that other methods know that the object exists on
@@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1160 sbi = sb->s_fs_info; 1161 sbi = sb->s_fs_info;
1161 1162
1162 inode->i_mapping->backing_dev_info = sb->s_bdi; 1163 inode->i_mapping->backing_dev_info = sb->s_bdi;
1163 sb->s_dirt = 1;
1164 inode_init_owner(inode, dir, mode); 1164 inode_init_owner(inode, dir, mode);
1165 inode->i_ino = sbi->s_nextid++; 1165 inode->i_ino = sbi->s_nextid++;
1166 inode->i_blkbits = EXOFS_BLKSHIFT; 1166 inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1171 spin_unlock(&sbi->s_next_gen_lock); 1171 spin_unlock(&sbi->s_next_gen_lock);
1172 insert_inode_hash(inode); 1172 insert_inode_hash(inode);
1173 1173
1174 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1175
1174 mark_inode_dirty(inode); 1176 mark_inode_dirty(inode);
1175 1177
1176 ret = exofs_get_io_state(&sbi->layout, &ios); 1178 ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 474989eeb7d..5eb0851e548 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -213,6 +213,101 @@ static void destroy_inodecache(void)
213static const struct super_operations exofs_sops; 213static const struct super_operations exofs_sops;
214static const struct export_operations exofs_export_ops; 214static const struct export_operations exofs_export_ops;
215 215
216static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
217 EXOFS_APAGE_SB_DATA,
218 EXOFS_ATTR_SB_STATS,
219 sizeof(struct exofs_sb_stats));
220
221static int __sbi_read_stats(struct exofs_sb_info *sbi)
222{
223 struct osd_attr attrs[] = {
224 [0] = g_attr_sb_stats,
225 };
226 struct exofs_io_state *ios;
227 int ret;
228
229 ret = exofs_get_io_state(&sbi->layout, &ios);
230 if (unlikely(ret)) {
231 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
232 return ret;
233 }
234
235 ios->cred = sbi->s_cred;
236
237 ios->in_attr = attrs;
238 ios->in_attr_len = ARRAY_SIZE(attrs);
239
240 ret = exofs_sbi_read(ios);
241 if (unlikely(ret)) {
242 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
243 goto out;
244 }
245
246 ret = extract_attr_from_ios(ios, &attrs[0]);
247 if (ret) {
248 EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
249 goto out;
250 }
251 if (attrs[0].len) {
252 struct exofs_sb_stats *ess;
253
254 if (unlikely(attrs[0].len != sizeof(*ess))) {
255 EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
256 "size(%d) != expected(%zd)\n",
257 __func__, attrs[0].len, sizeof(*ess));
258 goto out;
259 }
260
261 ess = attrs[0].val_ptr;
262 sbi->s_nextid = le64_to_cpu(ess->s_nextid);
263 sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
264 }
265
266out:
267 exofs_put_io_state(ios);
268 return ret;
269}
270
271static void stats_done(struct exofs_io_state *ios, void *p)
272{
273 exofs_put_io_state(ios);
274 /* Good thanks nothing to do anymore */
275}
276
277/* Asynchronously write the stats attribute */
278int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
279{
280 struct osd_attr attrs[] = {
281 [0] = g_attr_sb_stats,
282 };
283 struct exofs_io_state *ios;
284 int ret;
285
286 ret = exofs_get_io_state(&sbi->layout, &ios);
287 if (unlikely(ret)) {
288 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
289 return ret;
290 }
291
292 sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
293 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
294 attrs[0].val_ptr = &sbi->s_ess;
295
296 ios->cred = sbi->s_cred;
297 ios->done = stats_done;
298 ios->private = sbi;
299 ios->out_attr = attrs;
300 ios->out_attr_len = ARRAY_SIZE(attrs);
301
302 ret = exofs_sbi_write(ios);
303 if (unlikely(ret)) {
304 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
305 exofs_put_io_state(ios);
306 }
307
308 return ret;
309}
310
216/* 311/*
217 * Write the superblock to the OSD 312 * Write the superblock to the OSD
218 */ 313 */
@@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
223 struct exofs_io_state *ios; 318 struct exofs_io_state *ios;
224 int ret = -ENOMEM; 319 int ret = -ENOMEM;
225 320
226 lock_super(sb); 321 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
322 if (unlikely(!fscb))
323 return -ENOMEM;
324
227 sbi = sb->s_fs_info; 325 sbi = sb->s_fs_info;
228 fscb = &sbi->s_fscb;
229 326
327 /* NOTE: We no longer dirty the super_block anywhere in exofs. The
328 * reason we write the fscb here on unmount is so we can stay backwards
329 * compatible with fscb->s_version == 1. (What we are not compatible
330 * with is if a new version FS crashed and then we try to mount an old
331 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
332 * the writeable info is set in exofs_sbi_write_stats() above.
333 */
230 ret = exofs_get_io_state(&sbi->layout, &ios); 334 ret = exofs_get_io_state(&sbi->layout, &ios);
231 if (ret) 335 if (unlikely(ret))
232 goto out; 336 goto out;
233 337
234 /* Note: We only write the changing part of the fscb. .i.e upto the 338 lock_super(sb);
235 * the fscb->s_dev_table_oid member. There is no read-modify-write 339
236 * here.
237 */
238 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); 340 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
239 memset(fscb, 0, ios->length); 341 memset(fscb, 0, ios->length);
240 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 342 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
249 ios->cred = sbi->s_cred; 351 ios->cred = sbi->s_cred;
250 352
251 ret = exofs_sbi_write(ios); 353 ret = exofs_sbi_write(ios);
252 if (unlikely(ret)) { 354 if (unlikely(ret))
253 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 355 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
254 goto out; 356 else
255 } 357 sb->s_dirt = 0;
256 sb->s_dirt = 0;
257 358
359
360 unlock_super(sb);
258out: 361out:
259 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 362 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
260 exofs_put_io_state(ios); 363 exofs_put_io_state(ios);
261 unlock_super(sb); 364 kfree(fscb);
262 return ret; 365 return ret;
263} 366}
264 367
@@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb)
302 int num_pend; 405 int num_pend;
303 struct exofs_sb_info *sbi = sb->s_fs_info; 406 struct exofs_sb_info *sbi = sb->s_fs_info;
304 407
305 if (sb->s_dirt)
306 exofs_write_super(sb);
307
308 /* make sure there are no pending commands */ 408 /* make sure there are no pending commands */
309 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; 409 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
310 num_pend = atomic_read(&sbi->s_curr_pending)) { 410 num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
629 goto free_sbi; 729 goto free_sbi;
630 730
631 sb->s_magic = le16_to_cpu(fscb.s_magic); 731 sb->s_magic = le16_to_cpu(fscb.s_magic);
732 /* NOTE: we read below to be backward compatible with old versions */
632 sbi->s_nextid = le64_to_cpu(fscb.s_nextid); 733 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
633 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); 734 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
634 735
@@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
639 ret = -EINVAL; 740 ret = -EINVAL;
640 goto free_sbi; 741 goto free_sbi;
641 } 742 }
642 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { 743 if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
643 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", 744 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
644 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); 745 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
645 ret = -EINVAL; 746 ret = -EINVAL;
@@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
657 goto free_sbi; 758 goto free_sbi;
658 } 759 }
659 760
761 __sbi_read_stats(sbi);
762
660 /* set up operation vectors */ 763 /* set up operation vectors */
661 sbi->bdi.ra_pages = __ra_pages(&sbi->layout); 764 sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
662 sb->s_bdi = &sbi->bdi; 765 sb->s_bdi = &sbi->bdi;