summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile2
-rw-r--r--fs/aio.c113
-rw-r--r--fs/attr.c5
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/tests/free-space-tests.c4
-rw-r--r--fs/ceph/addr.c8
-rw-r--r--fs/ceph/inode.c136
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/dir.c11
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/link.c26
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext4/ext4.h10
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c45
-rw-r--r--fs/ext4/inode.c16
-rw-r--r--fs/ext4/mballoc.c17
-rw-r--r--fs/ext4/super.c21
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/checkpoint.c195
-rw-r--r--fs/f2fs/data.c621
-rw-r--r--fs/f2fs/debug.c53
-rw-r--r--fs/f2fs/dir.c47
-rw-r--r--fs/f2fs/f2fs.h195
-rw-r--r--fs/f2fs/file.c84
-rw-r--r--fs/f2fs/gc.c22
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/inline.c222
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/namei.c5
-rw-r--r--fs/f2fs/node.c272
-rw-r--r--fs/f2fs/node.h8
-rw-r--r--fs/f2fs/recovery.c49
-rw-r--r--fs/f2fs/segment.c584
-rw-r--r--fs/f2fs/segment.h81
-rw-r--r--fs/f2fs/super.c72
-rw-r--r--fs/f2fs/xattr.c2
-rw-r--r--fs/fs-writeback.c15
-rw-r--r--fs/fuse/dev.c25
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c41
-rw-r--r--fs/fuse/fuse_i.h5
-rw-r--r--fs/gfs2/aops.c49
-rw-r--r--fs/gfs2/dir.c90
-rw-r--r--fs/gfs2/dir.h19
-rw-r--r--fs/gfs2/glock.c31
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c36
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c118
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c5
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c8
-rw-r--r--fs/gfs2/ops_fstype.c70
-rw-r--r--fs/gfs2/quota.c342
-rw-r--r--fs/gfs2/quota.h1
-rw-r--r--fs/gfs2/rgrp.c113
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c43
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/journal.c18
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/transaction.c16
-rw-r--r--fs/kernfs/Makefile5
-rw-r--r--fs/kernfs/dir.c1073
-rw-r--r--fs/kernfs/file.c867
-rw-r--r--fs/kernfs/inode.c377
-rw-r--r--fs/kernfs/kernfs-internal.h122
-rw-r--r--fs/kernfs/mount.c165
-rw-r--r--fs/kernfs/symlink.c151
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/nilfs2/segment.c10
-rw-r--r--fs/notify/dnotify/dnotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.c224
-rw-r--r--fs/notify/fanotify/fanotify.h23
-rw-r--r--fs/notify/fanotify/fanotify_user.c41
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/group.c1
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c149
-rw-r--r--fs/notify/inotify/inotify_user.c119
-rw-r--r--fs/notify/notification.c334
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c10
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/file.c3
-rw-r--r--fs/ocfs2/ioctl.c7
-rw-r--r--fs/ocfs2/move_extents.c77
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/stack_user.c308
-rw-r--r--fs/ocfs2/stackglue.c16
-rw-r--r--fs/ocfs2/stackglue.h15
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/suballoc.h12
-rw-r--r--fs/ocfs2/super.c20
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/posix_acl.c84
-rw-r--r--fs/proc/meminfo.c37
-rw-r--r--fs/pstore/platform.c7
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/splice.c18
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysfs/Makefile2
-rw-r--r--fs/sysfs/dir.c1075
-rw-r--r--fs/sysfs/file.c963
-rw-r--r--fs/sysfs/group.c102
-rw-r--r--fs/sysfs/inode.c331
-rw-r--r--fs/sysfs/mount.c184
-rw-r--r--fs/sysfs/symlink.c219
-rw-r--r--fs/sysfs/sysfs.h236
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr.c5
-rw-r--r--fs/xfs/xfs_attr_list.c8
-rw-r--r--fs/xfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/xfs_bmap.c36
-rw-r--r--fs/xfs/xfs_bmap_util.c50
-rw-r--r--fs/xfs/xfs_buf.c63
-rw-r--r--fs/xfs/xfs_buf.h11
-rw-r--r--fs/xfs/xfs_buf_item.c124
-rw-r--r--fs/xfs/xfs_dir2_node.c26
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_dir2_sf.c58
-rw-r--r--fs/xfs/xfs_dquot.c7
-rw-r--r--fs/xfs/xfs_dquot_item.c67
-rw-r--r--fs/xfs/xfs_dquot_item.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c21
-rw-r--r--fs/xfs/xfs_file.c10
-rw-r--r--fs/xfs/xfs_ialloc.c53
-rw-r--r--fs/xfs/xfs_ialloc.h21
-rw-r--r--fs/xfs/xfs_icreate_item.c10
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_fork.c17
-rw-r--r--fs/xfs/xfs_inode_item.c400
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iops.c79
-rw-r--r--fs/xfs/xfs_itable.c22
-rw-r--r--fs/xfs/xfs_log.h46
-rw-r--r--fs/xfs/xfs_log_cil.c74
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_qm.c86
-rw-r--r--fs/xfs/xfs_qm.h18
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota_priv.h42
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c13
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_vnode.h9
174 files changed, 7376 insertions, 6140 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..39a824f44e7c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
53obj-y += quota/ 53obj-y += quota/
54 54
55obj-$(CONFIG_PROC_FS) += proc/ 55obj-$(CONFIG_PROC_FS) += proc/
56obj-$(CONFIG_SYSFS) += sysfs/ 56obj-$(CONFIG_SYSFS) += sysfs/ kernfs/
57obj-$(CONFIG_CONFIGFS_FS) += configfs/ 57obj-$(CONFIG_CONFIGFS_FS) += configfs/
58obj-y += devpts/ 58obj-y += devpts/
59 59
diff --git a/fs/aio.c b/fs/aio.c
index 6efb7f6cb22e..062a5f6a1448 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx)
244 int i; 244 int i;
245 245
246 for (i = 0; i < ctx->nr_pages; i++) { 246 for (i = 0; i < ctx->nr_pages; i++) {
247 struct page *page;
247 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, 248 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
248 page_count(ctx->ring_pages[i])); 249 page_count(ctx->ring_pages[i]));
249 put_page(ctx->ring_pages[i]); 250 page = ctx->ring_pages[i];
251 if (!page)
252 continue;
253 ctx->ring_pages[i] = NULL;
254 put_page(page);
250 } 255 }
251 256
252 put_aio_ring_file(ctx); 257 put_aio_ring_file(ctx);
@@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
280 unsigned long flags; 285 unsigned long flags;
281 int rc; 286 int rc;
282 287
288 rc = 0;
289
290 /* Make sure the old page hasn't already been changed */
291 spin_lock(&mapping->private_lock);
292 ctx = mapping->private_data;
293 if (ctx) {
294 pgoff_t idx;
295 spin_lock_irqsave(&ctx->completion_lock, flags);
296 idx = old->index;
297 if (idx < (pgoff_t)ctx->nr_pages) {
298 if (ctx->ring_pages[idx] != old)
299 rc = -EAGAIN;
300 } else
301 rc = -EINVAL;
302 spin_unlock_irqrestore(&ctx->completion_lock, flags);
303 } else
304 rc = -EINVAL;
305 spin_unlock(&mapping->private_lock);
306
307 if (rc != 0)
308 return rc;
309
283 /* Writeback must be complete */ 310 /* Writeback must be complete */
284 BUG_ON(PageWriteback(old)); 311 BUG_ON(PageWriteback(old));
285 put_page(old); 312 get_page(new);
286 313
287 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); 314 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
288 if (rc != MIGRATEPAGE_SUCCESS) { 315 if (rc != MIGRATEPAGE_SUCCESS) {
289 get_page(old); 316 put_page(new);
290 return rc; 317 return rc;
291 } 318 }
292 319
293 get_page(new);
294
295 /* We can potentially race against kioctx teardown here. Use the 320 /* We can potentially race against kioctx teardown here. Use the
296 * address_space's private data lock to protect the mapping's 321 * address_space's private data lock to protect the mapping's
297 * private_data. 322 * private_data.
@@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
303 spin_lock_irqsave(&ctx->completion_lock, flags); 328 spin_lock_irqsave(&ctx->completion_lock, flags);
304 migrate_page_copy(new, old); 329 migrate_page_copy(new, old);
305 idx = old->index; 330 idx = old->index;
306 if (idx < (pgoff_t)ctx->nr_pages) 331 if (idx < (pgoff_t)ctx->nr_pages) {
307 ctx->ring_pages[idx] = new; 332 /* And only do the move if things haven't changed */
333 if (ctx->ring_pages[idx] == old)
334 ctx->ring_pages[idx] = new;
335 else
336 rc = -EAGAIN;
337 } else
338 rc = -EINVAL;
308 spin_unlock_irqrestore(&ctx->completion_lock, flags); 339 spin_unlock_irqrestore(&ctx->completion_lock, flags);
309 } else 340 } else
310 rc = -EBUSY; 341 rc = -EBUSY;
311 spin_unlock(&mapping->private_lock); 342 spin_unlock(&mapping->private_lock);
312 343
344 if (rc == MIGRATEPAGE_SUCCESS)
345 put_page(old);
346 else
347 put_page(new);
348
313 return rc; 349 return rc;
314} 350}
315#endif 351#endif
@@ -326,7 +362,7 @@ static int aio_setup_ring(struct kioctx *ctx)
326 struct aio_ring *ring; 362 struct aio_ring *ring;
327 unsigned nr_events = ctx->max_reqs; 363 unsigned nr_events = ctx->max_reqs;
328 struct mm_struct *mm = current->mm; 364 struct mm_struct *mm = current->mm;
329 unsigned long size, populate; 365 unsigned long size, unused;
330 int nr_pages; 366 int nr_pages;
331 int i; 367 int i;
332 struct file *file; 368 struct file *file;
@@ -347,6 +383,20 @@ static int aio_setup_ring(struct kioctx *ctx)
347 return -EAGAIN; 383 return -EAGAIN;
348 } 384 }
349 385
386 ctx->aio_ring_file = file;
387 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
388 / sizeof(struct io_event);
389
390 ctx->ring_pages = ctx->internal_pages;
391 if (nr_pages > AIO_RING_PAGES) {
392 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
393 GFP_KERNEL);
394 if (!ctx->ring_pages) {
395 put_aio_ring_file(ctx);
396 return -ENOMEM;
397 }
398 }
399
350 for (i = 0; i < nr_pages; i++) { 400 for (i = 0; i < nr_pages; i++) {
351 struct page *page; 401 struct page *page;
352 page = find_or_create_page(file->f_inode->i_mapping, 402 page = find_or_create_page(file->f_inode->i_mapping,
@@ -358,19 +408,14 @@ static int aio_setup_ring(struct kioctx *ctx)
358 SetPageUptodate(page); 408 SetPageUptodate(page);
359 SetPageDirty(page); 409 SetPageDirty(page);
360 unlock_page(page); 410 unlock_page(page);
411
412 ctx->ring_pages[i] = page;
361 } 413 }
362 ctx->aio_ring_file = file; 414 ctx->nr_pages = i;
363 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
364 / sizeof(struct io_event);
365 415
366 ctx->ring_pages = ctx->internal_pages; 416 if (unlikely(i != nr_pages)) {
367 if (nr_pages > AIO_RING_PAGES) { 417 aio_free_ring(ctx);
368 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), 418 return -EAGAIN;
369 GFP_KERNEL);
370 if (!ctx->ring_pages) {
371 put_aio_ring_file(ctx);
372 return -ENOMEM;
373 }
374 } 419 }
375 420
376 ctx->mmap_size = nr_pages * PAGE_SIZE; 421 ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -379,9 +424,9 @@ static int aio_setup_ring(struct kioctx *ctx)
379 down_write(&mm->mmap_sem); 424 down_write(&mm->mmap_sem);
380 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, 425 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
381 PROT_READ | PROT_WRITE, 426 PROT_READ | PROT_WRITE,
382 MAP_SHARED | MAP_POPULATE, 0, &populate); 427 MAP_SHARED, 0, &unused);
428 up_write(&mm->mmap_sem);
383 if (IS_ERR((void *)ctx->mmap_base)) { 429 if (IS_ERR((void *)ctx->mmap_base)) {
384 up_write(&mm->mmap_sem);
385 ctx->mmap_size = 0; 430 ctx->mmap_size = 0;
386 aio_free_ring(ctx); 431 aio_free_ring(ctx);
387 return -EAGAIN; 432 return -EAGAIN;
@@ -389,27 +434,6 @@ static int aio_setup_ring(struct kioctx *ctx)
389 434
390 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 435 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
391 436
392 /* We must do this while still holding mmap_sem for write, as we
393 * need to be protected against userspace attempting to mremap()
394 * or munmap() the ring buffer.
395 */
396 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
397 1, 0, ctx->ring_pages, NULL);
398
399 /* Dropping the reference here is safe as the page cache will hold
400 * onto the pages for us. It is also required so that page migration
401 * can unmap the pages and get the right reference count.
402 */
403 for (i = 0; i < ctx->nr_pages; i++)
404 put_page(ctx->ring_pages[i]);
405
406 up_write(&mm->mmap_sem);
407
408 if (unlikely(ctx->nr_pages != nr_pages)) {
409 aio_free_ring(ctx);
410 return -EAGAIN;
411 }
412
413 ctx->user_id = ctx->mmap_base; 437 ctx->user_id = ctx->mmap_base;
414 ctx->nr_events = nr_events; /* trusted copy */ 438 ctx->nr_events = nr_events; /* trusted copy */
415 439
@@ -652,7 +676,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
652 aio_nr += ctx->max_reqs; 676 aio_nr += ctx->max_reqs;
653 spin_unlock(&aio_nr_lock); 677 spin_unlock(&aio_nr_lock);
654 678
655 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ 679 percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
680 percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */
656 681
657 err = ioctx_add_table(ctx, mm); 682 err = ioctx_add_table(ctx, mm);
658 if (err) 683 if (err)
diff --git a/fs/attr.c b/fs/attr.c
index 267968d94673..5d4e59d56e85 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
202 return -EPERM; 202 return -EPERM;
203 } 203 }
204 204
205 if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
206 if (attr->ia_size != inode->i_size)
207 inode_inc_iversion(inode);
208 }
209
210 if ((ia_valid & ATTR_MODE)) { 205 if ((ia_valid & ATTR_MODE)) {
211 umode_t amode = attr->ia_mode; 206 umode_t amode = attr->ia_mode;
212 /* Flag setting protected by i_mutex */ 207 /* Flag setting protected by i_mutex */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..471a4f7f4044 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4354 * these flags set. For all other operations the VFS set these flags 4354 * these flags set. For all other operations the VFS set these flags
4355 * explicitly if it wants a timestamp update. 4355 * explicitly if it wants a timestamp update.
4356 */ 4356 */
4357 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) 4357 if (newsize != oldsize) {
4358 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 4358 inode_inc_iversion(inode);
4359 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4360 inode->i_ctime = inode->i_mtime =
4361 current_fs_time(inode->i_sb);
4362 }
4359 4363
4360 if (newsize > oldsize) { 4364 if (newsize > oldsize) {
4361 truncate_pagecache(inode, newsize); 4365 truncate_pagecache(inode, newsize);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc15..c8d9ddf84c69 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
101 101
102 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 102 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
103 if (ret) { 103 if (ret) {
104 test_msg("Error removing middle peice %d\n", ret); 104 test_msg("Error removing middle piece %d\n", ret);
105 return ret; 105 return ret;
106 } 106 }
107 107
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
266 } 266 }
267 267
268 if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 268 if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
269 test_msg("Left over peices after removing overlapping\n"); 269 test_msg("Left over pieces after removing overlapping\n");
270 return -1; 270 return -1;
271 } 271 }
272 272
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1e561c059539..ec3ba43b9faa 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -210,9 +210,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
210 if (err < 0) { 210 if (err < 0) {
211 SetPageError(page); 211 SetPageError(page);
212 goto out; 212 goto out;
213 } else if (err < PAGE_CACHE_SIZE) { 213 } else {
214 if (err < PAGE_CACHE_SIZE) {
214 /* zero fill remainder of page */ 215 /* zero fill remainder of page */
215 zero_user_segment(page, err, PAGE_CACHE_SIZE); 216 zero_user_segment(page, err, PAGE_CACHE_SIZE);
217 } else {
218 flush_dcache_page(page);
219 }
216 } 220 }
217 SetPageUptodate(page); 221 SetPageUptodate(page);
218 222
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9a8e396aed89..278fd2891288 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -978,7 +978,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
978 struct ceph_mds_reply_inode *ininfo; 978 struct ceph_mds_reply_inode *ininfo;
979 struct ceph_vino vino; 979 struct ceph_vino vino;
980 struct ceph_fs_client *fsc = ceph_sb_to_client(sb); 980 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
981 int i = 0;
982 int err = 0; 981 int err = 0;
983 982
984 dout("fill_trace %p is_dentry %d is_target %d\n", req, 983 dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -1039,6 +1038,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1039 } 1038 }
1040 } 1039 }
1041 1040
1041 if (rinfo->head->is_target) {
1042 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1043 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1044
1045 in = ceph_get_inode(sb, vino);
1046 if (IS_ERR(in)) {
1047 err = PTR_ERR(in);
1048 goto done;
1049 }
1050 req->r_target_inode = in;
1051
1052 err = fill_inode(in, &rinfo->targeti, NULL,
1053 session, req->r_request_started,
1054 (le32_to_cpu(rinfo->head->result) == 0) ?
1055 req->r_fmode : -1,
1056 &req->r_caps_reservation);
1057 if (err < 0) {
1058 pr_err("fill_inode badness %p %llx.%llx\n",
1059 in, ceph_vinop(in));
1060 goto done;
1061 }
1062 }
1063
1042 /* 1064 /*
1043 * ignore null lease/binding on snapdir ENOENT, or else we 1065 * ignore null lease/binding on snapdir ENOENT, or else we
1044 * will have trouble splicing in the virtual snapdir later 1066 * will have trouble splicing in the virtual snapdir later
@@ -1108,7 +1130,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1108 ceph_dentry(req->r_old_dentry)->offset); 1130 ceph_dentry(req->r_old_dentry)->offset);
1109 1131
1110 dn = req->r_old_dentry; /* use old_dentry */ 1132 dn = req->r_old_dentry; /* use old_dentry */
1111 in = dn->d_inode;
1112 } 1133 }
1113 1134
1114 /* null dentry? */ 1135 /* null dentry? */
@@ -1130,44 +1151,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1130 } 1151 }
1131 1152
1132 /* attach proper inode */ 1153 /* attach proper inode */
1133 ininfo = rinfo->targeti.in; 1154 if (!dn->d_inode) {
1134 vino.ino = le64_to_cpu(ininfo->ino); 1155 ihold(in);
1135 vino.snap = le64_to_cpu(ininfo->snapid);
1136 in = dn->d_inode;
1137 if (!in) {
1138 in = ceph_get_inode(sb, vino);
1139 if (IS_ERR(in)) {
1140 pr_err("fill_trace bad get_inode "
1141 "%llx.%llx\n", vino.ino, vino.snap);
1142 err = PTR_ERR(in);
1143 d_drop(dn);
1144 goto done;
1145 }
1146 dn = splice_dentry(dn, in, &have_lease, true); 1156 dn = splice_dentry(dn, in, &have_lease, true);
1147 if (IS_ERR(dn)) { 1157 if (IS_ERR(dn)) {
1148 err = PTR_ERR(dn); 1158 err = PTR_ERR(dn);
1149 goto done; 1159 goto done;
1150 } 1160 }
1151 req->r_dentry = dn; /* may have spliced */ 1161 req->r_dentry = dn; /* may have spliced */
1152 ihold(in); 1162 } else if (dn->d_inode && dn->d_inode != in) {
1153 } else if (ceph_ino(in) == vino.ino &&
1154 ceph_snap(in) == vino.snap) {
1155 ihold(in);
1156 } else {
1157 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1163 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1158 dn, in, ceph_ino(in), ceph_snap(in), 1164 dn, dn->d_inode, ceph_vinop(dn->d_inode),
1159 vino.ino, vino.snap); 1165 ceph_vinop(in));
1160 have_lease = false; 1166 have_lease = false;
1161 in = NULL;
1162 } 1167 }
1163 1168
1164 if (have_lease) 1169 if (have_lease)
1165 update_dentry_lease(dn, rinfo->dlease, session, 1170 update_dentry_lease(dn, rinfo->dlease, session,
1166 req->r_request_started); 1171 req->r_request_started);
1167 dout(" final dn %p\n", dn); 1172 dout(" final dn %p\n", dn);
1168 i++; 1173 } else if (!req->r_aborted &&
1169 } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || 1174 (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1170 req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { 1175 req->r_op == CEPH_MDS_OP_MKSNAP)) {
1171 struct dentry *dn = req->r_dentry; 1176 struct dentry *dn = req->r_dentry;
1172 1177
1173 /* fill out a snapdir LOOKUPSNAP dentry */ 1178 /* fill out a snapdir LOOKUPSNAP dentry */
@@ -1177,52 +1182,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1177 ininfo = rinfo->targeti.in; 1182 ininfo = rinfo->targeti.in;
1178 vino.ino = le64_to_cpu(ininfo->ino); 1183 vino.ino = le64_to_cpu(ininfo->ino);
1179 vino.snap = le64_to_cpu(ininfo->snapid); 1184 vino.snap = le64_to_cpu(ininfo->snapid);
1180 in = ceph_get_inode(sb, vino);
1181 if (IS_ERR(in)) {
1182 pr_err("fill_inode get_inode badness %llx.%llx\n",
1183 vino.ino, vino.snap);
1184 err = PTR_ERR(in);
1185 d_delete(dn);
1186 goto done;
1187 }
1188 dout(" linking snapped dir %p to dn %p\n", in, dn); 1185 dout(" linking snapped dir %p to dn %p\n", in, dn);
1186 ihold(in);
1189 dn = splice_dentry(dn, in, NULL, true); 1187 dn = splice_dentry(dn, in, NULL, true);
1190 if (IS_ERR(dn)) { 1188 if (IS_ERR(dn)) {
1191 err = PTR_ERR(dn); 1189 err = PTR_ERR(dn);
1192 goto done; 1190 goto done;
1193 } 1191 }
1194 req->r_dentry = dn; /* may have spliced */ 1192 req->r_dentry = dn; /* may have spliced */
1195 ihold(in);
1196 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1197 }
1198
1199 if (rinfo->head->is_target) {
1200 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1201 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1202
1203 if (in == NULL || ceph_ino(in) != vino.ino ||
1204 ceph_snap(in) != vino.snap) {
1205 in = ceph_get_inode(sb, vino);
1206 if (IS_ERR(in)) {
1207 err = PTR_ERR(in);
1208 goto done;
1209 }
1210 }
1211 req->r_target_inode = in;
1212
1213 err = fill_inode(in,
1214 &rinfo->targeti, NULL,
1215 session, req->r_request_started,
1216 (le32_to_cpu(rinfo->head->result) == 0) ?
1217 req->r_fmode : -1,
1218 &req->r_caps_reservation);
1219 if (err < 0) {
1220 pr_err("fill_inode badness %p %llx.%llx\n",
1221 in, ceph_vinop(in));
1222 goto done;
1223 }
1224 } 1193 }
1225
1226done: 1194done:
1227 dout("fill_trace done err=%d\n", err); 1195 dout("fill_trace done err=%d\n", err);
1228 return err; 1196 return err;
@@ -1272,7 +1240,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1272 struct qstr dname; 1240 struct qstr dname;
1273 struct dentry *dn; 1241 struct dentry *dn;
1274 struct inode *in; 1242 struct inode *in;
1275 int err = 0, i; 1243 int err = 0, ret, i;
1276 struct inode *snapdir = NULL; 1244 struct inode *snapdir = NULL;
1277 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1245 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1278 struct ceph_dentry_info *di; 1246 struct ceph_dentry_info *di;
@@ -1305,6 +1273,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1305 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); 1273 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1306 } 1274 }
1307 1275
1276 /* FIXME: release caps/leases if error occurs */
1308 for (i = 0; i < rinfo->dir_nr; i++) { 1277 for (i = 0; i < rinfo->dir_nr; i++) {
1309 struct ceph_vino vino; 1278 struct ceph_vino vino;
1310 1279
@@ -1329,9 +1298,10 @@ retry_lookup:
1329 err = -ENOMEM; 1298 err = -ENOMEM;
1330 goto out; 1299 goto out;
1331 } 1300 }
1332 err = ceph_init_dentry(dn); 1301 ret = ceph_init_dentry(dn);
1333 if (err < 0) { 1302 if (ret < 0) {
1334 dput(dn); 1303 dput(dn);
1304 err = ret;
1335 goto out; 1305 goto out;
1336 } 1306 }
1337 } else if (dn->d_inode && 1307 } else if (dn->d_inode &&
@@ -1351,9 +1321,6 @@ retry_lookup:
1351 spin_unlock(&parent->d_lock); 1321 spin_unlock(&parent->d_lock);
1352 } 1322 }
1353 1323
1354 di = dn->d_fsdata;
1355 di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
1356
1357 /* inode */ 1324 /* inode */
1358 if (dn->d_inode) { 1325 if (dn->d_inode) {
1359 in = dn->d_inode; 1326 in = dn->d_inode;
@@ -1366,26 +1333,39 @@ retry_lookup:
1366 err = PTR_ERR(in); 1333 err = PTR_ERR(in);
1367 goto out; 1334 goto out;
1368 } 1335 }
1369 dn = splice_dentry(dn, in, NULL, false);
1370 if (IS_ERR(dn))
1371 dn = NULL;
1372 } 1336 }
1373 1337
1374 if (fill_inode(in, &rinfo->dir_in[i], NULL, session, 1338 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1375 req->r_request_started, -1, 1339 req->r_request_started, -1,
1376 &req->r_caps_reservation) < 0) { 1340 &req->r_caps_reservation) < 0) {
1377 pr_err("fill_inode badness on %p\n", in); 1341 pr_err("fill_inode badness on %p\n", in);
1342 if (!dn->d_inode)
1343 iput(in);
1344 d_drop(dn);
1378 goto next_item; 1345 goto next_item;
1379 } 1346 }
1380 if (dn) 1347
1381 update_dentry_lease(dn, rinfo->dir_dlease[i], 1348 if (!dn->d_inode) {
1382 req->r_session, 1349 dn = splice_dentry(dn, in, NULL, false);
1383 req->r_request_started); 1350 if (IS_ERR(dn)) {
1351 err = PTR_ERR(dn);
1352 dn = NULL;
1353 goto next_item;
1354 }
1355 }
1356
1357 di = dn->d_fsdata;
1358 di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
1359
1360 update_dentry_lease(dn, rinfo->dir_dlease[i],
1361 req->r_session,
1362 req->r_request_started);
1384next_item: 1363next_item:
1385 if (dn) 1364 if (dn)
1386 dput(dn); 1365 dput(dn);
1387 } 1366 }
1388 req->r_did_prepopulate = true; 1367 if (err == 0)
1368 req->r_did_prepopulate = true;
1389 1369
1390out: 1370out:
1391 if (snapdir) { 1371 if (snapdir) {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index aa3397620342..2c29db6a247e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
477 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 477 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
478extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 478extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
479extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); 479extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
480extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, 480extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
481 const unsigned char *path, 481 struct cifs_sb_info *cifs_sb,
482 struct cifs_sb_info *cifs_sb, unsigned int xid); 482 struct cifs_fattr *fattr,
483 const unsigned char *path);
483extern int mdfour(unsigned char *, unsigned char *, int); 484extern int mdfour(unsigned char *, unsigned char *, int);
484extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, 485extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
485 const struct nls_table *codepage); 486 const struct nls_table *codepage);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 124aa0230c1b..d707edb6b852 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4010,7 +4010,7 @@ QFileInfoRetry:
4010 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4010 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4011 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4011 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4012 if (rc) { 4012 if (rc) {
4013 cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); 4013 cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
4014 } else { /* decode response */ 4014 } else { /* decode response */
4015 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4015 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4016 4016
@@ -4179,7 +4179,7 @@ UnixQFileInfoRetry:
4179 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4179 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4180 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4180 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4181 if (rc) { 4181 if (rc) {
4182 cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); 4182 cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
4183 } else { /* decode response */ 4183 } else { /* decode response */
4184 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4184 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4185 4185
@@ -4263,7 +4263,7 @@ UnixQPathInfoRetry:
4263 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4263 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4264 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4264 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
4265 if (rc) { 4265 if (rc) {
4266 cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); 4266 cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
4267 } else { /* decode response */ 4267 } else { /* decode response */
4268 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 4268 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
4269 4269
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 11ff5f116b20..a514e0a65f69 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,7 +193,7 @@ check_name(struct dentry *direntry)
193static int 193static int
194cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, 194cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
195 struct tcon_link *tlink, unsigned oflags, umode_t mode, 195 struct tcon_link *tlink, unsigned oflags, umode_t mode,
196 __u32 *oplock, struct cifs_fid *fid, int *created) 196 __u32 *oplock, struct cifs_fid *fid)
197{ 197{
198 int rc = -ENOENT; 198 int rc = -ENOENT;
199 int create_options = CREATE_NOT_DIR; 199 int create_options = CREATE_NOT_DIR;
@@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
349 .device = 0, 349 .device = 0,
350 }; 350 };
351 351
352 *created |= FILE_CREATED;
353 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 352 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
354 args.uid = current_fsuid(); 353 args.uid = current_fsuid();
355 if (inode->i_mode & S_ISGID) 354 if (inode->i_mode & S_ISGID)
@@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
480 cifs_add_pending_open(&fid, tlink, &open); 479 cifs_add_pending_open(&fid, tlink, &open);
481 480
482 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, 481 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
483 &oplock, &fid, opened); 482 &oplock, &fid);
484 483
485 if (rc) { 484 if (rc) {
486 cifs_del_pending_open(&open); 485 cifs_del_pending_open(&open);
487 goto out; 486 goto out;
488 } 487 }
489 488
489 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
490 *opened |= FILE_CREATED;
491
490 rc = finish_open(file, direntry, generic_file_open, opened); 492 rc = finish_open(file, direntry, generic_file_open, opened);
491 if (rc) { 493 if (rc) {
492 if (server->ops->close) 494 if (server->ops->close)
@@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
529 struct TCP_Server_Info *server; 531 struct TCP_Server_Info *server;
530 struct cifs_fid fid; 532 struct cifs_fid fid;
531 __u32 oplock; 533 __u32 oplock;
532 int created = FILE_CREATED;
533 534
534 cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", 535 cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
535 inode, direntry->d_name.name, direntry); 536 inode, direntry->d_name.name, direntry);
@@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
546 server->ops->new_lease_key(&fid); 547 server->ops->new_lease_key(&fid);
547 548
548 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, 549 rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
549 &oplock, &fid, &created); 550 &oplock, &fid);
550 if (!rc && server->ops->close) 551 if (!rc && server->ops->close)
551 server->ops->close(xid, tcon, &fid); 552 server->ops->close(xid, tcon, &fid);
552 553
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 36f9ebb93ceb..49719b8228e5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
383 383
384 /* check for Minshall+French symlinks */ 384 /* check for Minshall+French symlinks */
385 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { 385 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
386 int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); 386 int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
387 full_path);
387 if (tmprc) 388 if (tmprc)
388 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); 389 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
389 } 390 }
@@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
799 800
800 /* check for Minshall+French symlinks */ 801 /* check for Minshall+French symlinks */
801 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { 802 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
802 tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); 803 tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
804 full_path);
803 if (tmprc) 805 if (tmprc)
804 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); 806 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
805 } 807 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cc0234710ddb..92aee08483a5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
354 354
355 355
356int 356int
357CIFSCheckMFSymlink(struct cifs_fattr *fattr, 357CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
358 const unsigned char *path, 358 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
359 struct cifs_sb_info *cifs_sb, unsigned int xid) 359 const unsigned char *path)
360{ 360{
361 int rc = 0; 361 int rc;
362 u8 *buf = NULL; 362 u8 *buf = NULL;
363 unsigned int link_len = 0; 363 unsigned int link_len = 0;
364 unsigned int bytes_read = 0; 364 unsigned int bytes_read = 0;
365 struct cifs_tcon *ptcon;
366 365
367 if (!CIFSCouldBeMFSymlink(fattr)) 366 if (!CIFSCouldBeMFSymlink(fattr))
368 /* it's not a symlink */ 367 /* it's not a symlink */
369 return 0; 368 return 0;
370 369
371 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); 370 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
372 if (!buf) { 371 if (!buf)
373 rc = -ENOMEM; 372 return -ENOMEM;
374 goto out;
375 }
376 373
377 ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); 374 if (tcon->ses->server->ops->query_mf_symlink)
378 if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) 375 rc = tcon->ses->server->ops->query_mf_symlink(path, buf,
379 rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, 376 &bytes_read, cifs_sb, xid);
380 &bytes_read, cifs_sb, xid);
381 else 377 else
382 goto out; 378 rc = -ENOSYS;
383 379
384 if (rc != 0) 380 if (rc)
385 goto out; 381 goto out;
386 382
387 if (bytes_read == 0) /* not a symlink */ 383 if (bytes_read == 0) /* not a symlink */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
680 struct i2c_msg __user *tmsgs; 680 struct i2c_msg __user *tmsgs;
681 struct i2c_msg32 __user *umsgs; 681 struct i2c_msg32 __user *umsgs;
682 compat_caddr_t datap; 682 compat_caddr_t datap;
683 int nmsgs, i; 683 u32 nmsgs;
684 int i;
684 685
685 if (get_user(nmsgs, &udata->nmsgs)) 686 if (get_user(nmsgs, &udata->nmsgs))
686 return -EFAULT; 687 return -EFAULT;
diff --git a/fs/dcache.c b/fs/dcache.c
index 6055d61811d3..cb4a10690868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
3061 * thus don't need to be hashed. They also don't need a name until a 3061 * thus don't need to be hashed. They also don't need a name until a
3062 * user wants to identify the object in /proc/pid/fd/. The little hack 3062 * user wants to identify the object in /proc/pid/fd/. The little hack
3063 * below allows us to generate a name for these objects on demand: 3063 * below allows us to generate a name for these objects on demand:
3064 *
3065 * Some pseudo inodes are mountable. When they are mounted
3066 * path->dentry == path->mnt->mnt_root. In that case don't call d_dname
3067 * and instead have d_path return the mounted path.
3064 */ 3068 */
3065 if (path->dentry->d_op && path->dentry->d_op->d_dname) 3069 if (path->dentry->d_op && path->dentry->d_op->d_dname &&
3070 (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
3066 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 3071 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
3067 3072
3068 rcu_read_lock(); 3073 rcu_read_lock();
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..a5e34dd6a32c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
649 struct msghdr *msg, char *buf) 649 struct msghdr *msg, char *buf)
650{ 650{
651 union sctp_notification *sn = (union sctp_notification *)buf; 651 union sctp_notification *sn = (union sctp_notification *)buf;
652 struct linger linger;
652 653
653 switch (sn->sn_header.sn_type) { 654 switch (sn->sn_header.sn_type) {
654 case SCTP_SEND_FAILED: 655 case SCTP_SEND_FAILED:
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
727 } 728 }
728 add_sock(new_con->sock, new_con); 729 add_sock(new_con->sock, new_con);
729 730
731 linger.l_onoff = 1;
732 linger.l_linger = 0;
733 ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
734 (char *)&linger, sizeof(linger));
735 if (ret < 0)
736 log_print("set socket option SO_LINGER failed");
737
730 log_print("connecting to %d sctp association %d", 738 log_print("connecting to %d sctp association %d",
731 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 739 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
732 740
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8b5e2584c840..af903128891c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1907,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1907 } 1907 }
1908 } 1908 }
1909 } 1909 }
1910 if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
1911 tep = tf.file->private_data;
1912 mutex_lock_nested(&tep->mtx, 1);
1913 }
1914 1910
1915 /* 1911 /*
1916 * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 1912 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 288534920fe5..20d6697bd638 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
1493 sb->s_blocksize - offset : towrite; 1493 sb->s_blocksize - offset : towrite;
1494 1494
1495 tmp_bh.b_state = 0; 1495 tmp_bh.b_state = 0;
1496 tmp_bh.b_size = sb->s_blocksize;
1496 err = ext2_get_block(inode, blk, &tmp_bh, 1); 1497 err = ext2_get_block(inode, blk, &tmp_bh, 1);
1497 if (err < 0) 1498 if (err < 0)
1498 goto out; 1499 goto out;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e6185031c1cc..ece55565b9cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -268,6 +268,16 @@ struct ext4_io_submit {
268/* Translate # of blks to # of clusters */ 268/* Translate # of blks to # of clusters */
269#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ 269#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
270 (sbi)->s_cluster_bits) 270 (sbi)->s_cluster_bits)
271/* Mask out the low bits to get the starting block of the cluster */
272#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \
273 ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
274#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
275 ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
276/* Get the cluster offset */
277#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
278 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
279#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \
280 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
271 281
272/* 282/*
273 * Structure of a blocks group descriptor 283 * Structure of a blocks group descriptor
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 17ac112ab101..3fe29de832c8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
259 if (WARN_ON_ONCE(err)) { 259 if (WARN_ON_ONCE(err)) {
260 ext4_journal_abort_handle(where, line, __func__, bh, 260 ext4_journal_abort_handle(where, line, __func__, bh,
261 handle, err); 261 handle, err);
262 ext4_error_inode(inode, where, line,
263 bh->b_blocknr,
264 "journal_dirty_metadata failed: "
265 "handle type %u started at line %u, "
266 "credits %u/%u, errcode %d",
267 handle->h_type,
268 handle->h_line_no,
269 handle->h_requested_credits,
270 handle->h_buffer_credits, err);
262 } 271 }
263 } else { 272 } else {
264 if (inode) 273 if (inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 35f65cf4f318..3384dc4bed40 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
360{ 360{
361 ext4_fsblk_t block = ext4_ext_pblock(ext); 361 ext4_fsblk_t block = ext4_ext_pblock(ext);
362 int len = ext4_ext_get_actual_len(ext); 362 int len = ext4_ext_get_actual_len(ext);
363 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
364 ext4_lblk_t last = lblock + len - 1;
363 365
364 if (len == 0) 366 if (lblock > last)
365 return 0; 367 return 0;
366 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 368 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
367} 369}
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
387 if (depth == 0) { 389 if (depth == 0) {
388 /* leaf entries */ 390 /* leaf entries */
389 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 391 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
392 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
393 ext4_fsblk_t pblock = 0;
394 ext4_lblk_t lblock = 0;
395 ext4_lblk_t prev = 0;
396 int len = 0;
390 while (entries) { 397 while (entries) {
391 if (!ext4_valid_extent(inode, ext)) 398 if (!ext4_valid_extent(inode, ext))
392 return 0; 399 return 0;
400
401 /* Check for overlapping extents */
402 lblock = le32_to_cpu(ext->ee_block);
403 len = ext4_ext_get_actual_len(ext);
404 if ((lblock <= prev) && prev) {
405 pblock = ext4_ext_pblock(ext);
406 es->s_last_error_block = cpu_to_le64(pblock);
407 return 0;
408 }
393 ext++; 409 ext++;
394 entries--; 410 entries--;
411 prev = lblock + len - 1;
395 } 412 }
396 } else { 413 } else {
397 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 414 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
@@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1834 depth = ext_depth(inode); 1851 depth = ext_depth(inode);
1835 if (!path[depth].p_ext) 1852 if (!path[depth].p_ext)
1836 goto out; 1853 goto out;
1837 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1854 b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1838 b2 &= ~(sbi->s_cluster_ratio - 1);
1839 1855
1840 /* 1856 /*
1841 * get the next allocated block if the extent in the path 1857 * get the next allocated block if the extent in the path
@@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1845 b2 = ext4_ext_next_allocated_block(path); 1861 b2 = ext4_ext_next_allocated_block(path);
1846 if (b2 == EXT_MAX_BLOCKS) 1862 if (b2 == EXT_MAX_BLOCKS)
1847 goto out; 1863 goto out;
1848 b2 &= ~(sbi->s_cluster_ratio - 1); 1864 b2 = EXT4_LBLK_CMASK(sbi, b2);
1849 } 1865 }
1850 1866
1851 /* check for wrap through zero on extent logical start block*/ 1867 /* check for wrap through zero on extent logical start block*/
@@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2504 * extent, we have to mark the cluster as used (store negative 2520 * extent, we have to mark the cluster as used (store negative
2505 * cluster number in partial_cluster). 2521 * cluster number in partial_cluster).
2506 */ 2522 */
2507 unaligned = pblk & (sbi->s_cluster_ratio - 1); 2523 unaligned = EXT4_PBLK_COFF(sbi, pblk);
2508 if (unaligned && (ee_len == num) && 2524 if (unaligned && (ee_len == num) &&
2509 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2525 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2510 *partial_cluster = EXT4_B2C(sbi, pblk); 2526 *partial_cluster = EXT4_B2C(sbi, pblk);
@@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2598 * accidentally freeing it later on 2614 * accidentally freeing it later on
2599 */ 2615 */
2600 pblk = ext4_ext_pblock(ex); 2616 pblk = ext4_ext_pblock(ex);
2601 if (pblk & (sbi->s_cluster_ratio - 1)) 2617 if (EXT4_PBLK_COFF(sbi, pblk))
2602 *partial_cluster = 2618 *partial_cluster =
2603 -((long long)EXT4_B2C(sbi, pblk)); 2619 -((long long)EXT4_B2C(sbi, pblk));
2604 ex--; 2620 ex--;
@@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3753{ 3769{
3754 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3770 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3755 ext4_lblk_t lblk_start, lblk_end; 3771 ext4_lblk_t lblk_start, lblk_end;
3756 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3772 lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3757 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3773 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3758 3774
3759 return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3775 return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3812 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 3828 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3813 3829
3814 /* Check towards left side */ 3830 /* Check towards left side */
3815 c_offset = lblk_start & (sbi->s_cluster_ratio - 1); 3831 c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3816 if (c_offset) { 3832 if (c_offset) {
3817 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3833 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3818 lblk_to = lblk_from + c_offset - 1; 3834 lblk_to = lblk_from + c_offset - 1;
3819 3835
3820 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3836 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3822 } 3838 }
3823 3839
3824 /* Now check towards right. */ 3840 /* Now check towards right. */
3825 c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); 3841 c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3826 if (allocated_clusters && c_offset) { 3842 if (allocated_clusters && c_offset) {
3827 lblk_from = lblk_start + num_blks; 3843 lblk_from = lblk_start + num_blks;
3828 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3844 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
4030 struct ext4_ext_path *path) 4046 struct ext4_ext_path *path)
4031{ 4047{
4032 struct ext4_sb_info *sbi = EXT4_SB(sb); 4048 struct ext4_sb_info *sbi = EXT4_SB(sb);
4033 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 4049 ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4034 ext4_lblk_t ex_cluster_start, ex_cluster_end; 4050 ext4_lblk_t ex_cluster_start, ex_cluster_end;
4035 ext4_lblk_t rr_cluster_start; 4051 ext4_lblk_t rr_cluster_start;
4036 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4052 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
4048 (rr_cluster_start == ex_cluster_start)) { 4064 (rr_cluster_start == ex_cluster_start)) {
4049 if (rr_cluster_start == ex_cluster_end) 4065 if (rr_cluster_start == ex_cluster_end)
4050 ee_start += ee_len - 1; 4066 ee_start += ee_len - 1;
4051 map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + 4067 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4052 c_offset;
4053 map->m_len = min(map->m_len, 4068 map->m_len = min(map->m_len,
4054 (unsigned) sbi->s_cluster_ratio - c_offset); 4069 (unsigned) sbi->s_cluster_ratio - c_offset);
4055 /* 4070 /*
@@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4203 */ 4218 */
4204 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; 4219 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4205 newex.ee_block = cpu_to_le32(map->m_lblk); 4220 newex.ee_block = cpu_to_le32(map->m_lblk);
4206 cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 4221 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4207 4222
4208 /* 4223 /*
4209 * If we are doing bigalloc, check to see if the extent returned 4224 * If we are doing bigalloc, check to see if the extent returned
@@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4271 * needed so that future calls to get_implied_cluster_alloc() 4286 * needed so that future calls to get_implied_cluster_alloc()
4272 * work correctly. 4287 * work correctly.
4273 */ 4288 */
4274 offset = map->m_lblk & (sbi->s_cluster_ratio - 1); 4289 offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4275 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4290 ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4276 ar.goal -= offset; 4291 ar.goal -= offset;
4277 ar.logical -= offset; 4292 ar.logical -= offset;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 075763474118..31fa964742bc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file,
1206 */ 1206 */
1207static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) 1207static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
1208{ 1208{
1209 int retries = 0;
1210 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1209 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1211 struct ext4_inode_info *ei = EXT4_I(inode); 1210 struct ext4_inode_info *ei = EXT4_I(inode);
1212 unsigned int md_needed; 1211 unsigned int md_needed;
@@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
1218 * in order to allocate nrblocks 1217 * in order to allocate nrblocks
1219 * worse case is one extent per block 1218 * worse case is one extent per block
1220 */ 1219 */
1221repeat:
1222 spin_lock(&ei->i_block_reservation_lock); 1220 spin_lock(&ei->i_block_reservation_lock);
1223 /* 1221 /*
1224 * ext4_calc_metadata_amount() has side effects, which we have 1222 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1238,10 +1236,6 @@ repeat:
1238 ei->i_da_metadata_calc_len = save_len; 1236 ei->i_da_metadata_calc_len = save_len;
1239 ei->i_da_metadata_calc_last_lblock = save_last_lblock; 1237 ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1240 spin_unlock(&ei->i_block_reservation_lock); 1238 spin_unlock(&ei->i_block_reservation_lock);
1241 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1242 cond_resched();
1243 goto repeat;
1244 }
1245 return -ENOSPC; 1239 return -ENOSPC;
1246 } 1240 }
1247 ei->i_reserved_meta_blocks += md_needed; 1241 ei->i_reserved_meta_blocks += md_needed;
@@ -1255,7 +1249,6 @@ repeat:
1255 */ 1249 */
1256static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1250static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1257{ 1251{
1258 int retries = 0;
1259 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1252 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1260 struct ext4_inode_info *ei = EXT4_I(inode); 1253 struct ext4_inode_info *ei = EXT4_I(inode);
1261 unsigned int md_needed; 1254 unsigned int md_needed;
@@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1277 * in order to allocate nrblocks 1270 * in order to allocate nrblocks
1278 * worse case is one extent per block 1271 * worse case is one extent per block
1279 */ 1272 */
1280repeat:
1281 spin_lock(&ei->i_block_reservation_lock); 1273 spin_lock(&ei->i_block_reservation_lock);
1282 /* 1274 /*
1283 * ext4_calc_metadata_amount() has side effects, which we have 1275 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1297,10 +1289,6 @@ repeat:
1297 ei->i_da_metadata_calc_len = save_len; 1289 ei->i_da_metadata_calc_len = save_len;
1298 ei->i_da_metadata_calc_last_lblock = save_last_lblock; 1290 ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1299 spin_unlock(&ei->i_block_reservation_lock); 1291 spin_unlock(&ei->i_block_reservation_lock);
1300 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1301 cond_resched();
1302 goto repeat;
1303 }
1304 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); 1292 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1305 return -ENOSPC; 1293 return -ENOSPC;
1306 } 1294 }
@@ -4598,6 +4586,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4598 if (attr->ia_size > sbi->s_bitmap_maxbytes) 4586 if (attr->ia_size > sbi->s_bitmap_maxbytes)
4599 return -EFBIG; 4587 return -EFBIG;
4600 } 4588 }
4589
4590 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
4591 inode_inc_iversion(inode);
4592
4601 if (S_ISREG(inode->i_mode) && 4593 if (S_ISREG(inode->i_mode) &&
4602 (attr->ia_size < inode->i_size)) { 4594 (attr->ia_size < inode->i_size)) {
4603 if (ext4_should_order_data(inode)) { 4595 if (ext4_should_order_data(inode)) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4d113efa024c..04a5c7504be9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
3442{ 3442{
3443 struct ext4_prealloc_space *pa; 3443 struct ext4_prealloc_space *pa;
3444 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3444 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3445
3446 BUG_ON(atomic_read(&pa->pa_count));
3447 BUG_ON(pa->pa_deleted == 0);
3445 kmem_cache_free(ext4_pspace_cachep, pa); 3448 kmem_cache_free(ext4_pspace_cachep, pa);
3446} 3449}
3447 3450
@@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3455 ext4_group_t grp; 3458 ext4_group_t grp;
3456 ext4_fsblk_t grp_blk; 3459 ext4_fsblk_t grp_blk;
3457 3460
3458 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3459 return;
3460
3461 /* in this short window concurrent discard can set pa_deleted */ 3461 /* in this short window concurrent discard can set pa_deleted */
3462 spin_lock(&pa->pa_lock); 3462 spin_lock(&pa->pa_lock);
3463 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3464 spin_unlock(&pa->pa_lock);
3465 return;
3466 }
3467
3463 if (pa->pa_deleted == 1) { 3468 if (pa->pa_deleted == 1) {
3464 spin_unlock(&pa->pa_lock); 3469 spin_unlock(&pa->pa_lock);
3465 return; 3470 return;
@@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4121 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4126 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4122 4127
4123 /* set up allocation goals */ 4128 /* set up allocation goals */
4124 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); 4129 ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4125 ac->ac_status = AC_STATUS_CONTINUE; 4130 ac->ac_status = AC_STATUS_CONTINUE;
4126 ac->ac_sb = sb; 4131 ac->ac_sb = sb;
4127 ac->ac_inode = ar->inode; 4132 ac->ac_inode = ar->inode;
@@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4663 * blocks at the beginning or the end unless we are explicitly 4668 * blocks at the beginning or the end unless we are explicitly
4664 * requested to avoid doing so. 4669 * requested to avoid doing so.
4665 */ 4670 */
4666 overflow = block & (sbi->s_cluster_ratio - 1); 4671 overflow = EXT4_PBLK_COFF(sbi, block);
4667 if (overflow) { 4672 if (overflow) {
4668 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { 4673 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4669 overflow = sbi->s_cluster_ratio - overflow; 4674 overflow = sbi->s_cluster_ratio - overflow;
@@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4677 count += overflow; 4682 count += overflow;
4678 } 4683 }
4679 } 4684 }
4680 overflow = count & (sbi->s_cluster_ratio - 1); 4685 overflow = EXT4_LBLK_COFF(sbi, count);
4681 if (overflow) { 4686 if (overflow) {
4682 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { 4687 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4683 if (count > overflow) 4688 if (count > overflow)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c977f4e4e63b..1f7784de05b6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb)
792 } 792 }
793 793
794 ext4_es_unregister_shrinker(sbi); 794 ext4_es_unregister_shrinker(sbi);
795 del_timer(&sbi->s_err_report); 795 del_timer_sync(&sbi->s_err_report);
796 ext4_release_system_zone(sb); 796 ext4_release_system_zone(sb);
797 ext4_mb_release(sb); 797 ext4_mb_release(sb);
798 ext4_ext_release(sb); 798 ext4_ext_release(sb);
@@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb)
3316} 3316}
3317 3317
3318 3318
3319static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) 3319static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
3320{ 3320{
3321 ext4_fsblk_t resv_clusters; 3321 ext4_fsblk_t resv_clusters;
3322 3322
3323 /* 3323 /*
3324 * There's no need to reserve anything when we aren't using extents.
3325 * The space estimates are exact, there are no unwritten extents,
3326 * hole punching doesn't need new metadata... This is needed especially
3327 * to keep ext2/3 backward compatibility.
3328 */
3329 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3330 return 0;
3331 /*
3324 * By default we reserve 2% or 4096 clusters, whichever is smaller. 3332 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3325 * This should cover the situations where we can not afford to run 3333 * This should cover the situations where we can not afford to run
3326 * out of space like for example punch hole, or converting 3334 * out of space like for example punch hole, or converting
@@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
3328 * allocation would require 1, or 2 blocks, higher numbers are 3336 * allocation would require 1, or 2 blocks, higher numbers are
3329 * very rare. 3337 * very rare.
3330 */ 3338 */
3331 resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; 3339 resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
3340 EXT4_SB(sb)->s_cluster_bits;
3332 3341
3333 do_div(resv_clusters, 50); 3342 do_div(resv_clusters, 50);
3334 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); 3343 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -4071,10 +4080,10 @@ no_journal:
4071 "available"); 4080 "available");
4072 } 4081 }
4073 4082
4074 err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); 4083 err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
4075 if (err) { 4084 if (err) {
4076 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " 4085 ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
4077 "reserved pool", ext4_calculate_resv_clusters(sbi)); 4086 "reserved pool", ext4_calculate_resv_clusters(sb));
4078 goto failed_mount4a; 4087 goto failed_mount4a;
4079 } 4088 }
4080 4089
@@ -4184,7 +4193,7 @@ failed_mount_wq:
4184 } 4193 }
4185failed_mount3: 4194failed_mount3:
4186 ext4_es_unregister_shrinker(sbi); 4195 ext4_es_unregister_shrinker(sbi);
4187 del_timer(&sbi->s_err_report); 4196 del_timer_sync(&sbi->s_err_report);
4188 if (sbi->s_flex_groups) 4197 if (sbi->s_flex_groups)
4189 ext4_kvfree(sbi->s_flex_groups); 4198 ext4_kvfree(sbi->s_flex_groups);
4190 percpu_counter_destroy(&sbi->s_freeclusters_counter); 4199 percpu_counter_destroy(&sbi->s_freeclusters_counter);
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b9..2e35da12d292 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_F2FS_FS) += f2fs.o 1obj-$(CONFIG_F2FS_FS) += f2fs.o
2 2
3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o 3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o 4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o 5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o 6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5716e5eb4e8e..293d0486a40f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
30 */ 30 */
31struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) 31struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
32{ 32{
33 struct address_space *mapping = sbi->meta_inode->i_mapping; 33 struct address_space *mapping = META_MAPPING(sbi);
34 struct page *page = NULL; 34 struct page *page = NULL;
35repeat: 35repeat:
36 page = grab_cache_page(mapping, index); 36 page = grab_cache_page(mapping, index);
@@ -50,7 +50,7 @@ repeat:
50 */ 50 */
51struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) 51struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
52{ 52{
53 struct address_space *mapping = sbi->meta_inode->i_mapping; 53 struct address_space *mapping = META_MAPPING(sbi);
54 struct page *page; 54 struct page *page;
55repeat: 55repeat:
56 page = grab_cache_page(mapping, index); 56 page = grab_cache_page(mapping, index);
@@ -61,11 +61,12 @@ repeat:
61 if (PageUptodate(page)) 61 if (PageUptodate(page))
62 goto out; 62 goto out;
63 63
64 if (f2fs_readpage(sbi, page, index, READ_SYNC)) 64 if (f2fs_submit_page_bio(sbi, page, index,
65 READ_SYNC | REQ_META | REQ_PRIO))
65 goto repeat; 66 goto repeat;
66 67
67 lock_page(page); 68 lock_page(page);
68 if (page->mapping != mapping) { 69 if (unlikely(page->mapping != mapping)) {
69 f2fs_put_page(page, 1); 70 f2fs_put_page(page, 1);
70 goto repeat; 71 goto repeat;
71 } 72 }
@@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page,
81 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 82 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
82 83
83 /* Should not write any meta pages, if any IO error was occurred */ 84 /* Should not write any meta pages, if any IO error was occurred */
84 if (wbc->for_reclaim || sbi->por_doing || 85 if (unlikely(sbi->por_doing ||
85 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { 86 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
86 dec_page_count(sbi, F2FS_DIRTY_META); 87 goto redirty_out;
87 wbc->pages_skipped++; 88
88 set_page_dirty(page); 89 if (wbc->for_reclaim)
89 return AOP_WRITEPAGE_ACTIVATE; 90 goto redirty_out;
90 }
91 91
92 wait_on_page_writeback(page); 92 wait_on_page_writeback(page);
93 93
@@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page,
95 dec_page_count(sbi, F2FS_DIRTY_META); 95 dec_page_count(sbi, F2FS_DIRTY_META);
96 unlock_page(page); 96 unlock_page(page);
97 return 0; 97 return 0;
98
99redirty_out:
100 dec_page_count(sbi, F2FS_DIRTY_META);
101 wbc->pages_skipped++;
102 set_page_dirty(page);
103 return AOP_WRITEPAGE_ACTIVATE;
98} 104}
99 105
100static int f2fs_write_meta_pages(struct address_space *mapping, 106static int f2fs_write_meta_pages(struct address_space *mapping,
101 struct writeback_control *wbc) 107 struct writeback_control *wbc)
102{ 108{
103 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 109 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
104 struct block_device *bdev = sbi->sb->s_bdev; 110 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
105 long written; 111 long written;
106 112
107 if (wbc->for_kupdate) 113 if (wbc->for_kupdate)
108 return 0; 114 return 0;
109 115
110 if (get_pages(sbi, F2FS_DIRTY_META) == 0) 116 /* collect a number of dirty meta pages and write together */
117 if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
111 return 0; 118 return 0;
112 119
113 /* if mounting is failed, skip writing node pages */ 120 /* if mounting is failed, skip writing node pages */
114 mutex_lock(&sbi->cp_mutex); 121 mutex_lock(&sbi->cp_mutex);
115 written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); 122 written = sync_meta_pages(sbi, META, nrpages);
116 mutex_unlock(&sbi->cp_mutex); 123 mutex_unlock(&sbi->cp_mutex);
117 wbc->nr_to_write -= written; 124 wbc->nr_to_write -= written;
118 return 0; 125 return 0;
@@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
121long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, 128long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
122 long nr_to_write) 129 long nr_to_write)
123{ 130{
124 struct address_space *mapping = sbi->meta_inode->i_mapping; 131 struct address_space *mapping = META_MAPPING(sbi);
125 pgoff_t index = 0, end = LONG_MAX; 132 pgoff_t index = 0, end = LONG_MAX;
126 struct pagevec pvec; 133 struct pagevec pvec;
127 long nwritten = 0; 134 long nwritten = 0;
@@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
136 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 143 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
137 PAGECACHE_TAG_DIRTY, 144 PAGECACHE_TAG_DIRTY,
138 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 145 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
139 if (nr_pages == 0) 146 if (unlikely(nr_pages == 0))
140 break; 147 break;
141 148
142 for (i = 0; i < nr_pages; i++) { 149 for (i = 0; i < nr_pages; i++) {
@@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
149 unlock_page(page); 156 unlock_page(page);
150 break; 157 break;
151 } 158 }
152 if (nwritten++ >= nr_to_write) 159 nwritten++;
160 if (unlikely(nwritten >= nr_to_write))
153 break; 161 break;
154 } 162 }
155 pagevec_release(&pvec); 163 pagevec_release(&pvec);
@@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
157 } 165 }
158 166
159 if (nwritten) 167 if (nwritten)
160 f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); 168 f2fs_submit_merged_bio(sbi, type, WRITE);
161 169
162 return nwritten; 170 return nwritten;
163} 171}
@@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = {
186 194
187int acquire_orphan_inode(struct f2fs_sb_info *sbi) 195int acquire_orphan_inode(struct f2fs_sb_info *sbi)
188{ 196{
189 unsigned int max_orphans;
190 int err = 0; 197 int err = 0;
191 198
192 /* 199 spin_lock(&sbi->orphan_inode_lock);
193 * considering 512 blocks in a segment 5 blocks are needed for cp 200 if (unlikely(sbi->n_orphans >= sbi->max_orphans))
194 * and log segment summaries. Remaining blocks are used to keep
195 * orphan entries with the limitation one reserved segment
196 * for cp pack we can have max 1020*507 orphan entries
197 */
198 max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
199 mutex_lock(&sbi->orphan_inode_mutex);
200 if (sbi->n_orphans >= max_orphans)
201 err = -ENOSPC; 201 err = -ENOSPC;
202 else 202 else
203 sbi->n_orphans++; 203 sbi->n_orphans++;
204 mutex_unlock(&sbi->orphan_inode_mutex); 204 spin_unlock(&sbi->orphan_inode_lock);
205
205 return err; 206 return err;
206} 207}
207 208
208void release_orphan_inode(struct f2fs_sb_info *sbi) 209void release_orphan_inode(struct f2fs_sb_info *sbi)
209{ 210{
210 mutex_lock(&sbi->orphan_inode_mutex); 211 spin_lock(&sbi->orphan_inode_lock);
211 f2fs_bug_on(sbi->n_orphans == 0); 212 f2fs_bug_on(sbi->n_orphans == 0);
212 sbi->n_orphans--; 213 sbi->n_orphans--;
213 mutex_unlock(&sbi->orphan_inode_mutex); 214 spin_unlock(&sbi->orphan_inode_lock);
214} 215}
215 216
216void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 217void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
218 struct list_head *head, *this; 219 struct list_head *head, *this;
219 struct orphan_inode_entry *new = NULL, *orphan = NULL; 220 struct orphan_inode_entry *new = NULL, *orphan = NULL;
220 221
221 mutex_lock(&sbi->orphan_inode_mutex); 222 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
223 new->ino = ino;
224
225 spin_lock(&sbi->orphan_inode_lock);
222 head = &sbi->orphan_inode_list; 226 head = &sbi->orphan_inode_list;
223 list_for_each(this, head) { 227 list_for_each(this, head) {
224 orphan = list_entry(this, struct orphan_inode_entry, list); 228 orphan = list_entry(this, struct orphan_inode_entry, list);
225 if (orphan->ino == ino) 229 if (orphan->ino == ino) {
226 goto out; 230 spin_unlock(&sbi->orphan_inode_lock);
231 kmem_cache_free(orphan_entry_slab, new);
232 return;
233 }
234
227 if (orphan->ino > ino) 235 if (orphan->ino > ino)
228 break; 236 break;
229 orphan = NULL; 237 orphan = NULL;
230 } 238 }
231 239
232 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
233 new->ino = ino;
234
235 /* add new_oentry into list which is sorted by inode number */ 240 /* add new_oentry into list which is sorted by inode number */
236 if (orphan) 241 if (orphan)
237 list_add(&new->list, this->prev); 242 list_add(&new->list, this->prev);
238 else 243 else
239 list_add_tail(&new->list, head); 244 list_add_tail(&new->list, head);
240out: 245 spin_unlock(&sbi->orphan_inode_lock);
241 mutex_unlock(&sbi->orphan_inode_mutex);
242} 246}
243 247
244void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 248void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
246 struct list_head *head; 250 struct list_head *head;
247 struct orphan_inode_entry *orphan; 251 struct orphan_inode_entry *orphan;
248 252
249 mutex_lock(&sbi->orphan_inode_mutex); 253 spin_lock(&sbi->orphan_inode_lock);
250 head = &sbi->orphan_inode_list; 254 head = &sbi->orphan_inode_list;
251 list_for_each_entry(orphan, head, list) { 255 list_for_each_entry(orphan, head, list) {
252 if (orphan->ino == ino) { 256 if (orphan->ino == ino) {
@@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
257 break; 261 break;
258 } 262 }
259 } 263 }
260 mutex_unlock(&sbi->orphan_inode_mutex); 264 spin_unlock(&sbi->orphan_inode_lock);
261} 265}
262 266
263static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 267static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
270 iput(inode); 274 iput(inode);
271} 275}
272 276
273int recover_orphan_inodes(struct f2fs_sb_info *sbi) 277void recover_orphan_inodes(struct f2fs_sb_info *sbi)
274{ 278{
275 block_t start_blk, orphan_blkaddr, i, j; 279 block_t start_blk, orphan_blkaddr, i, j;
276 280
277 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) 281 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
278 return 0; 282 return;
279 283
280 sbi->por_doing = true; 284 sbi->por_doing = true;
281 start_blk = __start_cp_addr(sbi) + 1; 285 start_blk = __start_cp_addr(sbi) + 1;
@@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
295 /* clear Orphan Flag */ 299 /* clear Orphan Flag */
296 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); 300 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
297 sbi->por_doing = false; 301 sbi->por_doing = false;
298 return 0; 302 return;
299} 303}
300 304
301static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) 305static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
302{ 306{
303 struct list_head *head, *this, *next; 307 struct list_head *head;
304 struct f2fs_orphan_block *orphan_blk = NULL; 308 struct f2fs_orphan_block *orphan_blk = NULL;
305 struct page *page = NULL;
306 unsigned int nentries = 0; 309 unsigned int nentries = 0;
307 unsigned short index = 1; 310 unsigned short index;
308 unsigned short orphan_blocks; 311 unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
309
310 orphan_blocks = (unsigned short)((sbi->n_orphans +
311 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); 312 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
313 struct page *page = NULL;
314 struct orphan_inode_entry *orphan = NULL;
315
316 for (index = 0; index < orphan_blocks; index++)
317 grab_meta_page(sbi, start_blk + index);
312 318
313 mutex_lock(&sbi->orphan_inode_mutex); 319 index = 1;
320 spin_lock(&sbi->orphan_inode_lock);
314 head = &sbi->orphan_inode_list; 321 head = &sbi->orphan_inode_list;
315 322
316 /* loop for each orphan inode entry and write them in Jornal block */ 323 /* loop for each orphan inode entry and write them in Jornal block */
317 list_for_each_safe(this, next, head) { 324 list_for_each_entry(orphan, head, list) {
318 struct orphan_inode_entry *orphan; 325 if (!page) {
326 page = find_get_page(META_MAPPING(sbi), start_blk++);
327 f2fs_bug_on(!page);
328 orphan_blk =
329 (struct f2fs_orphan_block *)page_address(page);
330 memset(orphan_blk, 0, sizeof(*orphan_blk));
331 f2fs_put_page(page, 0);
332 }
319 333
320 orphan = list_entry(this, struct orphan_inode_entry, list); 334 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
321 335
322 if (nentries == F2FS_ORPHANS_PER_BLOCK) { 336 if (nentries == F2FS_ORPHANS_PER_BLOCK) {
323 /* 337 /*
@@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
331 set_page_dirty(page); 345 set_page_dirty(page);
332 f2fs_put_page(page, 1); 346 f2fs_put_page(page, 1);
333 index++; 347 index++;
334 start_blk++;
335 nentries = 0; 348 nentries = 0;
336 page = NULL; 349 page = NULL;
337 } 350 }
338 if (page) 351 }
339 goto page_exist;
340 352
341 page = grab_meta_page(sbi, start_blk); 353 if (page) {
342 orphan_blk = (struct f2fs_orphan_block *)page_address(page); 354 orphan_blk->blk_addr = cpu_to_le16(index);
343 memset(orphan_blk, 0, sizeof(*orphan_blk)); 355 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
344page_exist: 356 orphan_blk->entry_count = cpu_to_le32(nentries);
345 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); 357 set_page_dirty(page);
358 f2fs_put_page(page, 1);
346 } 359 }
347 if (!page)
348 goto end;
349 360
350 orphan_blk->blk_addr = cpu_to_le16(index); 361 spin_unlock(&sbi->orphan_inode_lock);
351 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
352 orphan_blk->entry_count = cpu_to_le32(nentries);
353 set_page_dirty(page);
354 f2fs_put_page(page, 1);
355end:
356 mutex_unlock(&sbi->orphan_inode_mutex);
357} 362}
358 363
359static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, 364static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
428 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); 433 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
429 434
430 /* The second checkpoint pack should start at the next segment */ 435 /* The second checkpoint pack should start at the next segment */
431 cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); 436 cp_start_blk_no += ((unsigned long long)1) <<
437 le32_to_cpu(fsb->log_blocks_per_seg);
432 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); 438 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
433 439
434 if (cp1 && cp2) { 440 if (cp1 && cp2) {
@@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
465 list_for_each(this, head) { 471 list_for_each(this, head) {
466 struct dir_inode_entry *entry; 472 struct dir_inode_entry *entry;
467 entry = list_entry(this, struct dir_inode_entry, list); 473 entry = list_entry(this, struct dir_inode_entry, list);
468 if (entry->inode == inode) 474 if (unlikely(entry->inode == inode))
469 return -EEXIST; 475 return -EEXIST;
470 } 476 }
471 list_add_tail(&new->list, head); 477 list_add_tail(&new->list, head);
@@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode)
513void remove_dirty_dir_inode(struct inode *inode) 519void remove_dirty_dir_inode(struct inode *inode)
514{ 520{
515 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 521 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
516 struct list_head *head = &sbi->dir_inode_list; 522
517 struct list_head *this; 523 struct list_head *this, *head;
518 524
519 if (!S_ISDIR(inode->i_mode)) 525 if (!S_ISDIR(inode->i_mode))
520 return; 526 return;
@@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode)
525 return; 531 return;
526 } 532 }
527 533
534 head = &sbi->dir_inode_list;
528 list_for_each(this, head) { 535 list_for_each(this, head) {
529 struct dir_inode_entry *entry; 536 struct dir_inode_entry *entry;
530 entry = list_entry(this, struct dir_inode_entry, list); 537 entry = list_entry(this, struct dir_inode_entry, list);
@@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode)
546 553
547struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) 554struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
548{ 555{
549 struct list_head *head = &sbi->dir_inode_list; 556
550 struct list_head *this; 557 struct list_head *this, *head;
551 struct inode *inode = NULL; 558 struct inode *inode = NULL;
552 559
553 spin_lock(&sbi->dir_inode_lock); 560 spin_lock(&sbi->dir_inode_lock);
561
562 head = &sbi->dir_inode_list;
554 list_for_each(this, head) { 563 list_for_each(this, head) {
555 struct dir_inode_entry *entry; 564 struct dir_inode_entry *entry;
556 entry = list_entry(this, struct dir_inode_entry, list); 565 entry = list_entry(this, struct dir_inode_entry, list);
@@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
565 574
566void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 575void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
567{ 576{
568 struct list_head *head = &sbi->dir_inode_list; 577 struct list_head *head;
569 struct dir_inode_entry *entry; 578 struct dir_inode_entry *entry;
570 struct inode *inode; 579 struct inode *inode;
571retry: 580retry:
572 spin_lock(&sbi->dir_inode_lock); 581 spin_lock(&sbi->dir_inode_lock);
582
583 head = &sbi->dir_inode_list;
573 if (list_empty(head)) { 584 if (list_empty(head)) {
574 spin_unlock(&sbi->dir_inode_lock); 585 spin_unlock(&sbi->dir_inode_lock);
575 return; 586 return;
@@ -585,7 +596,7 @@ retry:
585 * We should submit bio, since it exists several 596 * We should submit bio, since it exists several
586 * wribacking dentry pages in the freeing inode. 597 * wribacking dentry pages in the freeing inode.
587 */ 598 */
588 f2fs_submit_bio(sbi, DATA, true); 599 f2fs_submit_merged_bio(sbi, DATA, WRITE);
589 } 600 }
590 goto retry; 601 goto retry;
591} 602}
@@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
760 /* wait for previous submitted node/meta pages writeback */ 771 /* wait for previous submitted node/meta pages writeback */
761 wait_on_all_pages_writeback(sbi); 772 wait_on_all_pages_writeback(sbi);
762 773
763 filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); 774 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
764 filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); 775 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
765 776
766 /* update user_block_counts */ 777 /* update user_block_counts */
767 sbi->last_valid_block_count = sbi->total_valid_block_count; 778 sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
770 /* Here, we only have one bio having CP pack */ 781 /* Here, we only have one bio having CP pack */
771 sync_meta_pages(sbi, META_FLUSH, LONG_MAX); 782 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
772 783
773 if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { 784 if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
774 clear_prefree_segments(sbi); 785 clear_prefree_segments(sbi);
775 F2FS_RESET_SB_DIRT(sbi); 786 F2FS_RESET_SB_DIRT(sbi);
776 } 787 }
@@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
791 802
792 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); 803 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
793 804
794 f2fs_submit_bio(sbi, DATA, true); 805 f2fs_submit_merged_bio(sbi, DATA, WRITE);
795 f2fs_submit_bio(sbi, NODE, true); 806 f2fs_submit_merged_bio(sbi, NODE, WRITE);
796 f2fs_submit_bio(sbi, META, true); 807 f2fs_submit_merged_bio(sbi, META, WRITE);
797 808
798 /* 809 /*
799 * update checkpoint pack index 810 * update checkpoint pack index
@@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
818 829
819void init_orphan_info(struct f2fs_sb_info *sbi) 830void init_orphan_info(struct f2fs_sb_info *sbi)
820{ 831{
821 mutex_init(&sbi->orphan_inode_mutex); 832 spin_lock_init(&sbi->orphan_inode_lock);
822 INIT_LIST_HEAD(&sbi->orphan_inode_list); 833 INIT_LIST_HEAD(&sbi->orphan_inode_list);
823 sbi->n_orphans = 0; 834 sbi->n_orphans = 0;
835 /*
836 * considering 512 blocks in a segment 8 blocks are needed for cp
837 * and log segment summaries. Remaining blocks are used to keep
838 * orphan entries with the limitation one reserved segment
839 * for cp pack we can have max 1020*504 orphan entries
840 */
841 sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
842 * F2FS_ORPHANS_PER_BLOCK;
824} 843}
825 844
826int __init create_checkpoint_caches(void) 845int __init create_checkpoint_caches(void)
827{ 846{
828 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 847 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
829 sizeof(struct orphan_inode_entry), NULL); 848 sizeof(struct orphan_inode_entry), NULL);
830 if (unlikely(!orphan_entry_slab)) 849 if (!orphan_entry_slab)
831 return -ENOMEM; 850 return -ENOMEM;
832 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 851 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
833 sizeof(struct dir_inode_entry), NULL); 852 sizeof(struct dir_inode_entry), NULL);
834 if (unlikely(!inode_entry_slab)) { 853 if (!inode_entry_slab) {
835 kmem_cache_destroy(orphan_entry_slab); 854 kmem_cache_destroy(orphan_entry_slab);
836 return -ENOMEM; 855 return -ENOMEM;
837 } 856 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571fa..0ae558723506 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,195 @@
24#include "segment.h" 24#include "segment.h"
25#include <trace/events/f2fs.h> 25#include <trace/events/f2fs.h>
26 26
27static void f2fs_read_end_io(struct bio *bio, int err)
28{
29 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
30 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
31
32 do {
33 struct page *page = bvec->bv_page;
34
35 if (--bvec >= bio->bi_io_vec)
36 prefetchw(&bvec->bv_page->flags);
37
38 if (unlikely(!uptodate)) {
39 ClearPageUptodate(page);
40 SetPageError(page);
41 } else {
42 SetPageUptodate(page);
43 }
44 unlock_page(page);
45 } while (bvec >= bio->bi_io_vec);
46
47 bio_put(bio);
48}
49
50static void f2fs_write_end_io(struct bio *bio, int err)
51{
52 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
53 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
54 struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb);
55
56 do {
57 struct page *page = bvec->bv_page;
58
59 if (--bvec >= bio->bi_io_vec)
60 prefetchw(&bvec->bv_page->flags);
61
62 if (unlikely(!uptodate)) {
63 SetPageError(page);
64 set_bit(AS_EIO, &page->mapping->flags);
65 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
66 sbi->sb->s_flags |= MS_RDONLY;
67 }
68 end_page_writeback(page);
69 dec_page_count(sbi, F2FS_WRITEBACK);
70 } while (bvec >= bio->bi_io_vec);
71
72 if (bio->bi_private)
73 complete(bio->bi_private);
74
75 if (!get_pages(sbi, F2FS_WRITEBACK) &&
76 !list_empty(&sbi->cp_wait.task_list))
77 wake_up(&sbi->cp_wait);
78
79 bio_put(bio);
80}
81
82/*
83 * Low-level block read/write IO operations.
84 */
85static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
86 int npages, bool is_read)
87{
88 struct bio *bio;
89
90 /* No failure on bio allocation */
91 bio = bio_alloc(GFP_NOIO, npages);
92
93 bio->bi_bdev = sbi->sb->s_bdev;
94 bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
95 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
96
97 return bio;
98}
99
100static void __submit_merged_bio(struct f2fs_bio_info *io)
101{
102 struct f2fs_io_info *fio = &io->fio;
103 int rw;
104
105 if (!io->bio)
106 return;
107
108 rw = fio->rw;
109
110 if (is_read_io(rw)) {
111 trace_f2fs_submit_read_bio(io->sbi->sb, rw,
112 fio->type, io->bio);
113 submit_bio(rw, io->bio);
114 } else {
115 trace_f2fs_submit_write_bio(io->sbi->sb, rw,
116 fio->type, io->bio);
117 /*
118 * META_FLUSH is only from the checkpoint procedure, and we
119 * should wait this metadata bio for FS consistency.
120 */
121 if (fio->type == META_FLUSH) {
122 DECLARE_COMPLETION_ONSTACK(wait);
123 io->bio->bi_private = &wait;
124 submit_bio(rw, io->bio);
125 wait_for_completion(&wait);
126 } else {
127 submit_bio(rw, io->bio);
128 }
129 }
130
131 io->bio = NULL;
132}
133
134void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
135 enum page_type type, int rw)
136{
137 enum page_type btype = PAGE_TYPE_OF_BIO(type);
138 struct f2fs_bio_info *io;
139
140 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
141
142 mutex_lock(&io->io_mutex);
143
144 /* change META to META_FLUSH in the checkpoint procedure */
145 if (type >= META_FLUSH) {
146 io->fio.type = META_FLUSH;
147 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
148 }
149 __submit_merged_bio(io);
150 mutex_unlock(&io->io_mutex);
151}
152
153/*
154 * Fill the locked page with data located in the block address.
155 * Return unlocked page.
156 */
157int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
158 block_t blk_addr, int rw)
159{
160 struct bio *bio;
161
162 trace_f2fs_submit_page_bio(page, blk_addr, rw);
163
164 /* Allocate a new bio */
165 bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
166
167 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
168 bio_put(bio);
169 f2fs_put_page(page, 1);
170 return -EFAULT;
171 }
172
173 submit_bio(rw, bio);
174 return 0;
175}
176
177void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
178 block_t blk_addr, struct f2fs_io_info *fio)
179{
180 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
181 struct f2fs_bio_info *io;
182 bool is_read = is_read_io(fio->rw);
183
184 io = is_read ? &sbi->read_io : &sbi->write_io[btype];
185
186 verify_block_addr(sbi, blk_addr);
187
188 mutex_lock(&io->io_mutex);
189
190 if (!is_read)
191 inc_page_count(sbi, F2FS_WRITEBACK);
192
193 if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
194 io->fio.rw != fio->rw))
195 __submit_merged_bio(io);
196alloc_new:
197 if (io->bio == NULL) {
198 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
199
200 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
201 io->fio = *fio;
202 }
203
204 if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
205 PAGE_CACHE_SIZE) {
206 __submit_merged_bio(io);
207 goto alloc_new;
208 }
209
210 io->last_block_in_bio = blk_addr;
211
212 mutex_unlock(&io->io_mutex);
213 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
214}
215
27/* 216/*
28 * Lock ordering for the change of data block address: 217 * Lock ordering for the change of data block address:
29 * ->data_page 218 * ->data_page
@@ -37,7 +226,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
37 struct page *node_page = dn->node_page; 226 struct page *node_page = dn->node_page;
38 unsigned int ofs_in_node = dn->ofs_in_node; 227 unsigned int ofs_in_node = dn->ofs_in_node;
39 228
40 f2fs_wait_on_page_writeback(node_page, NODE, false); 229 f2fs_wait_on_page_writeback(node_page, NODE);
41 230
42 rn = F2FS_NODE(node_page); 231 rn = F2FS_NODE(node_page);
43 232
@@ -51,19 +240,39 @@ int reserve_new_block(struct dnode_of_data *dn)
51{ 240{
52 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 241 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
53 242
54 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) 243 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
55 return -EPERM; 244 return -EPERM;
56 if (!inc_valid_block_count(sbi, dn->inode, 1)) 245 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
57 return -ENOSPC; 246 return -ENOSPC;
58 247
59 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); 248 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
60 249
61 __set_data_blkaddr(dn, NEW_ADDR); 250 __set_data_blkaddr(dn, NEW_ADDR);
62 dn->data_blkaddr = NEW_ADDR; 251 dn->data_blkaddr = NEW_ADDR;
252 mark_inode_dirty(dn->inode);
63 sync_inode_page(dn); 253 sync_inode_page(dn);
64 return 0; 254 return 0;
65} 255}
66 256
257int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
258{
259 bool need_put = dn->inode_page ? false : true;
260 int err;
261
262 /* if inode_page exists, index should be zero */
263 f2fs_bug_on(!need_put && index);
264
265 err = get_dnode_of_data(dn, index, ALLOC_NODE);
266 if (err)
267 return err;
268
269 if (dn->data_blkaddr == NULL_ADDR)
270 err = reserve_new_block(dn);
271 if (err || need_put)
272 f2fs_put_dnode(dn);
273 return err;
274}
275
67static int check_extent_cache(struct inode *inode, pgoff_t pgofs, 276static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
68 struct buffer_head *bh_result) 277 struct buffer_head *bh_result)
69{ 278{
@@ -71,6 +280,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
71 pgoff_t start_fofs, end_fofs; 280 pgoff_t start_fofs, end_fofs;
72 block_t start_blkaddr; 281 block_t start_blkaddr;
73 282
283 if (is_inode_flag_set(fi, FI_NO_EXTENT))
284 return 0;
285
74 read_lock(&fi->ext.ext_lock); 286 read_lock(&fi->ext.ext_lock);
75 if (fi->ext.len == 0) { 287 if (fi->ext.len == 0) {
76 read_unlock(&fi->ext.ext_lock); 288 read_unlock(&fi->ext.ext_lock);
@@ -109,6 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
109 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 321 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
110 pgoff_t fofs, start_fofs, end_fofs; 322 pgoff_t fofs, start_fofs, end_fofs;
111 block_t start_blkaddr, end_blkaddr; 323 block_t start_blkaddr, end_blkaddr;
324 int need_update = true;
112 325
113 f2fs_bug_on(blk_addr == NEW_ADDR); 326 f2fs_bug_on(blk_addr == NEW_ADDR);
114 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 327 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -117,6 +330,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
117 /* Update the page address in the parent node */ 330 /* Update the page address in the parent node */
118 __set_data_blkaddr(dn, blk_addr); 331 __set_data_blkaddr(dn, blk_addr);
119 332
333 if (is_inode_flag_set(fi, FI_NO_EXTENT))
334 return;
335
120 write_lock(&fi->ext.ext_lock); 336 write_lock(&fi->ext.ext_lock);
121 337
122 start_fofs = fi->ext.fofs; 338 start_fofs = fi->ext.fofs;
@@ -163,14 +379,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
163 fofs - start_fofs + 1; 379 fofs - start_fofs + 1;
164 fi->ext.len -= fofs - start_fofs + 1; 380 fi->ext.len -= fofs - start_fofs + 1;
165 } 381 }
166 goto end_update; 382 } else {
383 need_update = false;
167 } 384 }
168 write_unlock(&fi->ext.ext_lock);
169 return;
170 385
386 /* Finally, if the extent is very fragmented, let's drop the cache. */
387 if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
388 fi->ext.len = 0;
389 set_inode_flag(fi, FI_NO_EXTENT);
390 need_update = true;
391 }
171end_update: 392end_update:
172 write_unlock(&fi->ext.ext_lock); 393 write_unlock(&fi->ext.ext_lock);
173 sync_inode_page(dn); 394 if (need_update)
395 sync_inode_page(dn);
396 return;
174} 397}
175 398
176struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 399struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -196,7 +419,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
196 return ERR_PTR(-ENOENT); 419 return ERR_PTR(-ENOENT);
197 420
198 /* By fallocate(), there is no cached page, but with NEW_ADDR */ 421 /* By fallocate(), there is no cached page, but with NEW_ADDR */
199 if (dn.data_blkaddr == NEW_ADDR) 422 if (unlikely(dn.data_blkaddr == NEW_ADDR))
200 return ERR_PTR(-EINVAL); 423 return ERR_PTR(-EINVAL);
201 424
202 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); 425 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -208,11 +431,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
208 return page; 431 return page;
209 } 432 }
210 433
211 err = f2fs_readpage(sbi, page, dn.data_blkaddr, 434 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
212 sync ? READ_SYNC : READA); 435 sync ? READ_SYNC : READA);
436 if (err)
437 return ERR_PTR(err);
438
213 if (sync) { 439 if (sync) {
214 wait_on_page_locked(page); 440 wait_on_page_locked(page);
215 if (!PageUptodate(page)) { 441 if (unlikely(!PageUptodate(page))) {
216 f2fs_put_page(page, 0); 442 f2fs_put_page(page, 0);
217 return ERR_PTR(-EIO); 443 return ERR_PTR(-EIO);
218 } 444 }
@@ -246,7 +472,7 @@ repeat:
246 } 472 }
247 f2fs_put_dnode(&dn); 473 f2fs_put_dnode(&dn);
248 474
249 if (dn.data_blkaddr == NULL_ADDR) { 475 if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
250 f2fs_put_page(page, 1); 476 f2fs_put_page(page, 1);
251 return ERR_PTR(-ENOENT); 477 return ERR_PTR(-ENOENT);
252 } 478 }
@@ -266,16 +492,16 @@ repeat:
266 return page; 492 return page;
267 } 493 }
268 494
269 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 495 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
270 if (err) 496 if (err)
271 return ERR_PTR(err); 497 return ERR_PTR(err);
272 498
273 lock_page(page); 499 lock_page(page);
274 if (!PageUptodate(page)) { 500 if (unlikely(!PageUptodate(page))) {
275 f2fs_put_page(page, 1); 501 f2fs_put_page(page, 1);
276 return ERR_PTR(-EIO); 502 return ERR_PTR(-EIO);
277 } 503 }
278 if (page->mapping != mapping) { 504 if (unlikely(page->mapping != mapping)) {
279 f2fs_put_page(page, 1); 505 f2fs_put_page(page, 1);
280 goto repeat; 506 goto repeat;
281 } 507 }
@@ -286,12 +512,12 @@ repeat:
286 * Caller ensures that this data page is never allocated. 512 * Caller ensures that this data page is never allocated.
287 * A new zero-filled data page is allocated in the page cache. 513 * A new zero-filled data page is allocated in the page cache.
288 * 514 *
289 * Also, caller should grab and release a mutex by calling mutex_lock_op() and 515 * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
290 * mutex_unlock_op(). 516 * f2fs_unlock_op().
291 * Note that, npage is set only by make_empty_dir. 517 * Note that, ipage is set only by make_empty_dir.
292 */ 518 */
293struct page *get_new_data_page(struct inode *inode, 519struct page *get_new_data_page(struct inode *inode,
294 struct page *npage, pgoff_t index, bool new_i_size) 520 struct page *ipage, pgoff_t index, bool new_i_size)
295{ 521{
296 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 522 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
297 struct address_space *mapping = inode->i_mapping; 523 struct address_space *mapping = inode->i_mapping;
@@ -299,24 +525,16 @@ struct page *get_new_data_page(struct inode *inode,
299 struct dnode_of_data dn; 525 struct dnode_of_data dn;
300 int err; 526 int err;
301 527
302 set_new_dnode(&dn, inode, npage, npage, 0); 528 set_new_dnode(&dn, inode, ipage, NULL, 0);
303 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 529 err = f2fs_reserve_block(&dn, index);
304 if (err) 530 if (err)
305 return ERR_PTR(err); 531 return ERR_PTR(err);
306
307 if (dn.data_blkaddr == NULL_ADDR) {
308 if (reserve_new_block(&dn)) {
309 if (!npage)
310 f2fs_put_dnode(&dn);
311 return ERR_PTR(-ENOSPC);
312 }
313 }
314 if (!npage)
315 f2fs_put_dnode(&dn);
316repeat: 532repeat:
317 page = grab_cache_page(mapping, index); 533 page = grab_cache_page(mapping, index);
318 if (!page) 534 if (!page) {
319 return ERR_PTR(-ENOMEM); 535 err = -ENOMEM;
536 goto put_err;
537 }
320 538
321 if (PageUptodate(page)) 539 if (PageUptodate(page))
322 return page; 540 return page;
@@ -325,15 +543,18 @@ repeat:
325 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 543 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
326 SetPageUptodate(page); 544 SetPageUptodate(page);
327 } else { 545 } else {
328 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 546 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
547 READ_SYNC);
329 if (err) 548 if (err)
330 return ERR_PTR(err); 549 goto put_err;
550
331 lock_page(page); 551 lock_page(page);
332 if (!PageUptodate(page)) { 552 if (unlikely(!PageUptodate(page))) {
333 f2fs_put_page(page, 1); 553 f2fs_put_page(page, 1);
334 return ERR_PTR(-EIO); 554 err = -EIO;
555 goto put_err;
335 } 556 }
336 if (page->mapping != mapping) { 557 if (unlikely(page->mapping != mapping)) {
337 f2fs_put_page(page, 1); 558 f2fs_put_page(page, 1);
338 goto repeat; 559 goto repeat;
339 } 560 }
@@ -344,140 +565,187 @@ repeat:
344 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); 565 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
345 /* Only the directory inode sets new_i_size */ 566 /* Only the directory inode sets new_i_size */
346 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); 567 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
347 mark_inode_dirty_sync(inode);
348 } 568 }
349 return page; 569 return page;
350}
351
352static void read_end_io(struct bio *bio, int err)
353{
354 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
355 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
356 570
357 do { 571put_err:
358 struct page *page = bvec->bv_page; 572 f2fs_put_dnode(&dn);
359 573 return ERR_PTR(err);
360 if (--bvec >= bio->bi_io_vec)
361 prefetchw(&bvec->bv_page->flags);
362
363 if (uptodate) {
364 SetPageUptodate(page);
365 } else {
366 ClearPageUptodate(page);
367 SetPageError(page);
368 }
369 unlock_page(page);
370 } while (bvec >= bio->bi_io_vec);
371 bio_put(bio);
372} 574}
373 575
374/* 576static int __allocate_data_block(struct dnode_of_data *dn)
375 * Fill the locked page with data located in the block address.
376 * Return unlocked page.
377 */
378int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
379 block_t blk_addr, int type)
380{ 577{
381 struct block_device *bdev = sbi->sb->s_bdev; 578 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
382 struct bio *bio; 579 struct f2fs_summary sum;
580 block_t new_blkaddr;
581 struct node_info ni;
582 int type;
383 583
384 trace_f2fs_readpage(page, blk_addr, type); 584 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
585 return -EPERM;
586 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
587 return -ENOSPC;
385 588
386 down_read(&sbi->bio_sem); 589 __set_data_blkaddr(dn, NEW_ADDR);
590 dn->data_blkaddr = NEW_ADDR;
387 591
388 /* Allocate a new bio */ 592 get_node_info(sbi, dn->nid, &ni);
389 bio = f2fs_bio_alloc(bdev, 1); 593 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
390 594
391 /* Initialize the bio */ 595 type = CURSEG_WARM_DATA;
392 bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
393 bio->bi_end_io = read_end_io;
394 596
395 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 597 allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
396 bio_put(bio);
397 up_read(&sbi->bio_sem);
398 f2fs_put_page(page, 1);
399 return -EFAULT;
400 }
401 598
402 submit_bio(type, bio); 599 /* direct IO doesn't use extent cache to maximize the performance */
403 up_read(&sbi->bio_sem); 600 set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
601 update_extent_cache(new_blkaddr, dn);
602 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
603
604 dn->data_blkaddr = new_blkaddr;
404 return 0; 605 return 0;
405} 606}
406 607
407/* 608/*
408 * This function should be used by the data read flow only where it 609 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
409 * does not check the "create" flag that indicates block allocation. 610 * If original data blocks are allocated, then give them to blockdev.
410 * The reason for this special functionality is to exploit VFS readahead 611 * Otherwise,
411 * mechanism. 612 * a. preallocate requested block addresses
613 * b. do not use extent cache for better performance
614 * c. give the block addresses to blockdev
412 */ 615 */
413static int get_data_block_ro(struct inode *inode, sector_t iblock, 616static int get_data_block(struct inode *inode, sector_t iblock,
414 struct buffer_head *bh_result, int create) 617 struct buffer_head *bh_result, int create)
415{ 618{
619 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
416 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 620 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
417 unsigned maxblocks = bh_result->b_size >> blkbits; 621 unsigned maxblocks = bh_result->b_size >> blkbits;
418 struct dnode_of_data dn; 622 struct dnode_of_data dn;
419 pgoff_t pgofs; 623 int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
420 int err; 624 pgoff_t pgofs, end_offset;
625 int err = 0, ofs = 1;
626 bool allocated = false;
421 627
422 /* Get the page offset from the block offset(iblock) */ 628 /* Get the page offset from the block offset(iblock) */
423 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); 629 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
424 630
425 if (check_extent_cache(inode, pgofs, bh_result)) { 631 if (check_extent_cache(inode, pgofs, bh_result))
426 trace_f2fs_get_data_block(inode, iblock, bh_result, 0); 632 goto out;
427 return 0; 633
428 } 634 if (create)
635 f2fs_lock_op(sbi);
429 636
430 /* When reading holes, we need its node page */ 637 /* When reading holes, we need its node page */
431 set_new_dnode(&dn, inode, NULL, NULL, 0); 638 set_new_dnode(&dn, inode, NULL, NULL, 0);
432 err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); 639 err = get_dnode_of_data(&dn, pgofs, mode);
433 if (err) { 640 if (err) {
434 trace_f2fs_get_data_block(inode, iblock, bh_result, err); 641 if (err == -ENOENT)
435 return (err == -ENOENT) ? 0 : err; 642 err = 0;
643 goto unlock_out;
644 }
645 if (dn.data_blkaddr == NEW_ADDR)
646 goto put_out;
647
648 if (dn.data_blkaddr != NULL_ADDR) {
649 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
650 } else if (create) {
651 err = __allocate_data_block(&dn);
652 if (err)
653 goto put_out;
654 allocated = true;
655 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
656 } else {
657 goto put_out;
436 } 658 }
437 659
438 /* It does not support data allocation */ 660 end_offset = IS_INODE(dn.node_page) ?
439 f2fs_bug_on(create); 661 ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
662 bh_result->b_size = (((size_t)1) << blkbits);
663 dn.ofs_in_node++;
664 pgofs++;
665
666get_next:
667 if (dn.ofs_in_node >= end_offset) {
668 if (allocated)
669 sync_inode_page(&dn);
670 allocated = false;
671 f2fs_put_dnode(&dn);
440 672
441 if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { 673 set_new_dnode(&dn, inode, NULL, NULL, 0);
442 int i; 674 err = get_dnode_of_data(&dn, pgofs, mode);
443 unsigned int end_offset; 675 if (err) {
676 if (err == -ENOENT)
677 err = 0;
678 goto unlock_out;
679 }
680 if (dn.data_blkaddr == NEW_ADDR)
681 goto put_out;
444 682
445 end_offset = IS_INODE(dn.node_page) ? 683 end_offset = IS_INODE(dn.node_page) ?
446 ADDRS_PER_INODE(F2FS_I(inode)) : 684 ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
447 ADDRS_PER_BLOCK; 685 }
448
449 clear_buffer_new(bh_result);
450 686
687 if (maxblocks > (bh_result->b_size >> blkbits)) {
688 block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
689 if (blkaddr == NULL_ADDR && create) {
690 err = __allocate_data_block(&dn);
691 if (err)
692 goto sync_out;
693 allocated = true;
694 blkaddr = dn.data_blkaddr;
695 }
451 /* Give more consecutive addresses for the read ahead */ 696 /* Give more consecutive addresses for the read ahead */
452 for (i = 0; i < end_offset - dn.ofs_in_node; i++) 697 if (blkaddr == (bh_result->b_blocknr + ofs)) {
453 if (((datablock_addr(dn.node_page, 698 ofs++;
454 dn.ofs_in_node + i)) 699 dn.ofs_in_node++;
455 != (dn.data_blkaddr + i)) || maxblocks == i) 700 pgofs++;
456 break; 701 bh_result->b_size += (((size_t)1) << blkbits);
457 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 702 goto get_next;
458 bh_result->b_size = (i << blkbits); 703 }
459 } 704 }
705sync_out:
706 if (allocated)
707 sync_inode_page(&dn);
708put_out:
460 f2fs_put_dnode(&dn); 709 f2fs_put_dnode(&dn);
461 trace_f2fs_get_data_block(inode, iblock, bh_result, 0); 710unlock_out:
462 return 0; 711 if (create)
712 f2fs_unlock_op(sbi);
713out:
714 trace_f2fs_get_data_block(inode, iblock, bh_result, err);
715 return err;
463} 716}
464 717
465static int f2fs_read_data_page(struct file *file, struct page *page) 718static int f2fs_read_data_page(struct file *file, struct page *page)
466{ 719{
467 return mpage_readpage(page, get_data_block_ro); 720 struct inode *inode = page->mapping->host;
721 int ret;
722
723 /* If the file has inline data, try to read it directlly */
724 if (f2fs_has_inline_data(inode))
725 ret = f2fs_read_inline_data(inode, page);
726 else
727 ret = mpage_readpage(page, get_data_block);
728
729 return ret;
468} 730}
469 731
470static int f2fs_read_data_pages(struct file *file, 732static int f2fs_read_data_pages(struct file *file,
471 struct address_space *mapping, 733 struct address_space *mapping,
472 struct list_head *pages, unsigned nr_pages) 734 struct list_head *pages, unsigned nr_pages)
473{ 735{
474 return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); 736 struct inode *inode = file->f_mapping->host;
737
738 /* If the file has inline data, skip readpages */
739 if (f2fs_has_inline_data(inode))
740 return 0;
741
742 return mpage_readpages(mapping, pages, nr_pages, get_data_block);
475} 743}
476 744
477int do_write_data_page(struct page *page) 745int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
478{ 746{
479 struct inode *inode = page->mapping->host; 747 struct inode *inode = page->mapping->host;
480 block_t old_blk_addr, new_blk_addr; 748 block_t old_blkaddr, new_blkaddr;
481 struct dnode_of_data dn; 749 struct dnode_of_data dn;
482 int err = 0; 750 int err = 0;
483 751
@@ -486,10 +754,10 @@ int do_write_data_page(struct page *page)
486 if (err) 754 if (err)
487 return err; 755 return err;
488 756
489 old_blk_addr = dn.data_blkaddr; 757 old_blkaddr = dn.data_blkaddr;
490 758
491 /* This page is already truncated */ 759 /* This page is already truncated */
492 if (old_blk_addr == NULL_ADDR) 760 if (old_blkaddr == NULL_ADDR)
493 goto out_writepage; 761 goto out_writepage;
494 762
495 set_page_writeback(page); 763 set_page_writeback(page);
@@ -498,15 +766,13 @@ int do_write_data_page(struct page *page)
498 * If current allocation needs SSR, 766 * If current allocation needs SSR,
499 * it had better in-place writes for updated data. 767 * it had better in-place writes for updated data.
500 */ 768 */
501 if (unlikely(old_blk_addr != NEW_ADDR && 769 if (unlikely(old_blkaddr != NEW_ADDR &&
502 !is_cold_data(page) && 770 !is_cold_data(page) &&
503 need_inplace_update(inode))) { 771 need_inplace_update(inode))) {
504 rewrite_data_page(F2FS_SB(inode->i_sb), page, 772 rewrite_data_page(page, old_blkaddr, fio);
505 old_blk_addr);
506 } else { 773 } else {
507 write_data_page(inode, page, &dn, 774 write_data_page(page, &dn, &new_blkaddr, fio);
508 old_blk_addr, &new_blk_addr); 775 update_extent_cache(new_blkaddr, &dn);
509 update_extent_cache(new_blk_addr, &dn);
510 } 776 }
511out_writepage: 777out_writepage:
512 f2fs_put_dnode(&dn); 778 f2fs_put_dnode(&dn);
@@ -521,9 +787,13 @@ static int f2fs_write_data_page(struct page *page,
521 loff_t i_size = i_size_read(inode); 787 loff_t i_size = i_size_read(inode);
522 const pgoff_t end_index = ((unsigned long long) i_size) 788 const pgoff_t end_index = ((unsigned long long) i_size)
523 >> PAGE_CACHE_SHIFT; 789 >> PAGE_CACHE_SHIFT;
524 unsigned offset; 790 unsigned offset = 0;
525 bool need_balance_fs = false; 791 bool need_balance_fs = false;
526 int err = 0; 792 int err = 0;
793 struct f2fs_io_info fio = {
794 .type = DATA,
795 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
796 };
527 797
528 if (page->index < end_index) 798 if (page->index < end_index)
529 goto write; 799 goto write;
@@ -543,7 +813,7 @@ static int f2fs_write_data_page(struct page *page,
543 813
544 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 814 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
545write: 815write:
546 if (sbi->por_doing) { 816 if (unlikely(sbi->por_doing)) {
547 err = AOP_WRITEPAGE_ACTIVATE; 817 err = AOP_WRITEPAGE_ACTIVATE;
548 goto redirty_out; 818 goto redirty_out;
549 } 819 }
@@ -552,10 +822,18 @@ write:
552 if (S_ISDIR(inode->i_mode)) { 822 if (S_ISDIR(inode->i_mode)) {
553 dec_page_count(sbi, F2FS_DIRTY_DENTS); 823 dec_page_count(sbi, F2FS_DIRTY_DENTS);
554 inode_dec_dirty_dents(inode); 824 inode_dec_dirty_dents(inode);
555 err = do_write_data_page(page); 825 err = do_write_data_page(page, &fio);
556 } else { 826 } else {
557 f2fs_lock_op(sbi); 827 f2fs_lock_op(sbi);
558 err = do_write_data_page(page); 828
829 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
830 err = f2fs_write_inline_data(inode, page, offset);
831 f2fs_unlock_op(sbi);
832 goto out;
833 } else {
834 err = do_write_data_page(page, &fio);
835 }
836
559 f2fs_unlock_op(sbi); 837 f2fs_unlock_op(sbi);
560 need_balance_fs = true; 838 need_balance_fs = true;
561 } 839 }
@@ -564,8 +842,10 @@ write:
564 else if (err) 842 else if (err)
565 goto redirty_out; 843 goto redirty_out;
566 844
567 if (wbc->for_reclaim) 845 if (wbc->for_reclaim) {
568 f2fs_submit_bio(sbi, DATA, true); 846 f2fs_submit_merged_bio(sbi, DATA, WRITE);
847 need_balance_fs = false;
848 }
569 849
570 clear_cold_data(page); 850 clear_cold_data(page);
571out: 851out:
@@ -617,7 +897,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
617 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); 897 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
618 if (locked) 898 if (locked)
619 mutex_unlock(&sbi->writepages); 899 mutex_unlock(&sbi->writepages);
620 f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); 900
901 f2fs_submit_merged_bio(sbi, DATA, WRITE);
621 902
622 remove_dirty_dir_inode(inode); 903 remove_dirty_dir_inode(inode);
623 904
@@ -638,27 +919,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
638 919
639 f2fs_balance_fs(sbi); 920 f2fs_balance_fs(sbi);
640repeat: 921repeat:
922 err = f2fs_convert_inline_data(inode, pos + len);
923 if (err)
924 return err;
925
641 page = grab_cache_page_write_begin(mapping, index, flags); 926 page = grab_cache_page_write_begin(mapping, index, flags);
642 if (!page) 927 if (!page)
643 return -ENOMEM; 928 return -ENOMEM;
644 *pagep = page; 929 *pagep = page;
645 930
646 f2fs_lock_op(sbi); 931 if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
932 goto inline_data;
647 933
934 f2fs_lock_op(sbi);
648 set_new_dnode(&dn, inode, NULL, NULL, 0); 935 set_new_dnode(&dn, inode, NULL, NULL, 0);
649 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 936 err = f2fs_reserve_block(&dn, index);
650 if (err)
651 goto err;
652
653 if (dn.data_blkaddr == NULL_ADDR)
654 err = reserve_new_block(&dn);
655
656 f2fs_put_dnode(&dn);
657 if (err)
658 goto err;
659
660 f2fs_unlock_op(sbi); 937 f2fs_unlock_op(sbi);
661 938
939 if (err) {
940 f2fs_put_page(page, 1);
941 return err;
942 }
943inline_data:
662 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) 944 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
663 return 0; 945 return 0;
664 946
@@ -674,15 +956,19 @@ repeat:
674 if (dn.data_blkaddr == NEW_ADDR) { 956 if (dn.data_blkaddr == NEW_ADDR) {
675 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 957 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
676 } else { 958 } else {
677 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 959 if (f2fs_has_inline_data(inode))
960 err = f2fs_read_inline_data(inode, page);
961 else
962 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
963 READ_SYNC);
678 if (err) 964 if (err)
679 return err; 965 return err;
680 lock_page(page); 966 lock_page(page);
681 if (!PageUptodate(page)) { 967 if (unlikely(!PageUptodate(page))) {
682 f2fs_put_page(page, 1); 968 f2fs_put_page(page, 1);
683 return -EIO; 969 return -EIO;
684 } 970 }
685 if (page->mapping != mapping) { 971 if (unlikely(page->mapping != mapping)) {
686 f2fs_put_page(page, 1); 972 f2fs_put_page(page, 1);
687 goto repeat; 973 goto repeat;
688 } 974 }
@@ -691,11 +977,6 @@ out:
691 SetPageUptodate(page); 977 SetPageUptodate(page);
692 clear_cold_data(page); 978 clear_cold_data(page);
693 return 0; 979 return 0;
694
695err:
696 f2fs_unlock_op(sbi);
697 f2fs_put_page(page, 1);
698 return err;
699} 980}
700 981
701static int f2fs_write_end(struct file *file, 982static int f2fs_write_end(struct file *file,
@@ -714,23 +995,43 @@ static int f2fs_write_end(struct file *file,
714 update_inode_page(inode); 995 update_inode_page(inode);
715 } 996 }
716 997
717 unlock_page(page); 998 f2fs_put_page(page, 1);
718 page_cache_release(page);
719 return copied; 999 return copied;
720} 1000}
721 1001
1002static int check_direct_IO(struct inode *inode, int rw,
1003 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
1004{
1005 unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
1006 int i;
1007
1008 if (rw == READ)
1009 return 0;
1010
1011 if (offset & blocksize_mask)
1012 return -EINVAL;
1013
1014 for (i = 0; i < nr_segs; i++)
1015 if (iov[i].iov_len & blocksize_mask)
1016 return -EINVAL;
1017 return 0;
1018}
1019
722static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, 1020static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
723 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 1021 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
724{ 1022{
725 struct file *file = iocb->ki_filp; 1023 struct file *file = iocb->ki_filp;
726 struct inode *inode = file->f_mapping->host; 1024 struct inode *inode = file->f_mapping->host;
727 1025
728 if (rw == WRITE) 1026 /* Let buffer I/O handle the inline data case. */
1027 if (f2fs_has_inline_data(inode))
1028 return 0;
1029
1030 if (check_direct_IO(inode, rw, iov, offset, nr_segs))
729 return 0; 1031 return 0;
730 1032
731 /* Needs synchronization with the cleaner */
732 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 1033 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
733 get_data_block_ro); 1034 get_data_block);
734} 1035}
735 1036
736static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1037static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@ -759,6 +1060,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
759 trace_f2fs_set_page_dirty(page, DATA); 1060 trace_f2fs_set_page_dirty(page, DATA);
760 1061
761 SetPageUptodate(page); 1062 SetPageUptodate(page);
1063 mark_inode_dirty(inode);
1064
762 if (!PageDirty(page)) { 1065 if (!PageDirty(page)) {
763 __set_page_dirty_nobuffers(page); 1066 __set_page_dirty_nobuffers(page);
764 set_dirty_dir_page(inode, page); 1067 set_dirty_dir_page(inode, page);
@@ -769,7 +1072,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
769 1072
770static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) 1073static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
771{ 1074{
772 return generic_block_bmap(mapping, block, get_data_block_ro); 1075 return generic_block_bmap(mapping, block, get_data_block);
773} 1076}
774 1077
775const struct address_space_operations f2fs_dblock_aops = { 1078const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e6854..3de9d20d0c14 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
24#include "gc.h" 24#include "gc.h"
25 25
26static LIST_HEAD(f2fs_stat_list); 26static LIST_HEAD(f2fs_stat_list);
27static struct dentry *debugfs_root; 27static struct dentry *f2fs_debugfs_root;
28static DEFINE_MUTEX(f2fs_stat_mutex); 28static DEFINE_MUTEX(f2fs_stat_mutex);
29 29
30static void update_general_status(struct f2fs_sb_info *sbi) 30static void update_general_status(struct f2fs_sb_info *sbi)
@@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
45 si->valid_count = valid_user_blocks(sbi); 45 si->valid_count = valid_user_blocks(sbi);
46 si->valid_node_count = valid_node_count(sbi); 46 si->valid_node_count = valid_node_count(sbi);
47 si->valid_inode_count = valid_inode_count(sbi); 47 si->valid_inode_count = valid_inode_count(sbi);
48 si->inline_inode = sbi->inline_inode;
48 si->utilization = utilization(sbi); 49 si->utilization = utilization(sbi);
49 50
50 si->free_segs = free_segments(sbi); 51 si->free_segs = free_segments(sbi);
51 si->free_secs = free_sections(sbi); 52 si->free_secs = free_sections(sbi);
52 si->prefree_count = prefree_segments(sbi); 53 si->prefree_count = prefree_segments(sbi);
53 si->dirty_count = dirty_segments(sbi); 54 si->dirty_count = dirty_segments(sbi);
54 si->node_pages = sbi->node_inode->i_mapping->nrpages; 55 si->node_pages = NODE_MAPPING(sbi)->nrpages;
55 si->meta_pages = sbi->meta_inode->i_mapping->nrpages; 56 si->meta_pages = META_MAPPING(sbi)->nrpages;
56 si->nats = NM_I(sbi)->nat_cnt; 57 si->nats = NM_I(sbi)->nat_cnt;
57 si->sits = SIT_I(sbi)->dirty_sentries; 58 si->sits = SIT_I(sbi)->dirty_sentries;
58 si->fnids = NM_I(sbi)->fcnt; 59 si->fnids = NM_I(sbi)->fcnt;
@@ -165,9 +166,9 @@ get_cache:
165 /* free nids */ 166 /* free nids */
166 si->cache_mem = NM_I(sbi)->fcnt; 167 si->cache_mem = NM_I(sbi)->fcnt;
167 si->cache_mem += NM_I(sbi)->nat_cnt; 168 si->cache_mem += NM_I(sbi)->nat_cnt;
168 npages = sbi->node_inode->i_mapping->nrpages; 169 npages = NODE_MAPPING(sbi)->nrpages;
169 si->cache_mem += npages << PAGE_CACHE_SHIFT; 170 si->cache_mem += npages << PAGE_CACHE_SHIFT;
170 npages = sbi->meta_inode->i_mapping->nrpages; 171 npages = META_MAPPING(sbi)->nrpages;
171 si->cache_mem += npages << PAGE_CACHE_SHIFT; 172 si->cache_mem += npages << PAGE_CACHE_SHIFT;
172 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); 173 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
173 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); 174 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
@@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v)
200 seq_printf(s, "Other: %u)\n - Data: %u\n", 201 seq_printf(s, "Other: %u)\n - Data: %u\n",
201 si->valid_node_count - si->valid_inode_count, 202 si->valid_node_count - si->valid_inode_count,
202 si->valid_count - si->valid_node_count); 203 si->valid_count - si->valid_node_count);
204 seq_printf(s, " - Inline_data Inode: %u\n",
205 si->inline_inode);
203 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", 206 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
204 si->main_area_segs, si->main_area_sections, 207 si->main_area_segs, si->main_area_sections,
205 si->main_area_zones); 208 si->main_area_zones);
@@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v)
242 seq_printf(s, " - node blocks : %d\n", si->node_blks); 245 seq_printf(s, " - node blocks : %d\n", si->node_blks);
243 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", 246 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
244 si->hit_ext, si->total_ext); 247 si->hit_ext, si->total_ext);
245 seq_printf(s, "\nBalancing F2FS Async:\n"); 248 seq_puts(s, "\nBalancing F2FS Async:\n");
246 seq_printf(s, " - nodes %4d in %4d\n", 249 seq_printf(s, " - nodes: %4d in %4d\n",
247 si->ndirty_node, si->node_pages); 250 si->ndirty_node, si->node_pages);
248 seq_printf(s, " - dents %4d in dirs:%4d\n", 251 seq_printf(s, " - dents: %4d in dirs:%4d\n",
249 si->ndirty_dent, si->ndirty_dirs); 252 si->ndirty_dent, si->ndirty_dirs);
250 seq_printf(s, " - meta %4d in %4d\n", 253 seq_printf(s, " - meta: %4d in %4d\n",
251 si->ndirty_meta, si->meta_pages); 254 si->ndirty_meta, si->meta_pages);
252 seq_printf(s, " - NATs %5d > %lu\n", 255 seq_printf(s, " - NATs: %5d > %lu\n",
253 si->nats, NM_WOUT_THRESHOLD); 256 si->nats, NM_WOUT_THRESHOLD);
254 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 257 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
255 si->sits, si->fnids); 258 si->sits, si->fnids);
@@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
340 343
341void __init f2fs_create_root_stats(void) 344void __init f2fs_create_root_stats(void)
342{ 345{
343 debugfs_root = debugfs_create_dir("f2fs", NULL); 346 struct dentry *file;
344 if (debugfs_root) 347
345 debugfs_create_file("status", S_IRUGO, debugfs_root, 348 f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
346 NULL, &stat_fops); 349 if (!f2fs_debugfs_root)
350 goto bail;
351
352 file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
353 NULL, &stat_fops);
354 if (!file)
355 goto free_debugfs_dir;
356
357 return;
358
359free_debugfs_dir:
360 debugfs_remove(f2fs_debugfs_root);
361
362bail:
363 f2fs_debugfs_root = NULL;
364 return;
347} 365}
348 366
349void f2fs_destroy_root_stats(void) 367void f2fs_destroy_root_stats(void)
350{ 368{
351 debugfs_remove_recursive(debugfs_root); 369 if (!f2fs_debugfs_root)
352 debugfs_root = NULL; 370 return;
371
372 debugfs_remove_recursive(f2fs_debugfs_root);
373 f2fs_debugfs_root = NULL;
353} 374}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 594fc1bb64ef..2b7c255bcbdf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
190 unsigned int max_depth; 190 unsigned int max_depth;
191 unsigned int level; 191 unsigned int level;
192 192
193 if (namelen > F2FS_NAME_LEN)
194 return NULL;
195
196 if (npages == 0) 193 if (npages == 0)
197 return NULL; 194 return NULL;
198 195
@@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
259 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 256 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
260 mark_inode_dirty(dir); 257 mark_inode_dirty(dir);
261 258
262 /* update parent inode number before releasing dentry page */
263 F2FS_I(inode)->i_pino = dir->i_ino;
264
265 f2fs_put_page(page, 1); 259 f2fs_put_page(page, 1);
266} 260}
267 261
268static void init_dent_inode(const struct qstr *name, struct page *ipage) 262static void init_dent_inode(const struct qstr *name, struct page *ipage)
269{ 263{
270 struct f2fs_node *rn; 264 struct f2fs_inode *ri;
271 265
272 /* copy name info. to this inode page */ 266 /* copy name info. to this inode page */
273 rn = F2FS_NODE(ipage); 267 ri = F2FS_INODE(ipage);
274 rn->i.i_namelen = cpu_to_le32(name->len); 268 ri->i_namelen = cpu_to_le32(name->len);
275 memcpy(rn->i.i_name, name->name, name->len); 269 memcpy(ri->i_name, name->name, name->len);
276 set_page_dirty(ipage); 270 set_page_dirty(ipage);
277} 271}
278 272
@@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode,
348 342
349 err = f2fs_init_acl(inode, dir, page); 343 err = f2fs_init_acl(inode, dir, page);
350 if (err) 344 if (err)
351 goto error; 345 goto put_error;
352 346
353 err = f2fs_init_security(inode, dir, name, page); 347 err = f2fs_init_security(inode, dir, name, page);
354 if (err) 348 if (err)
355 goto error; 349 goto put_error;
356 350
357 wait_on_page_writeback(page); 351 wait_on_page_writeback(page);
358 } else { 352 } else {
@@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode,
376 } 370 }
377 return page; 371 return page;
378 372
379error: 373put_error:
380 f2fs_put_page(page, 1); 374 f2fs_put_page(page, 1);
375error:
381 remove_inode_page(inode); 376 remove_inode_page(inode);
382 return ERR_PTR(err); 377 return ERR_PTR(err);
383} 378}
@@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
393 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); 388 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
394 } 389 }
395 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 390 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
391 mark_inode_dirty(dir);
392
396 if (F2FS_I(dir)->i_current_depth != current_depth) { 393 if (F2FS_I(dir)->i_current_depth != current_depth) {
397 F2FS_I(dir)->i_current_depth = current_depth; 394 F2FS_I(dir)->i_current_depth = current_depth;
398 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 395 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
400 397
401 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) 398 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
402 update_inode_page(dir); 399 update_inode_page(dir);
403 else
404 mark_inode_dirty(dir);
405 400
406 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 401 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
407 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 402 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -432,10 +427,11 @@ next:
432} 427}
433 428
434/* 429/*
435 * Caller should grab and release a mutex by calling mutex_lock_op() and 430 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
436 * mutex_unlock_op(). 431 * f2fs_unlock_op().
437 */ 432 */
438int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) 433int __f2fs_add_link(struct inode *dir, const struct qstr *name,
434 struct inode *inode)
439{ 435{
440 unsigned int bit_pos; 436 unsigned int bit_pos;
441 unsigned int level; 437 unsigned int level;
@@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
461 } 457 }
462 458
463start: 459start:
464 if (current_depth == MAX_DIR_HASH_DEPTH) 460 if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
465 return -ENOSPC; 461 return -ENOSPC;
466 462
467 /* Increase the depth, if required */ 463 /* Increase the depth, if required */
@@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
554 550
555 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 551 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
556 552
557 if (inode && S_ISDIR(inode->i_mode)) {
558 drop_nlink(dir);
559 update_inode_page(dir);
560 } else {
561 mark_inode_dirty(dir);
562 }
563
564 if (inode) { 553 if (inode) {
554 if (S_ISDIR(inode->i_mode)) {
555 drop_nlink(dir);
556 update_inode_page(dir);
557 }
565 inode->i_ctime = CURRENT_TIME; 558 inode->i_ctime = CURRENT_TIME;
566 drop_nlink(inode); 559 drop_nlink(inode);
567 if (S_ISDIR(inode->i_mode)) { 560 if (S_ISDIR(inode->i_mode)) {
@@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
636 629
637 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); 630 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
638 631
639 for ( ; n < npages; n++) { 632 for (; n < npages; n++) {
640 dentry_page = get_lock_data_page(inode, n); 633 dentry_page = get_lock_data_page(inode, n);
641 if (IS_ERR(dentry_page)) 634 if (IS_ERR(dentry_page))
642 continue; 635 continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf2..af51a0bd2dee 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,8 +22,10 @@
22 22
23#ifdef CONFIG_F2FS_CHECK_FS 23#ifdef CONFIG_F2FS_CHECK_FS
24#define f2fs_bug_on(condition) BUG_ON(condition) 24#define f2fs_bug_on(condition) BUG_ON(condition)
25#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
25#else 26#else
26#define f2fs_bug_on(condition) 27#define f2fs_bug_on(condition)
28#define f2fs_down_write(x, y) down_write(x)
27#endif 29#endif
28 30
29/* 31/*
@@ -37,6 +39,7 @@
37#define F2FS_MOUNT_POSIX_ACL 0x00000020 39#define F2FS_MOUNT_POSIX_ACL 0x00000020
38#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
39#define F2FS_MOUNT_INLINE_XATTR 0x00000080 41#define F2FS_MOUNT_INLINE_XATTR 0x00000080
42#define F2FS_MOUNT_INLINE_DATA 0x00000100
40 43
41#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 44#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
42#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 45#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -97,6 +100,13 @@ struct dir_inode_entry {
97 struct inode *inode; /* vfs inode pointer */ 100 struct inode *inode; /* vfs inode pointer */
98}; 101};
99 102
103/* for the list of blockaddresses to be discarded */
104struct discard_entry {
105 struct list_head list; /* list head */
106 block_t blkaddr; /* block address to be discarded */
107 int len; /* # of consecutive blocks of the discard */
108};
109
100/* for the list of fsync inodes, used only during recovery */ 110/* for the list of fsync inodes, used only during recovery */
101struct fsync_inode_entry { 111struct fsync_inode_entry {
102 struct list_head list; /* list head */ 112 struct list_head list; /* list head */
@@ -155,13 +165,15 @@ enum {
155 LOOKUP_NODE, /* look up a node without readahead */ 165 LOOKUP_NODE, /* look up a node without readahead */
156 LOOKUP_NODE_RA, /* 166 LOOKUP_NODE_RA, /*
157 * look up a node with readahead called 167 * look up a node with readahead called
158 * by get_datablock_ro. 168 * by get_data_block.
159 */ 169 */
160}; 170};
161 171
162#define F2FS_LINK_MAX 32000 /* maximum link count per file */ 172#define F2FS_LINK_MAX 32000 /* maximum link count per file */
163 173
164/* for in-memory extent cache entry */ 174/* for in-memory extent cache entry */
175#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */
176
165struct extent_info { 177struct extent_info {
166 rwlock_t ext_lock; /* rwlock for consistency */ 178 rwlock_t ext_lock; /* rwlock for consistency */
167 unsigned int fofs; /* start offset in a file */ 179 unsigned int fofs; /* start offset in a file */
@@ -308,6 +320,14 @@ struct f2fs_sm_info {
308 320
309 /* a threshold to reclaim prefree segments */ 321 /* a threshold to reclaim prefree segments */
310 unsigned int rec_prefree_segments; 322 unsigned int rec_prefree_segments;
323
324 /* for small discard management */
325 struct list_head discard_list; /* 4KB discard list */
326 int nr_discards; /* # of discards in the list */
327 int max_discards; /* max. discards to be issued */
328
329 unsigned int ipu_policy; /* in-place-update policy */
330 unsigned int min_ipu_util; /* in-place-update threshold */
311}; 331};
312 332
313/* 333/*
@@ -338,6 +358,7 @@ enum count_type {
338 * with waiting the bio's completion 358 * with waiting the bio's completion
339 * ... Only can be used with META. 359 * ... Only can be used with META.
340 */ 360 */
361#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
341enum page_type { 362enum page_type {
342 DATA, 363 DATA,
343 NODE, 364 NODE,
@@ -346,6 +367,20 @@ enum page_type {
346 META_FLUSH, 367 META_FLUSH,
347}; 368};
348 369
370struct f2fs_io_info {
371 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
372 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
373};
374
375#define is_read_io(rw) (((rw) & 1) == READ)
376struct f2fs_bio_info {
377 struct f2fs_sb_info *sbi; /* f2fs superblock */
378 struct bio *bio; /* bios to merge */
379 sector_t last_block_in_bio; /* last block number */
380 struct f2fs_io_info fio; /* store buffered io info. */
381 struct mutex io_mutex; /* mutex for bio */
382};
383
349struct f2fs_sb_info { 384struct f2fs_sb_info {
350 struct super_block *sb; /* pointer to VFS super block */ 385 struct super_block *sb; /* pointer to VFS super block */
351 struct proc_dir_entry *s_proc; /* proc entry */ 386 struct proc_dir_entry *s_proc; /* proc entry */
@@ -359,9 +394,10 @@ struct f2fs_sb_info {
359 394
360 /* for segment-related operations */ 395 /* for segment-related operations */
361 struct f2fs_sm_info *sm_info; /* segment manager */ 396 struct f2fs_sm_info *sm_info; /* segment manager */
362 struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ 397
363 sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ 398 /* for bio operations */
364 struct rw_semaphore bio_sem; /* IO semaphore */ 399 struct f2fs_bio_info read_io; /* for read bios */
400 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
365 401
366 /* for checkpoint */ 402 /* for checkpoint */
367 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ 403 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -376,8 +412,9 @@ struct f2fs_sb_info {
376 412
377 /* for orphan inode management */ 413 /* for orphan inode management */
378 struct list_head orphan_inode_list; /* orphan inode list */ 414 struct list_head orphan_inode_list; /* orphan inode list */
379 struct mutex orphan_inode_mutex; /* for orphan inode list */ 415 spinlock_t orphan_inode_lock; /* for orphan inode list */
380 unsigned int n_orphans; /* # of orphan inodes */ 416 unsigned int n_orphans; /* # of orphan inodes */
417 unsigned int max_orphans; /* max orphan inodes */
381 418
382 /* for directory inode management */ 419 /* for directory inode management */
383 struct list_head dir_inode_list; /* dir inode list */ 420 struct list_head dir_inode_list; /* dir inode list */
@@ -414,6 +451,9 @@ struct f2fs_sb_info {
414 struct f2fs_gc_kthread *gc_thread; /* GC thread */ 451 struct f2fs_gc_kthread *gc_thread; /* GC thread */
415 unsigned int cur_victim_sec; /* current victim section num */ 452 unsigned int cur_victim_sec; /* current victim section num */
416 453
454 /* maximum # of trials to find a victim segment for SSR and GC */
455 unsigned int max_victim_search;
456
417 /* 457 /*
418 * for stat information. 458 * for stat information.
419 * one is for the LFS mode, and the other is for the SSR mode. 459 * one is for the LFS mode, and the other is for the SSR mode.
@@ -423,6 +463,7 @@ struct f2fs_sb_info {
423 unsigned int segment_count[2]; /* # of allocated segments */ 463 unsigned int segment_count[2]; /* # of allocated segments */
424 unsigned int block_count[2]; /* # of allocated blocks */ 464 unsigned int block_count[2]; /* # of allocated blocks */
425 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 465 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
466 int inline_inode; /* # of inline_data inodes */
426 int bg_gc; /* background gc calls */ 467 int bg_gc; /* background gc calls */
427 unsigned int n_dirty_dirs; /* # of dir inodes */ 468 unsigned int n_dirty_dirs; /* # of dir inodes */
428#endif 469#endif
@@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
462 return (struct f2fs_node *)page_address(page); 503 return (struct f2fs_node *)page_address(page);
463} 504}
464 505
506static inline struct f2fs_inode *F2FS_INODE(struct page *page)
507{
508 return &((struct f2fs_node *)page_address(page))->i;
509}
510
465static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) 511static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
466{ 512{
467 return (struct f2fs_nm_info *)(sbi->nm_info); 513 return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
487 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); 533 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
488} 534}
489 535
536static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
537{
538 return sbi->meta_inode->i_mapping;
539}
540
541static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
542{
543 return sbi->node_inode->i_mapping;
544}
545
490static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) 546static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
491{ 547{
492 sbi->s_dirty = 1; 548 sbi->s_dirty = 1;
@@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
534 590
535static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) 591static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
536{ 592{
537 down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex); 593 f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
538} 594}
539 595
540static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) 596static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
548static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) 604static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
549{ 605{
550 WARN_ON((nid >= NM_I(sbi)->max_nid)); 606 WARN_ON((nid >= NM_I(sbi)->max_nid));
551 if (nid >= NM_I(sbi)->max_nid) 607 if (unlikely(nid >= NM_I(sbi)->max_nid))
552 return -EINVAL; 608 return -EINVAL;
553 return 0; 609 return 0;
554} 610}
@@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
561static inline int F2FS_HAS_BLOCKS(struct inode *inode) 617static inline int F2FS_HAS_BLOCKS(struct inode *inode)
562{ 618{
563 if (F2FS_I(inode)->i_xattr_nid) 619 if (F2FS_I(inode)->i_xattr_nid)
564 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); 620 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
565 else 621 else
566 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); 622 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
567} 623}
568 624
569static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, 625static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
574 spin_lock(&sbi->stat_lock); 630 spin_lock(&sbi->stat_lock);
575 valid_block_count = 631 valid_block_count =
576 sbi->total_valid_block_count + (block_t)count; 632 sbi->total_valid_block_count + (block_t)count;
577 if (valid_block_count > sbi->user_block_count) { 633 if (unlikely(valid_block_count > sbi->user_block_count)) {
578 spin_unlock(&sbi->stat_lock); 634 spin_unlock(&sbi->stat_lock);
579 return false; 635 return false;
580 } 636 }
@@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
585 return true; 641 return true;
586} 642}
587 643
588static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, 644static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
589 struct inode *inode, 645 struct inode *inode,
590 blkcnt_t count) 646 blkcnt_t count)
591{ 647{
@@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
595 inode->i_blocks -= count; 651 inode->i_blocks -= count;
596 sbi->total_valid_block_count -= (block_t)count; 652 sbi->total_valid_block_count -= (block_t)count;
597 spin_unlock(&sbi->stat_lock); 653 spin_unlock(&sbi->stat_lock);
598 return 0;
599} 654}
600 655
601static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) 656static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
686} 741}
687 742
688static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, 743static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
689 struct inode *inode, 744 struct inode *inode)
690 unsigned int count)
691{ 745{
692 block_t valid_block_count; 746 block_t valid_block_count;
693 unsigned int valid_node_count; 747 unsigned int valid_node_count;
694 748
695 spin_lock(&sbi->stat_lock); 749 spin_lock(&sbi->stat_lock);
696 750
697 valid_block_count = sbi->total_valid_block_count + (block_t)count; 751 valid_block_count = sbi->total_valid_block_count + 1;
698 sbi->alloc_valid_block_count += (block_t)count; 752 if (unlikely(valid_block_count > sbi->user_block_count)) {
699 valid_node_count = sbi->total_valid_node_count + count;
700
701 if (valid_block_count > sbi->user_block_count) {
702 spin_unlock(&sbi->stat_lock); 753 spin_unlock(&sbi->stat_lock);
703 return false; 754 return false;
704 } 755 }
705 756
706 if (valid_node_count > sbi->total_node_count) { 757 valid_node_count = sbi->total_valid_node_count + 1;
758 if (unlikely(valid_node_count > sbi->total_node_count)) {
707 spin_unlock(&sbi->stat_lock); 759 spin_unlock(&sbi->stat_lock);
708 return false; 760 return false;
709 } 761 }
710 762
711 if (inode) 763 if (inode)
712 inode->i_blocks += count; 764 inode->i_blocks++;
713 sbi->total_valid_node_count = valid_node_count; 765
714 sbi->total_valid_block_count = valid_block_count; 766 sbi->alloc_valid_block_count++;
767 sbi->total_valid_node_count++;
768 sbi->total_valid_block_count++;
715 spin_unlock(&sbi->stat_lock); 769 spin_unlock(&sbi->stat_lock);
716 770
717 return true; 771 return true;
718} 772}
719 773
720static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, 774static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
721 struct inode *inode, 775 struct inode *inode)
722 unsigned int count)
723{ 776{
724 spin_lock(&sbi->stat_lock); 777 spin_lock(&sbi->stat_lock);
725 778
726 f2fs_bug_on(sbi->total_valid_block_count < count); 779 f2fs_bug_on(!sbi->total_valid_block_count);
727 f2fs_bug_on(sbi->total_valid_node_count < count); 780 f2fs_bug_on(!sbi->total_valid_node_count);
728 f2fs_bug_on(inode->i_blocks < count); 781 f2fs_bug_on(!inode->i_blocks);
729 782
730 inode->i_blocks -= count; 783 inode->i_blocks--;
731 sbi->total_valid_node_count -= count; 784 sbi->total_valid_node_count--;
732 sbi->total_valid_block_count -= (block_t)count; 785 sbi->total_valid_block_count--;
733 786
734 spin_unlock(&sbi->stat_lock); 787 spin_unlock(&sbi->stat_lock);
735} 788}
@@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
751 spin_unlock(&sbi->stat_lock); 804 spin_unlock(&sbi->stat_lock);
752} 805}
753 806
754static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) 807static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
755{ 808{
756 spin_lock(&sbi->stat_lock); 809 spin_lock(&sbi->stat_lock);
757 f2fs_bug_on(!sbi->total_valid_inode_count); 810 f2fs_bug_on(!sbi->total_valid_inode_count);
758 sbi->total_valid_inode_count--; 811 sbi->total_valid_inode_count--;
759 spin_unlock(&sbi->stat_lock); 812 spin_unlock(&sbi->stat_lock);
760 return 0;
761} 813}
762 814
763static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) 815static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
@@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
771 823
772static inline void f2fs_put_page(struct page *page, int unlock) 824static inline void f2fs_put_page(struct page *page, int unlock)
773{ 825{
774 if (!page || IS_ERR(page)) 826 if (!page)
775 return; 827 return;
776 828
777 if (unlock) { 829 if (unlock) {
@@ -876,7 +928,9 @@ enum {
876 FI_NO_ALLOC, /* should not allocate any blocks */ 928 FI_NO_ALLOC, /* should not allocate any blocks */
877 FI_UPDATE_DIR, /* should update inode block for consistency */ 929 FI_UPDATE_DIR, /* should update inode block for consistency */
878 FI_DELAY_IPUT, /* used for the recovery */ 930 FI_DELAY_IPUT, /* used for the recovery */
931 FI_NO_EXTENT, /* not to use the extent cache */
879 FI_INLINE_XATTR, /* used for inline xattr */ 932 FI_INLINE_XATTR, /* used for inline xattr */
933 FI_INLINE_DATA, /* used for inline data*/
880}; 934};
881 935
882static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 936static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
914{ 968{
915 if (ri->i_inline & F2FS_INLINE_XATTR) 969 if (ri->i_inline & F2FS_INLINE_XATTR)
916 set_inode_flag(fi, FI_INLINE_XATTR); 970 set_inode_flag(fi, FI_INLINE_XATTR);
971 if (ri->i_inline & F2FS_INLINE_DATA)
972 set_inode_flag(fi, FI_INLINE_DATA);
917} 973}
918 974
919static inline void set_raw_inline(struct f2fs_inode_info *fi, 975static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
923 979
924 if (is_inode_flag_set(fi, FI_INLINE_XATTR)) 980 if (is_inode_flag_set(fi, FI_INLINE_XATTR))
925 ri->i_inline |= F2FS_INLINE_XATTR; 981 ri->i_inline |= F2FS_INLINE_XATTR;
982 if (is_inode_flag_set(fi, FI_INLINE_DATA))
983 ri->i_inline |= F2FS_INLINE_DATA;
926} 984}
927 985
928static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 986static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
@@ -948,6 +1006,18 @@ static inline int inline_xattr_size(struct inode *inode)
948 return 0; 1006 return 0;
949} 1007}
950 1008
1009static inline int f2fs_has_inline_data(struct inode *inode)
1010{
1011 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1012}
1013
1014static inline void *inline_data_addr(struct page *page)
1015{
1016 struct f2fs_inode *ri;
1017 ri = (struct f2fs_inode *)page_address(page);
1018 return (void *)&(ri->i_addr[1]);
1019}
1020
951static inline int f2fs_readonly(struct super_block *sb) 1021static inline int f2fs_readonly(struct super_block *sb)
952{ 1022{
953 return sb->s_flags & MS_RDONLY; 1023 return sb->s_flags & MS_RDONLY;
@@ -958,6 +1028,7 @@ static inline int f2fs_readonly(struct super_block *sb)
958 */ 1028 */
959int f2fs_sync_file(struct file *, loff_t, loff_t, int); 1029int f2fs_sync_file(struct file *, loff_t, loff_t, int);
960void truncate_data_blocks(struct dnode_of_data *); 1030void truncate_data_blocks(struct dnode_of_data *);
1031int truncate_blocks(struct inode *, u64);
961void f2fs_truncate(struct inode *); 1032void f2fs_truncate(struct inode *);
962int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1033int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
963int f2fs_setattr(struct dentry *, struct iattr *); 1034int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1027,7 +1098,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1027int truncate_inode_blocks(struct inode *, pgoff_t); 1098int truncate_inode_blocks(struct inode *, pgoff_t);
1028int truncate_xattr_node(struct inode *, struct page *); 1099int truncate_xattr_node(struct inode *, struct page *);
1029int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); 1100int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
1030int remove_inode_page(struct inode *); 1101void remove_inode_page(struct inode *);
1031struct page *new_inode_page(struct inode *, const struct qstr *); 1102struct page *new_inode_page(struct inode *, const struct qstr *);
1032struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); 1103struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
1033void ra_node_page(struct f2fs_sb_info *, nid_t); 1104void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1059,19 +1130,19 @@ void clear_prefree_segments(struct f2fs_sb_info *);
1059int npages_for_summary_flush(struct f2fs_sb_info *); 1130int npages_for_summary_flush(struct f2fs_sb_info *);
1060void allocate_new_segments(struct f2fs_sb_info *); 1131void allocate_new_segments(struct f2fs_sb_info *);
1061struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1132struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1062struct bio *f2fs_bio_alloc(struct block_device *, int);
1063void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
1064void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
1065void write_meta_page(struct f2fs_sb_info *, struct page *); 1133void write_meta_page(struct f2fs_sb_info *, struct page *);
1066void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, 1134void write_node_page(struct f2fs_sb_info *, struct page *,
1067 block_t, block_t *); 1135 struct f2fs_io_info *, unsigned int, block_t, block_t *);
1068void write_data_page(struct inode *, struct page *, struct dnode_of_data*, 1136void write_data_page(struct page *, struct dnode_of_data *, block_t *,
1069 block_t, block_t *); 1137 struct f2fs_io_info *);
1070void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); 1138void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
1071void recover_data_page(struct f2fs_sb_info *, struct page *, 1139void recover_data_page(struct f2fs_sb_info *, struct page *,
1072 struct f2fs_summary *, block_t, block_t); 1140 struct f2fs_summary *, block_t, block_t);
1073void rewrite_node_page(struct f2fs_sb_info *, struct page *, 1141void rewrite_node_page(struct f2fs_sb_info *, struct page *,
1074 struct f2fs_summary *, block_t, block_t); 1142 struct f2fs_summary *, block_t, block_t);
1143void allocate_data_block(struct f2fs_sb_info *, struct page *,
1144 block_t, block_t *, struct f2fs_summary *, int);
1145void f2fs_wait_on_page_writeback(struct page *, enum page_type);
1075void write_data_summaries(struct f2fs_sb_info *, block_t); 1146void write_data_summaries(struct f2fs_sb_info *, block_t);
1076void write_node_summaries(struct f2fs_sb_info *, block_t); 1147void write_node_summaries(struct f2fs_sb_info *, block_t);
1077int lookup_journal_in_cursum(struct f2fs_summary_block *, 1148int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1079,6 +1150,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
1079void flush_sit_entries(struct f2fs_sb_info *); 1150void flush_sit_entries(struct f2fs_sb_info *);
1080int build_segment_manager(struct f2fs_sb_info *); 1151int build_segment_manager(struct f2fs_sb_info *);
1081void destroy_segment_manager(struct f2fs_sb_info *); 1152void destroy_segment_manager(struct f2fs_sb_info *);
1153int __init create_segment_manager_caches(void);
1154void destroy_segment_manager_caches(void);
1082 1155
1083/* 1156/*
1084 * checkpoint.c 1157 * checkpoint.c
@@ -1090,7 +1163,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
1090void release_orphan_inode(struct f2fs_sb_info *); 1163void release_orphan_inode(struct f2fs_sb_info *);
1091void add_orphan_inode(struct f2fs_sb_info *, nid_t); 1164void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1092void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1165void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1093int recover_orphan_inodes(struct f2fs_sb_info *); 1166void recover_orphan_inodes(struct f2fs_sb_info *);
1094int get_valid_checkpoint(struct f2fs_sb_info *); 1167int get_valid_checkpoint(struct f2fs_sb_info *);
1095void set_dirty_dir_page(struct inode *, struct page *); 1168void set_dirty_dir_page(struct inode *, struct page *);
1096void add_dirty_dir_inode(struct inode *); 1169void add_dirty_dir_inode(struct inode *);
@@ -1105,13 +1178,17 @@ void destroy_checkpoint_caches(void);
1105/* 1178/*
1106 * data.c 1179 * data.c
1107 */ 1180 */
1181void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
1182int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
1183void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
1184 struct f2fs_io_info *);
1108int reserve_new_block(struct dnode_of_data *); 1185int reserve_new_block(struct dnode_of_data *);
1186int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1109void update_extent_cache(block_t, struct dnode_of_data *); 1187void update_extent_cache(block_t, struct dnode_of_data *);
1110struct page *find_data_page(struct inode *, pgoff_t, bool); 1188struct page *find_data_page(struct inode *, pgoff_t, bool);
1111struct page *get_lock_data_page(struct inode *, pgoff_t); 1189struct page *get_lock_data_page(struct inode *, pgoff_t);
1112struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); 1190struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1113int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); 1191int do_write_data_page(struct page *, struct f2fs_io_info *);
1114int do_write_data_page(struct page *);
1115 1192
1116/* 1193/*
1117 * gc.c 1194 * gc.c
@@ -1144,7 +1221,7 @@ struct f2fs_stat_info {
1144 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; 1221 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1145 int nats, sits, fnids; 1222 int nats, sits, fnids;
1146 int total_count, utilization; 1223 int total_count, utilization;
1147 int bg_gc; 1224 int bg_gc, inline_inode;
1148 unsigned int valid_count, valid_node_count, valid_inode_count; 1225 unsigned int valid_count, valid_node_count, valid_inode_count;
1149 unsigned int bimodal, avg_vblocks; 1226 unsigned int bimodal, avg_vblocks;
1150 int util_free, util_valid, util_invalid; 1227 int util_free, util_valid, util_invalid;
@@ -1164,7 +1241,7 @@ struct f2fs_stat_info {
1164 1241
1165static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) 1242static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1166{ 1243{
1167 return (struct f2fs_stat_info*)sbi->stat_info; 1244 return (struct f2fs_stat_info *)sbi->stat_info;
1168} 1245}
1169 1246
1170#define stat_inc_call_count(si) ((si)->call_count++) 1247#define stat_inc_call_count(si) ((si)->call_count++)
@@ -1173,6 +1250,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1173#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) 1250#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
1174#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) 1251#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++)
1175#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) 1252#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++)
1253#define stat_inc_inline_inode(inode) \
1254 do { \
1255 if (f2fs_has_inline_data(inode)) \
1256 ((F2FS_SB(inode->i_sb))->inline_inode++); \
1257 } while (0)
1258#define stat_dec_inline_inode(inode) \
1259 do { \
1260 if (f2fs_has_inline_data(inode)) \
1261 ((F2FS_SB(inode->i_sb))->inline_inode--); \
1262 } while (0)
1263
1176#define stat_inc_seg_type(sbi, curseg) \ 1264#define stat_inc_seg_type(sbi, curseg) \
1177 ((sbi)->segment_count[(curseg)->alloc_type]++) 1265 ((sbi)->segment_count[(curseg)->alloc_type]++)
1178#define stat_inc_block_count(sbi, curseg) \ 1266#define stat_inc_block_count(sbi, curseg) \
@@ -1216,6 +1304,8 @@ void f2fs_destroy_root_stats(void);
1216#define stat_dec_dirty_dir(sbi) 1304#define stat_dec_dirty_dir(sbi)
1217#define stat_inc_total_hit(sb) 1305#define stat_inc_total_hit(sb)
1218#define stat_inc_read_hit(sb) 1306#define stat_inc_read_hit(sb)
1307#define stat_inc_inline_inode(inode)
1308#define stat_dec_inline_inode(inode)
1219#define stat_inc_seg_type(sbi, curseg) 1309#define stat_inc_seg_type(sbi, curseg)
1220#define stat_inc_block_count(sbi, curseg) 1310#define stat_inc_block_count(sbi, curseg)
1221#define stat_inc_seg_count(si, type) 1311#define stat_inc_seg_count(si, type)
@@ -1238,4 +1328,13 @@ extern const struct address_space_operations f2fs_meta_aops;
1238extern const struct inode_operations f2fs_dir_inode_operations; 1328extern const struct inode_operations f2fs_dir_inode_operations;
1239extern const struct inode_operations f2fs_symlink_inode_operations; 1329extern const struct inode_operations f2fs_symlink_inode_operations;
1240extern const struct inode_operations f2fs_special_inode_operations; 1330extern const struct inode_operations f2fs_special_inode_operations;
1331
1332/*
1333 * inline.c
1334 */
1335bool f2fs_may_inline(struct inode *);
1336int f2fs_read_inline_data(struct inode *, struct page *);
1337int f2fs_convert_inline_data(struct inode *, pgoff_t);
1338int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
1339int recover_inline_data(struct inode *, struct page *);
1241#endif 1340#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d5..85e91ca88d57 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
33 struct page *page = vmf->page; 33 struct page *page = vmf->page;
34 struct inode *inode = file_inode(vma->vm_file); 34 struct inode *inode = file_inode(vma->vm_file);
35 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 35 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
36 block_t old_blk_addr;
37 struct dnode_of_data dn; 36 struct dnode_of_data dn;
38 int err; 37 int err;
39 38
@@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
44 /* block allocation */ 43 /* block allocation */
45 f2fs_lock_op(sbi); 44 f2fs_lock_op(sbi);
46 set_new_dnode(&dn, inode, NULL, NULL, 0); 45 set_new_dnode(&dn, inode, NULL, NULL, 0);
47 err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); 46 err = f2fs_reserve_block(&dn, page->index);
48 if (err) {
49 f2fs_unlock_op(sbi);
50 goto out;
51 }
52
53 old_blk_addr = dn.data_blkaddr;
54
55 if (old_blk_addr == NULL_ADDR) {
56 err = reserve_new_block(&dn);
57 if (err) {
58 f2fs_put_dnode(&dn);
59 f2fs_unlock_op(sbi);
60 goto out;
61 }
62 }
63 f2fs_put_dnode(&dn);
64 f2fs_unlock_op(sbi); 47 f2fs_unlock_op(sbi);
48 if (err)
49 goto out;
65 50
66 file_update_time(vma->vm_file); 51 file_update_time(vma->vm_file);
67 lock_page(page); 52 lock_page(page);
68 if (page->mapping != inode->i_mapping || 53 if (unlikely(page->mapping != inode->i_mapping ||
69 page_offset(page) > i_size_read(inode) || 54 page_offset(page) > i_size_read(inode) ||
70 !PageUptodate(page)) { 55 !PageUptodate(page))) {
71 unlock_page(page); 56 unlock_page(page);
72 err = -EFAULT; 57 err = -EFAULT;
73 goto out; 58 goto out;
@@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
130 int ret = 0; 115 int ret = 0;
131 bool need_cp = false; 116 bool need_cp = false;
132 struct writeback_control wbc = { 117 struct writeback_control wbc = {
133 .sync_mode = WB_SYNC_ALL, 118 .sync_mode = WB_SYNC_NONE,
134 .nr_to_write = LONG_MAX, 119 .nr_to_write = LONG_MAX,
135 .for_reclaim = 0, 120 .for_reclaim = 0,
136 }; 121 };
137 122
138 if (f2fs_readonly(inode->i_sb)) 123 if (unlikely(f2fs_readonly(inode->i_sb)))
139 return 0; 124 return 0;
140 125
141 trace_f2fs_sync_file_enter(inode); 126 trace_f2fs_sync_file_enter(inode);
@@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
217 raw_node = F2FS_NODE(dn->node_page); 202 raw_node = F2FS_NODE(dn->node_page);
218 addr = blkaddr_in_node(raw_node) + ofs; 203 addr = blkaddr_in_node(raw_node) + ofs;
219 204
220 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { 205 for (; count > 0; count--, addr++, dn->ofs_in_node++) {
221 block_t blkaddr = le32_to_cpu(*addr); 206 block_t blkaddr = le32_to_cpu(*addr);
222 if (blkaddr == NULL_ADDR) 207 if (blkaddr == NULL_ADDR)
223 continue; 208 continue;
@@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
256 return; 241 return;
257 242
258 lock_page(page); 243 lock_page(page);
259 if (page->mapping != inode->i_mapping) { 244 if (unlikely(page->mapping != inode->i_mapping)) {
260 f2fs_put_page(page, 1); 245 f2fs_put_page(page, 1);
261 return; 246 return;
262 } 247 }
@@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
266 f2fs_put_page(page, 1); 251 f2fs_put_page(page, 1);
267} 252}
268 253
269static int truncate_blocks(struct inode *inode, u64 from) 254int truncate_blocks(struct inode *inode, u64 from)
270{ 255{
271 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 256 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
272 unsigned int blocksize = inode->i_sb->s_blocksize; 257 unsigned int blocksize = inode->i_sb->s_blocksize;
273 struct dnode_of_data dn; 258 struct dnode_of_data dn;
274 pgoff_t free_from; 259 pgoff_t free_from;
275 int count = 0; 260 int count = 0, err = 0;
276 int err;
277 261
278 trace_f2fs_truncate_blocks_enter(inode, from); 262 trace_f2fs_truncate_blocks_enter(inode, from);
279 263
264 if (f2fs_has_inline_data(inode))
265 goto done;
266
280 free_from = (pgoff_t) 267 free_from = (pgoff_t)
281 ((from + blocksize - 1) >> (sbi->log_blocksize)); 268 ((from + blocksize - 1) >> (sbi->log_blocksize));
282 269
283 f2fs_lock_op(sbi); 270 f2fs_lock_op(sbi);
271
284 set_new_dnode(&dn, inode, NULL, NULL, 0); 272 set_new_dnode(&dn, inode, NULL, NULL, 0);
285 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); 273 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
286 if (err) { 274 if (err) {
@@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
308free_next: 296free_next:
309 err = truncate_inode_blocks(inode, free_from); 297 err = truncate_inode_blocks(inode, free_from);
310 f2fs_unlock_op(sbi); 298 f2fs_unlock_op(sbi);
311 299done:
312 /* lastly zero out the first data page */ 300 /* lastly zero out the first data page */
313 truncate_partial_data_page(inode, from); 301 truncate_partial_data_page(inode, from);
314 302
@@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
382 370
383 if ((attr->ia_valid & ATTR_SIZE) && 371 if ((attr->ia_valid & ATTR_SIZE) &&
384 attr->ia_size != i_size_read(inode)) { 372 attr->ia_size != i_size_read(inode)) {
373 err = f2fs_convert_inline_data(inode, attr->ia_size);
374 if (err)
375 return err;
376
385 truncate_setsize(inode, attr->ia_size); 377 truncate_setsize(inode, attr->ia_size);
386 f2fs_truncate(inode); 378 f2fs_truncate(inode);
387 f2fs_balance_fs(F2FS_SB(inode->i_sb)); 379 f2fs_balance_fs(F2FS_SB(inode->i_sb));
@@ -459,12 +451,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
459 return 0; 451 return 0;
460} 452}
461 453
462static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) 454static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
463{ 455{
464 pgoff_t pg_start, pg_end; 456 pgoff_t pg_start, pg_end;
465 loff_t off_start, off_end; 457 loff_t off_start, off_end;
466 int ret = 0; 458 int ret = 0;
467 459
460 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
461 if (ret)
462 return ret;
463
468 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; 464 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
469 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; 465 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
470 466
@@ -499,12 +495,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
499 } 495 }
500 } 496 }
501 497
502 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
503 i_size_read(inode) <= (offset + len)) {
504 i_size_write(inode, offset);
505 mark_inode_dirty(inode);
506 }
507
508 return ret; 498 return ret;
509} 499}
510 500
@@ -521,6 +511,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
521 if (ret) 511 if (ret)
522 return ret; 512 return ret;
523 513
514 ret = f2fs_convert_inline_data(inode, offset + len);
515 if (ret)
516 return ret;
517
524 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; 518 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
525 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; 519 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
526 520
@@ -532,22 +526,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
532 526
533 f2fs_lock_op(sbi); 527 f2fs_lock_op(sbi);
534 set_new_dnode(&dn, inode, NULL, NULL, 0); 528 set_new_dnode(&dn, inode, NULL, NULL, 0);
535 ret = get_dnode_of_data(&dn, index, ALLOC_NODE); 529 ret = f2fs_reserve_block(&dn, index);
536 if (ret) {
537 f2fs_unlock_op(sbi);
538 break;
539 }
540
541 if (dn.data_blkaddr == NULL_ADDR) {
542 ret = reserve_new_block(&dn);
543 if (ret) {
544 f2fs_put_dnode(&dn);
545 f2fs_unlock_op(sbi);
546 break;
547 }
548 }
549 f2fs_put_dnode(&dn);
550 f2fs_unlock_op(sbi); 530 f2fs_unlock_op(sbi);
531 if (ret)
532 break;
551 533
552 if (pg_start == pg_end) 534 if (pg_start == pg_end)
553 new_size = offset + len; 535 new_size = offset + len;
@@ -578,7 +560,7 @@ static long f2fs_fallocate(struct file *file, int mode,
578 return -EOPNOTSUPP; 560 return -EOPNOTSUPP;
579 561
580 if (mode & FALLOC_FL_PUNCH_HOLE) 562 if (mode & FALLOC_FL_PUNCH_HOLE)
581 ret = punch_hole(inode, offset, len, mode); 563 ret = punch_hole(inode, offset, len);
582 else 564 else
583 ret = expand_inode_data(inode, offset, len, mode); 565 ret = expand_inode_data(inode, offset, len, mode);
584 566
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b7ad1ec7e4cc..ea0371e854b4 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
119 kfree(gc_th); 119 kfree(gc_th);
120 sbi->gc_thread = NULL; 120 sbi->gc_thread = NULL;
121 } 121 }
122
123out: 122out:
124 return err; 123 return err;
125} 124}
@@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
164 p->ofs_unit = sbi->segs_per_sec; 163 p->ofs_unit = sbi->segs_per_sec;
165 } 164 }
166 165
167 if (p->max_search > MAX_VICTIM_SEARCH) 166 if (p->max_search > sbi->max_victim_search)
168 p->max_search = MAX_VICTIM_SEARCH; 167 p->max_search = sbi->max_victim_search;
169 168
170 p->offset = sbi->last_victim[p->gc_mode]; 169 p->offset = sbi->last_victim[p->gc_mode];
171} 170}
@@ -429,7 +428,7 @@ next_step:
429 428
430 /* set page dirty and write it */ 429 /* set page dirty and write it */
431 if (gc_type == FG_GC) { 430 if (gc_type == FG_GC) {
432 f2fs_wait_on_page_writeback(node_page, NODE, true); 431 f2fs_wait_on_page_writeback(node_page, NODE);
433 set_page_dirty(node_page); 432 set_page_dirty(node_page);
434 } else { 433 } else {
435 if (!PageWriteback(node_page)) 434 if (!PageWriteback(node_page))
@@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
521 520
522static void move_data_page(struct inode *inode, struct page *page, int gc_type) 521static void move_data_page(struct inode *inode, struct page *page, int gc_type)
523{ 522{
523 struct f2fs_io_info fio = {
524 .type = DATA,
525 .rw = WRITE_SYNC,
526 };
527
524 if (gc_type == BG_GC) { 528 if (gc_type == BG_GC) {
525 if (PageWriteback(page)) 529 if (PageWriteback(page))
526 goto out; 530 goto out;
@@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
529 } else { 533 } else {
530 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 534 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
531 535
532 f2fs_wait_on_page_writeback(page, DATA, true); 536 f2fs_wait_on_page_writeback(page, DATA);
533 537
534 if (clear_page_dirty_for_io(page) && 538 if (clear_page_dirty_for_io(page) &&
535 S_ISDIR(inode->i_mode)) { 539 S_ISDIR(inode->i_mode)) {
@@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
537 inode_dec_dirty_dents(inode); 541 inode_dec_dirty_dents(inode);
538 } 542 }
539 set_cold_data(page); 543 set_cold_data(page);
540 do_write_data_page(page); 544 do_write_data_page(page, &fio);
541 clear_cold_data(page); 545 clear_cold_data(page);
542 } 546 }
543out: 547out:
@@ -631,7 +635,7 @@ next_iput:
631 goto next_step; 635 goto next_step;
632 636
633 if (gc_type == FG_GC) { 637 if (gc_type == FG_GC) {
634 f2fs_submit_bio(sbi, DATA, true); 638 f2fs_submit_merged_bio(sbi, DATA, WRITE);
635 639
636 /* 640 /*
637 * In the case of FG_GC, it'd be better to reclaim this victim 641 * In the case of FG_GC, it'd be better to reclaim this victim
@@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
664 668
665 /* read segment summary of victim */ 669 /* read segment summary of victim */
666 sum_page = get_sum_page(sbi, segno); 670 sum_page = get_sum_page(sbi, segno);
667 if (IS_ERR(sum_page))
668 return;
669 671
670 blk_start_plug(&plug); 672 blk_start_plug(&plug);
671 673
@@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
697 699
698 INIT_LIST_HEAD(&ilist); 700 INIT_LIST_HEAD(&ilist);
699gc_more: 701gc_more:
700 if (!(sbi->sb->s_flags & MS_ACTIVE)) 702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
701 goto stop; 703 goto stop;
702 704
703 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 705 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d22205..5d5eb6047bf4 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ 20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
21 21
22/* Search max. number of dirty segments to select a victim segment */ 22/* Search max. number of dirty segments to select a victim segment */
23#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ 23#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
24 24
25struct f2fs_gc_kthread { 25struct f2fs_gc_kthread {
26 struct task_struct *f2fs_gc_task; 26 struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 000000000000..31ee5b164ff9
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,222 @@
1/*
2 * fs/f2fs/inline.c
3 * Copyright (c) 2013, Intel Corporation
4 * Authors: Huajun Li <huajun.li@intel.com>
5 * Haicheng Li <haicheng.li@intel.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13
14#include "f2fs.h"
15
16bool f2fs_may_inline(struct inode *inode)
17{
18 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
19 block_t nr_blocks;
20 loff_t i_size;
21
22 if (!test_opt(sbi, INLINE_DATA))
23 return false;
24
25 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
26 if (inode->i_blocks > nr_blocks)
27 return false;
28
29 i_size = i_size_read(inode);
30 if (i_size > MAX_INLINE_DATA)
31 return false;
32
33 return true;
34}
35
36int f2fs_read_inline_data(struct inode *inode, struct page *page)
37{
38 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
39 struct page *ipage;
40 void *src_addr, *dst_addr;
41
42 if (page->index) {
43 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
44 goto out;
45 }
46
47 ipage = get_node_page(sbi, inode->i_ino);
48 if (IS_ERR(ipage))
49 return PTR_ERR(ipage);
50
51 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
52
53 /* Copy the whole inline data block */
54 src_addr = inline_data_addr(ipage);
55 dst_addr = kmap(page);
56 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
57 kunmap(page);
58 f2fs_put_page(ipage, 1);
59
60out:
61 SetPageUptodate(page);
62 unlock_page(page);
63
64 return 0;
65}
66
67static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
68{
69 int err;
70 struct page *ipage;
71 struct dnode_of_data dn;
72 void *src_addr, *dst_addr;
73 block_t new_blk_addr;
74 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
75 struct f2fs_io_info fio = {
76 .type = DATA,
77 .rw = WRITE_SYNC | REQ_PRIO,
78 };
79
80 f2fs_lock_op(sbi);
81 ipage = get_node_page(sbi, inode->i_ino);
82 if (IS_ERR(ipage))
83 return PTR_ERR(ipage);
84
85 /*
86 * i_addr[0] is not used for inline data,
87 * so reserving new block will not destroy inline data
88 */
89 set_new_dnode(&dn, inode, ipage, NULL, 0);
90 err = f2fs_reserve_block(&dn, 0);
91 if (err) {
92 f2fs_unlock_op(sbi);
93 return err;
94 }
95
96 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
97
98 /* Copy the whole inline data block */
99 src_addr = inline_data_addr(ipage);
100 dst_addr = kmap(page);
101 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
102 kunmap(page);
103 SetPageUptodate(page);
104
105 /* write data page to try to make data consistent */
106 set_page_writeback(page);
107 write_data_page(page, &dn, &new_blk_addr, &fio);
108 update_extent_cache(new_blk_addr, &dn);
109 f2fs_wait_on_page_writeback(page, DATA);
110
111 /* clear inline data and flag after data writeback */
112 zero_user_segment(ipage, INLINE_DATA_OFFSET,
113 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
114 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
115 stat_dec_inline_inode(inode);
116
117 sync_inode_page(&dn);
118 f2fs_put_dnode(&dn);
119 f2fs_unlock_op(sbi);
120 return err;
121}
122
123int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
124{
125 struct page *page;
126 int err;
127
128 if (!f2fs_has_inline_data(inode))
129 return 0;
130 else if (to_size <= MAX_INLINE_DATA)
131 return 0;
132
133 page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS);
134 if (!page)
135 return -ENOMEM;
136
137 err = __f2fs_convert_inline_data(inode, page);
138 f2fs_put_page(page, 1);
139 return err;
140}
141
142int f2fs_write_inline_data(struct inode *inode,
143 struct page *page, unsigned size)
144{
145 void *src_addr, *dst_addr;
146 struct page *ipage;
147 struct dnode_of_data dn;
148 int err;
149
150 set_new_dnode(&dn, inode, NULL, NULL, 0);
151 err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
152 if (err)
153 return err;
154 ipage = dn.inode_page;
155
156 zero_user_segment(ipage, INLINE_DATA_OFFSET,
157 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
158 src_addr = kmap(page);
159 dst_addr = inline_data_addr(ipage);
160 memcpy(dst_addr, src_addr, size);
161 kunmap(page);
162
163 /* Release the first data block if it is allocated */
164 if (!f2fs_has_inline_data(inode)) {
165 truncate_data_blocks_range(&dn, 1);
166 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
167 stat_inc_inline_inode(inode);
168 }
169
170 sync_inode_page(&dn);
171 f2fs_put_dnode(&dn);
172
173 return 0;
174}
175
176int recover_inline_data(struct inode *inode, struct page *npage)
177{
178 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
179 struct f2fs_inode *ri = NULL;
180 void *src_addr, *dst_addr;
181 struct page *ipage;
182
183 /*
184 * The inline_data recovery policy is as follows.
185 * [prev.] [next] of inline_data flag
186 * o o -> recover inline_data
187 * o x -> remove inline_data, and then recover data blocks
188 * x o -> remove inline_data, and then recover inline_data
189 * x x -> recover data blocks
190 */
191 if (IS_INODE(npage))
192 ri = F2FS_INODE(npage);
193
194 if (f2fs_has_inline_data(inode) &&
195 ri && ri->i_inline & F2FS_INLINE_DATA) {
196process_inline:
197 ipage = get_node_page(sbi, inode->i_ino);
198 f2fs_bug_on(IS_ERR(ipage));
199
200 src_addr = inline_data_addr(npage);
201 dst_addr = inline_data_addr(ipage);
202 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
203 update_inode(inode, ipage);
204 f2fs_put_page(ipage, 1);
205 return -1;
206 }
207
208 if (f2fs_has_inline_data(inode)) {
209 ipage = get_node_page(sbi, inode->i_ino);
210 f2fs_bug_on(IS_ERR(ipage));
211 zero_user_segment(ipage, INLINE_DATA_OFFSET,
212 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
213 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
214 update_inode(inode, ipage);
215 f2fs_put_page(ipage, 1);
216 } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
217 truncate_blocks(inode, 0);
218 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
219 goto process_inline;
220 }
221 return 0;
222}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d0eaa9faeca0..4d67ed736dca 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
42 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 42 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
43 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 43 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
44 if (ri->i_addr[0]) 44 if (ri->i_addr[0])
45 inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); 45 inode->i_rdev =
46 old_decode_dev(le32_to_cpu(ri->i_addr[0]));
46 else 47 else
47 inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); 48 inode->i_rdev =
49 new_decode_dev(le32_to_cpu(ri->i_addr[1]));
48 } 50 }
49} 51}
50 52
@@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
52{ 54{
53 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 55 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
54 if (old_valid_dev(inode->i_rdev)) { 56 if (old_valid_dev(inode->i_rdev)) {
55 ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); 57 ri->i_addr[0] =
58 cpu_to_le32(old_encode_dev(inode->i_rdev));
56 ri->i_addr[1] = 0; 59 ri->i_addr[1] = 0;
57 } else { 60 } else {
58 ri->i_addr[0] = 0; 61 ri->i_addr[0] = 0;
59 ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); 62 ri->i_addr[1] =
63 cpu_to_le32(new_encode_dev(inode->i_rdev));
60 ri->i_addr[2] = 0; 64 ri->i_addr[2] = 0;
61 } 65 }
62 } 66 }
@@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode)
67 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 71 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
68 struct f2fs_inode_info *fi = F2FS_I(inode); 72 struct f2fs_inode_info *fi = F2FS_I(inode);
69 struct page *node_page; 73 struct page *node_page;
70 struct f2fs_node *rn;
71 struct f2fs_inode *ri; 74 struct f2fs_inode *ri;
72 75
73 /* Check if ino is within scope */ 76 /* Check if ino is within scope */
@@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode)
81 if (IS_ERR(node_page)) 84 if (IS_ERR(node_page))
82 return PTR_ERR(node_page); 85 return PTR_ERR(node_page);
83 86
84 rn = F2FS_NODE(node_page); 87 ri = F2FS_INODE(node_page);
85 ri = &(rn->i);
86 88
87 inode->i_mode = le16_to_cpu(ri->i_mode); 89 inode->i_mode = le16_to_cpu(ri->i_mode);
88 i_uid_write(inode, le32_to_cpu(ri->i_uid)); 90 i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -175,13 +177,11 @@ bad_inode:
175 177
176void update_inode(struct inode *inode, struct page *node_page) 178void update_inode(struct inode *inode, struct page *node_page)
177{ 179{
178 struct f2fs_node *rn;
179 struct f2fs_inode *ri; 180 struct f2fs_inode *ri;
180 181
181 f2fs_wait_on_page_writeback(node_page, NODE, false); 182 f2fs_wait_on_page_writeback(node_page, NODE);
182 183
183 rn = F2FS_NODE(node_page); 184 ri = F2FS_INODE(node_page);
184 ri = &(rn->i);
185 185
186 ri->i_mode = cpu_to_le16(inode->i_mode); 186 ri->i_mode = cpu_to_le16(inode->i_mode);
187 ri->i_advise = F2FS_I(inode)->i_advise; 187 ri->i_advise = F2FS_I(inode)->i_advise;
@@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode)
281 281
282 f2fs_lock_op(sbi); 282 f2fs_lock_op(sbi);
283 remove_inode_page(inode); 283 remove_inode_page(inode);
284 stat_dec_inline_inode(inode);
284 f2fs_unlock_op(sbi); 285 f2fs_unlock_op(sbi);
285 286
286 sb_end_intwrite(inode->i_sb); 287 sb_end_intwrite(inode->i_sb);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8b..3d32f2969c5e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
424 } 424 }
425 425
426 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 426 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
427 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
427 428
428 new_inode->i_ctime = CURRENT_TIME; 429 new_inode->i_ctime = CURRENT_TIME;
429 if (old_dir_entry) 430 if (old_dir_entry)
430 drop_nlink(new_inode); 431 drop_nlink(new_inode);
431 drop_nlink(new_inode); 432 drop_nlink(new_inode);
433 mark_inode_dirty(new_inode);
432 434
433 if (!new_inode->i_nlink) 435 if (!new_inode->i_nlink)
434 add_orphan_inode(sbi, new_inode->i_ino); 436 add_orphan_inode(sbi, new_inode->i_ino);
@@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
457 if (old_dir != new_dir) { 459 if (old_dir != new_dir) {
458 f2fs_set_link(old_inode, old_dir_entry, 460 f2fs_set_link(old_inode, old_dir_entry,
459 old_dir_page, new_dir); 461 old_dir_page, new_dir);
462 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
463 update_inode_page(old_inode);
460 } else { 464 } else {
461 kunmap(old_dir_page); 465 kunmap(old_dir_page);
462 f2fs_put_page(old_dir_page, 0); 466 f2fs_put_page(old_dir_page, 0);
463 } 467 }
464 drop_nlink(old_dir); 468 drop_nlink(old_dir);
469 mark_inode_dirty(old_dir);
465 update_inode_page(old_dir); 470 update_inode_page(old_dir);
466 } 471 }
467 472
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4ac4150d421d..b0649b76eb4f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
87 */ 87 */
88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) 88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
89{ 89{
90 struct address_space *mapping = sbi->meta_inode->i_mapping; 90 struct address_space *mapping = META_MAPPING(sbi);
91 struct f2fs_nm_info *nm_i = NM_I(sbi); 91 struct f2fs_nm_info *nm_i = NM_I(sbi);
92 struct blk_plug plug;
93 struct page *page; 92 struct page *page;
94 pgoff_t index; 93 pgoff_t index;
95 int i; 94 int i;
95 struct f2fs_io_info fio = {
96 .type = META,
97 .rw = READ_SYNC | REQ_META | REQ_PRIO
98 };
96 99
97 blk_start_plug(&plug);
98 100
99 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { 101 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
100 if (nid >= nm_i->max_nid) 102 if (unlikely(nid >= nm_i->max_nid))
101 nid = 0; 103 nid = 0;
102 index = current_nat_addr(sbi, nid); 104 index = current_nat_addr(sbi, nid);
103 105
@@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
105 if (!page) 107 if (!page)
106 continue; 108 continue;
107 if (PageUptodate(page)) { 109 if (PageUptodate(page)) {
110 mark_page_accessed(page);
108 f2fs_put_page(page, 1); 111 f2fs_put_page(page, 1);
109 continue; 112 continue;
110 } 113 }
111 if (f2fs_readpage(sbi, page, index, READ)) 114 f2fs_submit_page_mbio(sbi, page, index, &fio);
112 continue; 115 mark_page_accessed(page);
113
114 f2fs_put_page(page, 0); 116 f2fs_put_page(page, 0);
115 } 117 }
116 blk_finish_plug(&plug); 118 f2fs_submit_merged_bio(sbi, META, READ);
117} 119}
118 120
119static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 121static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -391,8 +393,8 @@ got:
391 393
392/* 394/*
393 * Caller should call f2fs_put_dnode(dn). 395 * Caller should call f2fs_put_dnode(dn).
394 * Also, it should grab and release a mutex by calling mutex_lock_op() and 396 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
395 * mutex_unlock_op() only if ro is not set RDONLY_NODE. 397 * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
396 * In the case of RDONLY_NODE, we don't need to care about mutex. 398 * In the case of RDONLY_NODE, we don't need to care about mutex.
397 */ 399 */
398int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 400int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn)
502 504
503 /* Deallocate node address */ 505 /* Deallocate node address */
504 invalidate_blocks(sbi, ni.blk_addr); 506 invalidate_blocks(sbi, ni.blk_addr);
505 dec_valid_node_count(sbi, dn->inode, 1); 507 dec_valid_node_count(sbi, dn->inode);
506 set_node_addr(sbi, &ni, NULL_ADDR); 508 set_node_addr(sbi, &ni, NULL_ADDR);
507 509
508 if (dn->nid == dn->inode->i_ino) { 510 if (dn->nid == dn->inode->i_ino) {
@@ -516,6 +518,10 @@ invalidate:
516 F2FS_SET_SB_DIRT(sbi); 518 F2FS_SET_SB_DIRT(sbi);
517 519
518 f2fs_put_page(dn->node_page, 1); 520 f2fs_put_page(dn->node_page, 1);
521
522 invalidate_mapping_pages(NODE_MAPPING(sbi),
523 dn->node_page->index, dn->node_page->index);
524
519 dn->node_page = NULL; 525 dn->node_page = NULL;
520 trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); 526 trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
521} 527}
@@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
631 return 0; 637 return 0;
632 638
633 /* get indirect nodes in the path */ 639 /* get indirect nodes in the path */
634 for (i = 0; i < depth - 1; i++) { 640 for (i = 0; i < idx + 1; i++) {
635 /* refernece count'll be increased */ 641 /* refernece count'll be increased */
636 pages[i] = get_node_page(sbi, nid[i]); 642 pages[i] = get_node_page(sbi, nid[i]);
637 if (IS_ERR(pages[i])) { 643 if (IS_ERR(pages[i])) {
638 depth = i + 1;
639 err = PTR_ERR(pages[i]); 644 err = PTR_ERR(pages[i]);
645 idx = i - 1;
640 goto fail; 646 goto fail;
641 } 647 }
642 nid[i + 1] = get_nid(pages[i], offset[i + 1], false); 648 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
643 } 649 }
644 650
645 /* free direct nodes linked to a partial indirect node */ 651 /* free direct nodes linked to a partial indirect node */
646 for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { 652 for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
647 child_nid = get_nid(pages[idx], i, false); 653 child_nid = get_nid(pages[idx], i, false);
648 if (!child_nid) 654 if (!child_nid)
649 continue; 655 continue;
@@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
654 set_nid(pages[idx], i, 0, false); 660 set_nid(pages[idx], i, 0, false);
655 } 661 }
656 662
657 if (offset[depth - 1] == 0) { 663 if (offset[idx + 1] == 0) {
658 dn->node_page = pages[idx]; 664 dn->node_page = pages[idx];
659 dn->nid = nid[idx]; 665 dn->nid = nid[idx];
660 truncate_node(dn); 666 truncate_node(dn);
@@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
662 f2fs_put_page(pages[idx], 1); 668 f2fs_put_page(pages[idx], 1);
663 } 669 }
664 offset[idx]++; 670 offset[idx]++;
665 offset[depth - 1] = 0; 671 offset[idx + 1] = 0;
672 idx--;
666fail: 673fail:
667 for (i = depth - 3; i >= 0; i--) 674 for (i = idx; i >= 0; i--)
668 f2fs_put_page(pages[i], 1); 675 f2fs_put_page(pages[i], 1);
669 676
670 trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); 677 trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -678,11 +685,10 @@ fail:
678int truncate_inode_blocks(struct inode *inode, pgoff_t from) 685int truncate_inode_blocks(struct inode *inode, pgoff_t from)
679{ 686{
680 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 687 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
681 struct address_space *node_mapping = sbi->node_inode->i_mapping;
682 int err = 0, cont = 1; 688 int err = 0, cont = 1;
683 int level, offset[4], noffset[4]; 689 int level, offset[4], noffset[4];
684 unsigned int nofs = 0; 690 unsigned int nofs = 0;
685 struct f2fs_node *rn; 691 struct f2fs_inode *ri;
686 struct dnode_of_data dn; 692 struct dnode_of_data dn;
687 struct page *page; 693 struct page *page;
688 694
@@ -699,7 +705,7 @@ restart:
699 set_new_dnode(&dn, inode, page, NULL, 0); 705 set_new_dnode(&dn, inode, page, NULL, 0);
700 unlock_page(page); 706 unlock_page(page);
701 707
702 rn = F2FS_NODE(page); 708 ri = F2FS_INODE(page);
703 switch (level) { 709 switch (level) {
704 case 0: 710 case 0:
705 case 1: 711 case 1:
@@ -709,7 +715,7 @@ restart:
709 nofs = noffset[1]; 715 nofs = noffset[1];
710 if (!offset[level - 1]) 716 if (!offset[level - 1])
711 goto skip_partial; 717 goto skip_partial;
712 err = truncate_partial_nodes(&dn, &rn->i, offset, level); 718 err = truncate_partial_nodes(&dn, ri, offset, level);
713 if (err < 0 && err != -ENOENT) 719 if (err < 0 && err != -ENOENT)
714 goto fail; 720 goto fail;
715 nofs += 1 + NIDS_PER_BLOCK; 721 nofs += 1 + NIDS_PER_BLOCK;
@@ -718,7 +724,7 @@ restart:
718 nofs = 5 + 2 * NIDS_PER_BLOCK; 724 nofs = 5 + 2 * NIDS_PER_BLOCK;
719 if (!offset[level - 1]) 725 if (!offset[level - 1])
720 goto skip_partial; 726 goto skip_partial;
721 err = truncate_partial_nodes(&dn, &rn->i, offset, level); 727 err = truncate_partial_nodes(&dn, ri, offset, level);
722 if (err < 0 && err != -ENOENT) 728 if (err < 0 && err != -ENOENT)
723 goto fail; 729 goto fail;
724 break; 730 break;
@@ -728,7 +734,7 @@ restart:
728 734
729skip_partial: 735skip_partial:
730 while (cont) { 736 while (cont) {
731 dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); 737 dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
732 switch (offset[0]) { 738 switch (offset[0]) {
733 case NODE_DIR1_BLOCK: 739 case NODE_DIR1_BLOCK:
734 case NODE_DIR2_BLOCK: 740 case NODE_DIR2_BLOCK:
@@ -751,14 +757,14 @@ skip_partial:
751 if (err < 0 && err != -ENOENT) 757 if (err < 0 && err != -ENOENT)
752 goto fail; 758 goto fail;
753 if (offset[1] == 0 && 759 if (offset[1] == 0 &&
754 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { 760 ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
755 lock_page(page); 761 lock_page(page);
756 if (page->mapping != node_mapping) { 762 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
757 f2fs_put_page(page, 1); 763 f2fs_put_page(page, 1);
758 goto restart; 764 goto restart;
759 } 765 }
760 wait_on_page_writeback(page); 766 wait_on_page_writeback(page);
761 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 767 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
762 set_page_dirty(page); 768 set_page_dirty(page);
763 unlock_page(page); 769 unlock_page(page);
764 } 770 }
@@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
794 set_new_dnode(&dn, inode, page, npage, nid); 800 set_new_dnode(&dn, inode, page, npage, nid);
795 801
796 if (page) 802 if (page)
797 dn.inode_page_locked = 1; 803 dn.inode_page_locked = true;
798 truncate_node(&dn); 804 truncate_node(&dn);
799 return 0; 805 return 0;
800} 806}
801 807
802/* 808/*
803 * Caller should grab and release a mutex by calling mutex_lock_op() and 809 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
804 * mutex_unlock_op(). 810 * f2fs_unlock_op().
805 */ 811 */
806int remove_inode_page(struct inode *inode) 812void remove_inode_page(struct inode *inode)
807{ 813{
808 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 814 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
809 struct page *page; 815 struct page *page;
810 nid_t ino = inode->i_ino; 816 nid_t ino = inode->i_ino;
811 struct dnode_of_data dn; 817 struct dnode_of_data dn;
812 int err;
813 818
814 page = get_node_page(sbi, ino); 819 page = get_node_page(sbi, ino);
815 if (IS_ERR(page)) 820 if (IS_ERR(page))
816 return PTR_ERR(page); 821 return;
817 822
818 err = truncate_xattr_node(inode, page); 823 if (truncate_xattr_node(inode, page)) {
819 if (err) {
820 f2fs_put_page(page, 1); 824 f2fs_put_page(page, 1);
821 return err; 825 return;
822 } 826 }
823
824 /* 0 is possible, after f2fs_new_inode() is failed */ 827 /* 0 is possible, after f2fs_new_inode() is failed */
825 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 828 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
826 set_new_dnode(&dn, inode, page, page, ino); 829 set_new_dnode(&dn, inode, page, page, ino);
827 truncate_node(&dn); 830 truncate_node(&dn);
828 return 0;
829} 831}
830 832
831struct page *new_inode_page(struct inode *inode, const struct qstr *name) 833struct page *new_inode_page(struct inode *inode, const struct qstr *name)
@@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
843 unsigned int ofs, struct page *ipage) 845 unsigned int ofs, struct page *ipage)
844{ 846{
845 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 847 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
846 struct address_space *mapping = sbi->node_inode->i_mapping;
847 struct node_info old_ni, new_ni; 848 struct node_info old_ni, new_ni;
848 struct page *page; 849 struct page *page;
849 int err; 850 int err;
850 851
851 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) 852 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
852 return ERR_PTR(-EPERM); 853 return ERR_PTR(-EPERM);
853 854
854 page = grab_cache_page(mapping, dn->nid); 855 page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
855 if (!page) 856 if (!page)
856 return ERR_PTR(-ENOMEM); 857 return ERR_PTR(-ENOMEM);
857 858
858 if (!inc_valid_node_count(sbi, dn->inode, 1)) { 859 if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
859 err = -ENOSPC; 860 err = -ENOSPC;
860 goto fail; 861 goto fail;
861 } 862 }
@@ -898,14 +899,14 @@ fail:
898 * LOCKED_PAGE: f2fs_put_page(page, 1) 899 * LOCKED_PAGE: f2fs_put_page(page, 1)
899 * error: nothing 900 * error: nothing
900 */ 901 */
901static int read_node_page(struct page *page, int type) 902static int read_node_page(struct page *page, int rw)
902{ 903{
903 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 904 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
904 struct node_info ni; 905 struct node_info ni;
905 906
906 get_node_info(sbi, page->index, &ni); 907 get_node_info(sbi, page->index, &ni);
907 908
908 if (ni.blk_addr == NULL_ADDR) { 909 if (unlikely(ni.blk_addr == NULL_ADDR)) {
909 f2fs_put_page(page, 1); 910 f2fs_put_page(page, 1);
910 return -ENOENT; 911 return -ENOENT;
911 } 912 }
@@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type)
913 if (PageUptodate(page)) 914 if (PageUptodate(page))
914 return LOCKED_PAGE; 915 return LOCKED_PAGE;
915 916
916 return f2fs_readpage(sbi, page, ni.blk_addr, type); 917 return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
917} 918}
918 919
919/* 920/*
@@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type)
921 */ 922 */
922void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) 923void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
923{ 924{
924 struct address_space *mapping = sbi->node_inode->i_mapping;
925 struct page *apage; 925 struct page *apage;
926 int err; 926 int err;
927 927
928 apage = find_get_page(mapping, nid); 928 apage = find_get_page(NODE_MAPPING(sbi), nid);
929 if (apage && PageUptodate(apage)) { 929 if (apage && PageUptodate(apage)) {
930 f2fs_put_page(apage, 0); 930 f2fs_put_page(apage, 0);
931 return; 931 return;
932 } 932 }
933 f2fs_put_page(apage, 0); 933 f2fs_put_page(apage, 0);
934 934
935 apage = grab_cache_page(mapping, nid); 935 apage = grab_cache_page(NODE_MAPPING(sbi), nid);
936 if (!apage) 936 if (!apage)
937 return; 937 return;
938 938
@@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
945 945
946struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) 946struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
947{ 947{
948 struct address_space *mapping = sbi->node_inode->i_mapping;
949 struct page *page; 948 struct page *page;
950 int err; 949 int err;
951repeat: 950repeat:
952 page = grab_cache_page(mapping, nid); 951 page = grab_cache_page(NODE_MAPPING(sbi), nid);
953 if (!page) 952 if (!page)
954 return ERR_PTR(-ENOMEM); 953 return ERR_PTR(-ENOMEM);
955 954
@@ -960,11 +959,11 @@ repeat:
960 goto got_it; 959 goto got_it;
961 960
962 lock_page(page); 961 lock_page(page);
963 if (!PageUptodate(page)) { 962 if (unlikely(!PageUptodate(page))) {
964 f2fs_put_page(page, 1); 963 f2fs_put_page(page, 1);
965 return ERR_PTR(-EIO); 964 return ERR_PTR(-EIO);
966 } 965 }
967 if (page->mapping != mapping) { 966 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
968 f2fs_put_page(page, 1); 967 f2fs_put_page(page, 1);
969 goto repeat; 968 goto repeat;
970 } 969 }
@@ -981,7 +980,6 @@ got_it:
981struct page *get_node_page_ra(struct page *parent, int start) 980struct page *get_node_page_ra(struct page *parent, int start)
982{ 981{
983 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 982 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
984 struct address_space *mapping = sbi->node_inode->i_mapping;
985 struct blk_plug plug; 983 struct blk_plug plug;
986 struct page *page; 984 struct page *page;
987 int err, i, end; 985 int err, i, end;
@@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
992 if (!nid) 990 if (!nid)
993 return ERR_PTR(-ENOENT); 991 return ERR_PTR(-ENOENT);
994repeat: 992repeat:
995 page = grab_cache_page(mapping, nid); 993 page = grab_cache_page(NODE_MAPPING(sbi), nid);
996 if (!page) 994 if (!page)
997 return ERR_PTR(-ENOMEM); 995 return ERR_PTR(-ENOMEM);
998 996
@@ -1017,12 +1015,12 @@ repeat:
1017 blk_finish_plug(&plug); 1015 blk_finish_plug(&plug);
1018 1016
1019 lock_page(page); 1017 lock_page(page);
1020 if (page->mapping != mapping) { 1018 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1021 f2fs_put_page(page, 1); 1019 f2fs_put_page(page, 1);
1022 goto repeat; 1020 goto repeat;
1023 } 1021 }
1024page_hit: 1022page_hit:
1025 if (!PageUptodate(page)) { 1023 if (unlikely(!PageUptodate(page))) {
1026 f2fs_put_page(page, 1); 1024 f2fs_put_page(page, 1);
1027 return ERR_PTR(-EIO); 1025 return ERR_PTR(-EIO);
1028 } 1026 }
@@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn)
1048int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, 1046int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
1049 struct writeback_control *wbc) 1047 struct writeback_control *wbc)
1050{ 1048{
1051 struct address_space *mapping = sbi->node_inode->i_mapping;
1052 pgoff_t index, end; 1049 pgoff_t index, end;
1053 struct pagevec pvec; 1050 struct pagevec pvec;
1054 int step = ino ? 2 : 0; 1051 int step = ino ? 2 : 0;
@@ -1062,7 +1059,7 @@ next_step:
1062 1059
1063 while (index <= end) { 1060 while (index <= end) {
1064 int i, nr_pages; 1061 int i, nr_pages;
1065 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1062 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1066 PAGECACHE_TAG_DIRTY, 1063 PAGECACHE_TAG_DIRTY,
1067 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1064 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1068 if (nr_pages == 0) 1065 if (nr_pages == 0)
@@ -1095,7 +1092,7 @@ next_step:
1095 else if (!trylock_page(page)) 1092 else if (!trylock_page(page))
1096 continue; 1093 continue;
1097 1094
1098 if (unlikely(page->mapping != mapping)) { 1095 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1099continue_unlock: 1096continue_unlock:
1100 unlock_page(page); 1097 unlock_page(page);
1101 continue; 1098 continue;
@@ -1122,7 +1119,7 @@ continue_unlock:
1122 set_fsync_mark(page, 0); 1119 set_fsync_mark(page, 0);
1123 set_dentry_mark(page, 0); 1120 set_dentry_mark(page, 0);
1124 } 1121 }
1125 mapping->a_ops->writepage(page, wbc); 1122 NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
1126 wrote++; 1123 wrote++;
1127 1124
1128 if (--wbc->nr_to_write == 0) 1125 if (--wbc->nr_to_write == 0)
@@ -1143,31 +1140,31 @@ continue_unlock:
1143 } 1140 }
1144 1141
1145 if (wrote) 1142 if (wrote)
1146 f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); 1143 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1147
1148 return nwritten; 1144 return nwritten;
1149} 1145}
1150 1146
1151int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) 1147int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1152{ 1148{
1153 struct address_space *mapping = sbi->node_inode->i_mapping;
1154 pgoff_t index = 0, end = LONG_MAX; 1149 pgoff_t index = 0, end = LONG_MAX;
1155 struct pagevec pvec; 1150 struct pagevec pvec;
1156 int nr_pages;
1157 int ret2 = 0, ret = 0; 1151 int ret2 = 0, ret = 0;
1158 1152
1159 pagevec_init(&pvec, 0); 1153 pagevec_init(&pvec, 0);
1160 while ((index <= end) && 1154
1161 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1155 while (index <= end) {
1162 PAGECACHE_TAG_WRITEBACK, 1156 int i, nr_pages;
1163 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 1157 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1164 unsigned i; 1158 PAGECACHE_TAG_WRITEBACK,
1159 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1160 if (nr_pages == 0)
1161 break;
1165 1162
1166 for (i = 0; i < nr_pages; i++) { 1163 for (i = 0; i < nr_pages; i++) {
1167 struct page *page = pvec.pages[i]; 1164 struct page *page = pvec.pages[i];
1168 1165
1169 /* until radix tree lookup accepts end_index */ 1166 /* until radix tree lookup accepts end_index */
1170 if (page->index > end) 1167 if (unlikely(page->index > end))
1171 continue; 1168 continue;
1172 1169
1173 if (ino && ino_of_node(page) == ino) { 1170 if (ino && ino_of_node(page) == ino) {
@@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1180 cond_resched(); 1177 cond_resched();
1181 } 1178 }
1182 1179
1183 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 1180 if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
1184 ret2 = -ENOSPC; 1181 ret2 = -ENOSPC;
1185 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 1182 if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
1186 ret2 = -EIO; 1183 ret2 = -EIO;
1187 if (!ret) 1184 if (!ret)
1188 ret = ret2; 1185 ret = ret2;
@@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page,
1196 nid_t nid; 1193 nid_t nid;
1197 block_t new_addr; 1194 block_t new_addr;
1198 struct node_info ni; 1195 struct node_info ni;
1196 struct f2fs_io_info fio = {
1197 .type = NODE,
1198 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
1199 };
1199 1200
1200 if (sbi->por_doing) 1201 if (unlikely(sbi->por_doing))
1201 goto redirty_out; 1202 goto redirty_out;
1202 1203
1203 wait_on_page_writeback(page); 1204 wait_on_page_writeback(page);
@@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page,
1209 get_node_info(sbi, nid, &ni); 1210 get_node_info(sbi, nid, &ni);
1210 1211
1211 /* This page is already truncated */ 1212 /* This page is already truncated */
1212 if (ni.blk_addr == NULL_ADDR) { 1213 if (unlikely(ni.blk_addr == NULL_ADDR)) {
1213 dec_page_count(sbi, F2FS_DIRTY_NODES); 1214 dec_page_count(sbi, F2FS_DIRTY_NODES);
1214 unlock_page(page); 1215 unlock_page(page);
1215 return 0; 1216 return 0;
@@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page,
1220 1221
1221 mutex_lock(&sbi->node_write); 1222 mutex_lock(&sbi->node_write);
1222 set_page_writeback(page); 1223 set_page_writeback(page);
1223 write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); 1224 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1224 set_node_addr(sbi, &ni, new_addr); 1225 set_node_addr(sbi, &ni, new_addr);
1225 dec_page_count(sbi, F2FS_DIRTY_NODES); 1226 dec_page_count(sbi, F2FS_DIRTY_NODES);
1226 mutex_unlock(&sbi->node_write); 1227 mutex_unlock(&sbi->node_write);
@@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1255 1256
1256 /* if mounting is failed, skip writing node pages */ 1257 /* if mounting is failed, skip writing node pages */
1257 wbc->nr_to_write = 3 * max_hw_blocks(sbi); 1258 wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1259 wbc->sync_mode = WB_SYNC_NONE;
1258 sync_node_pages(sbi, 0, wbc); 1260 sync_node_pages(sbi, 0, wbc);
1259 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - 1261 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
1260 wbc->nr_to_write); 1262 wbc->nr_to_write);
@@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1333 return -1; 1335 return -1;
1334 1336
1335 /* 0 nid should not be used */ 1337 /* 0 nid should not be used */
1336 if (nid == 0) 1338 if (unlikely(nid == 0))
1337 return 0; 1339 return 0;
1338 1340
1339 if (build) { 1341 if (build) {
@@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
1386 1388
1387 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { 1389 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1388 1390
1389 if (start_nid >= nm_i->max_nid) 1391 if (unlikely(start_nid >= nm_i->max_nid))
1390 break; 1392 break;
1391 1393
1392 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1394 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
@@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1420 f2fs_put_page(page, 1); 1422 f2fs_put_page(page, 1);
1421 1423
1422 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); 1424 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1423 if (nid >= nm_i->max_nid) 1425 if (unlikely(nid >= nm_i->max_nid))
1424 nid = 0; 1426 nid = 0;
1425 1427
1426 if (i++ == FREE_NID_PAGES) 1428 if (i++ == FREE_NID_PAGES)
@@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1454 struct free_nid *i = NULL; 1456 struct free_nid *i = NULL;
1455 struct list_head *this; 1457 struct list_head *this;
1456retry: 1458retry:
1457 if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) 1459 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
1458 return false; 1460 return false;
1459 1461
1460 spin_lock(&nm_i->free_nid_list_lock); 1462 spin_lock(&nm_i->free_nid_list_lock);
@@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1535 1537
1536int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1538int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1537{ 1539{
1538 struct address_space *mapping = sbi->node_inode->i_mapping; 1540 struct f2fs_inode *src, *dst;
1539 struct f2fs_node *src, *dst;
1540 nid_t ino = ino_of_node(page); 1541 nid_t ino = ino_of_node(page);
1541 struct node_info old_ni, new_ni; 1542 struct node_info old_ni, new_ni;
1542 struct page *ipage; 1543 struct page *ipage;
1543 1544
1544 ipage = grab_cache_page(mapping, ino); 1545 ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
1545 if (!ipage) 1546 if (!ipage)
1546 return -ENOMEM; 1547 return -ENOMEM;
1547 1548
@@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1552 SetPageUptodate(ipage); 1553 SetPageUptodate(ipage);
1553 fill_node_footer(ipage, ino, ino, 0, true); 1554 fill_node_footer(ipage, ino, ino, 0, true);
1554 1555
1555 src = F2FS_NODE(page); 1556 src = F2FS_INODE(page);
1556 dst = F2FS_NODE(ipage); 1557 dst = F2FS_INODE(ipage);
1557 1558
1558 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); 1559 memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
1559 dst->i.i_size = 0; 1560 dst->i_size = 0;
1560 dst->i.i_blocks = cpu_to_le64(1); 1561 dst->i_blocks = cpu_to_le64(1);
1561 dst->i.i_links = cpu_to_le32(1); 1562 dst->i_links = cpu_to_le32(1);
1562 dst->i.i_xattr_nid = 0; 1563 dst->i_xattr_nid = 0;
1563 1564
1564 new_ni = old_ni; 1565 new_ni = old_ni;
1565 new_ni.ino = ino; 1566 new_ni.ino = ino;
1566 1567
1567 if (!inc_valid_node_count(sbi, NULL, 1)) 1568 if (unlikely(!inc_valid_node_count(sbi, NULL)))
1568 WARN_ON(1); 1569 WARN_ON(1);
1569 set_node_addr(sbi, &new_ni, NEW_ADDR); 1570 set_node_addr(sbi, &new_ni, NEW_ADDR);
1570 inc_valid_inode_count(sbi); 1571 inc_valid_inode_count(sbi);
@@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1572 return 0; 1573 return 0;
1573} 1574}
1574 1575
1576/*
1577 * ra_sum_pages() merge contiguous pages into one bio and submit.
1578 * these pre-readed pages are linked in pages list.
1579 */
1580static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1581 int start, int nrpages)
1582{
1583 struct page *page;
1584 int page_idx = start;
1585 struct f2fs_io_info fio = {
1586 .type = META,
1587 .rw = READ_SYNC | REQ_META | REQ_PRIO
1588 };
1589
1590 for (; page_idx < start + nrpages; page_idx++) {
1591 /* alloc temporal page for read node summary info*/
1592 page = alloc_page(GFP_F2FS_ZERO);
1593 if (!page) {
1594 struct page *tmp;
1595 list_for_each_entry_safe(page, tmp, pages, lru) {
1596 list_del(&page->lru);
1597 unlock_page(page);
1598 __free_pages(page, 0);
1599 }
1600 return -ENOMEM;
1601 }
1602
1603 lock_page(page);
1604 page->index = page_idx;
1605 list_add_tail(&page->lru, pages);
1606 }
1607
1608 list_for_each_entry(page, pages, lru)
1609 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
1610
1611 f2fs_submit_merged_bio(sbi, META, READ);
1612 return 0;
1613}
1614
1575int restore_node_summary(struct f2fs_sb_info *sbi, 1615int restore_node_summary(struct f2fs_sb_info *sbi,
1576 unsigned int segno, struct f2fs_summary_block *sum) 1616 unsigned int segno, struct f2fs_summary_block *sum)
1577{ 1617{
1578 struct f2fs_node *rn; 1618 struct f2fs_node *rn;
1579 struct f2fs_summary *sum_entry; 1619 struct f2fs_summary *sum_entry;
1580 struct page *page; 1620 struct page *page, *tmp;
1581 block_t addr; 1621 block_t addr;
1582 int i, last_offset; 1622 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1583 1623 int i, last_offset, nrpages, err = 0;
1584 /* alloc temporal page for read node */ 1624 LIST_HEAD(page_list);
1585 page = alloc_page(GFP_NOFS | __GFP_ZERO);
1586 if (!page)
1587 return -ENOMEM;
1588 lock_page(page);
1589 1625
1590 /* scan the node segment */ 1626 /* scan the node segment */
1591 last_offset = sbi->blocks_per_seg; 1627 last_offset = sbi->blocks_per_seg;
1592 addr = START_BLOCK(sbi, segno); 1628 addr = START_BLOCK(sbi, segno);
1593 sum_entry = &sum->entries[0]; 1629 sum_entry = &sum->entries[0];
1594 1630
1595 for (i = 0; i < last_offset; i++, sum_entry++) { 1631 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
1596 /* 1632 nrpages = min(last_offset - i, bio_blocks);
1597 * In order to read next node page,
1598 * we must clear PageUptodate flag.
1599 */
1600 ClearPageUptodate(page);
1601 1633
1602 if (f2fs_readpage(sbi, page, addr, READ_SYNC)) 1634 /* read ahead node pages */
1603 goto out; 1635 err = ra_sum_pages(sbi, &page_list, addr, nrpages);
1636 if (err)
1637 return err;
1604 1638
1605 lock_page(page); 1639 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1606 rn = F2FS_NODE(page); 1640
1607 sum_entry->nid = rn->footer.nid; 1641 lock_page(page);
1608 sum_entry->version = 0; 1642 if (unlikely(!PageUptodate(page))) {
1609 sum_entry->ofs_in_node = 0; 1643 err = -EIO;
1610 addr++; 1644 } else {
1645 rn = F2FS_NODE(page);
1646 sum_entry->nid = rn->footer.nid;
1647 sum_entry->version = 0;
1648 sum_entry->ofs_in_node = 0;
1649 sum_entry++;
1650 }
1651
1652 list_del(&page->lru);
1653 unlock_page(page);
1654 __free_pages(page, 0);
1655 }
1611 } 1656 }
1612 unlock_page(page); 1657 return err;
1613out:
1614 __free_pages(page, 0);
1615 return 0;
1616} 1658}
1617 1659
1618static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) 1660static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15dc..c4c79885c993 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
224 * | `- direct node (5 + N => 5 + 2N - 1) 224 * | `- direct node (5 + N => 5 + 2N - 1)
225 * `- double indirect node (5 + 2N) 225 * `- double indirect node (5 + 2N)
226 * `- indirect node (6 + 2N) 226 * `- indirect node (6 + 2N)
227 * `- direct node (x(N + 1)) 227 * `- direct node
228 * ......
229 * `- indirect node ((6 + 2N) + x(N + 1))
230 * `- direct node
231 * ......
232 * `- indirect node ((6 + 2N) + (N - 1)(N + 1))
233 * `- direct node
228 */ 234 */
229static inline bool IS_DNODE(struct page *node_page) 235static inline bool IS_DNODE(struct page *node_page)
230{ 236{
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fdc81161f254..976a7a934db5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
40 40
41static int recover_dentry(struct page *ipage, struct inode *inode) 41static int recover_dentry(struct page *ipage, struct inode *inode)
42{ 42{
43 struct f2fs_node *raw_node = F2FS_NODE(ipage); 43 struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
44 struct f2fs_inode *raw_inode = &(raw_node->i);
45 nid_t pino = le32_to_cpu(raw_inode->i_pino); 44 nid_t pino = le32_to_cpu(raw_inode->i_pino);
46 struct f2fs_dir_entry *de; 45 struct f2fs_dir_entry *de;
47 struct qstr name; 46 struct qstr name;
@@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
62 61
63 name.len = le32_to_cpu(raw_inode->i_namelen); 62 name.len = le32_to_cpu(raw_inode->i_namelen);
64 name.name = raw_inode->i_name; 63 name.name = raw_inode->i_name;
64
65 if (unlikely(name.len > F2FS_NAME_LEN)) {
66 WARN_ON(1);
67 err = -ENAMETOOLONG;
68 goto out;
69 }
65retry: 70retry:
66 de = f2fs_find_entry(dir, &name, &page); 71 de = f2fs_find_entry(dir, &name, &page);
67 if (de && inode->i_ino == le32_to_cpu(de->ino)) 72 if (de && inode->i_ino == le32_to_cpu(de->ino))
@@ -90,17 +95,16 @@ out_unmap_put:
90 kunmap(page); 95 kunmap(page);
91 f2fs_put_page(page, 0); 96 f2fs_put_page(page, 0);
92out: 97out:
93 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " 98 f2fs_msg(inode->i_sb, KERN_NOTICE,
94 "ino = %x, name = %s, dir = %lx, err = %d", 99 "%s: ino = %x, name = %s, dir = %lx, err = %d",
95 ino_of_node(ipage), raw_inode->i_name, 100 __func__, ino_of_node(ipage), raw_inode->i_name,
96 IS_ERR(dir) ? 0 : dir->i_ino, err); 101 IS_ERR(dir) ? 0 : dir->i_ino, err);
97 return err; 102 return err;
98} 103}
99 104
100static int recover_inode(struct inode *inode, struct page *node_page) 105static int recover_inode(struct inode *inode, struct page *node_page)
101{ 106{
102 struct f2fs_node *raw_node = F2FS_NODE(node_page); 107 struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
103 struct f2fs_inode *raw_inode = &(raw_node->i);
104 108
105 if (!IS_INODE(node_page)) 109 if (!IS_INODE(node_page))
106 return 0; 110 return 0;
@@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
143 while (1) { 147 while (1) {
144 struct fsync_inode_entry *entry; 148 struct fsync_inode_entry *entry;
145 149
146 err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); 150 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
147 if (err) 151 if (err)
148 goto out; 152 return err;
149 153
150 lock_page(page); 154 lock_page(page);
151 155
@@ -191,9 +195,10 @@ next:
191 /* check next segment */ 195 /* check next segment */
192 blkaddr = next_blkaddr_of_node(page); 196 blkaddr = next_blkaddr_of_node(page);
193 } 197 }
198
194 unlock_page(page); 199 unlock_page(page);
195out:
196 __free_pages(page, 0); 200 __free_pages(page, 0);
201
197 return err; 202 return err;
198} 203}
199 204
@@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
293 struct node_info ni; 298 struct node_info ni;
294 int err = 0, recovered = 0; 299 int err = 0, recovered = 0;
295 300
301 if (recover_inline_data(inode, page))
302 goto out;
303
296 start = start_bidx_of_node(ofs_of_node(page), fi); 304 start = start_bidx_of_node(ofs_of_node(page), fi);
297 if (IS_INODE(page)) 305 if (IS_INODE(page))
298 end = start + ADDRS_PER_INODE(fi); 306 end = start + ADDRS_PER_INODE(fi);
@@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
300 end = start + ADDRS_PER_BLOCK; 308 end = start + ADDRS_PER_BLOCK;
301 309
302 f2fs_lock_op(sbi); 310 f2fs_lock_op(sbi);
311
303 set_new_dnode(&dn, inode, NULL, NULL, 0); 312 set_new_dnode(&dn, inode, NULL, NULL, 0);
304 313
305 err = get_dnode_of_data(&dn, start, ALLOC_NODE); 314 err = get_dnode_of_data(&dn, start, ALLOC_NODE);
306 if (err) { 315 if (err) {
307 f2fs_unlock_op(sbi); 316 f2fs_unlock_op(sbi);
308 return err; 317 goto out;
309 } 318 }
310 319
311 wait_on_page_writeback(dn.node_page); 320 wait_on_page_writeback(dn.node_page);
@@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
356err: 365err:
357 f2fs_put_dnode(&dn); 366 f2fs_put_dnode(&dn);
358 f2fs_unlock_op(sbi); 367 f2fs_unlock_op(sbi);
359 368out:
360 f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " 369 f2fs_msg(sbi->sb, KERN_NOTICE,
361 "recovered_data = %d blocks, err = %d", 370 "recover_data: ino = %lx, recovered = %d blocks, err = %d",
362 inode->i_ino, recovered, err); 371 inode->i_ino, recovered, err);
363 return err; 372 return err;
364} 373}
365 374
@@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
377 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 386 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
378 387
379 /* read node page */ 388 /* read node page */
380 page = alloc_page(GFP_NOFS | __GFP_ZERO); 389 page = alloc_page(GFP_F2FS_ZERO);
381 if (!page) 390 if (!page)
382 return -ENOMEM; 391 return -ENOMEM;
383 392
@@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
386 while (1) { 395 while (1) {
387 struct fsync_inode_entry *entry; 396 struct fsync_inode_entry *entry;
388 397
389 err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); 398 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
390 if (err) 399 if (err)
391 goto out; 400 return err;
392 401
393 lock_page(page); 402 lock_page(page);
394 403
@@ -412,8 +421,8 @@ next:
412 /* check next segment */ 421 /* check next segment */
413 blkaddr = next_blkaddr_of_node(page); 422 blkaddr = next_blkaddr_of_node(page);
414 } 423 }
424
415 unlock_page(page); 425 unlock_page(page);
416out:
417 __free_pages(page, 0); 426 __free_pages(page, 0);
418 427
419 if (!err) 428 if (!err)
@@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
429 438
430 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", 439 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
431 sizeof(struct fsync_inode_entry), NULL); 440 sizeof(struct fsync_inode_entry), NULL);
432 if (unlikely(!fsync_entry_slab)) 441 if (!fsync_entry_slab)
433 return -ENOMEM; 442 return -ENOMEM;
434 443
435 INIT_LIST_HEAD(&inode_list); 444 INIT_LIST_HEAD(&inode_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d397199..7caac5f2ca9e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,12 +14,163 @@
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/prefetch.h> 15#include <linux/prefetch.h>
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/swap.h>
17 18
18#include "f2fs.h" 19#include "f2fs.h"
19#include "segment.h" 20#include "segment.h"
20#include "node.h" 21#include "node.h"
21#include <trace/events/f2fs.h> 22#include <trace/events/f2fs.h>
22 23
24#define __reverse_ffz(x) __reverse_ffs(~(x))
25
26static struct kmem_cache *discard_entry_slab;
27
28/*
29 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
30 * MSB and LSB are reversed in a byte by f2fs_set_bit.
31 */
32static inline unsigned long __reverse_ffs(unsigned long word)
33{
34 int num = 0;
35
36#if BITS_PER_LONG == 64
37 if ((word & 0xffffffff) == 0) {
38 num += 32;
39 word >>= 32;
40 }
41#endif
42 if ((word & 0xffff) == 0) {
43 num += 16;
44 word >>= 16;
45 }
46 if ((word & 0xff) == 0) {
47 num += 8;
48 word >>= 8;
49 }
50 if ((word & 0xf0) == 0)
51 num += 4;
52 else
53 word >>= 4;
54 if ((word & 0xc) == 0)
55 num += 2;
56 else
57 word >>= 2;
58 if ((word & 0x2) == 0)
59 num += 1;
60 return num;
61}
62
63/*
64 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
65 * f2fs_set_bit makes MSB and LSB reversed in a byte.
66 * Example:
67 * LSB <--> MSB
68 * f2fs_set_bit(0, bitmap) => 0000 0001
69 * f2fs_set_bit(7, bitmap) => 1000 0000
70 */
71static unsigned long __find_rev_next_bit(const unsigned long *addr,
72 unsigned long size, unsigned long offset)
73{
74 const unsigned long *p = addr + BIT_WORD(offset);
75 unsigned long result = offset & ~(BITS_PER_LONG - 1);
76 unsigned long tmp;
77 unsigned long mask, submask;
78 unsigned long quot, rest;
79
80 if (offset >= size)
81 return size;
82
83 size -= result;
84 offset %= BITS_PER_LONG;
85 if (!offset)
86 goto aligned;
87
88 tmp = *(p++);
89 quot = (offset >> 3) << 3;
90 rest = offset & 0x7;
91 mask = ~0UL << quot;
92 submask = (unsigned char)(0xff << rest) >> rest;
93 submask <<= quot;
94 mask &= submask;
95 tmp &= mask;
96 if (size < BITS_PER_LONG)
97 goto found_first;
98 if (tmp)
99 goto found_middle;
100
101 size -= BITS_PER_LONG;
102 result += BITS_PER_LONG;
103aligned:
104 while (size & ~(BITS_PER_LONG-1)) {
105 tmp = *(p++);
106 if (tmp)
107 goto found_middle;
108 result += BITS_PER_LONG;
109 size -= BITS_PER_LONG;
110 }
111 if (!size)
112 return result;
113 tmp = *p;
114found_first:
115 tmp &= (~0UL >> (BITS_PER_LONG - size));
116 if (tmp == 0UL) /* Are any bits set? */
117 return result + size; /* Nope. */
118found_middle:
119 return result + __reverse_ffs(tmp);
120}
121
122static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
123 unsigned long size, unsigned long offset)
124{
125 const unsigned long *p = addr + BIT_WORD(offset);
126 unsigned long result = offset & ~(BITS_PER_LONG - 1);
127 unsigned long tmp;
128 unsigned long mask, submask;
129 unsigned long quot, rest;
130
131 if (offset >= size)
132 return size;
133
134 size -= result;
135 offset %= BITS_PER_LONG;
136 if (!offset)
137 goto aligned;
138
139 tmp = *(p++);
140 quot = (offset >> 3) << 3;
141 rest = offset & 0x7;
142 mask = ~(~0UL << quot);
143 submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
144 submask <<= quot;
145 mask += submask;
146 tmp |= mask;
147 if (size < BITS_PER_LONG)
148 goto found_first;
149 if (~tmp)
150 goto found_middle;
151
152 size -= BITS_PER_LONG;
153 result += BITS_PER_LONG;
154aligned:
155 while (size & ~(BITS_PER_LONG - 1)) {
156 tmp = *(p++);
157 if (~tmp)
158 goto found_middle;
159 result += BITS_PER_LONG;
160 size -= BITS_PER_LONG;
161 }
162 if (!size)
163 return result;
164 tmp = *p;
165
166found_first:
167 tmp |= ~0UL << size;
168 if (tmp == ~0UL) /* Are any bits zero? */
169 return result + size; /* Nope. */
170found_middle:
171 return result + __reverse_ffz(tmp);
172}
173
23/* 174/*
24 * This function balances dirty node and dentry pages. 175 * This function balances dirty node and dentry pages.
25 * In addition, it controls garbage collection. 176 * In addition, it controls garbage collection.
@@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
116 mutex_unlock(&dirty_i->seglist_lock); 267 mutex_unlock(&dirty_i->seglist_lock);
117} 268}
118 269
270static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
271 block_t blkstart, block_t blklen)
272{
273 sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
274 sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
275 blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
276 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
277}
278
279static void add_discard_addrs(struct f2fs_sb_info *sbi,
280 unsigned int segno, struct seg_entry *se)
281{
282 struct list_head *head = &SM_I(sbi)->discard_list;
283 struct discard_entry *new;
284 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
285 int max_blocks = sbi->blocks_per_seg;
286 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
287 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
288 unsigned long dmap[entries];
289 unsigned int start = 0, end = -1;
290 int i;
291
292 if (!test_opt(sbi, DISCARD))
293 return;
294
295 /* zero block will be discarded through the prefree list */
296 if (!se->valid_blocks || se->valid_blocks == max_blocks)
297 return;
298
299 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
300 for (i = 0; i < entries; i++)
301 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
302
303 while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
304 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
305 if (start >= max_blocks)
306 break;
307
308 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
309
310 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
311 INIT_LIST_HEAD(&new->list);
312 new->blkaddr = START_BLOCK(sbi, segno) + start;
313 new->len = end - start;
314
315 list_add_tail(&new->list, head);
316 SM_I(sbi)->nr_discards += end - start;
317 }
318}
319
119/* 320/*
120 * Should call clear_prefree_segments after checkpoint is done. 321 * Should call clear_prefree_segments after checkpoint is done.
121 */ 322 */
@@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
138 339
139void clear_prefree_segments(struct f2fs_sb_info *sbi) 340void clear_prefree_segments(struct f2fs_sb_info *sbi)
140{ 341{
342 struct list_head *head = &(SM_I(sbi)->discard_list);
343 struct list_head *this, *next;
344 struct discard_entry *entry;
141 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 345 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
142 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 346 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
143 unsigned int total_segs = TOTAL_SEGS(sbi); 347 unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
160 if (!test_opt(sbi, DISCARD)) 364 if (!test_opt(sbi, DISCARD))
161 continue; 365 continue;
162 366
163 blkdev_issue_discard(sbi->sb->s_bdev, 367 f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
164 START_BLOCK(sbi, start) << 368 (end - start) << sbi->log_blocks_per_seg);
165 sbi->log_sectors_per_block,
166 (1 << (sbi->log_sectors_per_block +
167 sbi->log_blocks_per_seg)) * (end - start),
168 GFP_NOFS, 0);
169 } 369 }
170 mutex_unlock(&dirty_i->seglist_lock); 370 mutex_unlock(&dirty_i->seglist_lock);
371
372 /* send small discards */
373 list_for_each_safe(this, next, head) {
374 entry = list_entry(this, struct discard_entry, list);
375 f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
376 list_del(&entry->list);
377 SM_I(sbi)->nr_discards -= entry->len;
378 kmem_cache_free(discard_entry_slab, entry);
379 }
171} 380}
172 381
173static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) 382static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
459 struct curseg_info *seg, block_t start) 668 struct curseg_info *seg, block_t start)
460{ 669{
461 struct seg_entry *se = get_seg_entry(sbi, seg->segno); 670 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
462 block_t ofs; 671 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
463 for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { 672 unsigned long target_map[entries];
464 if (!f2fs_test_bit(ofs, se->ckpt_valid_map) 673 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
465 && !f2fs_test_bit(ofs, se->cur_valid_map)) 674 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
466 break; 675 int i, pos;
467 } 676
468 seg->next_blkoff = ofs; 677 for (i = 0; i < entries; i++)
678 target_map[i] = ckpt_map[i] | cur_map[i];
679
680 pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
681
682 seg->next_blkoff = pos;
469} 683}
470 684
471/* 685/*
@@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = {
573 .allocate_segment = allocate_segment_by_default, 787 .allocate_segment = allocate_segment_by_default,
574}; 788};
575 789
576static void f2fs_end_io_write(struct bio *bio, int err)
577{
578 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
579 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
580 struct bio_private *p = bio->bi_private;
581
582 do {
583 struct page *page = bvec->bv_page;
584
585 if (--bvec >= bio->bi_io_vec)
586 prefetchw(&bvec->bv_page->flags);
587 if (!uptodate) {
588 SetPageError(page);
589 if (page->mapping)
590 set_bit(AS_EIO, &page->mapping->flags);
591 set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
592 p->sbi->sb->s_flags |= MS_RDONLY;
593 }
594 end_page_writeback(page);
595 dec_page_count(p->sbi, F2FS_WRITEBACK);
596 } while (bvec >= bio->bi_io_vec);
597
598 if (p->is_sync)
599 complete(p->wait);
600
601 if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
602 !list_empty(&p->sbi->cp_wait.task_list))
603 wake_up(&p->sbi->cp_wait);
604
605 kfree(p);
606 bio_put(bio);
607}
608
609struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
610{
611 struct bio *bio;
612
613 /* No failure on bio allocation */
614 bio = bio_alloc(GFP_NOIO, npages);
615 bio->bi_bdev = bdev;
616 bio->bi_private = NULL;
617
618 return bio;
619}
620
621static void do_submit_bio(struct f2fs_sb_info *sbi,
622 enum page_type type, bool sync)
623{
624 int rw = sync ? WRITE_SYNC : WRITE;
625 enum page_type btype = type > META ? META : type;
626
627 if (type >= META_FLUSH)
628 rw = WRITE_FLUSH_FUA;
629
630 if (btype == META)
631 rw |= REQ_META;
632
633 if (sbi->bio[btype]) {
634 struct bio_private *p = sbi->bio[btype]->bi_private;
635 p->sbi = sbi;
636 sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
637
638 trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
639
640 if (type == META_FLUSH) {
641 DECLARE_COMPLETION_ONSTACK(wait);
642 p->is_sync = true;
643 p->wait = &wait;
644 submit_bio(rw, sbi->bio[btype]);
645 wait_for_completion(&wait);
646 } else {
647 p->is_sync = false;
648 submit_bio(rw, sbi->bio[btype]);
649 }
650 sbi->bio[btype] = NULL;
651 }
652}
653
654void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
655{
656 down_write(&sbi->bio_sem);
657 do_submit_bio(sbi, type, sync);
658 up_write(&sbi->bio_sem);
659}
660
661static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
662 block_t blk_addr, enum page_type type)
663{
664 struct block_device *bdev = sbi->sb->s_bdev;
665 int bio_blocks;
666
667 verify_block_addr(sbi, blk_addr);
668
669 down_write(&sbi->bio_sem);
670
671 inc_page_count(sbi, F2FS_WRITEBACK);
672
673 if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
674 do_submit_bio(sbi, type, false);
675alloc_new:
676 if (sbi->bio[type] == NULL) {
677 struct bio_private *priv;
678retry:
679 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
680 if (!priv) {
681 cond_resched();
682 goto retry;
683 }
684
685 bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
686 sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
687 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
688 sbi->bio[type]->bi_private = priv;
689 /*
690 * The end_io will be assigned at the sumbission phase.
691 * Until then, let bio_add_page() merge consecutive IOs as much
692 * as possible.
693 */
694 }
695
696 if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
697 PAGE_CACHE_SIZE) {
698 do_submit_bio(sbi, type, false);
699 goto alloc_new;
700 }
701
702 sbi->last_block_in_bio[type] = blk_addr;
703
704 up_write(&sbi->bio_sem);
705 trace_f2fs_submit_write_page(page, blk_addr, type);
706}
707
708void f2fs_wait_on_page_writeback(struct page *page,
709 enum page_type type, bool sync)
710{
711 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
712 if (PageWriteback(page)) {
713 f2fs_submit_bio(sbi, type, sync);
714 wait_on_page_writeback(page);
715 }
716}
717
718static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 790static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
719{ 791{
720 struct curseg_info *curseg = CURSEG_I(sbi, type); 792 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
782 return __get_segment_type_6(page, p_type); 854 return __get_segment_type_6(page, p_type);
783} 855}
784 856
785static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, 857void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
786 block_t old_blkaddr, block_t *new_blkaddr, 858 block_t old_blkaddr, block_t *new_blkaddr,
787 struct f2fs_summary *sum, enum page_type p_type) 859 struct f2fs_summary *sum, int type)
788{ 860{
789 struct sit_info *sit_i = SIT_I(sbi); 861 struct sit_info *sit_i = SIT_I(sbi);
790 struct curseg_info *curseg; 862 struct curseg_info *curseg;
791 unsigned int old_cursegno; 863 unsigned int old_cursegno;
792 int type;
793 864
794 type = __get_segment_type(page, p_type);
795 curseg = CURSEG_I(sbi, type); 865 curseg = CURSEG_I(sbi, type);
796 866
797 mutex_lock(&curseg->curseg_mutex); 867 mutex_lock(&curseg->curseg_mutex);
@@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
824 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 894 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
825 mutex_unlock(&sit_i->sentry_lock); 895 mutex_unlock(&sit_i->sentry_lock);
826 896
827 if (p_type == NODE) 897 if (page && IS_NODESEG(type))
828 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); 898 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
829 899
830 /* writeout dirty page into bdev */
831 submit_write_page(sbi, page, *new_blkaddr, p_type);
832
833 mutex_unlock(&curseg->curseg_mutex); 900 mutex_unlock(&curseg->curseg_mutex);
834} 901}
835 902
903static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
904 block_t old_blkaddr, block_t *new_blkaddr,
905 struct f2fs_summary *sum, struct f2fs_io_info *fio)
906{
907 int type = __get_segment_type(page, fio->type);
908
909 allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
910
911 /* writeout dirty page into bdev */
912 f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
913}
914
836void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) 915void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
837{ 916{
917 struct f2fs_io_info fio = {
918 .type = META,
919 .rw = WRITE_SYNC | REQ_META | REQ_PRIO
920 };
921
838 set_page_writeback(page); 922 set_page_writeback(page);
839 submit_write_page(sbi, page, page->index, META); 923 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
840} 924}
841 925
842void write_node_page(struct f2fs_sb_info *sbi, struct page *page, 926void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
927 struct f2fs_io_info *fio,
843 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) 928 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
844{ 929{
845 struct f2fs_summary sum; 930 struct f2fs_summary sum;
846 set_summary(&sum, nid, 0, 0); 931 set_summary(&sum, nid, 0, 0);
847 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); 932 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
848} 933}
849 934
850void write_data_page(struct inode *inode, struct page *page, 935void write_data_page(struct page *page, struct dnode_of_data *dn,
851 struct dnode_of_data *dn, block_t old_blkaddr, 936 block_t *new_blkaddr, struct f2fs_io_info *fio)
852 block_t *new_blkaddr)
853{ 937{
854 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 938 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
855 struct f2fs_summary sum; 939 struct f2fs_summary sum;
856 struct node_info ni; 940 struct node_info ni;
857 941
858 f2fs_bug_on(old_blkaddr == NULL_ADDR); 942 f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
859 get_node_info(sbi, dn->nid, &ni); 943 get_node_info(sbi, dn->nid, &ni);
860 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 944 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
861 945
862 do_write_page(sbi, page, old_blkaddr, 946 do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
863 new_blkaddr, &sum, DATA);
864} 947}
865 948
866void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, 949void rewrite_data_page(struct page *page, block_t old_blkaddr,
867 block_t old_blk_addr) 950 struct f2fs_io_info *fio)
868{ 951{
869 submit_write_page(sbi, page, old_blk_addr, DATA); 952 struct inode *inode = page->mapping->host;
953 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
954 f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
870} 955}
871 956
872void recover_data_page(struct f2fs_sb_info *sbi, 957void recover_data_page(struct f2fs_sb_info *sbi,
@@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
925 unsigned int segno, old_cursegno; 1010 unsigned int segno, old_cursegno;
926 block_t next_blkaddr = next_blkaddr_of_node(page); 1011 block_t next_blkaddr = next_blkaddr_of_node(page);
927 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); 1012 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
1013 struct f2fs_io_info fio = {
1014 .type = NODE,
1015 .rw = WRITE_SYNC,
1016 };
928 1017
929 curseg = CURSEG_I(sbi, type); 1018 curseg = CURSEG_I(sbi, type);
930 1019
@@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
953 1042
954 /* rewrite node page */ 1043 /* rewrite node page */
955 set_page_writeback(page); 1044 set_page_writeback(page);
956 submit_write_page(sbi, page, new_blkaddr, NODE); 1045 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
957 f2fs_submit_bio(sbi, NODE, true); 1046 f2fs_submit_merged_bio(sbi, NODE, WRITE);
958 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1047 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
959 1048
960 locate_dirty_segment(sbi, old_cursegno); 1049 locate_dirty_segment(sbi, old_cursegno);
@@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
964 mutex_unlock(&curseg->curseg_mutex); 1053 mutex_unlock(&curseg->curseg_mutex);
965} 1054}
966 1055
1056void f2fs_wait_on_page_writeback(struct page *page,
1057 enum page_type type)
1058{
1059 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1060 if (PageWriteback(page)) {
1061 f2fs_submit_merged_bio(sbi, type, WRITE);
1062 wait_on_page_writeback(page);
1063 }
1064}
1065
967static int read_compacted_summaries(struct f2fs_sb_info *sbi) 1066static int read_compacted_summaries(struct f2fs_sb_info *sbi)
968{ 1067{
969 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1068 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
1314 1413
1315 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 1414 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1316 1415
1416 /* add discard candidates */
1417 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
1418 add_discard_addrs(sbi, segno, se);
1419
1317 if (flushed) 1420 if (flushed)
1318 goto to_sit_page; 1421 goto to_sit_page;
1319 1422
@@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi)
1480 return restore_curseg_summaries(sbi); 1583 return restore_curseg_summaries(sbi);
1481} 1584}
1482 1585
1586static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
1587{
1588 struct address_space *mapping = META_MAPPING(sbi);
1589 struct page *page;
1590 block_t blk_addr, prev_blk_addr = 0;
1591 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1592 int blkno = start;
1593 struct f2fs_io_info fio = {
1594 .type = META,
1595 .rw = READ_SYNC | REQ_META | REQ_PRIO
1596 };
1597
1598 for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
1599
1600 blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
1601
1602 if (blkno != start && prev_blk_addr + 1 != blk_addr)
1603 break;
1604 prev_blk_addr = blk_addr;
1605repeat:
1606 page = grab_cache_page(mapping, blk_addr);
1607 if (!page) {
1608 cond_resched();
1609 goto repeat;
1610 }
1611 if (PageUptodate(page)) {
1612 mark_page_accessed(page);
1613 f2fs_put_page(page, 1);
1614 continue;
1615 }
1616
1617 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
1618
1619 mark_page_accessed(page);
1620 f2fs_put_page(page, 0);
1621 }
1622
1623 f2fs_submit_merged_bio(sbi, META, READ);
1624 return blkno - start;
1625}
1626
1483static void build_sit_entries(struct f2fs_sb_info *sbi) 1627static void build_sit_entries(struct f2fs_sb_info *sbi)
1484{ 1628{
1485 struct sit_info *sit_i = SIT_I(sbi); 1629 struct sit_info *sit_i = SIT_I(sbi);
1486 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1630 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1487 struct f2fs_summary_block *sum = curseg->sum_blk; 1631 struct f2fs_summary_block *sum = curseg->sum_blk;
1488 unsigned int start; 1632 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1633 unsigned int i, start, end;
1634 unsigned int readed, start_blk = 0;
1635 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1489 1636
1490 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1637 do {
1491 struct seg_entry *se = &sit_i->sentries[start]; 1638 readed = ra_sit_pages(sbi, start_blk, nrpages);
1492 struct f2fs_sit_block *sit_blk; 1639
1493 struct f2fs_sit_entry sit; 1640 start = start_blk * sit_i->sents_per_block;
1494 struct page *page; 1641 end = (start_blk + readed) * sit_i->sents_per_block;
1495 int i; 1642
1496 1643 for (; start < end && start < TOTAL_SEGS(sbi); start++) {
1497 mutex_lock(&curseg->curseg_mutex); 1644 struct seg_entry *se = &sit_i->sentries[start];
1498 for (i = 0; i < sits_in_cursum(sum); i++) { 1645 struct f2fs_sit_block *sit_blk;
1499 if (le32_to_cpu(segno_in_journal(sum, i)) == start) { 1646 struct f2fs_sit_entry sit;
1500 sit = sit_in_journal(sum, i); 1647 struct page *page;
1501 mutex_unlock(&curseg->curseg_mutex); 1648
1502 goto got_it; 1649 mutex_lock(&curseg->curseg_mutex);
1650 for (i = 0; i < sits_in_cursum(sum); i++) {
1651 if (le32_to_cpu(segno_in_journal(sum, i))
1652 == start) {
1653 sit = sit_in_journal(sum, i);
1654 mutex_unlock(&curseg->curseg_mutex);
1655 goto got_it;
1656 }
1503 } 1657 }
1504 } 1658 mutex_unlock(&curseg->curseg_mutex);
1505 mutex_unlock(&curseg->curseg_mutex); 1659
1506 page = get_current_sit_page(sbi, start); 1660 page = get_current_sit_page(sbi, start);
1507 sit_blk = (struct f2fs_sit_block *)page_address(page); 1661 sit_blk = (struct f2fs_sit_block *)page_address(page);
1508 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; 1662 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
1509 f2fs_put_page(page, 1); 1663 f2fs_put_page(page, 1);
1510got_it: 1664got_it:
1511 check_block_count(sbi, start, &sit); 1665 check_block_count(sbi, start, &sit);
1512 seg_info_from_raw_sit(se, &sit); 1666 seg_info_from_raw_sit(se, &sit);
1513 if (sbi->segs_per_sec > 1) { 1667 if (sbi->segs_per_sec > 1) {
1514 struct sec_entry *e = get_sec_entry(sbi, start); 1668 struct sec_entry *e = get_sec_entry(sbi, start);
1515 e->valid_blocks += se->valid_blocks; 1669 e->valid_blocks += se->valid_blocks;
1670 }
1516 } 1671 }
1517 } 1672 start_blk += readed;
1673 } while (start_blk < sit_blk_cnt);
1518} 1674}
1519 1675
1520static void init_free_segmap(struct f2fs_sb_info *sbi) 1676static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1644 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); 1800 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1645 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 1801 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1646 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; 1802 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
1803 sm_info->ipu_policy = F2FS_IPU_DISABLE;
1804 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
1805
1806 INIT_LIST_HEAD(&sm_info->discard_list);
1807 sm_info->nr_discards = 0;
1808 sm_info->max_discards = 0;
1647 1809
1648 err = build_sit_info(sbi); 1810 err = build_sit_info(sbi);
1649 if (err) 1811 if (err)
@@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1760 sbi->sm_info = NULL; 1922 sbi->sm_info = NULL;
1761 kfree(sm_info); 1923 kfree(sm_info);
1762} 1924}
1925
1926int __init create_segment_manager_caches(void)
1927{
1928 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
1929 sizeof(struct discard_entry), NULL);
1930 if (!discard_entry_slab)
1931 return -ENOMEM;
1932 return 0;
1933}
1934
1935void destroy_segment_manager_caches(void)
1936{
1937 kmem_cache_destroy(discard_entry_slab);
1938}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 269f690b4e24..5731682d7516 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -20,13 +20,8 @@
20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) 20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
21#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) 21#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
22 22
23#define IS_DATASEG(t) \ 23#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
24 ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ 24#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
25 (t == CURSEG_WARM_DATA))
26
27#define IS_NODESEG(t) \
28 ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
29 (t == CURSEG_WARM_NODE))
30 25
31#define IS_CURSEG(sbi, seg) \ 26#define IS_CURSEG(sbi, seg) \
32 ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ 27 ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
@@ -83,25 +78,20 @@
83 (segno / SIT_ENTRY_PER_BLOCK) 78 (segno / SIT_ENTRY_PER_BLOCK)
84#define START_SEGNO(sit_i, segno) \ 79#define START_SEGNO(sit_i, segno) \
85 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) 80 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
81#define SIT_BLK_CNT(sbi) \
82 ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
86#define f2fs_bitmap_size(nr) \ 83#define f2fs_bitmap_size(nr) \
87 (BITS_TO_LONGS(nr) * sizeof(unsigned long)) 84 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
88#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) 85#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
89#define TOTAL_SECS(sbi) (sbi->total_sections) 86#define TOTAL_SECS(sbi) (sbi->total_sections)
90 87
91#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ 88#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
92 (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) 89 (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
93#define SECTOR_TO_BLOCK(sbi, sectors) \ 90#define SECTOR_TO_BLOCK(sbi, sectors) \
94 (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) 91 (sectors >> (sbi)->log_sectors_per_block)
95#define MAX_BIO_BLOCKS(max_hw_blocks) \ 92#define MAX_BIO_BLOCKS(max_hw_blocks) \
96 (min((int)max_hw_blocks, BIO_MAX_PAGES)) 93 (min((int)max_hw_blocks, BIO_MAX_PAGES))
97 94
98/* during checkpoint, bio_private is used to synchronize the last bio */
99struct bio_private {
100 struct f2fs_sb_info *sbi;
101 bool is_sync;
102 void *wait;
103};
104
105/* 95/*
106 * indicate a block allocation direction: RIGHT and LEFT. 96 * indicate a block allocation direction: RIGHT and LEFT.
107 * RIGHT means allocating new sections towards the end of volume. 97 * RIGHT means allocating new sections towards the end of volume.
@@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
458 448
459static inline bool need_SSR(struct f2fs_sb_info *sbi) 449static inline bool need_SSR(struct f2fs_sb_info *sbi)
460{ 450{
461 return ((prefree_segments(sbi) / sbi->segs_per_sec) 451 return (prefree_segments(sbi) / sbi->segs_per_sec)
462 + free_sections(sbi) < overprovision_sections(sbi)); 452 + free_sections(sbi) < overprovision_sections(sbi);
463} 453}
464 454
465static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 455static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
467 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); 457 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
468 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); 458 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
469 459
470 if (sbi->por_doing) 460 if (unlikely(sbi->por_doing))
471 return false; 461 return false;
472 462
473 return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + 463 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
474 reserved_sections(sbi))); 464 reserved_sections(sbi));
475} 465}
476 466
477static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) 467static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
478{ 468{
479 return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments); 469 return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
480} 470}
481 471
482static inline int utilization(struct f2fs_sb_info *sbi) 472static inline int utilization(struct f2fs_sb_info *sbi)
483{ 473{
484 return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); 474 return div_u64((u64)valid_user_blocks(sbi) * 100,
475 sbi->user_block_count);
485} 476}
486 477
487/* 478/*
488 * Sometimes f2fs may be better to drop out-of-place update policy. 479 * Sometimes f2fs may be better to drop out-of-place update policy.
489 * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write 480 * And, users can control the policy through sysfs entries.
490 * data in the original place likewise other traditional file systems. 481 * There are five policies with triggering conditions as follows.
491 * But, currently set 100 in percentage, which means it is disabled. 482 * F2FS_IPU_FORCE - all the time,
492 * See below need_inplace_update(). 483 * F2FS_IPU_SSR - if SSR mode is activated,
484 * F2FS_IPU_UTIL - if FS utilization is over threashold,
485 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
486 * threashold,
487 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
493 */ 488 */
494#define MIN_IPU_UTIL 100 489#define DEF_MIN_IPU_UTIL 70
490
491enum {
492 F2FS_IPU_FORCE,
493 F2FS_IPU_SSR,
494 F2FS_IPU_UTIL,
495 F2FS_IPU_SSR_UTIL,
496 F2FS_IPU_DISABLE,
497};
498
495static inline bool need_inplace_update(struct inode *inode) 499static inline bool need_inplace_update(struct inode *inode)
496{ 500{
497 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 501 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
502
503 /* IPU can be done only for the user data */
498 if (S_ISDIR(inode->i_mode)) 504 if (S_ISDIR(inode->i_mode))
499 return false; 505 return false;
500 if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) 506
507 switch (SM_I(sbi)->ipu_policy) {
508 case F2FS_IPU_FORCE:
501 return true; 509 return true;
510 case F2FS_IPU_SSR:
511 if (need_SSR(sbi))
512 return true;
513 break;
514 case F2FS_IPU_UTIL:
515 if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
516 return true;
517 break;
518 case F2FS_IPU_SSR_UTIL:
519 if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
520 return true;
521 break;
522 case F2FS_IPU_DISABLE:
523 break;
524 }
502 return false; 525 return false;
503} 526}
504 527
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72de8e8..1a85f83abd53 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -50,6 +50,7 @@ enum {
50 Opt_active_logs, 50 Opt_active_logs,
51 Opt_disable_ext_identify, 51 Opt_disable_ext_identify,
52 Opt_inline_xattr, 52 Opt_inline_xattr,
53 Opt_inline_data,
53 Opt_err, 54 Opt_err,
54}; 55};
55 56
@@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = {
65 {Opt_active_logs, "active_logs=%u"}, 66 {Opt_active_logs, "active_logs=%u"},
66 {Opt_disable_ext_identify, "disable_ext_identify"}, 67 {Opt_disable_ext_identify, "disable_ext_identify"},
67 {Opt_inline_xattr, "inline_xattr"}, 68 {Opt_inline_xattr, "inline_xattr"},
69 {Opt_inline_data, "inline_data"},
68 {Opt_err, NULL}, 70 {Opt_err, NULL},
69}; 71};
70 72
@@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = {
72enum { 74enum {
73 GC_THREAD, /* struct f2fs_gc_thread */ 75 GC_THREAD, /* struct f2fs_gc_thread */
74 SM_INFO, /* struct f2fs_sm_info */ 76 SM_INFO, /* struct f2fs_sm_info */
77 F2FS_SBI, /* struct f2fs_sb_info */
75}; 78};
76 79
77struct f2fs_attr { 80struct f2fs_attr {
@@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
89 return (unsigned char *)sbi->gc_thread; 92 return (unsigned char *)sbi->gc_thread;
90 else if (struct_type == SM_INFO) 93 else if (struct_type == SM_INFO)
91 return (unsigned char *)SM_I(sbi); 94 return (unsigned char *)SM_I(sbi);
95 else if (struct_type == F2FS_SBI)
96 return (unsigned char *)sbi;
92 return NULL; 97 return NULL;
93} 98}
94 99
@@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
175F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); 180F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
176F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); 181F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
177F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); 182F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
183F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
184F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
185F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
186F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
178 187
179#define ATTR_LIST(name) (&f2fs_attr_##name.attr) 188#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
180static struct attribute *f2fs_attrs[] = { 189static struct attribute *f2fs_attrs[] = {
@@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = {
183 ATTR_LIST(gc_no_gc_sleep_time), 192 ATTR_LIST(gc_no_gc_sleep_time),
184 ATTR_LIST(gc_idle), 193 ATTR_LIST(gc_idle),
185 ATTR_LIST(reclaim_segments), 194 ATTR_LIST(reclaim_segments),
195 ATTR_LIST(max_small_discards),
196 ATTR_LIST(ipu_policy),
197 ATTR_LIST(min_ipu_util),
198 ATTR_LIST(max_victim_search),
186 NULL, 199 NULL,
187}; 200};
188 201
@@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options)
311 case Opt_disable_ext_identify: 324 case Opt_disable_ext_identify:
312 set_opt(sbi, DISABLE_EXT_IDENTIFY); 325 set_opt(sbi, DISABLE_EXT_IDENTIFY);
313 break; 326 break;
327 case Opt_inline_data:
328 set_opt(sbi, INLINE_DATA);
329 break;
314 default: 330 default:
315 f2fs_msg(sb, KERN_ERR, 331 f2fs_msg(sb, KERN_ERR,
316 "Unrecognized mount option \"%s\" or missing value", 332 "Unrecognized mount option \"%s\" or missing value",
@@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
325{ 341{
326 struct f2fs_inode_info *fi; 342 struct f2fs_inode_info *fi;
327 343
328 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); 344 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
329 if (!fi) 345 if (!fi)
330 return NULL; 346 return NULL;
331 347
@@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
508#endif 524#endif
509 if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) 525 if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
510 seq_puts(seq, ",disable_ext_identify"); 526 seq_puts(seq, ",disable_ext_identify");
511 527 if (test_opt(sbi, INLINE_DATA))
528 seq_puts(seq, ",inline_data");
512 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 529 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
513 530
514 return 0; 531 return 0;
@@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
518{ 535{
519 struct super_block *sb = seq->private; 536 struct super_block *sb = seq->private;
520 struct f2fs_sb_info *sbi = F2FS_SB(sb); 537 struct f2fs_sb_info *sbi = F2FS_SB(sb);
521 unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); 538 unsigned int total_segs =
539 le32_to_cpu(sbi->raw_super->segment_count_main);
522 int i; 540 int i;
523 541
524 for (i = 0; i < total_segs; i++) { 542 for (i = 0; i < total_segs; i++) {
@@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
618 struct f2fs_sb_info *sbi = F2FS_SB(sb); 636 struct f2fs_sb_info *sbi = F2FS_SB(sb);
619 struct inode *inode; 637 struct inode *inode;
620 638
621 if (ino < F2FS_ROOT_INO(sbi)) 639 if (unlikely(ino < F2FS_ROOT_INO(sbi)))
622 return ERR_PTR(-ESTALE); 640 return ERR_PTR(-ESTALE);
623 641
624 /* 642 /*
@@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
629 inode = f2fs_iget(sb, ino); 647 inode = f2fs_iget(sb, ino);
630 if (IS_ERR(inode)) 648 if (IS_ERR(inode))
631 return ERR_CAST(inode); 649 return ERR_CAST(inode);
632 if (generation && inode->i_generation != generation) { 650 if (unlikely(generation && inode->i_generation != generation)) {
633 /* we didn't find the right inode.. */ 651 /* we didn't find the right inode.. */
634 iput(inode); 652 iput(inode);
635 return ERR_PTR(-ESTALE); 653 return ERR_PTR(-ESTALE);
@@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
732 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); 750 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
733 fsmeta += le32_to_cpu(raw_super->segment_count_ssa); 751 fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
734 752
735 if (fsmeta >= total) 753 if (unlikely(fsmeta >= total))
736 return 1; 754 return 1;
737 755
738 if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { 756 if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
739 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); 757 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
740 return 1; 758 return 1;
741 } 759 }
@@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
763 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); 781 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
764 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); 782 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
765 sbi->cur_victim_sec = NULL_SECNO; 783 sbi->cur_victim_sec = NULL_SECNO;
784 sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
766 785
767 for (i = 0; i < NR_COUNT_TYPE; i++) 786 for (i = 0; i < NR_COUNT_TYPE; i++)
768 atomic_set(&sbi->nr_pages[i], 0); 787 atomic_set(&sbi->nr_pages[i], 0);
@@ -798,9 +817,10 @@ retry:
798 /* sanity checking of raw super */ 817 /* sanity checking of raw super */
799 if (sanity_check_raw_super(sb, *raw_super)) { 818 if (sanity_check_raw_super(sb, *raw_super)) {
800 brelse(*raw_super_buf); 819 brelse(*raw_super_buf);
801 f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " 820 f2fs_msg(sb, KERN_ERR,
802 "in %dth superblock", block + 1); 821 "Can't find valid F2FS filesystem in %dth superblock",
803 if(block == 0) { 822 block + 1);
823 if (block == 0) {
804 block++; 824 block++;
805 goto retry; 825 goto retry;
806 } else { 826 } else {
@@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
818 struct buffer_head *raw_super_buf; 838 struct buffer_head *raw_super_buf;
819 struct inode *root; 839 struct inode *root;
820 long err = -EINVAL; 840 long err = -EINVAL;
841 int i;
821 842
822 /* allocate memory for f2fs-specific super block info */ 843 /* allocate memory for f2fs-specific super block info */
823 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); 844 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
825 return -ENOMEM; 846 return -ENOMEM;
826 847
827 /* set a block size */ 848 /* set a block size */
828 if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { 849 if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
829 f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); 850 f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
830 goto free_sbi; 851 goto free_sbi;
831 } 852 }
@@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
874 mutex_init(&sbi->node_write); 895 mutex_init(&sbi->node_write);
875 sbi->por_doing = false; 896 sbi->por_doing = false;
876 spin_lock_init(&sbi->stat_lock); 897 spin_lock_init(&sbi->stat_lock);
877 init_rwsem(&sbi->bio_sem); 898
899 mutex_init(&sbi->read_io.io_mutex);
900 sbi->read_io.sbi = sbi;
901 sbi->read_io.bio = NULL;
902 for (i = 0; i < NR_PAGE_TYPE; i++) {
903 mutex_init(&sbi->write_io[i].io_mutex);
904 sbi->write_io[i].sbi = sbi;
905 sbi->write_io[i].bio = NULL;
906 }
907
878 init_rwsem(&sbi->cp_rwsem); 908 init_rwsem(&sbi->cp_rwsem);
879 init_waitqueue_head(&sbi->cp_wait); 909 init_waitqueue_head(&sbi->cp_wait);
880 init_sb_info(sbi); 910 init_sb_info(sbi);
@@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
939 } 969 }
940 970
941 /* if there are nt orphan nodes free them */ 971 /* if there are nt orphan nodes free them */
942 err = -EINVAL; 972 recover_orphan_inodes(sbi);
943 if (recover_orphan_inodes(sbi))
944 goto free_node_inode;
945 973
946 /* read root inode and dentry */ 974 /* read root inode and dentry */
947 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); 975 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
950 err = PTR_ERR(root); 978 err = PTR_ERR(root);
951 goto free_node_inode; 979 goto free_node_inode;
952 } 980 }
953 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) 981 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
982 err = -EINVAL;
954 goto free_root_inode; 983 goto free_root_inode;
984 }
955 985
956 sb->s_root = d_make_root(root); /* allocate root dentry */ 986 sb->s_root = d_make_root(root); /* allocate root dentry */
957 if (!sb->s_root) { 987 if (!sb->s_root) {
@@ -1053,7 +1083,7 @@ static int __init init_inodecache(void)
1053{ 1083{
1054 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1084 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
1055 sizeof(struct f2fs_inode_info), NULL); 1085 sizeof(struct f2fs_inode_info), NULL);
1056 if (f2fs_inode_cachep == NULL) 1086 if (!f2fs_inode_cachep)
1057 return -ENOMEM; 1087 return -ENOMEM;
1058 return 0; 1088 return 0;
1059} 1089}
@@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void)
1078 err = create_node_manager_caches(); 1108 err = create_node_manager_caches();
1079 if (err) 1109 if (err)
1080 goto free_inodecache; 1110 goto free_inodecache;
1081 err = create_gc_caches(); 1111 err = create_segment_manager_caches();
1082 if (err) 1112 if (err)
1083 goto free_node_manager_caches; 1113 goto free_node_manager_caches;
1114 err = create_gc_caches();
1115 if (err)
1116 goto free_segment_manager_caches;
1084 err = create_checkpoint_caches(); 1117 err = create_checkpoint_caches();
1085 if (err) 1118 if (err)
1086 goto free_gc_caches; 1119 goto free_gc_caches;
@@ -1102,6 +1135,8 @@ free_checkpoint_caches:
1102 destroy_checkpoint_caches(); 1135 destroy_checkpoint_caches();
1103free_gc_caches: 1136free_gc_caches:
1104 destroy_gc_caches(); 1137 destroy_gc_caches();
1138free_segment_manager_caches:
1139 destroy_segment_manager_caches();
1105free_node_manager_caches: 1140free_node_manager_caches:
1106 destroy_node_manager_caches(); 1141 destroy_node_manager_caches();
1107free_inodecache: 1142free_inodecache:
@@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void)
1117 unregister_filesystem(&f2fs_fs_type); 1152 unregister_filesystem(&f2fs_fs_type);
1118 destroy_checkpoint_caches(); 1153 destroy_checkpoint_caches();
1119 destroy_gc_caches(); 1154 destroy_gc_caches();
1155 destroy_segment_manager_caches();
1120 destroy_node_manager_caches(); 1156 destroy_node_manager_caches();
1121 destroy_inodecache(); 1157 destroy_inodecache();
1122 kset_unregister(f2fs_kset); 1158 kset_unregister(f2fs_kset);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe5..b0fb8a27f3da 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -522,7 +522,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
522 if (found) 522 if (found)
523 free = free + ENTRY_SIZE(here); 523 free = free + ENTRY_SIZE(here);
524 524
525 if (free < newsize) { 525 if (unlikely(free < newsize)) {
526 error = -ENOSPC; 526 error = -ENOSPC;
527 goto exit; 527 goto exit;
528 } 528 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f4a10ece2f1..e0259a163f98 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
516 } 516 }
517 WARN_ON(inode->i_state & I_SYNC); 517 WARN_ON(inode->i_state & I_SYNC);
518 /* 518 /*
519 * Skip inode if it is clean. We don't want to mess with writeback 519 * Skip inode if it is clean and we have no outstanding writeback in
520 * lists in this function since flusher thread may be doing for example 520 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
521 * sync in parallel and if we move the inode, it could get skipped. So 521 * function since flusher thread may be doing for example sync in
522 * here we make sure inode is on some writeback list and leave it there 522 * parallel and if we move the inode, it could get skipped. So here we
523 * unless we have completely cleaned the inode. 523 * make sure inode is on some writeback list and leave it there unless
524 * we have completely cleaned the inode.
524 */ 525 */
525 if (!(inode->i_state & I_DIRTY)) 526 if (!(inode->i_state & I_DIRTY) &&
527 (wbc->sync_mode != WB_SYNC_ALL ||
528 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
526 goto out; 529 goto out;
527 inode->i_state |= I_SYNC; 530 inode->i_state |= I_SYNC;
528 spin_unlock(&inode->i_lock); 531 spin_unlock(&inode->i_lock);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd362..0a648bb455ae 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1296 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); 1296 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1297} 1297}
1298 1298
1299static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1300 struct pipe_buffer *buf)
1301{
1302 return 1;
1303}
1304
1305static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1306 .can_merge = 0,
1307 .map = generic_pipe_buf_map,
1308 .unmap = generic_pipe_buf_unmap,
1309 .confirm = generic_pipe_buf_confirm,
1310 .release = generic_pipe_buf_release,
1311 .steal = fuse_dev_pipe_buf_steal,
1312 .get = generic_pipe_buf_get,
1313};
1314
1315static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, 1299static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1316 struct pipe_inode_info *pipe, 1300 struct pipe_inode_info *pipe,
1317 size_t len, unsigned int flags) 1301 size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1358 buf->page = bufs[page_nr].page; 1342 buf->page = bufs[page_nr].page;
1359 buf->offset = bufs[page_nr].offset; 1343 buf->offset = bufs[page_nr].offset;
1360 buf->len = bufs[page_nr].len; 1344 buf->len = bufs[page_nr].len;
1361 buf->ops = &fuse_dev_pipe_buf_ops; 1345 /*
1346 * Need to be careful about this. Having buf->ops in module
1347 * code can Oops if the buffer persists after module unload.
1348 */
1349 buf->ops = &nosteal_pipe_buf_ops;
1362 1350
1363 pipe->nrbufs++; 1351 pipe->nrbufs++;
1364 page_nr++; 1352 page_nr++;
@@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1599 1587
1600 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); 1588 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1601 err = fuse_copy_page(cs, &page, offset, this_num, 0); 1589 err = fuse_copy_page(cs, &page, offset, this_num, 0);
1602 if (!err && offset == 0 && (num != 0 || file_size == end)) 1590 if (!err && offset == 0 &&
1591 (this_num == PAGE_CACHE_SIZE || file_size == end))
1603 SetPageUptodate(page); 1592 SetPageUptodate(page);
1604 unlock_page(page); 1593 unlock_page(page);
1605 page_cache_release(page); 1594 page_cache_release(page);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c3eb2c46c8f1..1d1292c581c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
112 get_fuse_inode(inode)->i_time = 0; 112 get_fuse_inode(inode)->i_time = 0;
113} 113}
114 114
115/**
116 * Mark the attributes as stale due to an atime change. Avoid the invalidate if
117 * atime is not used.
118 */
119void fuse_invalidate_atime(struct inode *inode)
120{
121 if (!IS_RDONLY(inode))
122 fuse_invalidate_attr(inode);
123}
124
115/* 125/*
116 * Just mark the entry as stale, so that a next attempt to look it up 126 * Just mark the entry as stale, so that a next attempt to look it up
117 * will result in a new lookup call to userspace 127 * will result in a new lookup call to userspace
@@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
1371 } 1381 }
1372 1382
1373 __free_page(page); 1383 __free_page(page);
1374 fuse_invalidate_attr(inode); /* atime changed */ 1384 fuse_invalidate_atime(inode);
1375 return err; 1385 return err;
1376} 1386}
1377 1387
@@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry)
1404 link[req->out.args[0].size] = '\0'; 1414 link[req->out.args[0].size] = '\0';
1405 out: 1415 out:
1406 fuse_put_request(fc, req); 1416 fuse_put_request(fc, req);
1407 fuse_invalidate_attr(inode); /* atime changed */ 1417 fuse_invalidate_atime(inode);
1408 return link; 1418 return link;
1409} 1419}
1410 1420
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297bc..74f6ca500504 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
127 if (atomic_dec_and_test(&ff->count)) { 127 if (atomic_dec_and_test(&ff->count)) {
128 struct fuse_req *req = ff->reserved_req; 128 struct fuse_req *req = ff->reserved_req;
129 129
130 if (sync) { 130 if (ff->fc->no_open) {
131 /*
132 * Drop the release request when client does not
133 * implement 'open'
134 */
135 req->background = 0;
136 path_put(&req->misc.release.path);
137 fuse_put_request(ff->fc, req);
138 } else if (sync) {
131 req->background = 0; 139 req->background = 0;
132 fuse_request_send(ff->fc, req); 140 fuse_request_send(ff->fc, req);
133 path_put(&req->misc.release.path); 141 path_put(&req->misc.release.path);
@@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
144int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 152int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
145 bool isdir) 153 bool isdir)
146{ 154{
147 struct fuse_open_out outarg;
148 struct fuse_file *ff; 155 struct fuse_file *ff;
149 int err;
150 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 156 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
151 157
152 ff = fuse_file_alloc(fc); 158 ff = fuse_file_alloc(fc);
153 if (!ff) 159 if (!ff)
154 return -ENOMEM; 160 return -ENOMEM;
155 161
156 err = fuse_send_open(fc, nodeid, file, opcode, &outarg); 162 ff->fh = 0;
157 if (err) { 163 ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
158 fuse_file_free(ff); 164 if (!fc->no_open || isdir) {
159 return err; 165 struct fuse_open_out outarg;
166 int err;
167
168 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
169 if (!err) {
170 ff->fh = outarg.fh;
171 ff->open_flags = outarg.open_flags;
172
173 } else if (err != -ENOSYS || isdir) {
174 fuse_file_free(ff);
175 return err;
176 } else {
177 fc->no_open = 1;
178 }
160 } 179 }
161 180
162 if (isdir) 181 if (isdir)
163 outarg.open_flags &= ~FOPEN_DIRECT_IO; 182 ff->open_flags &= ~FOPEN_DIRECT_IO;
164 183
165 ff->fh = outarg.fh;
166 ff->nodeid = nodeid; 184 ff->nodeid = nodeid;
167 ff->open_flags = outarg.open_flags;
168 file->private_data = fuse_file_get(ff); 185 file->private_data = fuse_file_get(ff);
169 186
170 return 0; 187 return 0;
@@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page)
687 SetPageUptodate(page); 704 SetPageUptodate(page);
688 } 705 }
689 706
690 fuse_invalidate_attr(inode); /* atime changed */ 707 fuse_invalidate_atime(inode);
691 out: 708 out:
692 unlock_page(page); 709 unlock_page(page);
693 return err; 710 return err;
@@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
716 fuse_read_update_size(inode, pos, 733 fuse_read_update_size(inode, pos,
717 req->misc.read.attr_ver); 734 req->misc.read.attr_ver);
718 } 735 }
719 fuse_invalidate_attr(inode); /* atime changed */ 736 fuse_invalidate_atime(inode);
720 } 737 }
721 738
722 for (i = 0; i < req->num_pages; i++) { 739 for (i = 0; i < req->num_pages; i++) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7d2730912667..2da5db2c8bdb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -485,6 +485,9 @@ struct fuse_conn {
485 * and hence races in setting them will not cause malfunction 485 * and hence races in setting them will not cause malfunction
486 */ 486 */
487 487
488 /** Is open/release not implemented by fs? */
489 unsigned no_open:1;
490
488 /** Is fsync not implemented by fs? */ 491 /** Is fsync not implemented by fs? */
489 unsigned no_fsync:1; 492 unsigned no_fsync:1;
490 493
@@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode);
788 791
789void fuse_invalidate_entry_cache(struct dentry *entry); 792void fuse_invalidate_entry_cache(struct dentry *entry);
790 793
794void fuse_invalidate_atime(struct inode *inode);
795
791/** 796/**
792 * Acquire reference to fuse_conn 797 * Acquire reference to fuse_conn
793 */ 798 */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index b7fc035a6943..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
986{ 986{
987 struct file *file = iocb->ki_filp; 987 struct file *file = iocb->ki_filp;
988 struct inode *inode = file->f_mapping->host; 988 struct inode *inode = file->f_mapping->host;
989 struct address_space *mapping = inode->i_mapping;
989 struct gfs2_inode *ip = GFS2_I(inode); 990 struct gfs2_inode *ip = GFS2_I(inode);
990 struct gfs2_holder gh; 991 struct gfs2_holder gh;
991 int rv; 992 int rv;
@@ -1006,6 +1007,36 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1006 if (rv != 1) 1007 if (rv != 1)
1007 goto out; /* dio not valid, fall back to buffered i/o */ 1008 goto out; /* dio not valid, fall back to buffered i/o */
1008 1009
1010 /*
1011 * Now since we are holding a deferred (CW) lock at this point, you
1012 * might be wondering why this is ever needed. There is a case however
1013 * where we've granted a deferred local lock against a cached exclusive
1014 * glock. That is ok provided all granted local locks are deferred, but
1015 * it also means that it is possible to encounter pages which are
1016 * cached and possibly also mapped. So here we check for that and sort
1017 * them out ahead of the dio. The glock state machine will take care of
1018 * everything else.
1019 *
1020 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
1021 * the first place, mapping->nr_pages will always be zero.
1022 */
1023 if (mapping->nrpages) {
1024 loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
1025 loff_t len = iov_length(iov, nr_segs);
1026 loff_t end = PAGE_ALIGN(offset + len) - 1;
1027
1028 rv = 0;
1029 if (len == 0)
1030 goto out;
1031 if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
1032 unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
1033 rv = filemap_write_and_wait_range(mapping, lstart, end);
1034 if (rv)
1035 goto out;
1036 if (rw == WRITE)
1037 truncate_inode_pages_range(mapping, lstart, end);
1038 }
1039
1009 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1040 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1010 offset, nr_segs, gfs2_get_block_direct, 1041 offset, nr_segs, gfs2_get_block_direct,
1011 NULL, NULL, 0); 1042 NULL, NULL, 0);
@@ -1050,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1050 bh = bh->b_this_page; 1081 bh = bh->b_this_page;
1051 } while(bh != head); 1082 } while(bh != head);
1052 spin_unlock(&sdp->sd_ail_lock); 1083 spin_unlock(&sdp->sd_ail_lock);
1053 gfs2_log_unlock(sdp);
1054 1084
1055 head = bh = page_buffers(page); 1085 head = bh = page_buffers(page);
1056 do { 1086 do {
1057 gfs2_log_lock(sdp);
1058 bd = bh->b_private; 1087 bd = bh->b_private;
1059 if (bd) { 1088 if (bd) {
1060 gfs2_assert_warn(sdp, bd->bd_bh == bh); 1089 gfs2_assert_warn(sdp, bd->bd_bh == bh);
1061 if (!list_empty(&bd->bd_list)) { 1090 if (!list_empty(&bd->bd_list))
1062 if (!buffer_pinned(bh)) 1091 list_del_init(&bd->bd_list);
1063 list_del_init(&bd->bd_list); 1092 bd->bd_bh = NULL;
1064 else
1065 bd = NULL;
1066 }
1067 if (bd)
1068 bd->bd_bh = NULL;
1069 bh->b_private = NULL; 1093 bh->b_private = NULL;
1070 }
1071 gfs2_log_unlock(sdp);
1072 if (bd)
1073 kmem_cache_free(gfs2_bufdata_cachep, bd); 1094 kmem_cache_free(gfs2_bufdata_cachep, bd);
1095 }
1074 1096
1075 bh = bh->b_this_page; 1097 bh = bh->b_this_page;
1076 } while (bh != head); 1098 } while (bh != head);
1099 gfs2_log_unlock(sdp);
1077 1100
1078 return try_to_free_buffers(page); 1101 return try_to_free_buffers(page);
1079 1102
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
834 struct gfs2_leaf *leaf; 834 struct gfs2_leaf *leaf;
835 struct gfs2_dirent *dent; 835 struct gfs2_dirent *dent;
836 struct qstr name = { .name = "" }; 836 struct qstr name = { .name = "" };
837 struct timespec tv = CURRENT_TIME;
837 838
838 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); 839 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
839 if (error) 840 if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
850 leaf->lf_entries = 0; 851 leaf->lf_entries = 0;
851 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); 852 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
852 leaf->lf_next = 0; 853 leaf->lf_next = 0;
853 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); 854 leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
855 leaf->lf_dist = cpu_to_be32(1);
856 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
857 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
858 memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
854 dent = (struct gfs2_dirent *)(leaf+1); 859 dent = (struct gfs2_dirent *)(leaf+1);
855 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); 860 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
856 *pbh = bh; 861 *pbh = bh;
@@ -1612,11 +1617,31 @@ out:
1612 return ret; 1617 return ret;
1613} 1618}
1614 1619
1620/**
1621 * dir_new_leaf - Add a new leaf onto hash chain
1622 * @inode: The directory
1623 * @name: The name we are adding
1624 *
1625 * This adds a new dir leaf onto an existing leaf when there is not
1626 * enough space to add a new dir entry. This is a last resort after
1627 * we've expanded the hash table to max size and also split existing
1628 * leaf blocks, so it will only occur for very large directories.
1629 *
1630 * The dist parameter is set to 1 for leaf blocks directly attached
1631 * to the hash table, 2 for one layer of indirection, 3 for two layers
1632 * etc. We are thus able to tell the difference between an old leaf
1633 * with dist set to zero (i.e. "don't know") and a new one where we
1634 * set this information for debug/fsck purposes.
1635 *
1636 * Returns: 0 on success, or -ve on error
1637 */
1638
1615static int dir_new_leaf(struct inode *inode, const struct qstr *name) 1639static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1616{ 1640{
1617 struct buffer_head *bh, *obh; 1641 struct buffer_head *bh, *obh;
1618 struct gfs2_inode *ip = GFS2_I(inode); 1642 struct gfs2_inode *ip = GFS2_I(inode);
1619 struct gfs2_leaf *leaf, *oleaf; 1643 struct gfs2_leaf *leaf, *oleaf;
1644 u32 dist = 1;
1620 int error; 1645 int error;
1621 u32 index; 1646 u32 index;
1622 u64 bn; 1647 u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1626 if (error) 1651 if (error)
1627 return error; 1652 return error;
1628 do { 1653 do {
1654 dist++;
1629 oleaf = (struct gfs2_leaf *)obh->b_data; 1655 oleaf = (struct gfs2_leaf *)obh->b_data;
1630 bn = be64_to_cpu(oleaf->lf_next); 1656 bn = be64_to_cpu(oleaf->lf_next);
1631 if (!bn) 1657 if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1643 brelse(obh); 1669 brelse(obh);
1644 return -ENOSPC; 1670 return -ENOSPC;
1645 } 1671 }
1672 leaf->lf_dist = cpu_to_be32(dist);
1646 oleaf->lf_next = cpu_to_be64(bh->b_blocknr); 1673 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1647 brelse(bh); 1674 brelse(bh);
1648 brelse(obh); 1675 brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1659 1686
1660/** 1687/**
1661 * gfs2_dir_add - Add new filename into directory 1688 * gfs2_dir_add - Add new filename into directory
1662 * @dip: The GFS2 inode 1689 * @inode: The directory inode
1663 * @filename: The new name 1690 * @name: The new name
1664 * @inode: The inode number of the entry 1691 * @nip: The GFS2 inode to be linked in to the directory
1665 * @type: The type of the entry 1692 * @da: The directory addition info
1693 *
1694 * If the call to gfs2_diradd_alloc_required resulted in there being
1695 * no need to allocate any new directory blocks, then it will contain
1696 * a pointer to the directory entry and the bh in which it resides. We
1697 * can use that without having to repeat the search. If there was no
1698 * free space, then we must now create more space.
1666 * 1699 *
1667 * Returns: 0 on success, error code on failure 1700 * Returns: 0 on success, error code on failure
1668 */ 1701 */
1669 1702
1670int gfs2_dir_add(struct inode *inode, const struct qstr *name, 1703int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1671 const struct gfs2_inode *nip) 1704 const struct gfs2_inode *nip, struct gfs2_diradd *da)
1672{ 1705{
1673 struct gfs2_inode *ip = GFS2_I(inode); 1706 struct gfs2_inode *ip = GFS2_I(inode);
1674 struct buffer_head *bh; 1707 struct buffer_head *bh = da->bh;
1675 struct gfs2_dirent *dent; 1708 struct gfs2_dirent *dent = da->dent;
1709 struct timespec tv;
1676 struct gfs2_leaf *leaf; 1710 struct gfs2_leaf *leaf;
1677 int error; 1711 int error;
1678 1712
1679 while(1) { 1713 while(1) {
1680 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, 1714 if (da->bh == NULL) {
1681 &bh); 1715 dent = gfs2_dirent_search(inode, name,
1716 gfs2_dirent_find_space, &bh);
1717 }
1682 if (dent) { 1718 if (dent) {
1683 if (IS_ERR(dent)) 1719 if (IS_ERR(dent))
1684 return PTR_ERR(dent); 1720 return PTR_ERR(dent);
1685 dent = gfs2_init_dirent(inode, dent, name, bh); 1721 dent = gfs2_init_dirent(inode, dent, name, bh);
1686 gfs2_inum_out(nip, dent); 1722 gfs2_inum_out(nip, dent);
1687 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1724 tv = CURRENT_TIME;
1688 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1689 leaf = (struct gfs2_leaf *)bh->b_data; 1726 leaf = (struct gfs2_leaf *)bh->b_data;
1690 be16_add_cpu(&leaf->lf_entries, 1); 1727 be16_add_cpu(&leaf->lf_entries, 1);
1728 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1729 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1691 } 1730 }
1731 da->dent = NULL;
1732 da->bh = NULL;
1692 brelse(bh); 1733 brelse(bh);
1693 ip->i_entries++; 1734 ip->i_entries++;
1694 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1735 ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
1695 if (S_ISDIR(nip->i_inode.i_mode)) 1736 if (S_ISDIR(nip->i_inode.i_mode))
1696 inc_nlink(&ip->i_inode); 1737 inc_nlink(&ip->i_inode);
1697 mark_inode_dirty(inode); 1738 mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1742 const struct qstr *name = &dentry->d_name; 1783 const struct qstr *name = &dentry->d_name;
1743 struct gfs2_dirent *dent, *prev = NULL; 1784 struct gfs2_dirent *dent, *prev = NULL;
1744 struct buffer_head *bh; 1785 struct buffer_head *bh;
1786 struct timespec tv = CURRENT_TIME;
1745 1787
1746 /* Returns _either_ the entry (if its first in block) or the 1788 /* Returns _either_ the entry (if its first in block) or the
1747 previous entry otherwise */ 1789 previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1767 if (!entries) 1809 if (!entries)
1768 gfs2_consist_inode(dip); 1810 gfs2_consist_inode(dip);
1769 leaf->lf_entries = cpu_to_be16(--entries); 1811 leaf->lf_entries = cpu_to_be16(--entries);
1812 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1813 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1770 } 1814 }
1771 brelse(bh); 1815 brelse(bh);
1772 1816
1773 if (!dip->i_entries) 1817 if (!dip->i_entries)
1774 gfs2_consist_inode(dip); 1818 gfs2_consist_inode(dip);
1775 dip->i_entries--; 1819 dip->i_entries--;
1776 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1820 dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
1777 if (S_ISDIR(dentry->d_inode->i_mode)) 1821 if (S_ISDIR(dentry->d_inode->i_mode))
1778 drop_nlink(&dip->i_inode); 1822 drop_nlink(&dip->i_inode);
1779 mark_inode_dirty(&dip->i_inode); 1823 mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
2017 * gfs2_diradd_alloc_required - find if adding entry will require an allocation 2061 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
2018 * @ip: the file being written to 2062 * @ip: the file being written to
2019 * @filname: the filename that's going to be added 2063 * @filname: the filename that's going to be added
2064 * @da: The structure to return dir alloc info
2020 * 2065 *
2021 * Returns: 1 if alloc required, 0 if not, -ve on error 2066 * Returns: 0 if ok, -ve on error
2022 */ 2067 */
2023 2068
2024int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) 2069int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
2070 struct gfs2_diradd *da)
2025{ 2071{
2072 struct gfs2_inode *ip = GFS2_I(inode);
2073 struct gfs2_sbd *sdp = GFS2_SB(inode);
2074 const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
2026 struct gfs2_dirent *dent; 2075 struct gfs2_dirent *dent;
2027 struct buffer_head *bh; 2076 struct buffer_head *bh;
2028 2077
2078 da->nr_blocks = 0;
2079 da->bh = NULL;
2080 da->dent = NULL;
2081
2029 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); 2082 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
2030 if (!dent) { 2083 if (!dent) {
2031 return 1; 2084 da->nr_blocks = sdp->sd_max_dirres;
2085 if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
2086 (GFS2_DIRENT_SIZE(name->len) < extra))
2087 da->nr_blocks = 1;
2088 return 0;
2032 } 2089 }
2033 if (IS_ERR(dent)) 2090 if (IS_ERR(dent))
2034 return PTR_ERR(dent); 2091 return PTR_ERR(dent);
2035 brelse(bh); 2092 da->bh = bh;
2093 da->dent = dent;
2036 return 0; 2094 return 0;
2037} 2095}
2038 2096
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
16struct inode; 16struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19struct buffer_head;
20struct gfs2_dirent;
21
22struct gfs2_diradd {
23 unsigned nr_blocks;
24 struct gfs2_dirent *dent;
25 struct buffer_head *bh;
26};
19 27
20extern struct inode *gfs2_dir_search(struct inode *dir, 28extern struct inode *gfs2_dir_search(struct inode *dir,
21 const struct qstr *filename, 29 const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
23extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 31extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
24 const struct gfs2_inode *ip); 32 const struct gfs2_inode *ip);
25extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 33extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
26 const struct gfs2_inode *ip); 34 const struct gfs2_inode *ip, struct gfs2_diradd *da);
35static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
36{
37 if (da->bh)
38 brelse(da->bh);
39 da->bh = NULL;
40}
27extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); 41extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
28extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, 42extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
29 struct file_ra_state *f_ra); 43 struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
33extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 47extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
34 48
35extern int gfs2_diradd_alloc_required(struct inode *dir, 49extern int gfs2_diradd_alloc_required(struct inode *dir,
36 const struct qstr *filename); 50 const struct qstr *filename,
51 struct gfs2_diradd *da);
37extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 52extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
38 struct buffer_head **bhp); 53 struct buffer_head **bhp);
39extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); 54extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c8420f7e4db6..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1552 glock_hash_walk(thaw_glock, sdp); 1552 glock_hash_walk(thaw_glock, sdp);
1553} 1553}
1554 1554
1555static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) 1555static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1556{ 1556{
1557 int ret;
1558 spin_lock(&gl->gl_spin); 1557 spin_lock(&gl->gl_spin);
1559 ret = gfs2_dump_glock(seq, gl); 1558 gfs2_dump_glock(seq, gl);
1560 spin_unlock(&gl->gl_spin); 1559 spin_unlock(&gl->gl_spin);
1561 return ret;
1562} 1560}
1563 1561
1564static void dump_glock_func(struct gfs2_glock *gl) 1562static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,14 +1645,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1647 * @seq: the seq_file struct 1645 * @seq: the seq_file struct
1648 * @gh: the glock holder 1646 * @gh: the glock holder
1649 * 1647 *
1650 * Returns: 0 on success, -ENOBUFS when we run out of space
1651 */ 1648 */
1652 1649
1653static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1650static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1654{ 1651{
1655 struct task_struct *gh_owner = NULL; 1652 struct task_struct *gh_owner = NULL;
1656 char flags_buf[32]; 1653 char flags_buf[32];
1657 1654
1655 rcu_read_lock();
1658 if (gh->gh_owner_pid) 1656 if (gh->gh_owner_pid)
1659 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1657 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1660 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", 1658 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
@@ -1664,7 +1662,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1664 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, 1662 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1665 gh_owner ? gh_owner->comm : "(ended)", 1663 gh_owner ? gh_owner->comm : "(ended)",
1666 (void *)gh->gh_ip); 1664 (void *)gh->gh_ip);
1667 return 0; 1665 rcu_read_unlock();
1668} 1666}
1669 1667
1670static const char *gflags2str(char *buf, const struct gfs2_glock *gl) 1668static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1719,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1719 * example. The field's are n = number (id of the object), f = flags, 1717 * example. The field's are n = number (id of the object), f = flags,
1720 * t = type, s = state, r = refcount, e = error, p = pid. 1718 * t = type, s = state, r = refcount, e = error, p = pid.
1721 * 1719 *
1722 * Returns: 0 on success, -ENOBUFS when we run out of space
1723 */ 1720 */
1724 1721
1725int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) 1722void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1726{ 1723{
1727 const struct gfs2_glock_operations *glops = gl->gl_ops; 1724 const struct gfs2_glock_operations *glops = gl->gl_ops;
1728 unsigned long long dtime; 1725 unsigned long long dtime;
1729 const struct gfs2_holder *gh; 1726 const struct gfs2_holder *gh;
1730 char gflags_buf[32]; 1727 char gflags_buf[32];
1731 int error = 0;
1732 1728
1733 dtime = jiffies - gl->gl_demote_time; 1729 dtime = jiffies - gl->gl_demote_time;
1734 dtime *= 1000000/HZ; /* demote time in uSec */ 1730 dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1745,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1745 atomic_read(&gl->gl_revokes), 1741 atomic_read(&gl->gl_revokes),
1746 (int)gl->gl_lockref.count, gl->gl_hold_time); 1742 (int)gl->gl_lockref.count, gl->gl_hold_time);
1747 1743
1748 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1744 list_for_each_entry(gh, &gl->gl_holders, gh_list)
1749 error = dump_holder(seq, gh); 1745 dump_holder(seq, gh);
1750 if (error) 1746
1751 goto out;
1752 }
1753 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) 1747 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
1754 error = glops->go_dump(seq, gl); 1748 glops->go_dump(seq, gl);
1755out:
1756 return error;
1757} 1749}
1758 1750
1759static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) 1751static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1951,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
1951 1943
1952static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) 1944static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
1953{ 1945{
1954 return dump_glock(seq, iter_ptr); 1946 dump_glock(seq, iter_ptr);
1947 return 0;
1955} 1948}
1956 1949
1957static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) 1950static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
199 struct gfs2_holder *gh); 199 struct gfs2_holder *gh);
200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
202extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); 202extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) 203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
204extern __printf(2, 3) 204extern __printf(2, 3)
205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index db908f697139..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
133 133
134static void rgrp_go_sync(struct gfs2_glock *gl) 134static void rgrp_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct address_space *metamapping = gfs2_glock2aspace(gl); 136 struct gfs2_sbd *sdp = gl->gl_sbd;
137 struct address_space *mapping = &sdp->sd_aspace;
137 struct gfs2_rgrpd *rgd; 138 struct gfs2_rgrpd *rgd;
138 int error; 139 int error;
139 140
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
141 return; 142 return;
142 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); 143 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
143 144
144 gfs2_log_flush(gl->gl_sbd, gl); 145 gfs2_log_flush(sdp, gl);
145 filemap_fdatawrite(metamapping); 146 filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
146 error = filemap_fdatawait(metamapping); 147 error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
147 mapping_set_error(metamapping, error); 148 mapping_set_error(mapping, error);
148 gfs2_ail_empty_gl(gl); 149 gfs2_ail_empty_gl(gl);
149 150
150 spin_lock(&gl->gl_spin); 151 spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
166 167
167static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 168static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
168{ 169{
169 struct address_space *mapping = gfs2_glock2aspace(gl); 170 struct gfs2_sbd *sdp = gl->gl_sbd;
171 struct address_space *mapping = &sdp->sd_aspace;
170 172
171 WARN_ON_ONCE(!(flags & DIO_METADATA)); 173 WARN_ON_ONCE(!(flags & DIO_METADATA));
172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 174 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
173 truncate_inode_pages(mapping, 0); 175 truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
174 176
175 if (gl->gl_object) { 177 if (gl->gl_object) {
176 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; 178 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -192,8 +194,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
192 194
193 if (ip && !S_ISREG(ip->i_inode.i_mode)) 195 if (ip && !S_ISREG(ip->i_inode.i_mode))
194 ip = NULL; 196 ip = NULL;
195 if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) 197 if (ip) {
196 unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); 198 if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
199 unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
200 inode_dio_wait(&ip->i_inode);
201 }
197 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 202 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
198 return; 203 return;
199 204
@@ -410,6 +415,9 @@ static int inode_go_lock(struct gfs2_holder *gh)
410 return error; 415 return error;
411 } 416 }
412 417
418 if (gh->gh_state != LM_ST_DEFERRED)
419 inode_dio_wait(&ip->i_inode);
420
413 if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && 421 if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
414 (gl->gl_state == LM_ST_EXCLUSIVE) && 422 (gl->gl_state == LM_ST_EXCLUSIVE) &&
415 (gh->gh_state == LM_ST_EXCLUSIVE)) { 423 (gh->gh_state == LM_ST_EXCLUSIVE)) {
@@ -429,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
429 * @seq: The iterator 437 * @seq: The iterator
430 * @ip: the inode 438 * @ip: the inode
431 * 439 *
432 * Returns: 0 on success, -ENOBUFS when we run out of space
433 */ 440 */
434 441
435static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) 442static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
436{ 443{
437 const struct gfs2_inode *ip = gl->gl_object; 444 const struct gfs2_inode *ip = gl->gl_object;
438 if (ip == NULL) 445 if (ip == NULL)
439 return 0; 446 return;
440 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", 447 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
441 (unsigned long long)ip->i_no_formal_ino, 448 (unsigned long long)ip->i_no_formal_ino,
442 (unsigned long long)ip->i_no_addr, 449 (unsigned long long)ip->i_no_addr,
443 IF2DT(ip->i_inode.i_mode), ip->i_flags, 450 IF2DT(ip->i_inode.i_mode), ip->i_flags,
444 (unsigned int)ip->i_diskflags, 451 (unsigned int)ip->i_diskflags,
445 (unsigned long long)i_size_read(&ip->i_inode)); 452 (unsigned long long)i_size_read(&ip->i_inode));
446 return 0;
447} 453}
448 454
449/** 455/**
@@ -552,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
552 .go_unlock = gfs2_rgrp_go_unlock, 558 .go_unlock = gfs2_rgrp_go_unlock,
553 .go_dump = gfs2_rgrp_dump, 559 .go_dump = gfs2_rgrp_dump,
554 .go_type = LM_TYPE_RGRP, 560 .go_type = LM_TYPE_RGRP,
555 .go_flags = GLOF_ASPACE | GLOF_LVB, 561 .go_flags = GLOF_LVB,
556}; 562};
557 563
558const struct gfs2_glock_operations gfs2_trans_glops = { 564const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
93 struct gfs2_rgrp_lvb *rd_rgl; 93 struct gfs2_rgrp_lvb *rd_rgl;
94 u32 rd_last_alloc; 94 u32 rd_last_alloc;
95 u32 rd_flags; 95 u32 rd_flags;
96 u32 rd_extfail_pt; /* extent failure point */
96#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ 97#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
97#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ 98#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
98#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ 99#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
217 int (*go_demote_ok) (const struct gfs2_glock *gl); 218 int (*go_demote_ok) (const struct gfs2_glock *gl);
218 int (*go_lock) (struct gfs2_holder *gh); 219 int (*go_lock) (struct gfs2_holder *gh);
219 void (*go_unlock) (struct gfs2_holder *gh); 220 void (*go_unlock) (struct gfs2_holder *gh);
220 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 221 void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
221 void (*go_callback)(struct gfs2_glock *gl, bool remote); 222 void (*go_callback)(struct gfs2_glock *gl, bool remote);
222 const int go_type; 223 const int go_type;
223 const unsigned long go_flags; 224 const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
350 atomic_t gl_ail_count; 351 atomic_t gl_ail_count;
351 atomic_t gl_revokes; 352 atomic_t gl_revokes;
352 struct delayed_work gl_work; 353 struct delayed_work gl_work;
353 struct work_struct gl_delete; 354 union {
355 /* For inode and iopen glocks only */
356 struct work_struct gl_delete;
357 /* For rgrp glocks only */
358 struct {
359 loff_t start;
360 loff_t end;
361 } gl_vm;
362 };
354 struct rcu_head gl_rcu; 363 struct rcu_head gl_rcu;
355}; 364};
356 365
@@ -419,10 +428,13 @@ enum {
419}; 428};
420 429
421struct gfs2_quota_data { 430struct gfs2_quota_data {
431 struct hlist_bl_node qd_hlist;
422 struct list_head qd_list; 432 struct list_head qd_list;
423 struct kqid qd_id; 433 struct kqid qd_id;
434 struct gfs2_sbd *qd_sbd;
424 struct lockref qd_lockref; 435 struct lockref qd_lockref;
425 struct list_head qd_lru; 436 struct list_head qd_lru;
437 unsigned qd_hash;
426 438
427 unsigned long qd_flags; /* QDF_... */ 439 unsigned long qd_flags; /* QDF_... */
428 440
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
441 453
442 u64 qd_sync_gen; 454 u64 qd_sync_gen;
443 unsigned long qd_last_warn; 455 unsigned long qd_last_warn;
456 struct rcu_head qd_rcu;
444}; 457};
445 458
446struct gfs2_trans { 459struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
720 spinlock_t sd_trunc_lock; 733 spinlock_t sd_trunc_lock;
721 734
722 unsigned int sd_quota_slots; 735 unsigned int sd_quota_slots;
723 unsigned int sd_quota_chunks; 736 unsigned long *sd_quota_bitmap;
724 unsigned char **sd_quota_bitmap; 737 spinlock_t sd_bitmap_lock;
725 738
726 u64 sd_quota_sync_gen; 739 u64 sd_quota_sync_gen;
727 740
728 /* Log stuff */ 741 /* Log stuff */
729 742
743 struct address_space sd_aspace;
744
730 spinlock_t sd_log_lock; 745 spinlock_t sd_log_lock;
731 746
732 struct gfs2_trans *sd_log_tr; 747 struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..890588c7fb33 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
149 ip = GFS2_I(inode); 149 ip = GFS2_I(inode);
150 150
151 if (!inode) 151 if (!inode)
152 return ERR_PTR(-ENOBUFS); 152 return ERR_PTR(-ENOMEM);
153 153
154 if (inode->i_state & I_NEW) { 154 if (inode->i_state & I_NEW) {
155 struct gfs2_sbd *sdp = GFS2_SB(inode); 155 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
469 brelse(dibh); 469 brelse(dibh);
470} 470}
471 471
472/**
473 * gfs2_trans_da_blocks - Calculate number of blocks to link inode
474 * @dip: The directory we are linking into
475 * @da: The dir add information
476 * @nr_inodes: The number of inodes involved
477 *
478 * This calculate the number of blocks we need to reserve in a
479 * transaction to link @nr_inodes into a directory. In most cases
480 * @nr_inodes will be 2 (the directory plus the inode being linked in)
481 * but in case of rename, 4 may be required.
482 *
483 * Returns: Number of blocks
484 */
485
486static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
487 const struct gfs2_diradd *da,
488 unsigned nr_inodes)
489{
490 return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
491 (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
492}
493
472static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, 494static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
473 struct gfs2_inode *ip, int arq) 495 struct gfs2_inode *ip, struct gfs2_diradd *da)
474{ 496{
475 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 497 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
476 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 498 struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
477 int error; 499 int error;
478 500
479 if (arq) { 501 if (da->nr_blocks) {
480 error = gfs2_quota_lock_check(dip); 502 error = gfs2_quota_lock_check(dip);
481 if (error) 503 if (error)
482 goto fail_quota_locks; 504 goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
485 if (error) 507 if (error)
486 goto fail_quota_locks; 508 goto fail_quota_locks;
487 509
488 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 510 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
489 dip->i_rgd->rd_length +
490 2 * RES_DINODE +
491 RES_STATFS + RES_QUOTA, 0);
492 if (error) 511 if (error)
493 goto fail_ipreserv; 512 goto fail_ipreserv;
494 } else { 513 } else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
497 goto fail_quota_locks; 516 goto fail_quota_locks;
498 } 517 }
499 518
500 error = gfs2_dir_add(&dip->i_inode, name, ip); 519 error = gfs2_dir_add(&dip->i_inode, name, ip, da);
501 if (error) 520 if (error)
502 goto fail_end_trans; 521 goto fail_end_trans;
503 522
@@ -560,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
560 struct dentry *d; 579 struct dentry *d;
561 int error; 580 int error;
562 u32 aflags = 0; 581 u32 aflags = 0;
563 int arq; 582 struct gfs2_diradd da = { .bh = NULL, };
564 583
565 if (!name->len || name->len > GFS2_FNAMESIZE) 584 if (!name->len || name->len > GFS2_FNAMESIZE)
566 return -ENAMETOOLONG; 585 return -ENAMETOOLONG;
@@ -585,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
585 error = PTR_ERR(inode); 604 error = PTR_ERR(inode);
586 if (!IS_ERR(inode)) { 605 if (!IS_ERR(inode)) {
587 d = d_splice_alias(inode, dentry); 606 d = d_splice_alias(inode, dentry);
607 error = PTR_ERR(d);
608 if (IS_ERR(d))
609 goto fail_gunlock;
588 error = 0; 610 error = 0;
589 if (file) { 611 if (file) {
590 if (S_ISREG(inode->i_mode)) { 612 if (S_ISREG(inode->i_mode)) {
@@ -602,7 +624,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
602 goto fail_gunlock; 624 goto fail_gunlock;
603 } 625 }
604 626
605 arq = error = gfs2_diradd_alloc_required(dir, name); 627 error = gfs2_diradd_alloc_required(dir, name, &da);
606 if (error < 0) 628 if (error < 0)
607 goto fail_gunlock; 629 goto fail_gunlock;
608 630
@@ -690,7 +712,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
690 if (error) 712 if (error)
691 goto fail_gunlock3; 713 goto fail_gunlock3;
692 714
693 error = link_dinode(dip, name, ip, arq); 715 error = link_dinode(dip, name, ip, &da);
694 if (error) 716 if (error)
695 goto fail_gunlock3; 717 goto fail_gunlock3;
696 718
@@ -719,6 +741,7 @@ fail_free_inode:
719 free_inode_nonrcu(inode); 741 free_inode_nonrcu(inode);
720 inode = NULL; 742 inode = NULL;
721fail_gunlock: 743fail_gunlock:
744 gfs2_dir_no_add(&da);
722 gfs2_glock_dq_uninit(ghs); 745 gfs2_glock_dq_uninit(ghs);
723 if (inode && !IS_ERR(inode)) { 746 if (inode && !IS_ERR(inode)) {
724 clear_nlink(inode); 747 clear_nlink(inode);
@@ -779,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
779 } 802 }
780 803
781 d = d_splice_alias(inode, dentry); 804 d = d_splice_alias(inode, dentry);
805 if (IS_ERR(d)) {
806 iput(inode);
807 gfs2_glock_dq_uninit(&gh);
808 return d;
809 }
782 if (file && S_ISREG(inode->i_mode)) 810 if (file && S_ISREG(inode->i_mode))
783 error = finish_open(file, dentry, gfs2_open_common, opened); 811 error = finish_open(file, dentry, gfs2_open_common, opened);
784 812
@@ -817,7 +845,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
817 struct gfs2_inode *ip = GFS2_I(inode); 845 struct gfs2_inode *ip = GFS2_I(inode);
818 struct gfs2_holder ghs[2]; 846 struct gfs2_holder ghs[2];
819 struct buffer_head *dibh; 847 struct buffer_head *dibh;
820 int alloc_required; 848 struct gfs2_diradd da = { .bh = NULL, };
821 int error; 849 int error;
822 850
823 if (S_ISDIR(inode->i_mode)) 851 if (S_ISDIR(inode->i_mode))
@@ -872,13 +900,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
872 if (ip->i_inode.i_nlink == (u32)-1) 900 if (ip->i_inode.i_nlink == (u32)-1)
873 goto out_gunlock; 901 goto out_gunlock;
874 902
875 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); 903 error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
876 if (error < 0) 904 if (error < 0)
877 goto out_gunlock; 905 goto out_gunlock;
878 error = 0;
879 906
880 if (alloc_required) { 907 if (da.nr_blocks) {
881 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 908 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
882 error = gfs2_quota_lock_check(dip); 909 error = gfs2_quota_lock_check(dip);
883 if (error) 910 if (error)
884 goto out_gunlock; 911 goto out_gunlock;
@@ -887,10 +914,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
887 if (error) 914 if (error)
888 goto out_gunlock_q; 915 goto out_gunlock_q;
889 916
890 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 917 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
891 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
892 2 * RES_DINODE + RES_STATFS +
893 RES_QUOTA, 0);
894 if (error) 918 if (error)
895 goto out_ipres; 919 goto out_ipres;
896 } else { 920 } else {
@@ -903,7 +927,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
903 if (error) 927 if (error)
904 goto out_end_trans; 928 goto out_end_trans;
905 929
906 error = gfs2_dir_add(dir, &dentry->d_name, ip); 930 error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
907 if (error) 931 if (error)
908 goto out_brelse; 932 goto out_brelse;
909 933
@@ -919,12 +943,13 @@ out_brelse:
919out_end_trans: 943out_end_trans:
920 gfs2_trans_end(sdp); 944 gfs2_trans_end(sdp);
921out_ipres: 945out_ipres:
922 if (alloc_required) 946 if (da.nr_blocks)
923 gfs2_inplace_release(dip); 947 gfs2_inplace_release(dip);
924out_gunlock_q: 948out_gunlock_q:
925 if (alloc_required) 949 if (da.nr_blocks)
926 gfs2_quota_unlock(dip); 950 gfs2_quota_unlock(dip);
927out_gunlock: 951out_gunlock:
952 gfs2_dir_no_add(&da);
928 gfs2_glock_dq(ghs + 1); 953 gfs2_glock_dq(ghs + 1);
929out_child: 954out_child:
930 gfs2_glock_dq(ghs); 955 gfs2_glock_dq(ghs);
@@ -1254,7 +1279,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1254 struct gfs2_rgrpd *nrgd; 1279 struct gfs2_rgrpd *nrgd;
1255 unsigned int num_gh; 1280 unsigned int num_gh;
1256 int dir_rename = 0; 1281 int dir_rename = 0;
1257 int alloc_required = 0; 1282 struct gfs2_diradd da = { .nr_blocks = 0, };
1258 unsigned int x; 1283 unsigned int x;
1259 int error; 1284 int error;
1260 1285
@@ -1388,14 +1413,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1388 goto out_gunlock; 1413 goto out_gunlock;
1389 } 1414 }
1390 1415
1391 if (nip == NULL) 1416 if (nip == NULL) {
1392 alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); 1417 error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
1393 error = alloc_required; 1418 if (error)
1394 if (error < 0) 1419 goto out_gunlock;
1395 goto out_gunlock; 1420 }
1396 1421
1397 if (alloc_required) { 1422 if (da.nr_blocks) {
1398 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 1423 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
1399 error = gfs2_quota_lock_check(ndip); 1424 error = gfs2_quota_lock_check(ndip);
1400 if (error) 1425 if (error)
1401 goto out_gunlock; 1426 goto out_gunlock;
@@ -1404,10 +1429,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1404 if (error) 1429 if (error)
1405 goto out_gunlock_q; 1430 goto out_gunlock_q;
1406 1431
1407 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1432 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
1408 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + 1433 4 * RES_LEAF + 4, 0);
1409 4 * RES_DINODE + 4 * RES_LEAF +
1410 RES_STATFS + RES_QUOTA + 4, 0);
1411 if (error) 1434 if (error)
1412 goto out_ipreserv; 1435 goto out_ipreserv;
1413 } else { 1436 } else {
@@ -1441,19 +1464,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1441 if (error) 1464 if (error)
1442 goto out_end_trans; 1465 goto out_end_trans;
1443 1466
1444 error = gfs2_dir_add(ndir, &ndentry->d_name, ip); 1467 error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
1445 if (error) 1468 if (error)
1446 goto out_end_trans; 1469 goto out_end_trans;
1447 1470
1448out_end_trans: 1471out_end_trans:
1449 gfs2_trans_end(sdp); 1472 gfs2_trans_end(sdp);
1450out_ipreserv: 1473out_ipreserv:
1451 if (alloc_required) 1474 if (da.nr_blocks)
1452 gfs2_inplace_release(ndip); 1475 gfs2_inplace_release(ndip);
1453out_gunlock_q: 1476out_gunlock_q:
1454 if (alloc_required) 1477 if (da.nr_blocks)
1455 gfs2_quota_unlock(ndip); 1478 gfs2_quota_unlock(ndip);
1456out_gunlock: 1479out_gunlock:
1480 gfs2_dir_no_add(&da);
1457 while (x--) { 1481 while (x--) {
1458 gfs2_glock_dq(ghs + x); 1482 gfs2_glock_dq(ghs + x);
1459 gfs2_holder_uninit(ghs + x); 1483 gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1631,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1607 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) 1631 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
1608 ogid = ngid = NO_GID_QUOTA_CHANGE; 1632 ogid = ngid = NO_GID_QUOTA_CHANGE;
1609 1633
1610 error = gfs2_quota_lock(ip, nuid, ngid); 1634 error = get_write_access(inode);
1611 if (error) 1635 if (error)
1612 return error; 1636 return error;
1613 1637
1638 error = gfs2_rs_alloc(ip);
1639 if (error)
1640 goto out;
1641
1642 error = gfs2_rindex_update(sdp);
1643 if (error)
1644 goto out;
1645
1646 error = gfs2_quota_lock(ip, nuid, ngid);
1647 if (error)
1648 goto out;
1649
1614 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || 1650 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
1615 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { 1651 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
1616 error = gfs2_quota_check(ip, nuid, ngid); 1652 error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1673,8 @@ out_end_trans:
1637 gfs2_trans_end(sdp); 1673 gfs2_trans_end(sdp);
1638out_gunlock_q: 1674out_gunlock_q:
1639 gfs2_quota_unlock(ip); 1675 gfs2_quota_unlock(ip);
1676out:
1677 put_write_access(inode);
1640 return error; 1678 return error;
1641} 1679}
1642 1680
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613fb65b5..9dcb9777a5f8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
551 struct buffer_head *bh = bd->bd_bh; 551 struct buffer_head *bh = bd->bd_bh;
552 struct gfs2_glock *gl = bd->bd_gl; 552 struct gfs2_glock *gl = bd->bd_gl;
553 553
554 gfs2_remove_from_ail(bd);
555 bd->bd_bh = NULL;
556 bh->b_private = NULL; 554 bh->b_private = NULL;
557 bd->bd_blkno = bh->b_blocknr; 555 bd->bd_blkno = bh->b_blocknr;
556 gfs2_remove_from_ail(bd); /* drops ref on bh */
557 bd->bd_bh = NULL;
558 bd->bd_ops = &gfs2_revoke_lops; 558 bd->bd_ops = &gfs2_revoke_lops;
559 sdp->sd_log_num_revoke++; 559 sdp->sd_log_num_revoke++;
560 atomic_inc(&gl->gl_revokes); 560 atomic_inc(&gl->gl_revokes);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..58f06400b7b8 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); 83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
84 clear_bit(GBF_FULL, &bi->bi_flags); 84 clear_bit(GBF_FULL, &bi->bi_flags);
85 rgd->rd_free_clone = rgd->rd_free; 85 rgd->rd_free_clone = rgd->rd_free;
86 rgd->rd_extfail_pt = rgd->rd_free;
86} 87}
87 88
88/** 89/**
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
588static void gfs2_meta_sync(struct gfs2_glock *gl) 589static void gfs2_meta_sync(struct gfs2_glock *gl)
589{ 590{
590 struct address_space *mapping = gfs2_glock2aspace(gl); 591 struct address_space *mapping = gfs2_glock2aspace(gl);
592 struct gfs2_sbd *sdp = gl->gl_sbd;
591 int error; 593 int error;
592 594
595 if (mapping == NULL)
596 mapping = &sdp->sd_aspace;
597
593 filemap_fdatawrite(mapping); 598 filemap_fdatawrite(mapping);
594 error = filemap_fdatawait(mapping); 599 error = filemap_fdatawait(mapping);
595 600
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
76 76
77 gfs2_str2qstr(&gfs2_qdot, "."); 77 gfs2_str2qstr(&gfs2_qdot, ".");
78 gfs2_str2qstr(&gfs2_qdotdot, ".."); 78 gfs2_str2qstr(&gfs2_qdotdot, "..");
79 gfs2_quota_hash_init();
79 80
80 error = gfs2_sys_init(); 81 error = gfs2_sys_init();
81 if (error) 82 if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 932415050540..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
116 unsigned long index; 116 unsigned long index;
117 unsigned int bufnum; 117 unsigned int bufnum;
118 118
119 if (mapping == NULL)
120 mapping = &sdp->sd_aspace;
121
119 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; 122 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
120 index = blkno >> shift; /* convert block to page */ 123 index = blkno >> shift; /* convert block to page */
121 bufnum = blkno - (index << shift); /* block buf index within page */ 124 bufnum = blkno - (index << shift); /* block buf index within page */
@@ -258,6 +261,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
258 struct address_space *mapping = bh->b_page->mapping; 261 struct address_space *mapping = bh->b_page->mapping;
259 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); 262 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
260 struct gfs2_bufdata *bd = bh->b_private; 263 struct gfs2_bufdata *bd = bh->b_private;
264 int was_pinned = 0;
261 265
262 if (test_clear_buffer_pinned(bh)) { 266 if (test_clear_buffer_pinned(bh)) {
263 trace_gfs2_pin(bd, 0); 267 trace_gfs2_pin(bd, 0);
@@ -273,12 +277,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
273 tr->tr_num_databuf_rm++; 277 tr->tr_num_databuf_rm++;
274 } 278 }
275 tr->tr_touched = 1; 279 tr->tr_touched = 1;
280 was_pinned = 1;
276 brelse(bh); 281 brelse(bh);
277 } 282 }
278 if (bd) { 283 if (bd) {
279 spin_lock(&sdp->sd_ail_lock); 284 spin_lock(&sdp->sd_ail_lock);
280 if (bd->bd_tr) { 285 if (bd->bd_tr) {
281 gfs2_trans_add_revoke(sdp, bd); 286 gfs2_trans_add_revoke(sdp, bd);
287 } else if (was_pinned) {
288 bh->b_private = NULL;
289 kmem_cache_free(gfs2_bufdata_cachep, bd);
282 } 290 }
283 spin_unlock(&sdp->sd_ail_lock); 291 spin_unlock(&sdp->sd_ail_lock);
284 } 292 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 82303b474958..1e712b566d76 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
36#include "log.h" 36#include "log.h"
37#include "quota.h" 37#include "quota.h"
38#include "dir.h" 38#include "dir.h"
39#include "meta_io.h"
39#include "trace_gfs2.h" 40#include "trace_gfs2.h"
40 41
41#define DO 0 42#define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
62static struct gfs2_sbd *init_sbd(struct super_block *sb) 63static struct gfs2_sbd *init_sbd(struct super_block *sb)
63{ 64{
64 struct gfs2_sbd *sdp; 65 struct gfs2_sbd *sdp;
66 struct address_space *mapping;
65 67
66 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); 68 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
67 if (!sdp) 69 if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
97 init_waitqueue_head(&sdp->sd_quota_wait); 99 init_waitqueue_head(&sdp->sd_quota_wait);
98 INIT_LIST_HEAD(&sdp->sd_trunc_list); 100 INIT_LIST_HEAD(&sdp->sd_trunc_list);
99 spin_lock_init(&sdp->sd_trunc_lock); 101 spin_lock_init(&sdp->sd_trunc_lock);
102 spin_lock_init(&sdp->sd_bitmap_lock);
103
104 mapping = &sdp->sd_aspace;
105
106 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops;
108 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS);
111 mapping->private_data = NULL;
112 mapping->backing_dev_info = sb->s_bdi;
113 mapping->writeback_index = 0;
100 114
101 spin_lock_init(&sdp->sd_log_lock); 115 spin_lock_init(&sdp->sd_log_lock);
102 atomic_set(&sdp->sd_log_pinned, 0); 116 atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
217 231
218 page = alloc_page(GFP_NOFS); 232 page = alloc_page(GFP_NOFS);
219 if (unlikely(!page)) 233 if (unlikely(!page))
220 return -ENOBUFS; 234 return -ENOMEM;
221 235
222 ClearPageUptodate(page); 236 ClearPageUptodate(page);
223 ClearPageDirty(page); 237 ClearPageDirty(page);
@@ -956,40 +970,6 @@ fail:
956 return error; 970 return error;
957} 971}
958 972
959static int init_threads(struct gfs2_sbd *sdp, int undo)
960{
961 struct task_struct *p;
962 int error = 0;
963
964 if (undo)
965 goto fail_quotad;
966
967 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
968 if (IS_ERR(p)) {
969 error = PTR_ERR(p);
970 fs_err(sdp, "can't start logd thread: %d\n", error);
971 return error;
972 }
973 sdp->sd_logd_process = p;
974
975 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
976 if (IS_ERR(p)) {
977 error = PTR_ERR(p);
978 fs_err(sdp, "can't start quotad thread: %d\n", error);
979 goto fail;
980 }
981 sdp->sd_quotad_process = p;
982
983 return 0;
984
985
986fail_quotad:
987 kthread_stop(sdp->sd_quotad_process);
988fail:
989 kthread_stop(sdp->sd_logd_process);
990 return error;
991}
992
993static const match_table_t nolock_tokens = { 973static const match_table_t nolock_tokens = {
994 { Opt_jid, "jid=%d\n", }, 974 { Opt_jid, "jid=%d\n", },
995 { Opt_err, NULL }, 975 { Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1254 goto fail_per_node; 1234 goto fail_per_node;
1255 } 1235 }
1256 1236
1257 error = init_threads(sdp, DO);
1258 if (error)
1259 goto fail_per_node;
1260
1261 if (!(sb->s_flags & MS_RDONLY)) { 1237 if (!(sb->s_flags & MS_RDONLY)) {
1262 error = gfs2_make_fs_rw(sdp); 1238 error = gfs2_make_fs_rw(sdp);
1263 if (error) { 1239 if (error) {
1264 fs_err(sdp, "can't make FS RW: %d\n", error); 1240 fs_err(sdp, "can't make FS RW: %d\n", error);
1265 goto fail_threads; 1241 goto fail_per_node;
1266 } 1242 }
1267 } 1243 }
1268 1244
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1270 gfs2_online_uevent(sdp); 1246 gfs2_online_uevent(sdp);
1271 return 0; 1247 return 0;
1272 1248
1273fail_threads:
1274 init_threads(sdp, UNDO);
1275fail_per_node: 1249fail_per_node:
1276 init_per_node(sdp, UNDO); 1250 init_per_node(sdp, UNDO);
1277fail_inodes: 1251fail_inodes:
@@ -1366,8 +1340,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
1366 if (IS_ERR(s)) 1340 if (IS_ERR(s))
1367 goto error_bdev; 1341 goto error_bdev;
1368 1342
1369 if (s->s_root) 1343 if (s->s_root) {
1344 /*
1345 * s_umount nests inside bd_mutex during
1346 * __invalidate_device(). blkdev_put() acquires
1347 * bd_mutex and can't be called under s_umount. Drop
1348 * s_umount temporarily. This is safe as we're
1349 * holding an active reference.
1350 */
1351 up_write(&s->s_umount);
1370 blkdev_put(bdev, mode); 1352 blkdev_put(bdev, mode);
1353 down_write(&s->s_umount);
1354 }
1371 1355
1372 memset(&args, 0, sizeof(args)); 1356 memset(&args, 0, sizeof(args));
1373 args.ar_quota = GFS2_QUOTA_DEFAULT; 1357 args.ar_quota = GFS2_QUOTA_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
52#include <linux/dqblk_xfs.h> 52#include <linux/dqblk_xfs.h>
53#include <linux/lockref.h> 53#include <linux/lockref.h>
54#include <linux/list_lru.h> 54#include <linux/list_lru.h>
55#include <linux/rcupdate.h>
56#include <linux/rculist_bl.h>
57#include <linux/bit_spinlock.h>
58#include <linux/jhash.h>
59#include <linux/vmalloc.h>
55 60
56#include "gfs2.h" 61#include "gfs2.h"
57#include "incore.h" 62#include "incore.h"
@@ -67,16 +72,44 @@
67#include "inode.h" 72#include "inode.h"
68#include "util.h" 73#include "util.h"
69 74
70struct gfs2_quota_change_host { 75#define GFS2_QD_HASH_SHIFT 12
71 u64 qc_change; 76#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT)
72 u32 qc_flags; /* GFS2_QCF_... */ 77#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
73 struct kqid qc_id;
74};
75 78
76/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ 79/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
80/* -> sd_bitmap_lock */
77static DEFINE_SPINLOCK(qd_lock); 81static DEFINE_SPINLOCK(qd_lock);
78struct list_lru gfs2_qd_lru; 82struct list_lru gfs2_qd_lru;
79 83
84static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
85
86static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
87 const struct kqid qid)
88{
89 unsigned int h;
90
91 h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
92 h = jhash(&qid, sizeof(struct kqid), h);
93
94 return h & GFS2_QD_HASH_MASK;
95}
96
97static inline void spin_lock_bucket(unsigned int hash)
98{
99 hlist_bl_lock(&qd_hash_table[hash]);
100}
101
102static inline void spin_unlock_bucket(unsigned int hash)
103{
104 hlist_bl_unlock(&qd_hash_table[hash]);
105}
106
107static void gfs2_qd_dealloc(struct rcu_head *rcu)
108{
109 struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
110 kmem_cache_free(gfs2_quotad_cachep, qd);
111}
112
80static void gfs2_qd_dispose(struct list_head *list) 113static void gfs2_qd_dispose(struct list_head *list)
81{ 114{
82 struct gfs2_quota_data *qd; 115 struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
93 list_del(&qd->qd_list); 126 list_del(&qd->qd_list);
94 spin_unlock(&qd_lock); 127 spin_unlock(&qd_lock);
95 128
129 spin_lock_bucket(qd->qd_hash);
130 hlist_bl_del_rcu(&qd->qd_hlist);
131 spin_unlock_bucket(qd->qd_hash);
132
96 gfs2_assert_warn(sdp, !qd->qd_change); 133 gfs2_assert_warn(sdp, !qd->qd_change);
97 gfs2_assert_warn(sdp, !qd->qd_slot_count); 134 gfs2_assert_warn(sdp, !qd->qd_slot_count);
98 gfs2_assert_warn(sdp, !qd->qd_bh_count); 135 gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
101 atomic_dec(&sdp->sd_quota_count); 138 atomic_dec(&sdp->sd_quota_count);
102 139
103 /* Delete it from the common reclaim list */ 140 /* Delete it from the common reclaim list */
104 kmem_cache_free(gfs2_quotad_cachep, qd); 141 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
105 } 142 }
106} 143}
107 144
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
171 return offset; 208 return offset;
172} 209}
173 210
174static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, 211static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
175 struct gfs2_quota_data **qdp)
176{ 212{
177 struct gfs2_quota_data *qd; 213 struct gfs2_quota_data *qd;
178 int error; 214 int error;
179 215
180 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); 216 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
181 if (!qd) 217 if (!qd)
182 return -ENOMEM; 218 return NULL;
183 219
220 qd->qd_sbd = sdp;
184 qd->qd_lockref.count = 1; 221 qd->qd_lockref.count = 1;
185 spin_lock_init(&qd->qd_lockref.lock); 222 spin_lock_init(&qd->qd_lockref.lock);
186 qd->qd_id = qid; 223 qd->qd_id = qid;
187 qd->qd_slot = -1; 224 qd->qd_slot = -1;
188 INIT_LIST_HEAD(&qd->qd_lru); 225 INIT_LIST_HEAD(&qd->qd_lru);
226 qd->qd_hash = hash;
189 227
190 error = gfs2_glock_get(sdp, qd2index(qd), 228 error = gfs2_glock_get(sdp, qd2index(qd),
191 &gfs2_quota_glops, CREATE, &qd->qd_gl); 229 &gfs2_quota_glops, CREATE, &qd->qd_gl);
192 if (error) 230 if (error)
193 goto fail; 231 goto fail;
194 232
195 *qdp = qd; 233 return qd;
196
197 return 0;
198 234
199fail: 235fail:
200 kmem_cache_free(gfs2_quotad_cachep, qd); 236 kmem_cache_free(gfs2_quotad_cachep, qd);
201 return error; 237 return NULL;
202} 238}
203 239
204static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, 240static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
205 struct gfs2_quota_data **qdp) 241 const struct gfs2_sbd *sdp,
242 struct kqid qid)
206{ 243{
207 struct gfs2_quota_data *qd = NULL, *new_qd = NULL; 244 struct gfs2_quota_data *qd;
208 int error, found; 245 struct hlist_bl_node *h;
209
210 *qdp = NULL;
211 246
212 for (;;) { 247 hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
213 found = 0; 248 if (!qid_eq(qd->qd_id, qid))
214 spin_lock(&qd_lock); 249 continue;
215 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 250 if (qd->qd_sbd != sdp)
216 if (qid_eq(qd->qd_id, qid) && 251 continue;
217 lockref_get_not_dead(&qd->qd_lockref)) { 252 if (lockref_get_not_dead(&qd->qd_lockref)) {
218 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 253 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
219 found = 1; 254 return qd;
220 break;
221 }
222 } 255 }
256 }
223 257
224 if (!found) 258 return NULL;
225 qd = NULL; 259}
226 260
227 if (!qd && new_qd) {
228 qd = new_qd;
229 list_add(&qd->qd_list, &sdp->sd_quota_list);
230 atomic_inc(&sdp->sd_quota_count);
231 new_qd = NULL;
232 }
233 261
234 spin_unlock(&qd_lock); 262static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
263 struct gfs2_quota_data **qdp)
264{
265 struct gfs2_quota_data *qd, *new_qd;
266 unsigned int hash = gfs2_qd_hash(sdp, qid);
235 267
236 if (qd) { 268 rcu_read_lock();
237 if (new_qd) { 269 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
238 gfs2_glock_put(new_qd->qd_gl); 270 rcu_read_unlock();
239 kmem_cache_free(gfs2_quotad_cachep, new_qd);
240 }
241 *qdp = qd;
242 return 0;
243 }
244 271
245 error = qd_alloc(sdp, qid, &new_qd); 272 if (qd)
246 if (error) 273 return 0;
247 return error; 274
275 new_qd = qd_alloc(hash, sdp, qid);
276 if (!new_qd)
277 return -ENOMEM;
278
279 spin_lock(&qd_lock);
280 spin_lock_bucket(hash);
281 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
282 if (qd == NULL) {
283 *qdp = new_qd;
284 list_add(&new_qd->qd_list, &sdp->sd_quota_list);
285 hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
286 atomic_inc(&sdp->sd_quota_count);
248 } 287 }
288 spin_unlock_bucket(hash);
289 spin_unlock(&qd_lock);
290
291 if (qd) {
292 gfs2_glock_put(new_qd->qd_gl);
293 kmem_cache_free(gfs2_quotad_cachep, new_qd);
294 }
295
296 return 0;
249} 297}
250 298
299
251static void qd_hold(struct gfs2_quota_data *qd) 300static void qd_hold(struct gfs2_quota_data *qd)
252{ 301{
253 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 302 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
268 317
269static int slot_get(struct gfs2_quota_data *qd) 318static int slot_get(struct gfs2_quota_data *qd)
270{ 319{
271 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 320 struct gfs2_sbd *sdp = qd->qd_sbd;
272 unsigned int c, o = 0, b; 321 unsigned int bit;
273 unsigned char byte = 0; 322 int error = 0;
274 323
275 spin_lock(&qd_lock); 324 spin_lock(&sdp->sd_bitmap_lock);
325 if (qd->qd_slot_count != 0)
326 goto out;
276 327
277 if (qd->qd_slot_count++) { 328 error = -ENOSPC;
278 spin_unlock(&qd_lock); 329 bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
279 return 0; 330 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit;
333out:
334 qd->qd_slot_count++;
280 } 335 }
336 spin_unlock(&sdp->sd_bitmap_lock);
281 337
282 for (c = 0; c < sdp->sd_quota_chunks; c++) 338 return error;
283 for (o = 0; o < PAGE_SIZE; o++) {
284 byte = sdp->sd_quota_bitmap[c][o];
285 if (byte != 0xFF)
286 goto found;
287 }
288
289 goto fail;
290
291found:
292 for (b = 0; b < 8; b++)
293 if (!(byte & (1 << b)))
294 break;
295 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
296
297 if (qd->qd_slot >= sdp->sd_quota_slots)
298 goto fail;
299
300 sdp->sd_quota_bitmap[c][o] |= 1 << b;
301
302 spin_unlock(&qd_lock);
303
304 return 0;
305
306fail:
307 qd->qd_slot_count--;
308 spin_unlock(&qd_lock);
309 return -ENOSPC;
310} 339}
311 340
312static void slot_hold(struct gfs2_quota_data *qd) 341static void slot_hold(struct gfs2_quota_data *qd)
313{ 342{
314 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 343 struct gfs2_sbd *sdp = qd->qd_sbd;
315 344
316 spin_lock(&qd_lock); 345 spin_lock(&sdp->sd_bitmap_lock);
317 gfs2_assert(sdp, qd->qd_slot_count); 346 gfs2_assert(sdp, qd->qd_slot_count);
318 qd->qd_slot_count++; 347 qd->qd_slot_count++;
319 spin_unlock(&qd_lock); 348 spin_unlock(&sdp->sd_bitmap_lock);
320}
321
322static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
323 unsigned int bit, int new_value)
324{
325 unsigned int c, o, b = bit;
326 int old_value;
327
328 c = b / (8 * PAGE_SIZE);
329 b %= 8 * PAGE_SIZE;
330 o = b / 8;
331 b %= 8;
332
333 old_value = (bitmap[c][o] & (1 << b));
334 gfs2_assert_withdraw(sdp, !old_value != !new_value);
335
336 if (new_value)
337 bitmap[c][o] |= 1 << b;
338 else
339 bitmap[c][o] &= ~(1 << b);
340} 349}
341 350
342static void slot_put(struct gfs2_quota_data *qd) 351static void slot_put(struct gfs2_quota_data *qd)
343{ 352{
344 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 353 struct gfs2_sbd *sdp = qd->qd_sbd;
345 354
346 spin_lock(&qd_lock); 355 spin_lock(&sdp->sd_bitmap_lock);
347 gfs2_assert(sdp, qd->qd_slot_count); 356 gfs2_assert(sdp, qd->qd_slot_count);
348 if (!--qd->qd_slot_count) { 357 if (!--qd->qd_slot_count) {
349 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); 358 BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
350 qd->qd_slot = -1; 359 qd->qd_slot = -1;
351 } 360 }
352 spin_unlock(&qd_lock); 361 spin_unlock(&sdp->sd_bitmap_lock);
353} 362}
354 363
355static int bh_get(struct gfs2_quota_data *qd) 364static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
427 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 436 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
428 set_bit(QDF_LOCKED, &qd->qd_flags); 437 set_bit(QDF_LOCKED, &qd->qd_flags);
429 qd->qd_change_sync = qd->qd_change; 438 qd->qd_change_sync = qd->qd_change;
430 gfs2_assert_warn(sdp, qd->qd_slot_count); 439 slot_hold(qd);
431 qd->qd_slot_count++;
432 return 1; 440 return 1;
433} 441}
434 442
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
1214 return error; 1222 return error;
1215} 1223}
1216 1224
1217static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
1218{
1219 const struct gfs2_quota_change *str = buf;
1220
1221 qc->qc_change = be64_to_cpu(str->qc_change);
1222 qc->qc_flags = be32_to_cpu(str->qc_flags);
1223 qc->qc_id = make_kqid(&init_user_ns,
1224 (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
1225 be32_to_cpu(str->qc_id));
1226}
1227
1228int gfs2_quota_init(struct gfs2_sbd *sdp) 1225int gfs2_quota_init(struct gfs2_sbd *sdp)
1229{ 1226{
1230 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1227 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1232 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; 1229 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1233 unsigned int x, slot = 0; 1230 unsigned int x, slot = 0;
1234 unsigned int found = 0; 1231 unsigned int found = 0;
1232 unsigned int hash;
1233 unsigned int bm_size;
1235 u64 dblock; 1234 u64 dblock;
1236 u32 extlen = 0; 1235 u32 extlen = 0;
1237 int error; 1236 int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1240 return -EIO; 1239 return -EIO;
1241 1240
1242 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1241 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1243 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1244 1243 bm_size *= sizeof(unsigned long);
1245 error = -ENOMEM; 1244 error = -ENOMEM;
1246 1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
1247 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, 1246 if (sdp->sd_quota_bitmap == NULL)
1248 sizeof(unsigned char *), GFP_NOFS); 1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
1249 if (!sdp->sd_quota_bitmap) 1248 if (!sdp->sd_quota_bitmap)
1250 return error; 1249 return error;
1251 1250
1252 for (x = 0; x < sdp->sd_quota_chunks; x++) { 1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1253 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
1254 if (!sdp->sd_quota_bitmap[x])
1255 goto fail;
1256 }
1257 1252
1258 for (x = 0; x < blocks; x++) { 1253 for (x = 0; x < blocks; x++) {
1259 struct buffer_head *bh; 1254 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc;
1260 unsigned int y; 1256 unsigned int y;
1261 1257
1262 if (!extlen) { 1258 if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1274 goto fail; 1270 goto fail;
1275 } 1271 }
1276 1272
1273 qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
1277 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; 1274 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1278 y++, slot++) { 1275 y++, slot++) {
1279 struct gfs2_quota_change_host qc;
1280 struct gfs2_quota_data *qd; 1276 struct gfs2_quota_data *qd;
1281 1277 s64 qc_change = be64_to_cpu(qc->qc_change);
1282 gfs2_quota_change_in(&qc, bh->b_data + 1278 u32 qc_flags = be32_to_cpu(qc->qc_flags);
1283 sizeof(struct gfs2_meta_header) + 1279 enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
1284 y * sizeof(struct gfs2_quota_change)); 1280 USRQUOTA : GRPQUOTA;
1285 if (!qc.qc_change) 1281 struct kqid qc_id = make_kqid(&init_user_ns, qtype,
1282 be32_to_cpu(qc->qc_id));
1283 qc++;
1284 if (!qc_change)
1286 continue; 1285 continue;
1287 1286
1288 error = qd_alloc(sdp, qc.qc_id, &qd); 1287 hash = gfs2_qd_hash(sdp, qc_id);
1289 if (error) { 1288 qd = qd_alloc(hash, sdp, qc_id);
1289 if (qd == NULL) {
1290 brelse(bh); 1290 brelse(bh);
1291 goto fail; 1291 goto fail;
1292 } 1292 }
1293 1293
1294 set_bit(QDF_CHANGE, &qd->qd_flags); 1294 set_bit(QDF_CHANGE, &qd->qd_flags);
1295 qd->qd_change = qc.qc_change; 1295 qd->qd_change = qc_change;
1296 qd->qd_slot = slot; 1296 qd->qd_slot = slot;
1297 qd->qd_slot_count = 1; 1297 qd->qd_slot_count = 1;
1298 1298
1299 spin_lock(&qd_lock); 1299 spin_lock(&qd_lock);
1300 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); 1300 BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
1301 list_add(&qd->qd_list, &sdp->sd_quota_list); 1301 list_add(&qd->qd_list, &sdp->sd_quota_list);
1302 atomic_inc(&sdp->sd_quota_count); 1302 atomic_inc(&sdp->sd_quota_count);
1303 spin_unlock(&qd_lock); 1303 spin_unlock(&qd_lock);
1304 1304
1305 spin_lock_bucket(hash);
1306 hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
1307 spin_unlock_bucket(hash);
1308
1305 found++; 1309 found++;
1306 } 1310 }
1307 1311
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1324{ 1328{
1325 struct list_head *head = &sdp->sd_quota_list; 1329 struct list_head *head = &sdp->sd_quota_list;
1326 struct gfs2_quota_data *qd; 1330 struct gfs2_quota_data *qd;
1327 unsigned int x;
1328 1331
1329 spin_lock(&qd_lock); 1332 spin_lock(&qd_lock);
1330 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1331 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); 1334 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1332 1335
1333 /*
1334 * To be removed in due course... we should be able to
1335 * ensure that all refs to the qd have done by this point
1336 * so that this rather odd test is not required
1337 */
1338 spin_lock(&qd->qd_lockref.lock);
1339 if (qd->qd_lockref.count > 1 ||
1340 (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1341 spin_unlock(&qd->qd_lockref.lock);
1342 list_move(&qd->qd_list, head);
1343 spin_unlock(&qd_lock);
1344 schedule();
1345 spin_lock(&qd_lock);
1346 continue;
1347 }
1348 spin_unlock(&qd->qd_lockref.lock);
1349
1350 list_del(&qd->qd_list); 1336 list_del(&qd->qd_list);
1337
1351 /* Also remove if this qd exists in the reclaim list */ 1338 /* Also remove if this qd exists in the reclaim list */
1352 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 1339 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
1353 atomic_dec(&sdp->sd_quota_count); 1340 atomic_dec(&sdp->sd_quota_count);
1354 spin_unlock(&qd_lock); 1341 spin_unlock(&qd_lock);
1355 1342
1356 if (!qd->qd_lockref.count) { 1343 spin_lock_bucket(qd->qd_hash);
1357 gfs2_assert_warn(sdp, !qd->qd_change); 1344 hlist_bl_del_rcu(&qd->qd_hlist);
1358 gfs2_assert_warn(sdp, !qd->qd_slot_count); 1345 spin_unlock_bucket(qd->qd_hash);
1359 } else 1346
1360 gfs2_assert_warn(sdp, qd->qd_slot_count == 1); 1347 gfs2_assert_warn(sdp, !qd->qd_change);
1348 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1361 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1349 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1362 1350
1363 gfs2_glock_put(qd->qd_gl); 1351 gfs2_glock_put(qd->qd_gl);
1364 kmem_cache_free(gfs2_quotad_cachep, qd); 1352 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
1365 1353
1366 spin_lock(&qd_lock); 1354 spin_lock(&qd_lock);
1367 } 1355 }
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1370 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); 1358 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1371 1359
1372 if (sdp->sd_quota_bitmap) { 1360 if (sdp->sd_quota_bitmap) {
1373 for (x = 0; x < sdp->sd_quota_chunks; x++) 1361 if (is_vmalloc_addr(sdp->sd_quota_bitmap))
1374 kfree(sdp->sd_quota_bitmap[x]); 1362 vfree(sdp->sd_quota_bitmap);
1375 kfree(sdp->sd_quota_bitmap); 1363 else
1364 kfree(sdp->sd_quota_bitmap);
1365 sdp->sd_quota_bitmap = NULL;
1376 } 1366 }
1377} 1367}
1378 1368
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1656 .get_dqblk = gfs2_get_dqblk, 1646 .get_dqblk = gfs2_get_dqblk,
1657 .set_dqblk = gfs2_set_dqblk, 1647 .set_dqblk = gfs2_set_dqblk,
1658}; 1648};
1649
1650void __init gfs2_quota_hash_init(void)
1651{
1652 unsigned i;
1653
1654 for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
1655 INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
1656}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
57extern const struct quotactl_ops gfs2_quotactl_ops; 57extern const struct quotactl_ops gfs2_quotactl_ops;
58extern struct shrinker gfs2_qd_shrinker; 58extern struct shrinker gfs2_qd_shrinker;
59extern struct list_lru gfs2_qd_lru; 59extern struct list_lru gfs2_qd_lru;
60extern void __init gfs2_quota_hash_init(void);
60 61
61#endif /* __QUOTA_DOT_H__ */ 62#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
57 * 3 = Used (metadata) 57 * 3 = Used (metadata)
58 */ 58 */
59 59
60struct gfs2_extent {
61 struct gfs2_rbm rbm;
62 u32 len;
63};
64
60static const char valid_change[16] = { 65static const char valid_change[16] = {
61 /* current */ 66 /* current */
62 /* n */ 0, 1, 1, 1, 67 /* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
65 1, 0, 0, 0 70 1, 0, 0, 0
66}; 71};
67 72
68static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 73static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
69 const struct gfs2_inode *ip, bool nowrap); 74 const struct gfs2_inode *ip, bool nowrap,
75 const struct gfs2_alloc_parms *ap);
70 76
71 77
72/** 78/**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
635 /* return reserved blocks to the rgrp */ 641 /* return reserved blocks to the rgrp */
636 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); 642 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
637 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; 643 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
644 /* The rgrp extent failure point is likely not to increase;
645 it will only do so if the freed blocks are somehow
646 contiguous with a span of free blocks that follows. Still,
647 it will force the number to be recalculated later. */
648 rgd->rd_extfail_pt += rs->rs_free;
638 rs->rs_free = 0; 649 rs->rs_free = 0;
639 clear_bit(GBF_FULL, &bi->bi_flags); 650 clear_bit(GBF_FULL, &bi->bi_flags);
640 smp_mb__after_clear_bit();
641 } 651 }
642} 652}
643 653
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
876static int read_rindex_entry(struct gfs2_inode *ip) 886static int read_rindex_entry(struct gfs2_inode *ip)
877{ 887{
878 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 888 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
889 const unsigned bsize = sdp->sd_sb.sb_bsize;
879 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 890 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
880 struct gfs2_rindex buf; 891 struct gfs2_rindex buf;
881 int error; 892 int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
913 goto fail; 924 goto fail;
914 925
915 rgd->rd_gl->gl_object = rgd; 926 rgd->rd_gl->gl_object = rgd;
927 rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
928 rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
916 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; 929 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
917 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 930 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
918 if (rgd->rd_data > sdp->sd_max_rg_data) 931 if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1126 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 1139 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1127 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 1140 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1128 rgd->rd_free_clone = rgd->rd_free; 1141 rgd->rd_free_clone = rgd->rd_free;
1142 /* max out the rgrp allocation failure point */
1143 rgd->rd_extfail_pt = rgd->rd_free;
1129 } 1144 }
1130 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { 1145 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1131 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); 1146 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1184 1199
1185 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) 1200 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1186 return 0; 1201 return 0;
1187 return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); 1202 return gfs2_rgrp_bh_get(rgd);
1188} 1203}
1189 1204
1190/** 1205/**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1455 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) 1470 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1456 return; 1471 return;
1457 1472
1458 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); 1473 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
1459 if (ret == 0) { 1474 if (ret == 0) {
1460 rs->rs_rbm = rbm; 1475 rs->rs_rbm = rbm;
1461 rs->rs_free = extlen; 1476 rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1520 * @rbm: The current position in the resource group 1535 * @rbm: The current position in the resource group
1521 * @ip: The inode for which we are searching for blocks 1536 * @ip: The inode for which we are searching for blocks
1522 * @minext: The minimum extent length 1537 * @minext: The minimum extent length
1538 * @maxext: A pointer to the maximum extent structure
1523 * 1539 *
1524 * This checks the current position in the rgrp to see whether there is 1540 * This checks the current position in the rgrp to see whether there is
1525 * a reservation covering this block. If not then this function is a 1541 * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1532 1548
1533static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, 1549static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1534 const struct gfs2_inode *ip, 1550 const struct gfs2_inode *ip,
1535 u32 minext) 1551 u32 minext,
1552 struct gfs2_extent *maxext)
1536{ 1553{
1537 u64 block = gfs2_rbm_to_block(rbm); 1554 u64 block = gfs2_rbm_to_block(rbm);
1538 u32 extlen = 1; 1555 u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1545 */ 1562 */
1546 if (minext) { 1563 if (minext) {
1547 extlen = gfs2_free_extlen(rbm, minext); 1564 extlen = gfs2_free_extlen(rbm, minext);
1548 nblock = block + extlen; 1565 if (extlen <= maxext->len)
1549 if (extlen < minext)
1550 goto fail; 1566 goto fail;
1551 } 1567 }
1552 1568
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1555 * and skip if parts of it are already reserved 1571 * and skip if parts of it are already reserved
1556 */ 1572 */
1557 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); 1573 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1558 if (nblock == block) 1574 if (nblock == block) {
1559 return 0; 1575 if (!minext || extlen >= minext)
1576 return 0;
1577
1578 if (extlen > maxext->len) {
1579 maxext->len = extlen;
1580 maxext->rbm = *rbm;
1581 }
1560fail: 1582fail:
1583 nblock = block + extlen;
1584 }
1561 ret = gfs2_rbm_from_block(rbm, nblock); 1585 ret = gfs2_rbm_from_block(rbm, nblock);
1562 if (ret < 0) 1586 if (ret < 0)
1563 return ret; 1587 return ret;
@@ -1568,30 +1592,38 @@ fail:
1568 * gfs2_rbm_find - Look for blocks of a particular state 1592 * gfs2_rbm_find - Look for blocks of a particular state
1569 * @rbm: Value/result starting position and final position 1593 * @rbm: Value/result starting position and final position
1570 * @state: The state which we want to find 1594 * @state: The state which we want to find
1571 * @minext: The requested extent length (0 for a single block) 1595 * @minext: Pointer to the requested extent length (NULL for a single block)
1596 * This is updated to be the actual reservation size.
1572 * @ip: If set, check for reservations 1597 * @ip: If set, check for reservations
1573 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping 1598 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1574 * around until we've reached the starting point. 1599 * around until we've reached the starting point.
1600 * @ap: the allocation parameters
1575 * 1601 *
1576 * Side effects: 1602 * Side effects:
1577 * - If looking for free blocks, we set GBF_FULL on each bitmap which 1603 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1578 * has no free blocks in it. 1604 * has no free blocks in it.
1605 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1606 * has come up short on a free block search.
1579 * 1607 *
1580 * Returns: 0 on success, -ENOSPC if there is no block of the requested state 1608 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1581 */ 1609 */
1582 1610
1583static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 1611static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1584 const struct gfs2_inode *ip, bool nowrap) 1612 const struct gfs2_inode *ip, bool nowrap,
1613 const struct gfs2_alloc_parms *ap)
1585{ 1614{
1586 struct buffer_head *bh; 1615 struct buffer_head *bh;
1587 int initial_bii; 1616 int initial_bii;
1588 u32 initial_offset; 1617 u32 initial_offset;
1618 int first_bii = rbm->bii;
1619 u32 first_offset = rbm->offset;
1589 u32 offset; 1620 u32 offset;
1590 u8 *buffer; 1621 u8 *buffer;
1591 int n = 0; 1622 int n = 0;
1592 int iters = rbm->rgd->rd_length; 1623 int iters = rbm->rgd->rd_length;
1593 int ret; 1624 int ret;
1594 struct gfs2_bitmap *bi; 1625 struct gfs2_bitmap *bi;
1626 struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1595 1627
1596 /* If we are not starting at the beginning of a bitmap, then we 1628 /* If we are not starting at the beginning of a bitmap, then we
1597 * need to add one to the bitmap count to ensure that we search 1629 * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
1620 return 0; 1652 return 0;
1621 1653
1622 initial_bii = rbm->bii; 1654 initial_bii = rbm->bii;
1623 ret = gfs2_reservation_check_and_update(rbm, ip, minext); 1655 ret = gfs2_reservation_check_and_update(rbm, ip,
1656 minext ? *minext : 0,
1657 &maxext);
1624 if (ret == 0) 1658 if (ret == 0)
1625 return 0; 1659 return 0;
1626 if (ret > 0) { 1660 if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
1655 break; 1689 break;
1656 } 1690 }
1657 1691
1692 if (minext == NULL || state != GFS2_BLKST_FREE)
1693 return -ENOSPC;
1694
1695 /* If the extent was too small, and it's smaller than the smallest
1696 to have failed before, remember for future reference that it's
1697 useless to search this rgrp again for this amount or more. */
1698 if ((first_offset == 0) && (first_bii == 0) &&
1699 (*minext < rbm->rgd->rd_extfail_pt))
1700 rbm->rgd->rd_extfail_pt = *minext;
1701
1702 /* If the maximum extent we found is big enough to fulfill the
1703 minimum requirements, use it anyway. */
1704 if (maxext.len) {
1705 *rbm = maxext.rbm;
1706 *minext = maxext.len;
1707 return 0;
1708 }
1709
1658 return -ENOSPC; 1710 return -ENOSPC;
1659} 1711}
1660 1712
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1680 1732
1681 while (1) { 1733 while (1) {
1682 down_write(&sdp->sd_log_flush_lock); 1734 down_write(&sdp->sd_log_flush_lock);
1683 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); 1735 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1736 true, NULL);
1684 up_write(&sdp->sd_log_flush_lock); 1737 up_write(&sdp->sd_log_flush_lock);
1685 if (error == -ENOSPC) 1738 if (error == -ENOSPC)
1686 break; 1739 break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1891 } 1944 }
1892 1945
1893 /* Skip unuseable resource groups */ 1946 /* Skip unuseable resource groups */
1894 if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 1947 if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
1948 GFS2_RDF_ERROR)) ||
1949 (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
1895 goto skip_rgrp; 1950 goto skip_rgrp;
1896 1951
1897 if (sdp->sd_args.ar_rgrplvb) 1952 if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1911 return 0; 1966 return 0;
1912 } 1967 }
1913 1968
1914 /* Drop reservation, if we couldn't use reserved rgrp */
1915 if (gfs2_rs_active(rs))
1916 gfs2_rs_deltree(rs);
1917check_rgrp: 1969check_rgrp:
1918 /* Check for unlinked inodes which can be reclaimed */ 1970 /* Check for unlinked inodes which can be reclaimed */
1919 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) 1971 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
1920 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, 1972 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
1921 ip->i_no_addr); 1973 ip->i_no_addr);
1922skip_rgrp: 1974skip_rgrp:
1975 /* Drop reservation, if we couldn't use reserved rgrp */
1976 if (gfs2_rs_active(rs))
1977 gfs2_rs_deltree(rs);
1978
1923 /* Unlock rgrp if required */ 1979 /* Unlock rgrp if required */
1924 if (!rg_locked) 1980 if (!rg_locked)
1925 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1981 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2064 * 2120 *
2065 */ 2121 */
2066 2122
2067int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) 2123void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2068{ 2124{
2069 struct gfs2_rgrpd *rgd = gl->gl_object; 2125 struct gfs2_rgrpd *rgd = gl->gl_object;
2070 struct gfs2_blkreserv *trs; 2126 struct gfs2_blkreserv *trs;
2071 const struct rb_node *n; 2127 const struct rb_node *n;
2072 2128
2073 if (rgd == NULL) 2129 if (rgd == NULL)
2074 return 0; 2130 return;
2075 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", 2131 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2076 (unsigned long long)rgd->rd_addr, rgd->rd_flags, 2132 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2077 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, 2133 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2078 rgd->rd_reserved); 2134 rgd->rd_reserved, rgd->rd_extfail_pt);
2079 spin_lock(&rgd->rd_rsspin); 2135 spin_lock(&rgd->rd_rsspin);
2080 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { 2136 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2081 trs = rb_entry(n, struct gfs2_blkreserv, rs_node); 2137 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2082 dump_rs(seq, trs); 2138 dump_rs(seq, trs);
2083 } 2139 }
2084 spin_unlock(&rgd->rd_rsspin); 2140 spin_unlock(&rgd->rd_rsspin);
2085 return 0;
2086} 2141}
2087 2142
2088static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) 2143static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2184 int error; 2239 int error;
2185 2240
2186 gfs2_set_alloc_start(&rbm, ip, dinode); 2241 gfs2_set_alloc_start(&rbm, ip, dinode);
2187 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); 2242 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
2188 2243
2189 if (error == -ENOSPC) { 2244 if (error == -ENOSPC) {
2190 gfs2_set_alloc_start(&rbm, ip, dinode); 2245 gfs2_set_alloc_start(&rbm, ip, dinode);
2191 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); 2246 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
2247 NULL);
2192 } 2248 }
2193 2249
2194 /* Since all blocks are reserved in advance, this shouldn't happen */ 2250 /* Since all blocks are reserved in advance, this shouldn't happen */
2195 if (error) { 2251 if (error) {
2196 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", 2252 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2197 (unsigned long long)ip->i_no_addr, error, *nblocks, 2253 (unsigned long long)ip->i_no_addr, error, *nblocks,
2198 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); 2254 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2255 rbm.rgd->rd_extfail_pt);
2199 goto rgrp_error; 2256 goto rgrp_error;
2200 } 2257 }
2201 2258
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
71extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 71extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
73 struct buffer_head *bh, 73 struct buffer_head *bh,
74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
369 return 0; 369 return 0;
370} 370}
371 371
372static int init_threads(struct gfs2_sbd *sdp)
373{
374 struct task_struct *p;
375 int error = 0;
376
377 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
378 if (IS_ERR(p)) {
379 error = PTR_ERR(p);
380 fs_err(sdp, "can't start logd thread: %d\n", error);
381 return error;
382 }
383 sdp->sd_logd_process = p;
384
385 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
386 if (IS_ERR(p)) {
387 error = PTR_ERR(p);
388 fs_err(sdp, "can't start quotad thread: %d\n", error);
389 goto fail;
390 }
391 sdp->sd_quotad_process = p;
392 return 0;
393
394fail:
395 kthread_stop(sdp->sd_logd_process);
396 return error;
397}
398
372/** 399/**
373 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one 400 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
374 * @sdp: the filesystem 401 * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
384 struct gfs2_log_header_host head; 411 struct gfs2_log_header_host head;
385 int error; 412 int error;
386 413
387 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); 414 error = init_threads(sdp);
388 if (error) 415 if (error)
389 return error; 416 return error;
390 417
418 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
419 if (error)
420 goto fail_threads;
421
391 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 422 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
392 423
393 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 424 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
417fail: 448fail:
418 t_gh.gh_flags |= GL_NOCACHE; 449 t_gh.gh_flags |= GL_NOCACHE;
419 gfs2_glock_dq_uninit(&t_gh); 450 gfs2_glock_dq_uninit(&t_gh);
420 451fail_threads:
452 kthread_stop(sdp->sd_quotad_process);
453 kthread_stop(sdp->sd_logd_process);
421 return error; 454 return error;
422} 455}
423 456
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
800 struct gfs2_holder t_gh; 833 struct gfs2_holder t_gh;
801 int error; 834 int error;
802 835
836 kthread_stop(sdp->sd_quotad_process);
837 kthread_stop(sdp->sd_logd_process);
838
803 flush_workqueue(gfs2_delete_workqueue); 839 flush_workqueue(gfs2_delete_workqueue);
804 gfs2_quota_sync(sdp->sd_vfs, 0); 840 gfs2_quota_sync(sdp->sd_vfs, 0);
805 gfs2_statfs_sync(sdp->sd_vfs, 0); 841 gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
857 } 893 }
858 spin_unlock(&sdp->sd_jindex_spin); 894 spin_unlock(&sdp->sd_jindex_spin);
859 895
860 kthread_stop(sdp->sd_quotad_process);
861 kthread_stop(sdp->sd_logd_process);
862
863 if (!(sb->s_flags & MS_RDONLY)) { 896 if (!(sb->s_flags & MS_RDONLY)) {
864 error = gfs2_make_fs_ro(sdp); 897 error = gfs2_make_fs_ro(sdp);
865 if (error) 898 if (error)
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd7..06fe11e0abfa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
573#ifdef CONFIG_JBD_DEBUG 573#ifdef CONFIG_JBD_DEBUG
574 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
575 if (!tid_geq(journal->j_commit_request, tid)) { 575 if (!tid_geq(journal->j_commit_request, tid)) {
576 printk(KERN_EMERG 576 printk(KERN_ERR
577 "%s: error: j_commit_request=%d, tid=%d\n", 577 "%s: error: j_commit_request=%d, tid=%d\n",
578 __func__, journal->j_commit_request, tid); 578 __func__, journal->j_commit_request, tid);
579 } 579 }
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
604out_unlock: 604out_unlock:
605 spin_unlock(&journal->j_state_lock); 605 spin_unlock(&journal->j_state_lock);
606 606
607 if (unlikely(is_journal_aborted(journal))) { 607 if (unlikely(is_journal_aborted(journal)))
608 printk(KERN_EMERG "journal commit I/O error\n");
609 err = -EIO; 608 err = -EIO;
610 }
611 return err; 609 return err;
612} 610}
613 611
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
2136#ifdef CONFIG_JBD_DEBUG 2134#ifdef CONFIG_JBD_DEBUG
2137 int n = atomic_read(&nr_journal_heads); 2135 int n = atomic_read(&nr_journal_heads);
2138 if (n) 2136 if (n)
2139 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2137 printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
2140#endif 2138#endif
2141 jbd_remove_debugfs_entry(); 2139 jbd_remove_debugfs_entry();
2142 journal_destroy_caches(); 2140 journal_destroy_caches();
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index aa603e017d22..1695ba8334a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -675,7 +675,7 @@ repeat:
675 jbd_alloc(jh2bh(jh)->b_size, 675 jbd_alloc(jh2bh(jh)->b_size,
676 GFP_NOFS); 676 GFP_NOFS);
677 if (!frozen_buffer) { 677 if (!frozen_buffer) {
678 printk(KERN_EMERG 678 printk(KERN_ERR
679 "%s: OOM for frozen_buffer\n", 679 "%s: OOM for frozen_buffer\n",
680 __func__); 680 __func__);
681 JBUFFER_TRACE(jh, "oom!"); 681 JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
898 if (!jh->b_committed_data) { 898 if (!jh->b_committed_data) {
899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); 899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
900 if (!committed_data) { 900 if (!committed_data) {
901 printk(KERN_EMERG "%s: No memory for committed data\n", 901 printk(KERN_ERR "%s: No memory for committed data\n",
902 __func__); 902 __func__);
903 err = -ENOMEM; 903 err = -ENOMEM;
904 goto out; 904 goto out;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 52032647dd4a..5fa344afb49a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
702 read_lock(&journal->j_state_lock); 702 read_lock(&journal->j_state_lock);
703#ifdef CONFIG_JBD2_DEBUG 703#ifdef CONFIG_JBD2_DEBUG
704 if (!tid_geq(journal->j_commit_request, tid)) { 704 if (!tid_geq(journal->j_commit_request, tid)) {
705 printk(KERN_EMERG 705 printk(KERN_ERR
706 "%s: error: j_commit_request=%d, tid=%d\n", 706 "%s: error: j_commit_request=%d, tid=%d\n",
707 __func__, journal->j_commit_request, tid); 707 __func__, journal->j_commit_request, tid);
708 } 708 }
@@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
718 } 718 }
719 read_unlock(&journal->j_state_lock); 719 read_unlock(&journal->j_state_lock);
720 720
721 if (unlikely(is_journal_aborted(journal))) { 721 if (unlikely(is_journal_aborted(journal)))
722 printk(KERN_EMERG "journal commit I/O error\n");
723 err = -EIO; 722 err = -EIO;
724 }
725 return err; 723 return err;
726} 724}
727 725
@@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal)
1527 if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && 1525 if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
1528 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1526 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
1529 /* Can't have checksum v1 and v2 on at the same time! */ 1527 /* Can't have checksum v1 and v2 on at the same time! */
1530 printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " 1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
1531 "at the same time!\n"); 1529 "at the same time!\n");
1532 goto out; 1530 goto out;
1533 } 1531 }
1534 1532
1535 if (!jbd2_verify_csum_type(journal, sb)) { 1533 if (!jbd2_verify_csum_type(journal, sb)) {
1536 printk(KERN_ERR "JBD: Unknown checksum type\n"); 1534 printk(KERN_ERR "JBD2: Unknown checksum type\n");
1537 goto out; 1535 goto out;
1538 } 1536 }
1539 1537
@@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal)
1541 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1539 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
1542 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 1540 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1543 if (IS_ERR(journal->j_chksum_driver)) { 1541 if (IS_ERR(journal->j_chksum_driver)) {
1544 printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); 1542 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1545 err = PTR_ERR(journal->j_chksum_driver); 1543 err = PTR_ERR(journal->j_chksum_driver);
1546 journal->j_chksum_driver = NULL; 1544 journal->j_chksum_driver = NULL;
1547 goto out; 1545 goto out;
@@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal)
1550 1548
1551 /* Check superblock checksum */ 1549 /* Check superblock checksum */
1552 if (!jbd2_superblock_csum_verify(journal, sb)) { 1550 if (!jbd2_superblock_csum_verify(journal, sb)) {
1553 printk(KERN_ERR "JBD: journal checksum error\n"); 1551 printk(KERN_ERR "JBD2: journal checksum error\n");
1554 goto out; 1552 goto out;
1555 } 1553 }
1556 1554
@@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1836 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 1834 journal->j_chksum_driver = crypto_alloc_shash("crc32c",
1837 0, 0); 1835 0, 0);
1838 if (IS_ERR(journal->j_chksum_driver)) { 1836 if (IS_ERR(journal->j_chksum_driver)) {
1839 printk(KERN_ERR "JBD: Cannot load crc32c " 1837 printk(KERN_ERR "JBD2: Cannot load crc32c "
1840 "driver.\n"); 1838 "driver.\n");
1841 journal->j_chksum_driver = NULL; 1839 journal->j_chksum_driver = NULL;
1842 return 0; 1840 return 0;
@@ -2645,7 +2643,7 @@ static void __exit journal_exit(void)
2645#ifdef CONFIG_JBD2_DEBUG 2643#ifdef CONFIG_JBD2_DEBUG
2646 int n = atomic_read(&nr_journal_heads); 2644 int n = atomic_read(&nr_journal_heads);
2647 if (n) 2645 if (n)
2648 printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); 2646 printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
2649#endif 2647#endif
2650 jbd2_remove_jbd_stats_proc_entry(); 2648 jbd2_remove_jbd_stats_proc_entry();
2651 jbd2_journal_destroy_caches(); 2649 jbd2_journal_destroy_caches();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3929c50428b1..3b6bb19d60b1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal,
594 be32_to_cpu(tmp->h_sequence))) { 594 be32_to_cpu(tmp->h_sequence))) {
595 brelse(obh); 595 brelse(obh);
596 success = -EIO; 596 success = -EIO;
597 printk(KERN_ERR "JBD: Invalid " 597 printk(KERN_ERR "JBD2: Invalid "
598 "checksum recovering " 598 "checksum recovering "
599 "block %llu in log\n", 599 "block %llu in log\n",
600 blocknr); 600 blocknr);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 7aa9a32573bb..8360674c85bc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -932,7 +932,7 @@ repeat:
932 jbd2_alloc(jh2bh(jh)->b_size, 932 jbd2_alloc(jh2bh(jh)->b_size,
933 GFP_NOFS); 933 GFP_NOFS);
934 if (!frozen_buffer) { 934 if (!frozen_buffer) {
935 printk(KERN_EMERG 935 printk(KERN_ERR
936 "%s: OOM for frozen_buffer\n", 936 "%s: OOM for frozen_buffer\n",
937 __func__); 937 __func__);
938 JBUFFER_TRACE(jh, "oom!"); 938 JBUFFER_TRACE(jh, "oom!");
@@ -1166,7 +1166,7 @@ repeat:
1166 if (!jh->b_committed_data) { 1166 if (!jh->b_committed_data) {
1167 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 1167 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
1168 if (!committed_data) { 1168 if (!committed_data) {
1169 printk(KERN_EMERG "%s: No memory for committed data\n", 1169 printk(KERN_ERR "%s: No memory for committed data\n",
1170 __func__); 1170 __func__);
1171 err = -ENOMEM; 1171 err = -ENOMEM;
1172 goto out; 1172 goto out;
@@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1290 * once a transaction -bzzz 1290 * once a transaction -bzzz
1291 */ 1291 */
1292 jh->b_modified = 1; 1292 jh->b_modified = 1;
1293 J_ASSERT_JH(jh, handle->h_buffer_credits > 0); 1293 if (handle->h_buffer_credits <= 0) {
1294 ret = -ENOSPC;
1295 goto out_unlock_bh;
1296 }
1294 handle->h_buffer_credits--; 1297 handle->h_buffer_credits--;
1295 } 1298 }
1296 1299
@@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1305 JBUFFER_TRACE(jh, "fastpath"); 1308 JBUFFER_TRACE(jh, "fastpath");
1306 if (unlikely(jh->b_transaction != 1309 if (unlikely(jh->b_transaction !=
1307 journal->j_running_transaction)) { 1310 journal->j_running_transaction)) {
1308 printk(KERN_EMERG "JBD: %s: " 1311 printk(KERN_ERR "JBD2: %s: "
1309 "jh->b_transaction (%llu, %p, %u) != " 1312 "jh->b_transaction (%llu, %p, %u) != "
1310 "journal->j_running_transaction (%p, %u)", 1313 "journal->j_running_transaction (%p, %u)",
1311 journal->j_devname, 1314 journal->j_devname,
@@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1332 JBUFFER_TRACE(jh, "already on other transaction"); 1335 JBUFFER_TRACE(jh, "already on other transaction");
1333 if (unlikely(jh->b_transaction != 1336 if (unlikely(jh->b_transaction !=
1334 journal->j_committing_transaction)) { 1337 journal->j_committing_transaction)) {
1335 printk(KERN_EMERG "JBD: %s: " 1338 printk(KERN_ERR "JBD2: %s: "
1336 "jh->b_transaction (%llu, %p, %u) != " 1339 "jh->b_transaction (%llu, %p, %u) != "
1337 "journal->j_committing_transaction (%p, %u)", 1340 "journal->j_committing_transaction (%p, %u)",
1338 journal->j_devname, 1341 journal->j_devname,
@@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1345 ret = -EINVAL; 1348 ret = -EINVAL;
1346 } 1349 }
1347 if (unlikely(jh->b_next_transaction != transaction)) { 1350 if (unlikely(jh->b_next_transaction != transaction)) {
1348 printk(KERN_EMERG "JBD: %s: " 1351 printk(KERN_ERR "JBD2: %s: "
1349 "jh->b_next_transaction (%llu, %p, %u) != " 1352 "jh->b_next_transaction (%llu, %p, %u) != "
1350 "transaction (%p, %u)", 1353 "transaction (%p, %u)",
1351 journal->j_devname, 1354 journal->j_devname,
@@ -1373,7 +1376,6 @@ out_unlock_bh:
1373 jbd2_journal_put_journal_head(jh); 1376 jbd2_journal_put_journal_head(jh);
1374out: 1377out:
1375 JBUFFER_TRACE(jh, "exit"); 1378 JBUFFER_TRACE(jh, "exit");
1376 WARN_ON(ret); /* All errors are bugs, so dump the stack */
1377 return ret; 1379 return ret;
1378} 1380}
1379 1381
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the kernfs pseudo filesystem
3#
4
5obj-y := mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..5104cf5d25c5
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1073 @@
1/*
2 * fs/kernfs/dir.c - kernfs directory implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/namei.h>
13#include <linux/idr.h>
14#include <linux/slab.h>
15#include <linux/security.h>
16#include <linux/hash.h>
17
18#include "kernfs-internal.h"
19
20DEFINE_MUTEX(kernfs_mutex);
21
22#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
23
24/**
25 * kernfs_name_hash
26 * @name: Null terminated string to hash
27 * @ns: Namespace tag to hash
28 *
29 * Returns 31 bit hash of ns + name (so it fits in an off_t )
30 */
31static unsigned int kernfs_name_hash(const char *name, const void *ns)
32{
33 unsigned long hash = init_name_hash();
34 unsigned int len = strlen(name);
35 while (len--)
36 hash = partial_name_hash(*name++, hash);
37 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
38 hash &= 0x7fffffffU;
39 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
40 if (hash < 1)
41 hash += 2;
42 if (hash >= INT_MAX)
43 hash = INT_MAX - 1;
44 return hash;
45}
46
47static int kernfs_name_compare(unsigned int hash, const char *name,
48 const void *ns, const struct kernfs_node *kn)
49{
50 if (hash != kn->hash)
51 return hash - kn->hash;
52 if (ns != kn->ns)
53 return ns - kn->ns;
54 return strcmp(name, kn->name);
55}
56
57static int kernfs_sd_compare(const struct kernfs_node *left,
58 const struct kernfs_node *right)
59{
60 return kernfs_name_compare(left->hash, left->name, left->ns, right);
61}
62
63/**
64 * kernfs_link_sibling - link kernfs_node into sibling rbtree
65 * @kn: kernfs_node of interest
66 *
67 * Link @kn into its sibling rbtree which starts from
68 * @kn->parent->dir.children.
69 *
70 * Locking:
71 * mutex_lock(kernfs_mutex)
72 *
73 * RETURNS:
74 * 0 on susccess -EEXIST on failure.
75 */
76static int kernfs_link_sibling(struct kernfs_node *kn)
77{
78 struct rb_node **node = &kn->parent->dir.children.rb_node;
79 struct rb_node *parent = NULL;
80
81 if (kernfs_type(kn) == KERNFS_DIR)
82 kn->parent->dir.subdirs++;
83
84 while (*node) {
85 struct kernfs_node *pos;
86 int result;
87
88 pos = rb_to_kn(*node);
89 parent = *node;
90 result = kernfs_sd_compare(kn, pos);
91 if (result < 0)
92 node = &pos->rb.rb_left;
93 else if (result > 0)
94 node = &pos->rb.rb_right;
95 else
96 return -EEXIST;
97 }
98 /* add new node and rebalance the tree */
99 rb_link_node(&kn->rb, parent, node);
100 rb_insert_color(&kn->rb, &kn->parent->dir.children);
101 return 0;
102}
103
104/**
105 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
106 * @kn: kernfs_node of interest
107 *
108 * Unlink @kn from its sibling rbtree which starts from
109 * kn->parent->dir.children.
110 *
111 * Locking:
112 * mutex_lock(kernfs_mutex)
113 */
114static void kernfs_unlink_sibling(struct kernfs_node *kn)
115{
116 if (kernfs_type(kn) == KERNFS_DIR)
117 kn->parent->dir.subdirs--;
118
119 rb_erase(&kn->rb, &kn->parent->dir.children);
120}
121
122/**
123 * kernfs_get_active - get an active reference to kernfs_node
124 * @kn: kernfs_node to get an active reference to
125 *
126 * Get an active reference of @kn. This function is noop if @kn
127 * is NULL.
128 *
129 * RETURNS:
130 * Pointer to @kn on success, NULL on failure.
131 */
132struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
133{
134 if (unlikely(!kn))
135 return NULL;
136
137 if (!atomic_inc_unless_negative(&kn->active))
138 return NULL;
139
140 if (kn->flags & KERNFS_LOCKDEP)
141 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
142 return kn;
143}
144
145/**
146 * kernfs_put_active - put an active reference to kernfs_node
147 * @kn: kernfs_node to put an active reference to
148 *
149 * Put an active reference to @kn. This function is noop if @kn
150 * is NULL.
151 */
152void kernfs_put_active(struct kernfs_node *kn)
153{
154 int v;
155
156 if (unlikely(!kn))
157 return;
158
159 if (kn->flags & KERNFS_LOCKDEP)
160 rwsem_release(&kn->dep_map, 1, _RET_IP_);
161 v = atomic_dec_return(&kn->active);
162 if (likely(v != KN_DEACTIVATED_BIAS))
163 return;
164
165 /*
166 * atomic_dec_return() is a mb(), we'll always see the updated
167 * kn->u.completion.
168 */
169 complete(kn->u.completion);
170}
171
172/**
173 * kernfs_deactivate - deactivate kernfs_node
174 * @kn: kernfs_node to deactivate
175 *
176 * Deny new active references and drain existing ones.
177 */
178static void kernfs_deactivate(struct kernfs_node *kn)
179{
180 DECLARE_COMPLETION_ONSTACK(wait);
181 int v;
182
183 BUG_ON(!(kn->flags & KERNFS_REMOVED));
184
185 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
186 return;
187
188 kn->u.completion = (void *)&wait;
189
190 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
191 /* atomic_add_return() is a mb(), put_active() will always see
192 * the updated kn->u.completion.
193 */
194 v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
195
196 if (v != KN_DEACTIVATED_BIAS) {
197 lock_contended(&kn->dep_map, _RET_IP_);
198 wait_for_completion(&wait);
199 }
200
201 lock_acquired(&kn->dep_map, _RET_IP_);
202 rwsem_release(&kn->dep_map, 1, _RET_IP_);
203}
204
205/**
206 * kernfs_get - get a reference count on a kernfs_node
207 * @kn: the target kernfs_node
208 */
209void kernfs_get(struct kernfs_node *kn)
210{
211 if (kn) {
212 WARN_ON(!atomic_read(&kn->count));
213 atomic_inc(&kn->count);
214 }
215}
216EXPORT_SYMBOL_GPL(kernfs_get);
217
218/**
219 * kernfs_put - put a reference count on a kernfs_node
220 * @kn: the target kernfs_node
221 *
222 * Put a reference count of @kn and destroy it if it reached zero.
223 */
224void kernfs_put(struct kernfs_node *kn)
225{
226 struct kernfs_node *parent;
227 struct kernfs_root *root;
228
229 if (!kn || !atomic_dec_and_test(&kn->count))
230 return;
231 root = kernfs_root(kn);
232 repeat:
233 /* Moving/renaming is always done while holding reference.
234 * kn->parent won't change beneath us.
235 */
236 parent = kn->parent;
237
238 WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
239 parent ? parent->name : "", kn->name);
240
241 if (kernfs_type(kn) == KERNFS_LINK)
242 kernfs_put(kn->symlink.target_kn);
243 if (!(kn->flags & KERNFS_STATIC_NAME))
244 kfree(kn->name);
245 if (kn->iattr) {
246 if (kn->iattr->ia_secdata)
247 security_release_secctx(kn->iattr->ia_secdata,
248 kn->iattr->ia_secdata_len);
249 simple_xattrs_free(&kn->iattr->xattrs);
250 }
251 kfree(kn->iattr);
252 ida_simple_remove(&root->ino_ida, kn->ino);
253 kmem_cache_free(kernfs_node_cache, kn);
254
255 kn = parent;
256 if (kn) {
257 if (atomic_dec_and_test(&kn->count))
258 goto repeat;
259 } else {
260 /* just released the root kn, free @root too */
261 ida_destroy(&root->ino_ida);
262 kfree(root);
263 }
264}
265EXPORT_SYMBOL_GPL(kernfs_put);
266
267static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
268{
269 struct kernfs_node *kn;
270
271 if (flags & LOOKUP_RCU)
272 return -ECHILD;
273
274 /* Always perform fresh lookup for negatives */
275 if (!dentry->d_inode)
276 goto out_bad_unlocked;
277
278 kn = dentry->d_fsdata;
279 mutex_lock(&kernfs_mutex);
280
281 /* The kernfs node has been deleted */
282 if (kn->flags & KERNFS_REMOVED)
283 goto out_bad;
284
285 /* The kernfs node has been moved? */
286 if (dentry->d_parent->d_fsdata != kn->parent)
287 goto out_bad;
288
289 /* The kernfs node has been renamed */
290 if (strcmp(dentry->d_name.name, kn->name) != 0)
291 goto out_bad;
292
293 /* The kernfs node has been moved to a different namespace */
294 if (kn->parent && kernfs_ns_enabled(kn->parent) &&
295 kernfs_info(dentry->d_sb)->ns != kn->ns)
296 goto out_bad;
297
298 mutex_unlock(&kernfs_mutex);
299out_valid:
300 return 1;
301out_bad:
302 mutex_unlock(&kernfs_mutex);
303out_bad_unlocked:
304 /*
305 * @dentry doesn't match the underlying kernfs node, drop the
306 * dentry and force lookup. If we have submounts we must allow the
307 * vfs caches to lie about the state of the filesystem to prevent
308 * leaks and other nasty things, so use check_submounts_and_drop()
309 * instead of d_drop().
310 */
311 if (check_submounts_and_drop(dentry) != 0)
312 goto out_valid;
313
314 return 0;
315}
316
317static void kernfs_dop_release(struct dentry *dentry)
318{
319 kernfs_put(dentry->d_fsdata);
320}
321
322const struct dentry_operations kernfs_dops = {
323 .d_revalidate = kernfs_dop_revalidate,
324 .d_release = kernfs_dop_release,
325};
326
327static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
328 const char *name, umode_t mode,
329 unsigned flags)
330{
331 char *dup_name = NULL;
332 struct kernfs_node *kn;
333 int ret;
334
335 if (!(flags & KERNFS_STATIC_NAME)) {
336 name = dup_name = kstrdup(name, GFP_KERNEL);
337 if (!name)
338 return NULL;
339 }
340
341 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
342 if (!kn)
343 goto err_out1;
344
345 ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
346 if (ret < 0)
347 goto err_out2;
348 kn->ino = ret;
349
350 atomic_set(&kn->count, 1);
351 atomic_set(&kn->active, 0);
352
353 kn->name = name;
354 kn->mode = mode;
355 kn->flags = flags | KERNFS_REMOVED;
356
357 return kn;
358
359 err_out2:
360 kmem_cache_free(kernfs_node_cache, kn);
361 err_out1:
362 kfree(dup_name);
363 return NULL;
364}
365
366struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
367 const char *name, umode_t mode,
368 unsigned flags)
369{
370 struct kernfs_node *kn;
371
372 kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
373 if (kn) {
374 kernfs_get(parent);
375 kn->parent = parent;
376 }
377 return kn;
378}
379
380/**
381 * kernfs_addrm_start - prepare for kernfs_node add/remove
382 * @acxt: pointer to kernfs_addrm_cxt to be used
383 *
384 * This function is called when the caller is about to add or remove
385 * kernfs_node. This function acquires kernfs_mutex. @acxt is used
386 * to keep and pass context to other addrm functions.
387 *
388 * LOCKING:
389 * Kernel thread context (may sleep). kernfs_mutex is locked on
390 * return.
391 */
392void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
393 __acquires(kernfs_mutex)
394{
395 memset(acxt, 0, sizeof(*acxt));
396
397 mutex_lock(&kernfs_mutex);
398}
399
400/**
401 * kernfs_add_one - add kernfs_node to parent without warning
402 * @acxt: addrm context to use
403 * @kn: kernfs_node to be added
404 *
405 * The caller must already have initialized @kn->parent. This
406 * function increments nlink of the parent's inode if @kn is a
407 * directory and link into the children list of the parent.
408 *
409 * This function should be called between calls to
410 * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
411 * the same @acxt as passed to kernfs_addrm_start().
412 *
413 * LOCKING:
414 * Determined by kernfs_addrm_start().
415 *
416 * RETURNS:
417 * 0 on success, -EEXIST if entry with the given name already
418 * exists.
419 */
420int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
421{
422 struct kernfs_node *parent = kn->parent;
423 bool has_ns = kernfs_ns_enabled(parent);
424 struct kernfs_iattrs *ps_iattr;
425 int ret;
426
427 if (has_ns != (bool)kn->ns) {
428 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
429 has_ns ? "required" : "invalid", parent->name, kn->name);
430 return -EINVAL;
431 }
432
433 if (kernfs_type(parent) != KERNFS_DIR)
434 return -EINVAL;
435
436 if (parent->flags & KERNFS_REMOVED)
437 return -ENOENT;
438
439 kn->hash = kernfs_name_hash(kn->name, kn->ns);
440
441 ret = kernfs_link_sibling(kn);
442 if (ret)
443 return ret;
444
445 /* Update timestamps on the parent */
446 ps_iattr = parent->iattr;
447 if (ps_iattr) {
448 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
449 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
450 }
451
452 /* Mark the entry added into directory tree */
453 kn->flags &= ~KERNFS_REMOVED;
454
455 return 0;
456}
457
458/**
459 * kernfs_remove_one - remove kernfs_node from parent
460 * @acxt: addrm context to use
461 * @kn: kernfs_node to be removed
462 *
463 * Mark @kn removed and drop nlink of parent inode if @kn is a
464 * directory. @kn is unlinked from the children list.
465 *
466 * This function should be called between calls to
467 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
468 * passed the same @acxt as passed to kernfs_addrm_start().
469 *
470 * LOCKING:
471 * Determined by kernfs_addrm_start().
472 */
473static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
474 struct kernfs_node *kn)
475{
476 struct kernfs_iattrs *ps_iattr;
477
478 /*
479 * Removal can be called multiple times on the same node. Only the
480 * first invocation is effective and puts the base ref.
481 */
482 if (kn->flags & KERNFS_REMOVED)
483 return;
484
485 if (kn->parent) {
486 kernfs_unlink_sibling(kn);
487
488 /* Update timestamps on the parent */
489 ps_iattr = kn->parent->iattr;
490 if (ps_iattr) {
491 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
492 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
493 }
494 }
495
496 kn->flags |= KERNFS_REMOVED;
497 kn->u.removed_list = acxt->removed;
498 acxt->removed = kn;
499}
500
501/**
502 * kernfs_addrm_finish - finish up kernfs_node add/remove
503 * @acxt: addrm context to finish up
504 *
505 * Finish up kernfs_node add/remove. Resources acquired by
506 * kernfs_addrm_start() are released and removed kernfs_nodes are
507 * cleaned up.
508 *
509 * LOCKING:
510 * kernfs_mutex is released.
511 */
512void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
513 __releases(kernfs_mutex)
514{
515 /* release resources acquired by kernfs_addrm_start() */
516 mutex_unlock(&kernfs_mutex);
517
518 /* kill removed kernfs_nodes */
519 while (acxt->removed) {
520 struct kernfs_node *kn = acxt->removed;
521
522 acxt->removed = kn->u.removed_list;
523
524 kernfs_deactivate(kn);
525 kernfs_unmap_bin_file(kn);
526 kernfs_put(kn);
527 }
528}
529
530/**
531 * kernfs_find_ns - find kernfs_node with the given name
532 * @parent: kernfs_node to search under
533 * @name: name to look for
534 * @ns: the namespace tag to use
535 *
536 * Look for kernfs_node with name @name under @parent. Returns pointer to
537 * the found kernfs_node on success, %NULL on failure.
538 */
539static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
540 const unsigned char *name,
541 const void *ns)
542{
543 struct rb_node *node = parent->dir.children.rb_node;
544 bool has_ns = kernfs_ns_enabled(parent);
545 unsigned int hash;
546
547 lockdep_assert_held(&kernfs_mutex);
548
549 if (has_ns != (bool)ns) {
550 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
551 has_ns ? "required" : "invalid", parent->name, name);
552 return NULL;
553 }
554
555 hash = kernfs_name_hash(name, ns);
556 while (node) {
557 struct kernfs_node *kn;
558 int result;
559
560 kn = rb_to_kn(node);
561 result = kernfs_name_compare(hash, name, ns, kn);
562 if (result < 0)
563 node = node->rb_left;
564 else if (result > 0)
565 node = node->rb_right;
566 else
567 return kn;
568 }
569 return NULL;
570}
571
572/**
573 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
574 * @parent: kernfs_node to search under
575 * @name: name to look for
576 * @ns: the namespace tag to use
577 *
578 * Look for kernfs_node with name @name under @parent and get a reference
579 * if found. This function may sleep and returns pointer to the found
580 * kernfs_node on success, %NULL on failure.
581 */
582struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
583 const char *name, const void *ns)
584{
585 struct kernfs_node *kn;
586
587 mutex_lock(&kernfs_mutex);
588 kn = kernfs_find_ns(parent, name, ns);
589 kernfs_get(kn);
590 mutex_unlock(&kernfs_mutex);
591
592 return kn;
593}
594EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
595
596/**
597 * kernfs_create_root - create a new kernfs hierarchy
598 * @kdops: optional directory syscall operations for the hierarchy
599 * @priv: opaque data associated with the new directory
600 *
601 * Returns the root of the new hierarchy on success, ERR_PTR() value on
602 * failure.
603 */
604struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
605{
606 struct kernfs_root *root;
607 struct kernfs_node *kn;
608
609 root = kzalloc(sizeof(*root), GFP_KERNEL);
610 if (!root)
611 return ERR_PTR(-ENOMEM);
612
613 ida_init(&root->ino_ida);
614
615 kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
616 KERNFS_DIR);
617 if (!kn) {
618 ida_destroy(&root->ino_ida);
619 kfree(root);
620 return ERR_PTR(-ENOMEM);
621 }
622
623 kn->flags &= ~KERNFS_REMOVED;
624 kn->priv = priv;
625 kn->dir.root = root;
626
627 root->dir_ops = kdops;
628 root->kn = kn;
629
630 return root;
631}
632
633/**
634 * kernfs_destroy_root - destroy a kernfs hierarchy
635 * @root: root of the hierarchy to destroy
636 *
637 * Destroy the hierarchy anchored at @root by removing all existing
638 * directories and destroying @root.
639 */
640void kernfs_destroy_root(struct kernfs_root *root)
641{
642 kernfs_remove(root->kn); /* will also free @root */
643}
644
645/**
646 * kernfs_create_dir_ns - create a directory
647 * @parent: parent in which to create a new directory
648 * @name: name of the new directory
649 * @mode: mode of the new directory
650 * @priv: opaque data associated with the new directory
651 * @ns: optional namespace tag of the directory
652 *
653 * Returns the created node on success, ERR_PTR() value on failure.
654 */
655struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
656 const char *name, umode_t mode,
657 void *priv, const void *ns)
658{
659 struct kernfs_addrm_cxt acxt;
660 struct kernfs_node *kn;
661 int rc;
662
663 /* allocate */
664 kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
665 if (!kn)
666 return ERR_PTR(-ENOMEM);
667
668 kn->dir.root = parent->dir.root;
669 kn->ns = ns;
670 kn->priv = priv;
671
672 /* link in */
673 kernfs_addrm_start(&acxt);
674 rc = kernfs_add_one(&acxt, kn);
675 kernfs_addrm_finish(&acxt);
676
677 if (!rc)
678 return kn;
679
680 kernfs_put(kn);
681 return ERR_PTR(rc);
682}
683
684static struct dentry *kernfs_iop_lookup(struct inode *dir,
685 struct dentry *dentry,
686 unsigned int flags)
687{
688 struct dentry *ret;
689 struct kernfs_node *parent = dentry->d_parent->d_fsdata;
690 struct kernfs_node *kn;
691 struct inode *inode;
692 const void *ns = NULL;
693
694 mutex_lock(&kernfs_mutex);
695
696 if (kernfs_ns_enabled(parent))
697 ns = kernfs_info(dir->i_sb)->ns;
698
699 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
700
701 /* no such entry */
702 if (!kn) {
703 ret = NULL;
704 goto out_unlock;
705 }
706 kernfs_get(kn);
707 dentry->d_fsdata = kn;
708
709 /* attach dentry and inode */
710 inode = kernfs_get_inode(dir->i_sb, kn);
711 if (!inode) {
712 ret = ERR_PTR(-ENOMEM);
713 goto out_unlock;
714 }
715
716 /* instantiate and hash dentry */
717 ret = d_materialise_unique(dentry, inode);
718 out_unlock:
719 mutex_unlock(&kernfs_mutex);
720 return ret;
721}
722
723static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
724 umode_t mode)
725{
726 struct kernfs_node *parent = dir->i_private;
727 struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
728
729 if (!kdops || !kdops->mkdir)
730 return -EPERM;
731
732 return kdops->mkdir(parent, dentry->d_name.name, mode);
733}
734
735static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
736{
737 struct kernfs_node *kn = dentry->d_fsdata;
738 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
739
740 if (!kdops || !kdops->rmdir)
741 return -EPERM;
742
743 return kdops->rmdir(kn);
744}
745
746static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
747 struct inode *new_dir, struct dentry *new_dentry)
748{
749 struct kernfs_node *kn = old_dentry->d_fsdata;
750 struct kernfs_node *new_parent = new_dir->i_private;
751 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
752
753 if (!kdops || !kdops->rename)
754 return -EPERM;
755
756 return kdops->rename(kn, new_parent, new_dentry->d_name.name);
757}
758
759const struct inode_operations kernfs_dir_iops = {
760 .lookup = kernfs_iop_lookup,
761 .permission = kernfs_iop_permission,
762 .setattr = kernfs_iop_setattr,
763 .getattr = kernfs_iop_getattr,
764 .setxattr = kernfs_iop_setxattr,
765 .removexattr = kernfs_iop_removexattr,
766 .getxattr = kernfs_iop_getxattr,
767 .listxattr = kernfs_iop_listxattr,
768
769 .mkdir = kernfs_iop_mkdir,
770 .rmdir = kernfs_iop_rmdir,
771 .rename = kernfs_iop_rename,
772};
773
774static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
775{
776 struct kernfs_node *last;
777
778 while (true) {
779 struct rb_node *rbn;
780
781 last = pos;
782
783 if (kernfs_type(pos) != KERNFS_DIR)
784 break;
785
786 rbn = rb_first(&pos->dir.children);
787 if (!rbn)
788 break;
789
790 pos = rb_to_kn(rbn);
791 }
792
793 return last;
794}
795
796/**
797 * kernfs_next_descendant_post - find the next descendant for post-order walk
798 * @pos: the current position (%NULL to initiate traversal)
799 * @root: kernfs_node whose descendants to walk
800 *
801 * Find the next descendant to visit for post-order traversal of @root's
802 * descendants. @root is included in the iteration and the last node to be
803 * visited.
804 */
805static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
806 struct kernfs_node *root)
807{
808 struct rb_node *rbn;
809
810 lockdep_assert_held(&kernfs_mutex);
811
812 /* if first iteration, visit leftmost descendant which may be root */
813 if (!pos)
814 return kernfs_leftmost_descendant(root);
815
816 /* if we visited @root, we're done */
817 if (pos == root)
818 return NULL;
819
820 /* if there's an unvisited sibling, visit its leftmost descendant */
821 rbn = rb_next(&pos->rb);
822 if (rbn)
823 return kernfs_leftmost_descendant(rb_to_kn(rbn));
824
825 /* no sibling left, visit parent */
826 return pos->parent;
827}
828
829static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
830 struct kernfs_node *kn)
831{
832 struct kernfs_node *pos, *next;
833
834 if (!kn)
835 return;
836
837 pr_debug("kernfs %s: removing\n", kn->name);
838
839 next = NULL;
840 do {
841 pos = next;
842 next = kernfs_next_descendant_post(pos, kn);
843 if (pos)
844 kernfs_remove_one(acxt, pos);
845 } while (next);
846}
847
848/**
849 * kernfs_remove - remove a kernfs_node recursively
850 * @kn: the kernfs_node to remove
851 *
852 * Remove @kn along with all its subdirectories and files.
853 */
854void kernfs_remove(struct kernfs_node *kn)
855{
856 struct kernfs_addrm_cxt acxt;
857
858 kernfs_addrm_start(&acxt);
859 __kernfs_remove(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861}
862
863/**
864 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
865 * @parent: parent of the target
866 * @name: name of the kernfs_node to remove
867 * @ns: namespace tag of the kernfs_node to remove
868 *
869 * Look for the kernfs_node with @name and @ns under @parent and remove it.
870 * Returns 0 on success, -ENOENT if such entry doesn't exist.
871 */
872int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
873 const void *ns)
874{
875 struct kernfs_addrm_cxt acxt;
876 struct kernfs_node *kn;
877
878 if (!parent) {
879 WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
880 name);
881 return -ENOENT;
882 }
883
884 kernfs_addrm_start(&acxt);
885
886 kn = kernfs_find_ns(parent, name, ns);
887 if (kn)
888 __kernfs_remove(&acxt, kn);
889
890 kernfs_addrm_finish(&acxt);
891
892 if (kn)
893 return 0;
894 else
895 return -ENOENT;
896}
897
898/**
899 * kernfs_rename_ns - move and rename a kernfs_node
900 * @kn: target node
901 * @new_parent: new parent to put @sd under
902 * @new_name: new name
903 * @new_ns: new namespace tag
904 */
905int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
906 const char *new_name, const void *new_ns)
907{
908 int error;
909
910 mutex_lock(&kernfs_mutex);
911
912 error = -ENOENT;
913 if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
914 goto out;
915
916 error = 0;
917 if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
918 (strcmp(kn->name, new_name) == 0))
919 goto out; /* nothing to rename */
920
921 error = -EEXIST;
922 if (kernfs_find_ns(new_parent, new_name, new_ns))
923 goto out;
924
925 /* rename kernfs_node */
926 if (strcmp(kn->name, new_name) != 0) {
927 error = -ENOMEM;
928 new_name = kstrdup(new_name, GFP_KERNEL);
929 if (!new_name)
930 goto out;
931
932 if (kn->flags & KERNFS_STATIC_NAME)
933 kn->flags &= ~KERNFS_STATIC_NAME;
934 else
935 kfree(kn->name);
936
937 kn->name = new_name;
938 }
939
940 /*
941 * Move to the appropriate place in the appropriate directories rbtree.
942 */
943 kernfs_unlink_sibling(kn);
944 kernfs_get(new_parent);
945 kernfs_put(kn->parent);
946 kn->ns = new_ns;
947 kn->hash = kernfs_name_hash(kn->name, kn->ns);
948 kn->parent = new_parent;
949 kernfs_link_sibling(kn);
950
951 error = 0;
952 out:
953 mutex_unlock(&kernfs_mutex);
954 return error;
955}
956
957/* Relationship between s_mode and the DT_xxx types */
958static inline unsigned char dt_type(struct kernfs_node *kn)
959{
960 return (kn->mode >> 12) & 15;
961}
962
963static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
964{
965 kernfs_put(filp->private_data);
966 return 0;
967}
968
969static struct kernfs_node *kernfs_dir_pos(const void *ns,
970 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
971{
972 if (pos) {
973 int valid = !(pos->flags & KERNFS_REMOVED) &&
974 pos->parent == parent && hash == pos->hash;
975 kernfs_put(pos);
976 if (!valid)
977 pos = NULL;
978 }
979 if (!pos && (hash > 1) && (hash < INT_MAX)) {
980 struct rb_node *node = parent->dir.children.rb_node;
981 while (node) {
982 pos = rb_to_kn(node);
983
984 if (hash < pos->hash)
985 node = node->rb_left;
986 else if (hash > pos->hash)
987 node = node->rb_right;
988 else
989 break;
990 }
991 }
992 /* Skip over entries in the wrong namespace */
993 while (pos && pos->ns != ns) {
994 struct rb_node *node = rb_next(&pos->rb);
995 if (!node)
996 pos = NULL;
997 else
998 pos = rb_to_kn(node);
999 }
1000 return pos;
1001}
1002
1003static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1004 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1005{
1006 pos = kernfs_dir_pos(ns, parent, ino, pos);
1007 if (pos)
1008 do {
1009 struct rb_node *node = rb_next(&pos->rb);
1010 if (!node)
1011 pos = NULL;
1012 else
1013 pos = rb_to_kn(node);
1014 } while (pos && pos->ns != ns);
1015 return pos;
1016}
1017
1018static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
1019{
1020 struct dentry *dentry = file->f_path.dentry;
1021 struct kernfs_node *parent = dentry->d_fsdata;
1022 struct kernfs_node *pos = file->private_data;
1023 const void *ns = NULL;
1024
1025 if (!dir_emit_dots(file, ctx))
1026 return 0;
1027 mutex_lock(&kernfs_mutex);
1028
1029 if (kernfs_ns_enabled(parent))
1030 ns = kernfs_info(dentry->d_sb)->ns;
1031
1032 for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
1033 pos;
1034 pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
1035 const char *name = pos->name;
1036 unsigned int type = dt_type(pos);
1037 int len = strlen(name);
1038 ino_t ino = pos->ino;
1039
1040 ctx->pos = pos->hash;
1041 file->private_data = pos;
1042 kernfs_get(pos);
1043
1044 mutex_unlock(&kernfs_mutex);
1045 if (!dir_emit(ctx, name, len, ino, type))
1046 return 0;
1047 mutex_lock(&kernfs_mutex);
1048 }
1049 mutex_unlock(&kernfs_mutex);
1050 file->private_data = NULL;
1051 ctx->pos = INT_MAX;
1052 return 0;
1053}
1054
1055static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
1056 int whence)
1057{
1058 struct inode *inode = file_inode(file);
1059 loff_t ret;
1060
1061 mutex_lock(&inode->i_mutex);
1062 ret = generic_file_llseek(file, offset, whence);
1063 mutex_unlock(&inode->i_mutex);
1064
1065 return ret;
1066}
1067
1068const struct file_operations kernfs_dir_fops = {
1069 .read = generic_read_dir,
1070 .iterate = kernfs_fop_readdir,
1071 .release = kernfs_dir_fop_release,
1072 .llseek = kernfs_dir_fop_llseek,
1073};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..dbf397bfdff2
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,867 @@
1/*
2 * fs/kernfs/file.c - kernfs file implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/poll.h>
15#include <linux/pagemap.h>
16#include <linux/sched.h>
17
18#include "kernfs-internal.h"
19
20/*
21 * There's one kernfs_open_file for each open file and one kernfs_open_node
22 * for each kernfs_node with one or more open files.
23 *
24 * kernfs_node->attr.open points to kernfs_open_node. attr.open is
25 * protected by kernfs_open_node_lock.
26 *
27 * filp->private_data points to seq_file whose ->private points to
28 * kernfs_open_file. kernfs_open_files are chained at
29 * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
30 */
31static DEFINE_SPINLOCK(kernfs_open_node_lock);
32static DEFINE_MUTEX(kernfs_open_file_mutex);
33
34struct kernfs_open_node {
35 atomic_t refcnt;
36 atomic_t event;
37 wait_queue_head_t poll;
38 struct list_head files; /* goes through kernfs_open_file.list */
39};
40
41static struct kernfs_open_file *kernfs_of(struct file *file)
42{
43 return ((struct seq_file *)file->private_data)->private;
44}
45
46/*
47 * Determine the kernfs_ops for the given kernfs_node. This function must
48 * be called while holding an active reference.
49 */
50static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
51{
52 if (kn->flags & KERNFS_LOCKDEP)
53 lockdep_assert_held(kn);
54 return kn->attr.ops;
55}
56
57/*
58 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
59 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
60 * a seq_file iteration which is fully initialized with an active reference
61 * or an aborted kernfs_seq_start() due to get_active failure. The
62 * position pointer is the only context for each seq_file iteration and
63 * thus the stop condition should be encoded in it. As the return value is
64 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
65 * choice to indicate get_active failure.
66 *
67 * Unfortunately, this is complicated due to the optional custom seq_file
68 * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop()
69 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
70 * custom seq_file operations and thus can't decide whether put_active
71 * should be performed or not only on ERR_PTR(-ENODEV).
72 *
73 * This is worked around by factoring out the custom seq_stop() and
74 * put_active part into kernfs_seq_stop_active(), skipping it from
75 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
76 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
77 * that kernfs_seq_stop_active() is skipped only after get_active failure.
78 */
79static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
80{
81 struct kernfs_open_file *of = sf->private;
82 const struct kernfs_ops *ops = kernfs_ops(of->kn);
83
84 if (ops->seq_stop)
85 ops->seq_stop(sf, v);
86 kernfs_put_active(of->kn);
87}
88
89static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
90{
91 struct kernfs_open_file *of = sf->private;
92 const struct kernfs_ops *ops;
93
94 /*
95 * @of->mutex nests outside active ref and is just to ensure that
96 * the ops aren't called concurrently for the same open file.
97 */
98 mutex_lock(&of->mutex);
99 if (!kernfs_get_active(of->kn))
100 return ERR_PTR(-ENODEV);
101
102 ops = kernfs_ops(of->kn);
103 if (ops->seq_start) {
104 void *next = ops->seq_start(sf, ppos);
105 /* see the comment above kernfs_seq_stop_active() */
106 if (next == ERR_PTR(-ENODEV))
107 kernfs_seq_stop_active(sf, next);
108 return next;
109 } else {
110 /*
111 * The same behavior and code as single_open(). Returns
112 * !NULL if pos is at the beginning; otherwise, NULL.
113 */
114 return NULL + !*ppos;
115 }
116}
117
118static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
119{
120 struct kernfs_open_file *of = sf->private;
121 const struct kernfs_ops *ops = kernfs_ops(of->kn);
122
123 if (ops->seq_next) {
124 void *next = ops->seq_next(sf, v, ppos);
125 /* see the comment above kernfs_seq_stop_active() */
126 if (next == ERR_PTR(-ENODEV))
127 kernfs_seq_stop_active(sf, next);
128 return next;
129 } else {
130 /*
131 * The same behavior and code as single_open(), always
132 * terminate after the initial read.
133 */
134 ++*ppos;
135 return NULL;
136 }
137}
138
139static void kernfs_seq_stop(struct seq_file *sf, void *v)
140{
141 struct kernfs_open_file *of = sf->private;
142
143 if (v != ERR_PTR(-ENODEV))
144 kernfs_seq_stop_active(sf, v);
145 mutex_unlock(&of->mutex);
146}
147
148static int kernfs_seq_show(struct seq_file *sf, void *v)
149{
150 struct kernfs_open_file *of = sf->private;
151
152 of->event = atomic_read(&of->kn->attr.open->event);
153
154 return of->kn->attr.ops->seq_show(sf, v);
155}
156
157static const struct seq_operations kernfs_seq_ops = {
158 .start = kernfs_seq_start,
159 .next = kernfs_seq_next,
160 .stop = kernfs_seq_stop,
161 .show = kernfs_seq_show,
162};
163
164/*
165 * As reading a bin file can have side-effects, the exact offset and bytes
166 * specified in read(2) call should be passed to the read callback making
167 * it difficult to use seq_file. Implement simplistic custom buffering for
168 * bin files.
169 */
170static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
171 char __user *user_buf, size_t count,
172 loff_t *ppos)
173{
174 ssize_t len = min_t(size_t, count, PAGE_SIZE);
175 const struct kernfs_ops *ops;
176 char *buf;
177
178 buf = kmalloc(len, GFP_KERNEL);
179 if (!buf)
180 return -ENOMEM;
181
182 /*
183 * @of->mutex nests outside active ref and is just to ensure that
184 * the ops aren't called concurrently for the same open file.
185 */
186 mutex_lock(&of->mutex);
187 if (!kernfs_get_active(of->kn)) {
188 len = -ENODEV;
189 mutex_unlock(&of->mutex);
190 goto out_free;
191 }
192
193 ops = kernfs_ops(of->kn);
194 if (ops->read)
195 len = ops->read(of, buf, len, *ppos);
196 else
197 len = -EINVAL;
198
199 kernfs_put_active(of->kn);
200 mutex_unlock(&of->mutex);
201
202 if (len < 0)
203 goto out_free;
204
205 if (copy_to_user(user_buf, buf, len)) {
206 len = -EFAULT;
207 goto out_free;
208 }
209
210 *ppos += len;
211
212 out_free:
213 kfree(buf);
214 return len;
215}
216
217/**
218 * kernfs_fop_read - kernfs vfs read callback
219 * @file: file pointer
220 * @user_buf: data to write
221 * @count: number of bytes
222 * @ppos: starting offset
223 */
224static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
225 size_t count, loff_t *ppos)
226{
227 struct kernfs_open_file *of = kernfs_of(file);
228
229 if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
230 return seq_read(file, user_buf, count, ppos);
231 else
232 return kernfs_file_direct_read(of, user_buf, count, ppos);
233}
234
235/**
236 * kernfs_fop_write - kernfs vfs write callback
237 * @file: file pointer
238 * @user_buf: data to write
239 * @count: number of bytes
240 * @ppos: starting offset
241 *
242 * Copy data in from userland and pass it to the matching kernfs write
243 * operation.
244 *
245 * There is no easy way for us to know if userspace is only doing a partial
246 * write, so we don't support them. We expect the entire buffer to come on
247 * the first write. Hint: if you're writing a value, first read the file,
248 * modify only the the value you're changing, then write entire buffer
249 * back.
250 */
251static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
252 size_t count, loff_t *ppos)
253{
254 struct kernfs_open_file *of = kernfs_of(file);
255 ssize_t len = min_t(size_t, count, PAGE_SIZE);
256 const struct kernfs_ops *ops;
257 char *buf;
258
259 buf = kmalloc(len + 1, GFP_KERNEL);
260 if (!buf)
261 return -ENOMEM;
262
263 if (copy_from_user(buf, user_buf, len)) {
264 len = -EFAULT;
265 goto out_free;
266 }
267 buf[len] = '\0'; /* guarantee string termination */
268
269 /*
270 * @of->mutex nests outside active ref and is just to ensure that
271 * the ops aren't called concurrently for the same open file.
272 */
273 mutex_lock(&of->mutex);
274 if (!kernfs_get_active(of->kn)) {
275 mutex_unlock(&of->mutex);
276 len = -ENODEV;
277 goto out_free;
278 }
279
280 ops = kernfs_ops(of->kn);
281 if (ops->write)
282 len = ops->write(of, buf, len, *ppos);
283 else
284 len = -EINVAL;
285
286 kernfs_put_active(of->kn);
287 mutex_unlock(&of->mutex);
288
289 if (len > 0)
290 *ppos += len;
291out_free:
292 kfree(buf);
293 return len;
294}
295
296static void kernfs_vma_open(struct vm_area_struct *vma)
297{
298 struct file *file = vma->vm_file;
299 struct kernfs_open_file *of = kernfs_of(file);
300
301 if (!of->vm_ops)
302 return;
303
304 if (!kernfs_get_active(of->kn))
305 return;
306
307 if (of->vm_ops->open)
308 of->vm_ops->open(vma);
309
310 kernfs_put_active(of->kn);
311}
312
313static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
314{
315 struct file *file = vma->vm_file;
316 struct kernfs_open_file *of = kernfs_of(file);
317 int ret;
318
319 if (!of->vm_ops)
320 return VM_FAULT_SIGBUS;
321
322 if (!kernfs_get_active(of->kn))
323 return VM_FAULT_SIGBUS;
324
325 ret = VM_FAULT_SIGBUS;
326 if (of->vm_ops->fault)
327 ret = of->vm_ops->fault(vma, vmf);
328
329 kernfs_put_active(of->kn);
330 return ret;
331}
332
333static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
334 struct vm_fault *vmf)
335{
336 struct file *file = vma->vm_file;
337 struct kernfs_open_file *of = kernfs_of(file);
338 int ret;
339
340 if (!of->vm_ops)
341 return VM_FAULT_SIGBUS;
342
343 if (!kernfs_get_active(of->kn))
344 return VM_FAULT_SIGBUS;
345
346 ret = 0;
347 if (of->vm_ops->page_mkwrite)
348 ret = of->vm_ops->page_mkwrite(vma, vmf);
349 else
350 file_update_time(file);
351
352 kernfs_put_active(of->kn);
353 return ret;
354}
355
356static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
357 void *buf, int len, int write)
358{
359 struct file *file = vma->vm_file;
360 struct kernfs_open_file *of = kernfs_of(file);
361 int ret;
362
363 if (!of->vm_ops)
364 return -EINVAL;
365
366 if (!kernfs_get_active(of->kn))
367 return -EINVAL;
368
369 ret = -EINVAL;
370 if (of->vm_ops->access)
371 ret = of->vm_ops->access(vma, addr, buf, len, write);
372
373 kernfs_put_active(of->kn);
374 return ret;
375}
376
377#ifdef CONFIG_NUMA
378static int kernfs_vma_set_policy(struct vm_area_struct *vma,
379 struct mempolicy *new)
380{
381 struct file *file = vma->vm_file;
382 struct kernfs_open_file *of = kernfs_of(file);
383 int ret;
384
385 if (!of->vm_ops)
386 return 0;
387
388 if (!kernfs_get_active(of->kn))
389 return -EINVAL;
390
391 ret = 0;
392 if (of->vm_ops->set_policy)
393 ret = of->vm_ops->set_policy(vma, new);
394
395 kernfs_put_active(of->kn);
396 return ret;
397}
398
399static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
400 unsigned long addr)
401{
402 struct file *file = vma->vm_file;
403 struct kernfs_open_file *of = kernfs_of(file);
404 struct mempolicy *pol;
405
406 if (!of->vm_ops)
407 return vma->vm_policy;
408
409 if (!kernfs_get_active(of->kn))
410 return vma->vm_policy;
411
412 pol = vma->vm_policy;
413 if (of->vm_ops->get_policy)
414 pol = of->vm_ops->get_policy(vma, addr);
415
416 kernfs_put_active(of->kn);
417 return pol;
418}
419
420static int kernfs_vma_migrate(struct vm_area_struct *vma,
421 const nodemask_t *from, const nodemask_t *to,
422 unsigned long flags)
423{
424 struct file *file = vma->vm_file;
425 struct kernfs_open_file *of = kernfs_of(file);
426 int ret;
427
428 if (!of->vm_ops)
429 return 0;
430
431 if (!kernfs_get_active(of->kn))
432 return 0;
433
434 ret = 0;
435 if (of->vm_ops->migrate)
436 ret = of->vm_ops->migrate(vma, from, to, flags);
437
438 kernfs_put_active(of->kn);
439 return ret;
440}
441#endif
442
443static const struct vm_operations_struct kernfs_vm_ops = {
444 .open = kernfs_vma_open,
445 .fault = kernfs_vma_fault,
446 .page_mkwrite = kernfs_vma_page_mkwrite,
447 .access = kernfs_vma_access,
448#ifdef CONFIG_NUMA
449 .set_policy = kernfs_vma_set_policy,
450 .get_policy = kernfs_vma_get_policy,
451 .migrate = kernfs_vma_migrate,
452#endif
453};
454
455static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
456{
457 struct kernfs_open_file *of = kernfs_of(file);
458 const struct kernfs_ops *ops;
459 int rc;
460
461 /*
462 * mmap path and of->mutex are prone to triggering spurious lockdep
463 * warnings and we don't want to add spurious locking dependency
464 * between the two. Check whether mmap is actually implemented
465 * without grabbing @of->mutex by testing HAS_MMAP flag. See the
466 * comment in kernfs_file_open() for more details.
467 */
468 if (!(of->kn->flags & KERNFS_HAS_MMAP))
469 return -ENODEV;
470
471 mutex_lock(&of->mutex);
472
473 rc = -ENODEV;
474 if (!kernfs_get_active(of->kn))
475 goto out_unlock;
476
477 ops = kernfs_ops(of->kn);
478 rc = ops->mmap(of, vma);
479
480 /*
481 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
482 * to satisfy versions of X which crash if the mmap fails: that
483 * substitutes a new vm_file, and we don't then want bin_vm_ops.
484 */
485 if (vma->vm_file != file)
486 goto out_put;
487
488 rc = -EINVAL;
489 if (of->mmapped && of->vm_ops != vma->vm_ops)
490 goto out_put;
491
492 /*
493 * It is not possible to successfully wrap close.
494 * So error if someone is trying to use close.
495 */
496 rc = -EINVAL;
497 if (vma->vm_ops && vma->vm_ops->close)
498 goto out_put;
499
500 rc = 0;
501 of->mmapped = 1;
502 of->vm_ops = vma->vm_ops;
503 vma->vm_ops = &kernfs_vm_ops;
504out_put:
505 kernfs_put_active(of->kn);
506out_unlock:
507 mutex_unlock(&of->mutex);
508
509 return rc;
510}
511
512/**
513 * kernfs_get_open_node - get or create kernfs_open_node
514 * @kn: target kernfs_node
515 * @of: kernfs_open_file for this instance of open
516 *
517 * If @kn->attr.open exists, increment its reference count; otherwise,
518 * create one. @of is chained to the files list.
519 *
520 * LOCKING:
521 * Kernel thread context (may sleep).
522 *
523 * RETURNS:
524 * 0 on success, -errno on failure.
525 */
526static int kernfs_get_open_node(struct kernfs_node *kn,
527 struct kernfs_open_file *of)
528{
529 struct kernfs_open_node *on, *new_on = NULL;
530
531 retry:
532 mutex_lock(&kernfs_open_file_mutex);
533 spin_lock_irq(&kernfs_open_node_lock);
534
535 if (!kn->attr.open && new_on) {
536 kn->attr.open = new_on;
537 new_on = NULL;
538 }
539
540 on = kn->attr.open;
541 if (on) {
542 atomic_inc(&on->refcnt);
543 list_add_tail(&of->list, &on->files);
544 }
545
546 spin_unlock_irq(&kernfs_open_node_lock);
547 mutex_unlock(&kernfs_open_file_mutex);
548
549 if (on) {
550 kfree(new_on);
551 return 0;
552 }
553
554 /* not there, initialize a new one and retry */
555 new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
556 if (!new_on)
557 return -ENOMEM;
558
559 atomic_set(&new_on->refcnt, 0);
560 atomic_set(&new_on->event, 1);
561 init_waitqueue_head(&new_on->poll);
562 INIT_LIST_HEAD(&new_on->files);
563 goto retry;
564}
565
566/**
567 * kernfs_put_open_node - put kernfs_open_node
568 * @kn: target kernfs_nodet
569 * @of: associated kernfs_open_file
570 *
571 * Put @kn->attr.open and unlink @of from the files list. If
572 * reference count reaches zero, disassociate and free it.
573 *
574 * LOCKING:
575 * None.
576 */
577static void kernfs_put_open_node(struct kernfs_node *kn,
578 struct kernfs_open_file *of)
579{
580 struct kernfs_open_node *on = kn->attr.open;
581 unsigned long flags;
582
583 mutex_lock(&kernfs_open_file_mutex);
584 spin_lock_irqsave(&kernfs_open_node_lock, flags);
585
586 if (of)
587 list_del(&of->list);
588
589 if (atomic_dec_and_test(&on->refcnt))
590 kn->attr.open = NULL;
591 else
592 on = NULL;
593
594 spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
595 mutex_unlock(&kernfs_open_file_mutex);
596
597 kfree(on);
598}
599
600static int kernfs_fop_open(struct inode *inode, struct file *file)
601{
602 struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
603 const struct kernfs_ops *ops;
604 struct kernfs_open_file *of;
605 bool has_read, has_write, has_mmap;
606 int error = -EACCES;
607
608 if (!kernfs_get_active(kn))
609 return -ENODEV;
610
611 ops = kernfs_ops(kn);
612
613 has_read = ops->seq_show || ops->read || ops->mmap;
614 has_write = ops->write || ops->mmap;
615 has_mmap = ops->mmap;
616
617 /* check perms and supported operations */
618 if ((file->f_mode & FMODE_WRITE) &&
619 (!(inode->i_mode & S_IWUGO) || !has_write))
620 goto err_out;
621
622 if ((file->f_mode & FMODE_READ) &&
623 (!(inode->i_mode & S_IRUGO) || !has_read))
624 goto err_out;
625
626 /* allocate a kernfs_open_file for the file */
627 error = -ENOMEM;
628 of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
629 if (!of)
630 goto err_out;
631
632 /*
633 * The following is done to give a different lockdep key to
634 * @of->mutex for files which implement mmap. This is a rather
635 * crude way to avoid false positive lockdep warning around
636 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
637 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
638 * which mm->mmap_sem nests, while holding @of->mutex. As each
639 * open file has a separate mutex, it's okay as long as those don't
640 * happen on the same file. At this point, we can't easily give
641 * each file a separate locking class. Let's differentiate on
642 * whether the file has mmap or not for now.
643 *
644 * Both paths of the branch look the same. They're supposed to
645 * look that way and give @of->mutex different static lockdep keys.
646 */
647 if (has_mmap)
648 mutex_init(&of->mutex);
649 else
650 mutex_init(&of->mutex);
651
652 of->kn = kn;
653 of->file = file;
654
655 /*
656 * Always instantiate seq_file even if read access doesn't use
657 * seq_file or is not requested. This unifies private data access
658 * and readable regular files are the vast majority anyway.
659 */
660 if (ops->seq_show)
661 error = seq_open(file, &kernfs_seq_ops);
662 else
663 error = seq_open(file, NULL);
664 if (error)
665 goto err_free;
666
667 ((struct seq_file *)file->private_data)->private = of;
668
669 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
670 if (file->f_mode & FMODE_WRITE)
671 file->f_mode |= FMODE_PWRITE;
672
673 /* make sure we have open node struct */
674 error = kernfs_get_open_node(kn, of);
675 if (error)
676 goto err_close;
677
678 /* open succeeded, put active references */
679 kernfs_put_active(kn);
680 return 0;
681
682err_close:
683 seq_release(inode, file);
684err_free:
685 kfree(of);
686err_out:
687 kernfs_put_active(kn);
688 return error;
689}
690
691static int kernfs_fop_release(struct inode *inode, struct file *filp)
692{
693 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
694 struct kernfs_open_file *of = kernfs_of(filp);
695
696 kernfs_put_open_node(kn, of);
697 seq_release(inode, filp);
698 kfree(of);
699
700 return 0;
701}
702
703void kernfs_unmap_bin_file(struct kernfs_node *kn)
704{
705 struct kernfs_open_node *on;
706 struct kernfs_open_file *of;
707
708 if (!(kn->flags & KERNFS_HAS_MMAP))
709 return;
710
711 spin_lock_irq(&kernfs_open_node_lock);
712 on = kn->attr.open;
713 if (on)
714 atomic_inc(&on->refcnt);
715 spin_unlock_irq(&kernfs_open_node_lock);
716 if (!on)
717 return;
718
719 mutex_lock(&kernfs_open_file_mutex);
720 list_for_each_entry(of, &on->files, list) {
721 struct inode *inode = file_inode(of->file);
722 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
723 }
724 mutex_unlock(&kernfs_open_file_mutex);
725
726 kernfs_put_open_node(kn, NULL);
727}
728
729/*
730 * Kernfs attribute files are pollable. The idea is that you read
731 * the content and then you use 'poll' or 'select' to wait for
732 * the content to change. When the content changes (assuming the
733 * manager for the kobject supports notification), poll will
734 * return POLLERR|POLLPRI, and select will return the fd whether
735 * it is waiting for read, write, or exceptions.
736 * Once poll/select indicates that the value has changed, you
737 * need to close and re-open the file, or seek to 0 and read again.
738 * Reminder: this only works for attributes which actively support
739 * it, and it is not possible to test an attribute from userspace
740 * to see if it supports poll (Neither 'poll' nor 'select' return
741 * an appropriate error code). When in doubt, set a suitable timeout value.
742 */
743static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
744{
745 struct kernfs_open_file *of = kernfs_of(filp);
746 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
747 struct kernfs_open_node *on = kn->attr.open;
748
749 /* need parent for the kobj, grab both */
750 if (!kernfs_get_active(kn))
751 goto trigger;
752
753 poll_wait(filp, &on->poll, wait);
754
755 kernfs_put_active(kn);
756
757 if (of->event != atomic_read(&on->event))
758 goto trigger;
759
760 return DEFAULT_POLLMASK;
761
762 trigger:
763 return DEFAULT_POLLMASK|POLLERR|POLLPRI;
764}
765
766/**
767 * kernfs_notify - notify a kernfs file
768 * @kn: file to notify
769 *
770 * Notify @kn such that poll(2) on @kn wakes up.
771 */
772void kernfs_notify(struct kernfs_node *kn)
773{
774 struct kernfs_open_node *on;
775 unsigned long flags;
776
777 spin_lock_irqsave(&kernfs_open_node_lock, flags);
778
779 if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
780 on = kn->attr.open;
781 if (on) {
782 atomic_inc(&on->event);
783 wake_up_interruptible(&on->poll);
784 }
785 }
786
787 spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
788}
789EXPORT_SYMBOL_GPL(kernfs_notify);
790
791const struct file_operations kernfs_file_fops = {
792 .read = kernfs_fop_read,
793 .write = kernfs_fop_write,
794 .llseek = generic_file_llseek,
795 .mmap = kernfs_fop_mmap,
796 .open = kernfs_fop_open,
797 .release = kernfs_fop_release,
798 .poll = kernfs_fop_poll,
799};
800
801/**
802 * __kernfs_create_file - kernfs internal function to create a file
803 * @parent: directory to create the file in
804 * @name: name of the file
805 * @mode: mode of the file
806 * @size: size of the file
807 * @ops: kernfs operations for the file
808 * @priv: private data for the file
809 * @ns: optional namespace tag of the file
810 * @static_name: don't copy file name
811 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
812 *
813 * Returns the created node on success, ERR_PTR() value on error.
814 */
815struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
816 const char *name,
817 umode_t mode, loff_t size,
818 const struct kernfs_ops *ops,
819 void *priv, const void *ns,
820 bool name_is_static,
821 struct lock_class_key *key)
822{
823 struct kernfs_addrm_cxt acxt;
824 struct kernfs_node *kn;
825 unsigned flags;
826 int rc;
827
828 flags = KERNFS_FILE;
829 if (name_is_static)
830 flags |= KERNFS_STATIC_NAME;
831
832 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
833 if (!kn)
834 return ERR_PTR(-ENOMEM);
835
836 kn->attr.ops = ops;
837 kn->attr.size = size;
838 kn->ns = ns;
839 kn->priv = priv;
840
841#ifdef CONFIG_DEBUG_LOCK_ALLOC
842 if (key) {
843 lockdep_init_map(&kn->dep_map, "s_active", key, 0);
844 kn->flags |= KERNFS_LOCKDEP;
845 }
846#endif
847
848 /*
849 * kn->attr.ops is accesible only while holding active ref. We
850 * need to know whether some ops are implemented outside active
851 * ref. Cache their existence in flags.
852 */
853 if (ops->seq_show)
854 kn->flags |= KERNFS_HAS_SEQ_SHOW;
855 if (ops->mmap)
856 kn->flags |= KERNFS_HAS_MMAP;
857
858 kernfs_addrm_start(&acxt);
859 rc = kernfs_add_one(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861
862 if (rc) {
863 kernfs_put(kn);
864 return ERR_PTR(rc);
865 }
866 return kn;
867}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
1/*
2 * fs/kernfs/inode.c - kernfs inode implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/pagemap.h>
12#include <linux/backing-dev.h>
13#include <linux/capability.h>
14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/xattr.h>
17#include <linux/security.h>
18
19#include "kernfs-internal.h"
20
21static const struct address_space_operations kernfs_aops = {
22 .readpage = simple_readpage,
23 .write_begin = simple_write_begin,
24 .write_end = simple_write_end,
25};
26
27static struct backing_dev_info kernfs_bdi = {
28 .name = "kernfs",
29 .ra_pages = 0, /* No readahead */
30 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
31};
32
33static const struct inode_operations kernfs_iops = {
34 .permission = kernfs_iop_permission,
35 .setattr = kernfs_iop_setattr,
36 .getattr = kernfs_iop_getattr,
37 .setxattr = kernfs_iop_setxattr,
38 .removexattr = kernfs_iop_removexattr,
39 .getxattr = kernfs_iop_getxattr,
40 .listxattr = kernfs_iop_listxattr,
41};
42
43void __init kernfs_inode_init(void)
44{
45 if (bdi_init(&kernfs_bdi))
46 panic("failed to init kernfs_bdi");
47}
48
49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
50{
51 struct iattr *iattrs;
52
53 if (kn->iattr)
54 return kn->iattr;
55
56 kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
57 if (!kn->iattr)
58 return NULL;
59 iattrs = &kn->iattr->ia_iattr;
60
61 /* assign default attributes */
62 iattrs->ia_mode = kn->mode;
63 iattrs->ia_uid = GLOBAL_ROOT_UID;
64 iattrs->ia_gid = GLOBAL_ROOT_GID;
65 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
66
67 simple_xattrs_init(&kn->iattr->xattrs);
68
69 return kn->iattr;
70}
71
72static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
73{
74 struct kernfs_iattrs *attrs;
75 struct iattr *iattrs;
76 unsigned int ia_valid = iattr->ia_valid;
77
78 attrs = kernfs_iattrs(kn);
79 if (!attrs)
80 return -ENOMEM;
81
82 iattrs = &attrs->ia_iattr;
83
84 if (ia_valid & ATTR_UID)
85 iattrs->ia_uid = iattr->ia_uid;
86 if (ia_valid & ATTR_GID)
87 iattrs->ia_gid = iattr->ia_gid;
88 if (ia_valid & ATTR_ATIME)
89 iattrs->ia_atime = iattr->ia_atime;
90 if (ia_valid & ATTR_MTIME)
91 iattrs->ia_mtime = iattr->ia_mtime;
92 if (ia_valid & ATTR_CTIME)
93 iattrs->ia_ctime = iattr->ia_ctime;
94 if (ia_valid & ATTR_MODE) {
95 umode_t mode = iattr->ia_mode;
96 iattrs->ia_mode = kn->mode = mode;
97 }
98 return 0;
99}
100
101/**
102 * kernfs_setattr - set iattr on a node
103 * @kn: target node
104 * @iattr: iattr to set
105 *
106 * Returns 0 on success, -errno on failure.
107 */
108int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
109{
110 int ret;
111
112 mutex_lock(&kernfs_mutex);
113 ret = __kernfs_setattr(kn, iattr);
114 mutex_unlock(&kernfs_mutex);
115 return ret;
116}
117
118int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
119{
120 struct inode *inode = dentry->d_inode;
121 struct kernfs_node *kn = dentry->d_fsdata;
122 int error;
123
124 if (!kn)
125 return -EINVAL;
126
127 mutex_lock(&kernfs_mutex);
128 error = inode_change_ok(inode, iattr);
129 if (error)
130 goto out;
131
132 error = __kernfs_setattr(kn, iattr);
133 if (error)
134 goto out;
135
136 /* this ignores size changes */
137 setattr_copy(inode, iattr);
138
139out:
140 mutex_unlock(&kernfs_mutex);
141 return error;
142}
143
144static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
145 u32 *secdata_len)
146{
147 struct kernfs_iattrs *attrs;
148 void *old_secdata;
149 size_t old_secdata_len;
150
151 attrs = kernfs_iattrs(kn);
152 if (!attrs)
153 return -ENOMEM;
154
155 old_secdata = attrs->ia_secdata;
156 old_secdata_len = attrs->ia_secdata_len;
157
158 attrs->ia_secdata = *secdata;
159 attrs->ia_secdata_len = *secdata_len;
160
161 *secdata = old_secdata;
162 *secdata_len = old_secdata_len;
163 return 0;
164}
165
166int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
167 const void *value, size_t size, int flags)
168{
169 struct kernfs_node *kn = dentry->d_fsdata;
170 struct kernfs_iattrs *attrs;
171 void *secdata;
172 int error;
173 u32 secdata_len = 0;
174
175 attrs = kernfs_iattrs(kn);
176 if (!attrs)
177 return -ENOMEM;
178
179 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
180 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
181 error = security_inode_setsecurity(dentry->d_inode, suffix,
182 value, size, flags);
183 if (error)
184 return error;
185 error = security_inode_getsecctx(dentry->d_inode,
186 &secdata, &secdata_len);
187 if (error)
188 return error;
189
190 mutex_lock(&kernfs_mutex);
191 error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
192 mutex_unlock(&kernfs_mutex);
193
194 if (secdata)
195 security_release_secctx(secdata, secdata_len);
196 return error;
197 } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
198 return simple_xattr_set(&attrs->xattrs, name, value, size,
199 flags);
200 }
201
202 return -EINVAL;
203}
204
205int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
206{
207 struct kernfs_node *kn = dentry->d_fsdata;
208 struct kernfs_iattrs *attrs;
209
210 attrs = kernfs_iattrs(kn);
211 if (!attrs)
212 return -ENOMEM;
213
214 return simple_xattr_remove(&attrs->xattrs, name);
215}
216
217ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
218 size_t size)
219{
220 struct kernfs_node *kn = dentry->d_fsdata;
221 struct kernfs_iattrs *attrs;
222
223 attrs = kernfs_iattrs(kn);
224 if (!attrs)
225 return -ENOMEM;
226
227 return simple_xattr_get(&attrs->xattrs, name, buf, size);
228}
229
230ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
231{
232 struct kernfs_node *kn = dentry->d_fsdata;
233 struct kernfs_iattrs *attrs;
234
235 attrs = kernfs_iattrs(kn);
236 if (!attrs)
237 return -ENOMEM;
238
239 return simple_xattr_list(&attrs->xattrs, buf, size);
240}
241
242static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
243{
244 inode->i_mode = mode;
245 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
246}
247
248static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
249{
250 inode->i_uid = iattr->ia_uid;
251 inode->i_gid = iattr->ia_gid;
252 inode->i_atime = iattr->ia_atime;
253 inode->i_mtime = iattr->ia_mtime;
254 inode->i_ctime = iattr->ia_ctime;
255}
256
257static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
258{
259 struct kernfs_iattrs *attrs = kn->iattr;
260
261 inode->i_mode = kn->mode;
262 if (attrs) {
263 /*
264 * kernfs_node has non-default attributes get them from
265 * persistent copy in kernfs_node.
266 */
267 set_inode_attr(inode, &attrs->ia_iattr);
268 security_inode_notifysecctx(inode, attrs->ia_secdata,
269 attrs->ia_secdata_len);
270 }
271
272 if (kernfs_type(kn) == KERNFS_DIR)
273 set_nlink(inode, kn->dir.subdirs + 2);
274}
275
276int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
277 struct kstat *stat)
278{
279 struct kernfs_node *kn = dentry->d_fsdata;
280 struct inode *inode = dentry->d_inode;
281
282 mutex_lock(&kernfs_mutex);
283 kernfs_refresh_inode(kn, inode);
284 mutex_unlock(&kernfs_mutex);
285
286 generic_fillattr(inode, stat);
287 return 0;
288}
289
290static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
291{
292 kernfs_get(kn);
293 inode->i_private = kn;
294 inode->i_mapping->a_ops = &kernfs_aops;
295 inode->i_mapping->backing_dev_info = &kernfs_bdi;
296 inode->i_op = &kernfs_iops;
297
298 set_default_inode_attr(inode, kn->mode);
299 kernfs_refresh_inode(kn, inode);
300
301 /* initialize inode according to type */
302 switch (kernfs_type(kn)) {
303 case KERNFS_DIR:
304 inode->i_op = &kernfs_dir_iops;
305 inode->i_fop = &kernfs_dir_fops;
306 break;
307 case KERNFS_FILE:
308 inode->i_size = kn->attr.size;
309 inode->i_fop = &kernfs_file_fops;
310 break;
311 case KERNFS_LINK:
312 inode->i_op = &kernfs_symlink_iops;
313 break;
314 default:
315 BUG();
316 }
317
318 unlock_new_inode(inode);
319}
320
321/**
322 * kernfs_get_inode - get inode for kernfs_node
323 * @sb: super block
324 * @kn: kernfs_node to allocate inode for
325 *
326 * Get inode for @kn. If such inode doesn't exist, a new inode is
327 * allocated and basics are initialized. New inode is returned
328 * locked.
329 *
330 * LOCKING:
331 * Kernel thread context (may sleep).
332 *
333 * RETURNS:
334 * Pointer to allocated inode on success, NULL on failure.
335 */
336struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
337{
338 struct inode *inode;
339
340 inode = iget_locked(sb, kn->ino);
341 if (inode && (inode->i_state & I_NEW))
342 kernfs_init_inode(kn, inode);
343
344 return inode;
345}
346
347/*
348 * The kernfs_node serves as both an inode and a directory entry for
349 * kernfs. To prevent the kernfs inode numbers from being freed
350 * prematurely we take a reference to kernfs_node from the kernfs inode. A
351 * super_operations.evict_inode() implementation is needed to drop that
352 * reference upon inode destruction.
353 */
354void kernfs_evict_inode(struct inode *inode)
355{
356 struct kernfs_node *kn = inode->i_private;
357
358 truncate_inode_pages(&inode->i_data, 0);
359 clear_inode(inode);
360 kernfs_put(kn);
361}
362
363int kernfs_iop_permission(struct inode *inode, int mask)
364{
365 struct kernfs_node *kn;
366
367 if (mask & MAY_NOT_BLOCK)
368 return -ECHILD;
369
370 kn = inode->i_private;
371
372 mutex_lock(&kernfs_mutex);
373 kernfs_refresh_inode(kn, inode);
374 mutex_unlock(&kernfs_mutex);
375
376 return generic_permission(inode, mask);
377}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..eb536b76374a
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
1/*
2 * fs/kernfs/kernfs-internal.h - kernfs internal header file
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#ifndef __KERNFS_INTERNAL_H
12#define __KERNFS_INTERNAL_H
13
14#include <linux/lockdep.h>
15#include <linux/fs.h>
16#include <linux/mutex.h>
17#include <linux/xattr.h>
18
19#include <linux/kernfs.h>
20
21struct kernfs_iattrs {
22 struct iattr ia_iattr;
23 void *ia_secdata;
24 u32 ia_secdata_len;
25
26 struct simple_xattrs xattrs;
27};
28
29#define KN_DEACTIVATED_BIAS INT_MIN
30
31/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
32
33/**
34 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
35 * @kn: kernfs_node of interest
36 *
37 * Return the kernfs_root @kn belongs to.
38 */
39static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
40{
41 /* if parent exists, it's always a dir; otherwise, @sd is a dir */
42 if (kn->parent)
43 kn = kn->parent;
44 return kn->dir.root;
45}
46
47/*
48 * Context structure to be used while adding/removing nodes.
49 */
50struct kernfs_addrm_cxt {
51 struct kernfs_node *removed;
52};
53
54/*
55 * mount.c
56 */
57struct kernfs_super_info {
58 /*
59 * The root associated with this super_block. Each super_block is
60 * identified by the root and ns it's associated with.
61 */
62 struct kernfs_root *root;
63
64 /*
65 * Each sb is associated with one namespace tag, currently the
66 * network namespace of the task which mounted this kernfs
67 * instance. If multiple tags become necessary, make the following
68 * an array and compare kernfs_node tag against every entry.
69 */
70 const void *ns;
71};
72#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
73
74extern struct kmem_cache *kernfs_node_cache;
75
76/*
77 * inode.c
78 */
79struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
80void kernfs_evict_inode(struct inode *inode);
81int kernfs_iop_permission(struct inode *inode, int mask);
82int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
83int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
84 struct kstat *stat);
85int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
86 size_t size, int flags);
87int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
89 size_t size);
90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
91void kernfs_inode_init(void);
92
93/*
94 * dir.c
95 */
96extern struct mutex kernfs_mutex;
97extern const struct dentry_operations kernfs_dops;
98extern const struct file_operations kernfs_dir_fops;
99extern const struct inode_operations kernfs_dir_iops;
100
101struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
102void kernfs_put_active(struct kernfs_node *kn);
103void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
104int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
105void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
106struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
107 const char *name, umode_t mode,
108 unsigned flags);
109
110/*
111 * file.c
112 */
113extern const struct file_operations kernfs_file_fops;
114
115void kernfs_unmap_bin_file(struct kernfs_node *kn);
116
117/*
118 * symlink.c
119 */
120extern const struct inode_operations kernfs_symlink_iops;
121
122#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0d6ce895a9ee
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,165 @@
1/*
2 * fs/kernfs/mount.c - kernfs mount implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/mount.h>
13#include <linux/init.h>
14#include <linux/magic.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h>
17
18#include "kernfs-internal.h"
19
20struct kmem_cache *kernfs_node_cache;
21
22static const struct super_operations kernfs_sops = {
23 .statfs = simple_statfs,
24 .drop_inode = generic_delete_inode,
25 .evict_inode = kernfs_evict_inode,
26};
27
28static int kernfs_fill_super(struct super_block *sb)
29{
30 struct kernfs_super_info *info = kernfs_info(sb);
31 struct inode *inode;
32 struct dentry *root;
33
34 sb->s_blocksize = PAGE_CACHE_SIZE;
35 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
36 sb->s_magic = SYSFS_MAGIC;
37 sb->s_op = &kernfs_sops;
38 sb->s_time_gran = 1;
39
40 /* get root inode, initialize and unlock it */
41 mutex_lock(&kernfs_mutex);
42 inode = kernfs_get_inode(sb, info->root->kn);
43 mutex_unlock(&kernfs_mutex);
44 if (!inode) {
45 pr_debug("kernfs: could not get root inode\n");
46 return -ENOMEM;
47 }
48
49 /* instantiate and link root dentry */
50 root = d_make_root(inode);
51 if (!root) {
52 pr_debug("%s: could not get root dentry!\n", __func__);
53 return -ENOMEM;
54 }
55 kernfs_get(info->root->kn);
56 root->d_fsdata = info->root->kn;
57 sb->s_root = root;
58 sb->s_d_op = &kernfs_dops;
59 return 0;
60}
61
62static int kernfs_test_super(struct super_block *sb, void *data)
63{
64 struct kernfs_super_info *sb_info = kernfs_info(sb);
65 struct kernfs_super_info *info = data;
66
67 return sb_info->root == info->root && sb_info->ns == info->ns;
68}
69
70static int kernfs_set_super(struct super_block *sb, void *data)
71{
72 int error;
73 error = set_anon_super(sb, data);
74 if (!error)
75 sb->s_fs_info = data;
76 return error;
77}
78
79/**
80 * kernfs_super_ns - determine the namespace tag of a kernfs super_block
81 * @sb: super_block of interest
82 *
83 * Return the namespace tag associated with kernfs super_block @sb.
84 */
85const void *kernfs_super_ns(struct super_block *sb)
86{
87 struct kernfs_super_info *info = kernfs_info(sb);
88
89 return info->ns;
90}
91
92/**
93 * kernfs_mount_ns - kernfs mount helper
94 * @fs_type: file_system_type of the fs being mounted
95 * @flags: mount flags specified for the mount
96 * @root: kernfs_root of the hierarchy being mounted
97 * @ns: optional namespace tag of the mount
98 *
99 * This is to be called from each kernfs user's file_system_type->mount()
100 * implementation, which should pass through the specified @fs_type and
101 * @flags, and specify the hierarchy and namespace tag to mount via @root
102 * and @ns, respectively.
103 *
104 * The return value can be passed to the vfs layer verbatim.
105 */
106struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
107 struct kernfs_root *root, const void *ns)
108{
109 struct super_block *sb;
110 struct kernfs_super_info *info;
111 int error;
112
113 info = kzalloc(sizeof(*info), GFP_KERNEL);
114 if (!info)
115 return ERR_PTR(-ENOMEM);
116
117 info->root = root;
118 info->ns = ns;
119
120 sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
121 if (IS_ERR(sb) || sb->s_fs_info != info)
122 kfree(info);
123 if (IS_ERR(sb))
124 return ERR_CAST(sb);
125 if (!sb->s_root) {
126 error = kernfs_fill_super(sb);
127 if (error) {
128 deactivate_locked_super(sb);
129 return ERR_PTR(error);
130 }
131 sb->s_flags |= MS_ACTIVE;
132 }
133
134 return dget(sb->s_root);
135}
136
137/**
138 * kernfs_kill_sb - kill_sb for kernfs
139 * @sb: super_block being killed
140 *
141 * This can be used directly for file_system_type->kill_sb(). If a kernfs
142 * user needs extra cleanup, it can implement its own kill_sb() and call
143 * this function at the end.
144 */
145void kernfs_kill_sb(struct super_block *sb)
146{
147 struct kernfs_super_info *info = kernfs_info(sb);
148 struct kernfs_node *root_kn = sb->s_root->d_fsdata;
149
150 /*
151 * Remove the superblock from fs_supers/s_instances
152 * so we can't find it, before freeing kernfs_super_info.
153 */
154 kill_anon_super(sb);
155 kfree(info);
156 kernfs_put(root_kn);
157}
158
159void __init kernfs_init(void)
160{
161 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
162 sizeof(struct kernfs_node),
163 0, SLAB_PANIC, NULL);
164 kernfs_inode_init();
165}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..4d457055acb9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,151 @@
1/*
2 * fs/kernfs/symlink.c - kernfs symlink implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/gfp.h>
13#include <linux/namei.h>
14
15#include "kernfs-internal.h"
16
17/**
18 * kernfs_create_link - create a symlink
19 * @parent: directory to create the symlink in
20 * @name: name of the symlink
21 * @target: target node for the symlink to point to
22 *
23 * Returns the created node on success, ERR_PTR() value on error.
24 */
25struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
26 const char *name,
27 struct kernfs_node *target)
28{
29 struct kernfs_node *kn;
30 struct kernfs_addrm_cxt acxt;
31 int error;
32
33 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
34 if (!kn)
35 return ERR_PTR(-ENOMEM);
36
37 if (kernfs_ns_enabled(parent))
38 kn->ns = target->ns;
39 kn->symlink.target_kn = target;
40 kernfs_get(target); /* ref owned by symlink */
41
42 kernfs_addrm_start(&acxt);
43 error = kernfs_add_one(&acxt, kn);
44 kernfs_addrm_finish(&acxt);
45
46 if (!error)
47 return kn;
48
49 kernfs_put(kn);
50 return ERR_PTR(error);
51}
52
53static int kernfs_get_target_path(struct kernfs_node *parent,
54 struct kernfs_node *target, char *path)
55{
56 struct kernfs_node *base, *kn;
57 char *s = path;
58 int len = 0;
59
60 /* go up to the root, stop at the base */
61 base = parent;
62 while (base->parent) {
63 kn = target->parent;
64 while (kn->parent && base != kn)
65 kn = kn->parent;
66
67 if (base == kn)
68 break;
69
70 strcpy(s, "../");
71 s += 3;
72 base = base->parent;
73 }
74
75 /* determine end of target string for reverse fillup */
76 kn = target;
77 while (kn->parent && kn != base) {
78 len += strlen(kn->name) + 1;
79 kn = kn->parent;
80 }
81
82 /* check limits */
83 if (len < 2)
84 return -EINVAL;
85 len--;
86 if ((s - path) + len > PATH_MAX)
87 return -ENAMETOOLONG;
88
89 /* reverse fillup of target string from target to base */
90 kn = target;
91 while (kn->parent && kn != base) {
92 int slen = strlen(kn->name);
93
94 len -= slen;
95 strncpy(s + len, kn->name, slen);
96 if (len)
97 s[--len] = '/';
98
99 kn = kn->parent;
100 }
101
102 return 0;
103}
104
105static int kernfs_getlink(struct dentry *dentry, char *path)
106{
107 struct kernfs_node *kn = dentry->d_fsdata;
108 struct kernfs_node *parent = kn->parent;
109 struct kernfs_node *target = kn->symlink.target_kn;
110 int error;
111
112 mutex_lock(&kernfs_mutex);
113 error = kernfs_get_target_path(parent, target, path);
114 mutex_unlock(&kernfs_mutex);
115
116 return error;
117}
118
119static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
120{
121 int error = -ENOMEM;
122 unsigned long page = get_zeroed_page(GFP_KERNEL);
123 if (page) {
124 error = kernfs_getlink(dentry, (char *) page);
125 if (error < 0)
126 free_page((unsigned long)page);
127 }
128 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
129 return NULL;
130}
131
132static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
133 void *cookie)
134{
135 char *page = nd_get_link(nd);
136 if (!IS_ERR(page))
137 free_page((unsigned long)page);
138}
139
140const struct inode_operations kernfs_symlink_iops = {
141 .setxattr = kernfs_iop_setxattr,
142 .removexattr = kernfs_iop_removexattr,
143 .getxattr = kernfs_iop_getxattr,
144 .listxattr = kernfs_iop_listxattr,
145 .readlink = generic_readlink,
146 .follow_link = kernfs_iop_follow_link,
147 .put_link = kernfs_iop_put_link,
148 .setattr = kernfs_iop_setattr,
149 .getattr = kernfs_iop_getattr,
150 .permission = kernfs_iop_permission,
151};
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..22e536705c45 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2790,6 +2790,8 @@ void __init mnt_init(void)
2790 for (u = 0; u < HASH_SIZE; u++) 2790 for (u = 0; u < HASH_SIZE; u++)
2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]); 2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2792 2792
2793 kernfs_init();
2794
2793 err = sysfs_init(); 2795 err = sysfs_init();
2794 if (err) 2796 if (err)
2795 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2797 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type)
2886 struct inode *inode = child->mnt_mountpoint->d_inode; 2888 struct inode *inode = child->mnt_mountpoint->d_inode;
2887 if (!S_ISDIR(inode->i_mode)) 2889 if (!S_ISDIR(inode->i_mode))
2888 goto next; 2890 goto next;
2889 if (inode->i_nlink != 2) 2891 if (inode->i_nlink > 2)
2890 goto next; 2892 goto next;
2891 } 2893 }
2892 visible = true; 2894 visible = true;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c01..a1a191634abc 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1440 1440
1441 nilfs_clear_logs(&sci->sc_segbufs); 1441 nilfs_clear_logs(&sci->sc_segbufs);
1442 1442
1443 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1444 if (unlikely(err))
1445 return err;
1446
1447 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1443 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1448 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1444 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1449 sci->sc_freesegs, 1445 sci->sc_freesegs,
1450 sci->sc_nfreesegs, 1446 sci->sc_nfreesegs,
1451 NULL); 1447 NULL);
1452 WARN_ON(err); /* do not happen */ 1448 WARN_ON(err); /* do not happen */
1449 sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
1453 } 1450 }
1451
1452 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1453 if (unlikely(err))
1454 return err;
1455
1454 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1456 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1455 sci->sc_stage = prev_stage; 1457 sci->sc_stage = prev_stage;
1456 } 1458 }
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..0b9ff4395e6a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
82 * events. 82 * events.
83 */ 83 */
84static int dnotify_handle_event(struct fsnotify_group *group, 84static int dnotify_handle_event(struct fsnotify_group *group,
85 struct inode *inode,
85 struct fsnotify_mark *inode_mark, 86 struct fsnotify_mark *inode_mark,
86 struct fsnotify_mark *vfsmount_mark, 87 struct fsnotify_mark *vfsmount_mark,
87 struct fsnotify_event *event) 88 u32 mask, void *data, int data_type,
89 const unsigned char *file_name)
88{ 90{
89 struct dnotify_mark *dn_mark; 91 struct dnotify_mark *dn_mark;
90 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
95 96
96 BUG_ON(vfsmount_mark); 97 /* not a dir, dnotify doesn't care */
98 if (!S_ISDIR(inode->i_mode))
99 return 0;
97 100
98 to_tell = event->to_tell; 101 BUG_ON(vfsmount_mark);
99 102
100 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); 103 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
101 104
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
122 return 0; 125 return 0;
123} 126}
124 127
125/*
126 * Given an inode and mask determine if dnotify would be interested in sending
127 * userspace notification for that pair.
128 */
129static bool dnotify_should_send_event(struct fsnotify_group *group,
130 struct inode *inode,
131 struct fsnotify_mark *inode_mark,
132 struct fsnotify_mark *vfsmount_mark,
133 __u32 mask, void *data, int data_type)
134{
135 /* not a dir, dnotify doesn't care */
136 if (!S_ISDIR(inode->i_mode))
137 return false;
138
139 return true;
140}
141
142static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) 128static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
143{ 129{
144 struct dnotify_mark *dn_mark = container_of(fsn_mark, 130 struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
152 138
153static struct fsnotify_ops dnotify_fsnotify_ops = { 139static struct fsnotify_ops dnotify_fsnotify_ops = {
154 .handle_event = dnotify_handle_event, 140 .handle_event = dnotify_handle_event,
155 .should_send_event = dnotify_should_send_event,
156 .free_group_priv = NULL,
157 .freeing_mark = NULL,
158 .free_event_priv = NULL,
159}; 141};
160 142
161/* 143/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..58772623f02a 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,31 +9,27 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11 11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) 12#include "fanotify.h"
13
14static bool should_merge(struct fsnotify_event *old_fsn,
15 struct fsnotify_event *new_fsn)
13{ 16{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new); 17 struct fanotify_event_info *old, *new;
15 18
16 if (old->to_tell == new->to_tell &&
17 old->data_type == new->data_type &&
18 old->tgid == new->tgid) {
19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH):
21#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 19#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
22 /* dont merge two permission events */ 20 /* dont merge two permission events */
23 if ((old->mask & FAN_ALL_PERM_EVENTS) && 21 if ((old_fsn->mask & FAN_ALL_PERM_EVENTS) &&
24 (new->mask & FAN_ALL_PERM_EVENTS)) 22 (new_fsn->mask & FAN_ALL_PERM_EVENTS))
25 return false; 23 return false;
26#endif 24#endif
27 if ((old->path.mnt == new->path.mnt) && 25 pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
28 (old->path.dentry == new->path.dentry)) 26 old = FANOTIFY_E(old_fsn);
29 return true; 27 new = FANOTIFY_E(new_fsn);
30 break; 28
31 case (FSNOTIFY_EVENT_NONE): 29 if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
32 return true; 30 old->path.mnt == new->path.mnt &&
33 default: 31 old->path.dentry == new->path.dentry)
34 BUG(); 32 return true;
35 };
36 }
37 return false; 33 return false;
38} 34}
39 35
@@ -41,59 +37,28 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
41static struct fsnotify_event *fanotify_merge(struct list_head *list, 37static struct fsnotify_event *fanotify_merge(struct list_head *list,
42 struct fsnotify_event *event) 38 struct fsnotify_event *event)
43{ 39{
44 struct fsnotify_event_holder *test_holder; 40 struct fsnotify_event *test_event;
45 struct fsnotify_event *test_event = NULL; 41 bool do_merge = false;
46 struct fsnotify_event *new_event;
47 42
48 pr_debug("%s: list=%p event=%p\n", __func__, list, event); 43 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
49 44
50 45 list_for_each_entry_reverse(test_event, list, list) {
51 list_for_each_entry_reverse(test_holder, list, event_list) { 46 if (should_merge(test_event, event)) {
52 if (should_merge(test_holder->event, event)) { 47 do_merge = true;
53 test_event = test_holder->event;
54 break; 48 break;
55 } 49 }
56 } 50 }
57 51
58 if (!test_event) 52 if (!do_merge)
59 return NULL; 53 return NULL;
60 54
61 fsnotify_get_event(test_event); 55 test_event->mask |= event->mask;
62 56 return test_event;
63 /* if they are exactly the same we are done */
64 if (test_event->mask == event->mask)
65 return test_event;
66
67 /*
68 * if the refcnt == 2 this is the only queue
69 * for this event and so we can update the mask
70 * in place.
71 */
72 if (atomic_read(&test_event->refcnt) == 2) {
73 test_event->mask |= event->mask;
74 return test_event;
75 }
76
77 new_event = fsnotify_clone_event(test_event);
78
79 /* done with test_event */
80 fsnotify_put_event(test_event);
81
82 /* couldn't allocate memory, merge was not possible */
83 if (unlikely(!new_event))
84 return ERR_PTR(-ENOMEM);
85
86 /* build new event and replace it on the list */
87 new_event->mask = (test_event->mask | event->mask);
88 fsnotify_replace_event(test_holder, new_event);
89
90 /* we hold a reference on new_event from clone_event */
91 return new_event;
92} 57}
93 58
94#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 59#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
95static int fanotify_get_response_from_access(struct fsnotify_group *group, 60static int fanotify_get_response_from_access(struct fsnotify_group *group,
96 struct fsnotify_event *event) 61 struct fanotify_event_info *event)
97{ 62{
98 int ret; 63 int ret;
99 64
@@ -106,7 +71,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
106 return 0; 71 return 0;
107 72
108 /* userspace responded, convert to something usable */ 73 /* userspace responded, convert to something usable */
109 spin_lock(&event->lock);
110 switch (event->response) { 74 switch (event->response) {
111 case FAN_ALLOW: 75 case FAN_ALLOW:
112 ret = 0; 76 ret = 0;
@@ -116,7 +80,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
116 ret = -EPERM; 80 ret = -EPERM;
117 } 81 }
118 event->response = 0; 82 event->response = 0;
119 spin_unlock(&event->lock);
120 83
121 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 84 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
122 group, event, ret); 85 group, event, ret);
@@ -125,58 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
125} 88}
126#endif 89#endif
127 90
128static int fanotify_handle_event(struct fsnotify_group *group, 91static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
129 struct fsnotify_mark *inode_mark,
130 struct fsnotify_mark *fanotify_mark,
131 struct fsnotify_event *event)
132{
133 int ret = 0;
134 struct fsnotify_event *notify_event = NULL;
135
136 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
137 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
138 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
139 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
140 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
141 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
142 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
143 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
144 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
145 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
146
147 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
148
149 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
150 if (IS_ERR(notify_event))
151 return PTR_ERR(notify_event);
152
153#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
154 if (event->mask & FAN_ALL_PERM_EVENTS) {
155 /* if we merged we need to wait on the new event */
156 if (notify_event)
157 event = notify_event;
158 ret = fanotify_get_response_from_access(group, event);
159 }
160#endif
161
162 if (notify_event)
163 fsnotify_put_event(notify_event);
164
165 return ret;
166}
167
168static bool fanotify_should_send_event(struct fsnotify_group *group,
169 struct inode *to_tell,
170 struct fsnotify_mark *inode_mark,
171 struct fsnotify_mark *vfsmnt_mark, 92 struct fsnotify_mark *vfsmnt_mark,
172 __u32 event_mask, void *data, int data_type) 93 u32 event_mask,
94 void *data, int data_type)
173{ 95{
174 __u32 marks_mask, marks_ignored_mask; 96 __u32 marks_mask, marks_ignored_mask;
175 struct path *path = data; 97 struct path *path = data;
176 98
177 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 99 pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
178 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 100 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
179 inode_mark, vfsmnt_mark, event_mask, data, data_type); 101 event_mask, data, data_type);
180 102
181 /* if we don't have enough info to send an event to userspace say no */ 103 /* if we don't have enough info to send an event to userspace say no */
182 if (data_type != FSNOTIFY_EVENT_PATH) 104 if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +139,74 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
217 return false; 139 return false;
218} 140}
219 141
142static int fanotify_handle_event(struct fsnotify_group *group,
143 struct inode *inode,
144 struct fsnotify_mark *inode_mark,
145 struct fsnotify_mark *fanotify_mark,
146 u32 mask, void *data, int data_type,
147 const unsigned char *file_name)
148{
149 int ret = 0;
150 struct fanotify_event_info *event;
151 struct fsnotify_event *fsn_event;
152 struct fsnotify_event *notify_fsn_event;
153
154 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
155 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
156 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
157 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
158 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
159 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
160 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
161 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
162 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
163 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
164
165 if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
166 data_type))
167 return 0;
168
169 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
170 mask);
171
172 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
173 if (unlikely(!event))
174 return -ENOMEM;
175
176 fsn_event = &event->fse;
177 fsnotify_init_event(fsn_event, inode, mask);
178 event->tgid = get_pid(task_tgid(current));
179 if (data_type == FSNOTIFY_EVENT_PATH) {
180 struct path *path = data;
181 event->path = *path;
182 path_get(&event->path);
183 } else {
184 event->path.mnt = NULL;
185 event->path.dentry = NULL;
186 }
187#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
188 event->response = 0;
189#endif
190
191 notify_fsn_event = fsnotify_add_notify_event(group, fsn_event,
192 fanotify_merge);
193 if (notify_fsn_event) {
194 /* Our event wasn't used in the end. Free it. */
195 fsnotify_destroy_event(group, fsn_event);
196 if (IS_ERR(notify_fsn_event))
197 return PTR_ERR(notify_fsn_event);
198 /* We need to ask about a different events after a merge... */
199 event = FANOTIFY_E(notify_fsn_event);
200 fsn_event = notify_fsn_event;
201 }
202
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (fsn_event->mask & FAN_ALL_PERM_EVENTS)
205 ret = fanotify_get_response_from_access(group, event);
206#endif
207 return ret;
208}
209
220static void fanotify_free_group_priv(struct fsnotify_group *group) 210static void fanotify_free_group_priv(struct fsnotify_group *group)
221{ 211{
222 struct user_struct *user; 212 struct user_struct *user;
@@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
226 free_uid(user); 216 free_uid(user);
227} 217}
228 218
219static void fanotify_free_event(struct fsnotify_event *fsn_event)
220{
221 struct fanotify_event_info *event;
222
223 event = FANOTIFY_E(fsn_event);
224 path_put(&event->path);
225 put_pid(event->tgid);
226 kmem_cache_free(fanotify_event_cachep, event);
227}
228
229const struct fsnotify_ops fanotify_fsnotify_ops = { 229const struct fsnotify_ops fanotify_fsnotify_ops = {
230 .handle_event = fanotify_handle_event, 230 .handle_event = fanotify_handle_event,
231 .should_send_event = fanotify_should_send_event,
232 .free_group_priv = fanotify_free_group_priv, 231 .free_group_priv = fanotify_free_group_priv,
233 .free_event_priv = NULL, 232 .free_event = fanotify_free_event,
234 .freeing_mark = NULL,
235}; 233};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..0e90174a116a
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,23 @@
1#include <linux/fsnotify_backend.h>
2#include <linux/path.h>
3#include <linux/slab.h>
4
5extern struct kmem_cache *fanotify_event_cachep;
6
7struct fanotify_event_info {
8 struct fsnotify_event fse;
9 /*
10 * We hold ref to this path so it may be dereferenced at any point
11 * during this object's lifetime
12 */
13 struct path path;
14 struct pid *tgid;
15#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
16 u32 response; /* userspace answer to question */
17#endif
18};
19
20static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
21{
22 return container_of(fse, struct fanotify_event_info, fse);
23}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..57d7c083cb4b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
19 19
20#include "../../mount.h" 20#include "../../mount.h"
21#include "../fdinfo.h" 21#include "../fdinfo.h"
22#include "fanotify.h"
22 23
23#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 24#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
24#define FANOTIFY_DEFAULT_MAX_MARKS 8192 25#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
28 29
29static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
30static struct kmem_cache *fanotify_response_event_cache __read_mostly; 31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly;
31 33
32struct fanotify_response_event { 34struct fanotify_response_event {
33 struct list_head list; 35 struct list_head list;
34 __s32 fd; 36 __s32 fd;
35 struct fsnotify_event *event; 37 struct fanotify_event_info *event;
36}; 38};
37 39
38/* 40/*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
61} 63}
62 64
63static int create_fd(struct fsnotify_group *group, 65static int create_fd(struct fsnotify_group *group,
64 struct fsnotify_event *event, 66 struct fanotify_event_info *event,
65 struct file **file) 67 struct file **file)
66{ 68{
67 int client_fd; 69 int client_fd;
68 struct file *new_file; 70 struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
73 if (client_fd < 0) 75 if (client_fd < 0)
74 return client_fd; 76 return client_fd;
75 77
76 if (event->data_type != FSNOTIFY_EVENT_PATH) {
77 WARN_ON(1);
78 put_unused_fd(client_fd);
79 return -EINVAL;
80 }
81
82 /* 78 /*
83 * we need a new file handle for the userspace program so it can read even if it was 79 * we need a new file handle for the userspace program so it can read even if it was
84 * originally opened O_WRONLY. 80 * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
109} 105}
110 106
111static int fill_event_metadata(struct fsnotify_group *group, 107static int fill_event_metadata(struct fsnotify_group *group,
112 struct fanotify_event_metadata *metadata, 108 struct fanotify_event_metadata *metadata,
113 struct fsnotify_event *event, 109 struct fsnotify_event *fsn_event,
114 struct file **file) 110 struct file **file)
115{ 111{
116 int ret = 0; 112 int ret = 0;
113 struct fanotify_event_info *event;
117 114
118 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
119 group, metadata, event); 116 group, metadata, fsn_event);
120 117
121 *file = NULL; 118 *file = NULL;
119 event = container_of(fsn_event, struct fanotify_event_info, fse);
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 120 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 121 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 122 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0; 123 metadata->reserved = 0;
126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 124 metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
127 metadata->pid = pid_vnr(event->tgid); 125 metadata->pid = pid_vnr(event->tgid);
128 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 126 if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
129 metadata->fd = FAN_NOFD; 127 metadata->fd = FAN_NOFD;
130 else { 128 else {
131 metadata->fd = create_fd(group, event, file); 129 metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
209 if (!re) 207 if (!re)
210 return -ENOMEM; 208 return -ENOMEM;
211 209
212 re->event = event; 210 re->event = FANOTIFY_E(event);
213 re->fd = fd; 211 re->fd = fd;
214 212
215 mutex_lock(&group->fanotify_data.access_mutex); 213 mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
217 if (atomic_read(&group->fanotify_data.bypass_perm)) { 215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
218 mutex_unlock(&group->fanotify_data.access_mutex); 216 mutex_unlock(&group->fanotify_data.access_mutex);
219 kmem_cache_free(fanotify_response_event_cache, re); 217 kmem_cache_free(fanotify_response_event_cache, re);
220 event->response = FAN_ALLOW; 218 FANOTIFY_E(event)->response = FAN_ALLOW;
221 return 0; 219 return 0;
222 } 220 }
223 221
@@ -273,7 +271,7 @@ out_close_fd:
273out: 271out:
274#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
275 if (event->mask & FAN_ALL_PERM_EVENTS) { 273 if (event->mask & FAN_ALL_PERM_EVENTS) {
276 event->response = FAN_DENY; 274 FANOTIFY_E(event)->response = FAN_DENY;
277 wake_up(&group->fanotify_data.access_waitq); 275 wake_up(&group->fanotify_data.access_waitq);
278 } 276 }
279#endif 277#endif
@@ -321,7 +319,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
321 if (IS_ERR(kevent)) 319 if (IS_ERR(kevent))
322 break; 320 break;
323 ret = copy_event_to_user(group, kevent, buf); 321 ret = copy_event_to_user(group, kevent, buf);
324 fsnotify_put_event(kevent); 322 fsnotify_destroy_event(group, kevent);
325 if (ret < 0) 323 if (ret < 0)
326 break; 324 break;
327 buf += ret; 325 buf += ret;
@@ -409,7 +407,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
409static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 407static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
410{ 408{
411 struct fsnotify_group *group; 409 struct fsnotify_group *group;
412 struct fsnotify_event_holder *holder; 410 struct fsnotify_event *fsn_event;
413 void __user *p; 411 void __user *p;
414 int ret = -ENOTTY; 412 int ret = -ENOTTY;
415 size_t send_len = 0; 413 size_t send_len = 0;
@@ -421,7 +419,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
421 switch (cmd) { 419 switch (cmd) {
422 case FIONREAD: 420 case FIONREAD:
423 mutex_lock(&group->notification_mutex); 421 mutex_lock(&group->notification_mutex);
424 list_for_each_entry(holder, &group->notification_list, event_list) 422 list_for_each_entry(fsn_event, &group->notification_list, list)
425 send_len += FAN_EVENT_METADATA_LEN; 423 send_len += FAN_EVENT_METADATA_LEN;
426 mutex_unlock(&group->notification_mutex); 424 mutex_unlock(&group->notification_mutex);
427 ret = put_user(send_len, (int __user *) p); 425 ret = put_user(send_len, (int __user *) p);
@@ -906,6 +904,7 @@ static int __init fanotify_user_setup(void)
906 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 904 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
907 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, 905 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
908 SLAB_PANIC); 906 SLAB_PANIC);
907 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
909 908
910 return 0; 909 return 0;
911} 910}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..1d4e1ea2f37c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
128 struct fsnotify_mark *vfsmount_mark, 128 struct fsnotify_mark *vfsmount_mark,
129 __u32 mask, void *data, 129 __u32 mask, void *data,
130 int data_is, u32 cookie, 130 int data_is, u32 cookie,
131 const unsigned char *file_name, 131 const unsigned char *file_name)
132 struct fsnotify_event **event)
133{ 132{
134 struct fsnotify_group *group = NULL; 133 struct fsnotify_group *group = NULL;
135 __u32 inode_test_mask = 0; 134 __u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
170 169
171 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" 170 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" 171 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
173 " data=%p data_is=%d cookie=%d event=%p\n", 172 " data=%p data_is=%d cookie=%d\n",
174 __func__, group, to_tell, mask, inode_mark, 173 __func__, group, to_tell, mask, inode_mark,
175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, 174 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
176 data_is, cookie, *event); 175 data_is, cookie);
177 176
178 if (!inode_test_mask && !vfsmount_test_mask) 177 if (!inode_test_mask && !vfsmount_test_mask)
179 return 0; 178 return 0;
180 179
181 if (group->ops->should_send_event(group, to_tell, inode_mark, 180 return group->ops->handle_event(group, to_tell, inode_mark,
182 vfsmount_mark, mask, data, 181 vfsmount_mark, mask, data, data_is,
183 data_is) == false) 182 file_name);
184 return 0;
185
186 if (!*event) {
187 *event = fsnotify_create_event(to_tell, mask, data,
188 data_is, file_name,
189 cookie, GFP_KERNEL);
190 if (!*event)
191 return -ENOMEM;
192 }
193 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
194} 183}
195 184
196/* 185/*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
205 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; 194 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
206 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 195 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
207 struct fsnotify_group *inode_group, *vfsmount_group; 196 struct fsnotify_group *inode_group, *vfsmount_group;
208 struct fsnotify_event *event = NULL;
209 struct mount *mnt; 197 struct mount *mnt;
210 int idx, ret = 0; 198 int idx, ret = 0;
211 /* global tests shouldn't care about events on child only the specific event */ 199 /* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
258 246
259 if (inode_group > vfsmount_group) { 247 if (inode_group > vfsmount_group) {
260 /* handle inode */ 248 /* handle inode */
261 ret = send_to_group(to_tell, inode_mark, NULL, mask, data, 249 ret = send_to_group(to_tell, inode_mark, NULL, mask,
262 data_is, cookie, file_name, &event); 250 data, data_is, cookie, file_name);
263 /* we didn't use the vfsmount_mark */ 251 /* we didn't use the vfsmount_mark */
264 vfsmount_group = NULL; 252 vfsmount_group = NULL;
265 } else if (vfsmount_group > inode_group) { 253 } else if (vfsmount_group > inode_group) {
266 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, 254 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
267 data_is, cookie, file_name, &event); 255 data, data_is, cookie, file_name);
268 inode_group = NULL; 256 inode_group = NULL;
269 } else { 257 } else {
270 ret = send_to_group(to_tell, inode_mark, vfsmount_mark, 258 ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
271 mask, data, data_is, cookie, file_name, 259 mask, data, data_is, cookie,
272 &event); 260 file_name);
273 } 261 }
274 262
275 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) 263 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
285 ret = 0; 273 ret = 0;
286out: 274out:
287 srcu_read_unlock(&fsnotify_mark_srcu, idx); 275 srcu_read_unlock(&fsnotify_mark_srcu, idx);
288 /*
289 * fsnotify_create_event() took a reference so the event can't be cleaned
290 * up while we are still trying to add it to lists, drop that one.
291 */
292 if (event)
293 fsnotify_put_event(event);
294 276
295 return ret; 277 return ret;
296} 278}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ee674fe2cec7 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
99 INIT_LIST_HEAD(&group->marks_list); 99 INIT_LIST_HEAD(&group->marks_list);
100 100
101 group->ops = ops; 101 group->ops = ops;
102 fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW);
102 103
103 return group; 104 return group;
104} 105}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..485eef3f4407 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/slab.h> /* struct kmem_cache */ 3#include <linux/slab.h> /* struct kmem_cache */
4 4
5extern struct kmem_cache *event_priv_cachep; 5struct inotify_event_info {
6 6 struct fsnotify_event fse;
7struct inotify_event_private_data {
8 struct fsnotify_event_private_data fsnotify_event_priv_data;
9 int wd; 7 int wd;
8 u32 sync_cookie;
9 int name_len;
10 char name[];
10}; 11};
11 12
12struct inotify_inode_mark { 13struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
14 int wd; 15 int wd;
15}; 16};
16 17
18static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
19{
20 return container_of(fse, struct inotify_event_info, fse);
21}
22
17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, 23extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
18 struct fsnotify_group *group); 24 struct fsnotify_group *group);
19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 25extern int inotify_handle_event(struct fsnotify_group *group,
26 struct inode *inode,
27 struct fsnotify_mark *inode_mark,
28 struct fsnotify_mark *vfsmount_mark,
29 u32 mask, void *data, int data_type,
30 const unsigned char *file_name);
20 31
21extern const struct fsnotify_ops inotify_fsnotify_ops; 32extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..aad1a35e9af1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,100 +34,87 @@
34#include "inotify.h" 34#include "inotify.h"
35 35
36/* 36/*
37 * Check if 2 events contain the same information. We do not compare private data 37 * Check if 2 events contain the same information.
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */ 38 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) 39static bool event_compare(struct fsnotify_event *old_fsn,
40 struct fsnotify_event *new_fsn)
41{ 41{
42 if ((old->mask == new->mask) && 42 struct inotify_event_info *old, *new;
43 (old->to_tell == new->to_tell) && 43
44 (old->data_type == new->data_type) && 44 if (old_fsn->mask & FS_IN_IGNORED)
45 (old->name_len == new->name_len)) { 45 return false;
46 switch (old->data_type) { 46 old = INOTIFY_E(old_fsn);
47 case (FSNOTIFY_EVENT_INODE): 47 new = INOTIFY_E(new_fsn);
48 /* remember, after old was put on the wait_q we aren't 48 if ((old_fsn->mask == new_fsn->mask) &&
49 * allowed to look at the inode any more, only thing 49 (old_fsn->inode == new_fsn->inode) &&
50 * left to check was if the file_name is the same */ 50 (old->name_len == new->name_len) &&
51 if (!old->name_len || 51 (!old->name_len || !strcmp(old->name, new->name)))
52 !strcmp(old->file_name, new->file_name)) 52 return true;
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false; 53 return false;
69} 54}
70 55
71static struct fsnotify_event *inotify_merge(struct list_head *list, 56static struct fsnotify_event *inotify_merge(struct list_head *list,
72 struct fsnotify_event *event) 57 struct fsnotify_event *event)
73{ 58{
74 struct fsnotify_event_holder *last_holder;
75 struct fsnotify_event *last_event; 59 struct fsnotify_event *last_event;
76 60
77 /* and the list better be locked by something too */ 61 last_event = list_entry(list->prev, struct fsnotify_event, list);
78 spin_lock(&event->lock); 62 if (!event_compare(last_event, event))
79 63 return NULL;
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event; 64 return last_event;
90} 65}
91 66
92static int inotify_handle_event(struct fsnotify_group *group, 67int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark, 68 struct inode *inode,
94 struct fsnotify_mark *vfsmount_mark, 69 struct fsnotify_mark *inode_mark,
95 struct fsnotify_event *event) 70 struct fsnotify_mark *vfsmount_mark,
71 u32 mask, void *data, int data_type,
72 const unsigned char *file_name)
96{ 73{
97 struct inotify_inode_mark *i_mark; 74 struct inotify_inode_mark *i_mark;
98 struct inode *to_tell; 75 struct inotify_event_info *event;
99 struct inotify_event_private_data *event_priv;
100 struct fsnotify_event_private_data *fsn_event_priv;
101 struct fsnotify_event *added_event; 76 struct fsnotify_event *added_event;
102 int wd, ret = 0; 77 struct fsnotify_event *fsn_event;
78 int ret = 0;
79 int len = 0;
80 int alloc_len = sizeof(struct inotify_event_info);
103 81
104 BUG_ON(vfsmount_mark); 82 BUG_ON(vfsmount_mark);
105 83
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, 84 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
107 event, event->to_tell, event->mask); 85 (data_type == FSNOTIFY_EVENT_PATH)) {
86 struct path *path = data;
108 87
109 to_tell = event->to_tell; 88 if (d_unlinked(path->dentry))
89 return 0;
90 }
91 if (file_name) {
92 len = strlen(file_name);
93 alloc_len += len + 1;
94 }
95
96 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
97 mask);
110 98
111 i_mark = container_of(inode_mark, struct inotify_inode_mark, 99 i_mark = container_of(inode_mark, struct inotify_inode_mark,
112 fsn_mark); 100 fsn_mark);
113 wd = i_mark->wd;
114 101
115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 102 event = kmalloc(alloc_len, GFP_KERNEL);
116 if (unlikely(!event_priv)) 103 if (unlikely(!event))
117 return -ENOMEM; 104 return -ENOMEM;
118 105
119 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 106 fsn_event = &event->fse;
120 107 fsnotify_init_event(fsn_event, inode, mask);
121 fsnotify_get_group(group); 108 event->wd = i_mark->wd;
122 fsn_event_priv->group = group; 109 event->name_len = len;
123 event_priv->wd = wd; 110 if (len)
111 strcpy(event->name, file_name);
124 112
125 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); 113 added_event = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
126 if (added_event) { 114 if (added_event) {
127 inotify_free_event_priv(fsn_event_priv); 115 /* Our event wasn't used in the end. Free it. */
128 if (!IS_ERR(added_event)) 116 fsnotify_destroy_event(group, fsn_event);
129 fsnotify_put_event(added_event); 117 if (IS_ERR(added_event))
130 else
131 ret = PTR_ERR(added_event); 118 ret = PTR_ERR(added_event);
132 } 119 }
133 120
@@ -142,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
142 inotify_ignored_and_remove_idr(fsn_mark, group); 129 inotify_ignored_and_remove_idr(fsn_mark, group);
143} 130}
144 131
145static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
146 struct fsnotify_mark *inode_mark,
147 struct fsnotify_mark *vfsmount_mark,
148 __u32 mask, void *data, int data_type)
149{
150 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
151 (data_type == FSNOTIFY_EVENT_PATH)) {
152 struct path *path = data;
153
154 if (d_unlinked(path->dentry))
155 return false;
156 }
157
158 return true;
159}
160
161/* 132/*
162 * This is NEVER supposed to be called. Inotify marks should either have been 133 * This is NEVER supposed to be called. Inotify marks should either have been
163 * removed from the idr when the watch was removed or in the 134 * removed from the idr when the watch was removed or in the
@@ -202,22 +173,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
202 free_uid(group->inotify_data.user); 173 free_uid(group->inotify_data.user);
203} 174}
204 175
205void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 176static void inotify_free_event(struct fsnotify_event *fsn_event)
206{ 177{
207 struct inotify_event_private_data *event_priv; 178 kfree(INOTIFY_E(fsn_event));
208
209
210 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
211 fsnotify_event_priv_data);
212
213 fsnotify_put_group(fsn_event_priv->group);
214 kmem_cache_free(event_priv_cachep, event_priv);
215} 179}
216 180
217const struct fsnotify_ops inotify_fsnotify_ops = { 181const struct fsnotify_ops inotify_fsnotify_ops = {
218 .handle_event = inotify_handle_event, 182 .handle_event = inotify_handle_event,
219 .should_send_event = inotify_should_send_event,
220 .free_group_priv = inotify_free_group_priv, 183 .free_group_priv = inotify_free_group_priv,
221 .free_event_priv = inotify_free_event_priv, 184 .free_event = inotify_free_event,
222 .freeing_mark = inotify_freeing_mark, 185 .freeing_mark = inotify_freeing_mark,
223}; 186};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..497395c8274b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
50static int inotify_max_user_watches __read_mostly; 50static int inotify_max_user_watches __read_mostly;
51 51
52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
53struct kmem_cache *event_priv_cachep __read_mostly;
54 53
55#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
56 55
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
124 return ret; 123 return ret;
125} 124}
126 125
126static int round_event_name_len(struct fsnotify_event *fsn_event)
127{
128 struct inotify_event_info *event;
129
130 event = INOTIFY_E(fsn_event);
131 if (!event->name_len)
132 return 0;
133 return roundup(event->name_len + 1, sizeof(struct inotify_event));
134}
135
127/* 136/*
128 * Get an inotify_kernel_event if one exists and is small 137 * Get an inotify_kernel_event if one exists and is small
129 * enough to fit in "count". Return an error pointer if 138 * enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 153
145 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 154 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
146 155
147 if (event->name_len) 156 event_size += round_event_name_len(event);
148 event_size += roundup(event->name_len + 1, event_size);
149
150 if (event_size > count) 157 if (event_size > count)
151 return ERR_PTR(-EINVAL); 158 return ERR_PTR(-EINVAL);
152 159
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
164 * buffer we had in "get_one_event()" above. 171 * buffer we had in "get_one_event()" above.
165 */ 172 */
166static ssize_t copy_event_to_user(struct fsnotify_group *group, 173static ssize_t copy_event_to_user(struct fsnotify_group *group,
167 struct fsnotify_event *event, 174 struct fsnotify_event *fsn_event,
168 char __user *buf) 175 char __user *buf)
169{ 176{
170 struct inotify_event inotify_event; 177 struct inotify_event inotify_event;
171 struct fsnotify_event_private_data *fsn_priv; 178 struct inotify_event_info *event;
172 struct inotify_event_private_data *priv;
173 size_t event_size = sizeof(struct inotify_event); 179 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 180 size_t name_len;
175 181 size_t pad_name_len;
176 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
177 182
178 /* we get the inotify watch descriptor from the event private data */ 183 pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
179 spin_lock(&event->lock);
180 fsn_priv = fsnotify_remove_priv_from_event(group, event);
181 spin_unlock(&event->lock);
182
183 if (!fsn_priv)
184 inotify_event.wd = -1;
185 else {
186 priv = container_of(fsn_priv, struct inotify_event_private_data,
187 fsnotify_event_priv_data);
188 inotify_event.wd = priv->wd;
189 inotify_free_event_priv(fsn_priv);
190 }
191 184
185 event = INOTIFY_E(fsn_event);
186 name_len = event->name_len;
192 /* 187 /*
193 * round up event->name_len so it is a multiple of event_size 188 * round up name length so it is a multiple of event_size
194 * plus an extra byte for the terminating '\0'. 189 * plus an extra byte for the terminating '\0'.
195 */ 190 */
196 if (event->name_len) 191 pad_name_len = round_event_name_len(fsn_event);
197 name_len = roundup(event->name_len + 1, event_size); 192 inotify_event.len = pad_name_len;
198 inotify_event.len = name_len; 193 inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
199 194 inotify_event.wd = event->wd;
200 inotify_event.mask = inotify_mask_to_arg(event->mask);
201 inotify_event.cookie = event->sync_cookie; 195 inotify_event.cookie = event->sync_cookie;
202 196
203 /* send the main event */ 197 /* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
209 /* 203 /*
210 * fsnotify only stores the pathname, so here we have to send the pathname 204 * fsnotify only stores the pathname, so here we have to send the pathname
211 * and then pad that pathname out to a multiple of sizeof(inotify_event) 205 * and then pad that pathname out to a multiple of sizeof(inotify_event)
212 * with zeros. I get my zeros from the nul_inotify_event. 206 * with zeros.
213 */ 207 */
214 if (name_len) { 208 if (pad_name_len) {
215 unsigned int len_to_zero = name_len - event->name_len;
216 /* copy the path name */ 209 /* copy the path name */
217 if (copy_to_user(buf, event->file_name, event->name_len)) 210 if (copy_to_user(buf, event->name, name_len))
218 return -EFAULT; 211 return -EFAULT;
219 buf += event->name_len; 212 buf += name_len;
220 213
221 /* fill userspace with 0's */ 214 /* fill userspace with 0's */
222 if (clear_user(buf, len_to_zero)) 215 if (clear_user(buf, pad_name_len - name_len))
223 return -EFAULT; 216 return -EFAULT;
224 buf += len_to_zero; 217 event_size += pad_name_len;
225 event_size += name_len;
226 } 218 }
227 219
228 return event_size; 220 return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
254 if (IS_ERR(kevent)) 246 if (IS_ERR(kevent))
255 break; 247 break;
256 ret = copy_event_to_user(group, kevent, buf); 248 ret = copy_event_to_user(group, kevent, buf);
257 fsnotify_put_event(kevent); 249 fsnotify_destroy_event(group, kevent);
258 if (ret < 0) 250 if (ret < 0)
259 break; 251 break;
260 buf += ret; 252 buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
297 unsigned long arg) 289 unsigned long arg)
298{ 290{
299 struct fsnotify_group *group; 291 struct fsnotify_group *group;
300 struct fsnotify_event_holder *holder; 292 struct fsnotify_event *fsn_event;
301 struct fsnotify_event *event;
302 void __user *p; 293 void __user *p;
303 int ret = -ENOTTY; 294 int ret = -ENOTTY;
304 size_t send_len = 0; 295 size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
311 switch (cmd) { 302 switch (cmd) {
312 case FIONREAD: 303 case FIONREAD:
313 mutex_lock(&group->notification_mutex); 304 mutex_lock(&group->notification_mutex);
314 list_for_each_entry(holder, &group->notification_list, event_list) { 305 list_for_each_entry(fsn_event, &group->notification_list,
315 event = holder->event; 306 list) {
316 send_len += sizeof(struct inotify_event); 307 send_len += sizeof(struct inotify_event);
317 if (event->name_len) 308 send_len += round_event_name_len(fsn_event);
318 send_len += roundup(event->name_len + 1,
319 sizeof(struct inotify_event));
320 } 309 }
321 mutex_unlock(&group->notification_mutex); 310 mutex_unlock(&group->notification_mutex);
322 ret = put_user(send_len, (int __user *) p); 311 ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
503 struct fsnotify_group *group) 492 struct fsnotify_group *group)
504{ 493{
505 struct inotify_inode_mark *i_mark; 494 struct inotify_inode_mark *i_mark;
506 struct fsnotify_event *ignored_event, *notify_event;
507 struct inotify_event_private_data *event_priv;
508 struct fsnotify_event_private_data *fsn_event_priv;
509 int ret;
510
511 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
512
513 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
514 FSNOTIFY_EVENT_NONE, NULL, 0,
515 GFP_NOFS);
516 if (!ignored_event)
517 goto skip_send_ignore;
518
519 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
520 if (unlikely(!event_priv))
521 goto skip_send_ignore;
522
523 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
524
525 fsnotify_get_group(group);
526 fsn_event_priv->group = group;
527 event_priv->wd = i_mark->wd;
528
529 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
530 if (notify_event) {
531 if (IS_ERR(notify_event))
532 ret = PTR_ERR(notify_event);
533 else
534 fsnotify_put_event(notify_event);
535 inotify_free_event_priv(fsn_event_priv);
536 }
537 495
538skip_send_ignore: 496 /* Queue ignore event for the watch */
539 /* matches the reference taken when the event was created */ 497 inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
540 if (ignored_event) 498 NULL, FSNOTIFY_EVENT_NONE, NULL);
541 fsnotify_put_event(ignored_event);
542 499
500 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
543 /* remove this mark from the idr */ 501 /* remove this mark from the idr */
544 inotify_remove_from_idr(group, i_mark); 502 inotify_remove_from_idr(group, i_mark);
545 503
@@ -836,7 +794,6 @@ static int __init inotify_user_setup(void)
836 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 794 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
837 795
838 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 796 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
839 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
840 797
841 inotify_max_queued_events = 16384; 798 inotify_max_queued_events = 16384;
842 inotify_max_user_instances = 128; 799 inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..952237b8e2d2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
48#include <linux/fsnotify_backend.h> 48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h" 49#include "fsnotify.h"
50 50
51static struct kmem_cache *fsnotify_event_cachep;
52static struct kmem_cache *fsnotify_event_holder_cachep;
53/*
54 * This is a magic event we send when the q is too full. Since it doesn't
55 * hold real event information we just keep one system wide and use it any time
56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed'
58 */
59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 51static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 52
62/** 53/**
@@ -76,60 +67,14 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
76 return list_empty(&group->notification_list) ? true : false; 67 return list_empty(&group->notification_list) ? true : false;
77} 68}
78 69
79void fsnotify_get_event(struct fsnotify_event *event) 70void fsnotify_destroy_event(struct fsnotify_group *group,
71 struct fsnotify_event *event)
80{ 72{
81 atomic_inc(&event->refcnt); 73 /* Overflow events are per-group and we don't want to free them */
82} 74 if (!event || event->mask == FS_Q_OVERFLOW)
83
84void fsnotify_put_event(struct fsnotify_event *event)
85{
86 if (!event)
87 return; 75 return;
88 76
89 if (atomic_dec_and_test(&event->refcnt)) { 77 group->ops->free_event(event);
90 pr_debug("%s: event=%p\n", __func__, event);
91
92 if (event->data_type == FSNOTIFY_EVENT_PATH)
93 path_put(&event->path);
94
95 BUG_ON(!list_empty(&event->private_data_list));
96
97 kfree(event->file_name);
98 put_pid(event->tgid);
99 kmem_cache_free(fsnotify_event_cachep, event);
100 }
101}
102
103struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104{
105 return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
106}
107
108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
109{
110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
112}
113
114/*
115 * Find the private data that the group previously attached to this event when
116 * the group added the event to the notification queue (fsnotify_add_notify_event)
117 */
118struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
119{
120 struct fsnotify_event_private_data *lpriv;
121 struct fsnotify_event_private_data *priv = NULL;
122
123 assert_spin_locked(&event->lock);
124
125 list_for_each_entry(lpriv, &event->private_data_list, event_list) {
126 if (lpriv->group == group) {
127 priv = lpriv;
128 list_del(&priv->event_list);
129 break;
130 }
131 }
132 return priv;
133} 78}
134 79
135/* 80/*
@@ -137,91 +82,35 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
137 * event off the queue to deal with. If the event is successfully added to the 82 * event off the queue to deal with. If the event is successfully added to the
138 * group's notification queue, a reference is taken on event. 83 * group's notification queue, a reference is taken on event.
139 */ 84 */
140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 85struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group,
141 struct fsnotify_event_private_data *priv, 86 struct fsnotify_event *event,
142 struct fsnotify_event *(*merge)(struct list_head *, 87 struct fsnotify_event *(*merge)(struct list_head *,
143 struct fsnotify_event *)) 88 struct fsnotify_event *))
144{ 89{
145 struct fsnotify_event *return_event = NULL; 90 struct fsnotify_event *return_event = NULL;
146 struct fsnotify_event_holder *holder = NULL;
147 struct list_head *list = &group->notification_list; 91 struct list_head *list = &group->notification_list;
148 92
149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); 93 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
150
151 /*
152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
153 * Check if we expect to be able to use that holder. If not alloc a new
154 * holder.
155 * For the overflow event it's possible that something will use the in
156 * event holder before we get the lock so we may need to jump back and
157 * alloc a new holder, this can't happen for most events...
158 */
159 if (!list_empty(&event->holder.event_list)) {
160alloc_holder:
161 holder = fsnotify_alloc_event_holder();
162 if (!holder)
163 return ERR_PTR(-ENOMEM);
164 }
165 94
166 mutex_lock(&group->notification_mutex); 95 mutex_lock(&group->notification_mutex);
167 96
168 if (group->q_len >= group->max_events) { 97 if (group->q_len >= group->max_events) {
169 event = q_overflow_event; 98 /* Queue overflow event only if it isn't already queued */
170 99 if (list_empty(&group->overflow_event.list))
171 /* 100 event = &group->overflow_event;
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event; 101 return_event = event;
177
178 /* sorry, no private data on the overflow event */
179 priv = NULL;
180 } 102 }
181 103
182 if (!list_empty(list) && merge) { 104 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp; 105 return_event = merge(list, event);
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex);
188
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 }
195 }
196
197 spin_lock(&event->lock);
198
199 if (list_empty(&event->holder.event_list)) {
200 if (unlikely(holder))
201 fsnotify_destroy_event_holder(holder);
202 holder = &event->holder;
203 } else if (unlikely(!holder)) {
204 /* between the time we checked above and got the lock the in
205 * event holder was used, go back and get a new one */
206 spin_unlock(&event->lock);
207 mutex_unlock(&group->notification_mutex);
208
209 if (return_event) { 106 if (return_event) {
210 fsnotify_put_event(return_event); 107 mutex_unlock(&group->notification_mutex);
211 return_event = NULL; 108 return return_event;
212 } 109 }
213
214 goto alloc_holder;
215 } 110 }
216 111
217 group->q_len++; 112 group->q_len++;
218 holder->event = event; 113 list_add_tail(&event->list, list);
219
220 fsnotify_get_event(event);
221 list_add_tail(&holder->event_list, list);
222 if (priv)
223 list_add_tail(&priv->event_list, &event->private_data_list);
224 spin_unlock(&event->lock);
225 mutex_unlock(&group->notification_mutex); 114 mutex_unlock(&group->notification_mutex);
226 115
227 wake_up(&group->notification_waitq); 116 wake_up(&group->notification_waitq);
@@ -230,32 +119,20 @@ alloc_holder:
230} 119}
231 120
232/* 121/*
233 * Remove and return the first event from the notification list. There is a 122 * Remove and return the first event from the notification list. It is the
234 * reference held on this event since it was on the list. It is the responsibility 123 * responsibility of the caller to destroy the obtained event
235 * of the caller to drop this reference.
236 */ 124 */
237struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) 125struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
238{ 126{
239 struct fsnotify_event *event; 127 struct fsnotify_event *event;
240 struct fsnotify_event_holder *holder;
241 128
242 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 129 BUG_ON(!mutex_is_locked(&group->notification_mutex));
243 130
244 pr_debug("%s: group=%p\n", __func__, group); 131 pr_debug("%s: group=%p\n", __func__, group);
245 132
246 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 133 event = list_first_entry(&group->notification_list,
247 134 struct fsnotify_event, list);
248 event = holder->event; 135 list_del(&event->list);
249
250 spin_lock(&event->lock);
251 holder->event = NULL;
252 list_del_init(&holder->event_list);
253 spin_unlock(&event->lock);
254
255 /* event == holder means we are referenced through the in event holder */
256 if (holder != &event->holder)
257 fsnotify_destroy_event_holder(holder);
258
259 group->q_len--; 136 group->q_len--;
260 137
261 return event; 138 return event;
@@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
266 */ 143 */
267struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) 144struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
268{ 145{
269 struct fsnotify_event *event;
270 struct fsnotify_event_holder *holder;
271
272 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 146 BUG_ON(!mutex_is_locked(&group->notification_mutex));
273 147
274 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 148 return list_first_entry(&group->notification_list,
275 event = holder->event; 149 struct fsnotify_event, list);
276
277 return event;
278} 150}
279 151
280/* 152/*
@@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
284void fsnotify_flush_notify(struct fsnotify_group *group) 156void fsnotify_flush_notify(struct fsnotify_group *group)
285{ 157{
286 struct fsnotify_event *event; 158 struct fsnotify_event *event;
287 struct fsnotify_event_private_data *priv;
288 159
289 mutex_lock(&group->notification_mutex); 160 mutex_lock(&group->notification_mutex);
290 while (!fsnotify_notify_queue_is_empty(group)) { 161 while (!fsnotify_notify_queue_is_empty(group)) {
291 event = fsnotify_remove_notify_event(group); 162 event = fsnotify_remove_notify_event(group);
292 /* if they don't implement free_event_priv they better not have attached any */ 163 fsnotify_destroy_event(group, event);
293 if (group->ops->free_event_priv) {
294 spin_lock(&event->lock);
295 priv = fsnotify_remove_priv_from_event(group, event);
296 spin_unlock(&event->lock);
297 if (priv)
298 group->ops->free_event_priv(priv);
299 }
300 fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
301 } 164 }
302 mutex_unlock(&group->notification_mutex); 165 mutex_unlock(&group->notification_mutex);
303} 166}
304 167
305static void initialize_event(struct fsnotify_event *event)
306{
307 INIT_LIST_HEAD(&event->holder.event_list);
308 atomic_set(&event->refcnt, 1);
309
310 spin_lock_init(&event->lock);
311
312 INIT_LIST_HEAD(&event->private_data_list);
313}
314
315/*
316 * Caller damn well better be holding whatever mutex is protecting the
317 * old_holder->event_list and the new_event must be a clean event which
318 * cannot be found anywhere else in the kernel.
319 */
320int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
321 struct fsnotify_event *new_event)
322{
323 struct fsnotify_event *old_event = old_holder->event;
324 struct fsnotify_event_holder *new_holder = &new_event->holder;
325
326 enum event_spinlock_class {
327 SPINLOCK_OLD,
328 SPINLOCK_NEW,
329 };
330
331 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
332
333 /*
334 * if the new_event's embedded holder is in use someone
335 * screwed up and didn't give us a clean new event.
336 */
337 BUG_ON(!list_empty(&new_holder->event_list));
338
339 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
340 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
341
342 new_holder->event = new_event;
343 list_replace_init(&old_holder->event_list, &new_holder->event_list);
344
345 spin_unlock(&new_event->lock);
346 spin_unlock(&old_event->lock);
347
348 /* event == holder means we are referenced through the in event holder */
349 if (old_holder != &old_event->holder)
350 fsnotify_destroy_event_holder(old_holder);
351
352 fsnotify_get_event(new_event); /* on the list take reference */
353 fsnotify_put_event(old_event); /* off the list, drop reference */
354
355 return 0;
356}
357
358struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
359{
360 struct fsnotify_event *event;
361
362 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
363 if (!event)
364 return NULL;
365
366 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
367
368 memcpy(event, old_event, sizeof(*event));
369 initialize_event(event);
370
371 if (event->name_len) {
372 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
373 if (!event->file_name) {
374 kmem_cache_free(fsnotify_event_cachep, event);
375 return NULL;
376 }
377 }
378 event->tgid = get_pid(old_event->tgid);
379 if (event->data_type == FSNOTIFY_EVENT_PATH)
380 path_get(&event->path);
381
382 return event;
383}
384
385/* 168/*
386 * fsnotify_create_event - Allocate a new event which will be sent to each 169 * fsnotify_create_event - Allocate a new event which will be sent to each
387 * group's handle_event function if the group was interested in this 170 * group's handle_event function if the group was interested in this
388 * particular event. 171 * particular event.
389 * 172 *
390 * @to_tell the inode which is supposed to receive the event (sometimes a 173 * @inode the inode which is supposed to receive the event (sometimes a
391 * parent of the inode to which the event happened. 174 * parent of the inode to which the event happened.
392 * @mask what actually happened. 175 * @mask what actually happened.
393 * @data pointer to the object which was actually affected 176 * @data pointer to the object which was actually affected
394 * @data_type flag indication if the data is a file, path, inode, nothing... 177 * @data_type flag indication if the data is a file, path, inode, nothing...
395 * @name the filename, if available 178 * @name the filename, if available
396 */ 179 */
397struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 180void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
398 int data_type, const unsigned char *name, 181 u32 mask)
399 u32 cookie, gfp_t gfp)
400{ 182{
401 struct fsnotify_event *event; 183 INIT_LIST_HEAD(&event->list);
402 184 event->inode = inode;
403 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
404 if (!event)
405 return NULL;
406
407 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
408 __func__, event, to_tell, mask, data, data_type);
409
410 initialize_event(event);
411
412 if (name) {
413 event->file_name = kstrdup(name, gfp);
414 if (!event->file_name) {
415 kmem_cache_free(fsnotify_event_cachep, event);
416 return NULL;
417 }
418 event->name_len = strlen(event->file_name);
419 }
420
421 event->tgid = get_pid(task_tgid(current));
422 event->sync_cookie = cookie;
423 event->to_tell = to_tell;
424 event->data_type = data_type;
425
426 switch (data_type) {
427 case FSNOTIFY_EVENT_PATH: {
428 struct path *path = data;
429 event->path.dentry = path->dentry;
430 event->path.mnt = path->mnt;
431 path_get(&event->path);
432 break;
433 }
434 case FSNOTIFY_EVENT_INODE:
435 event->inode = data;
436 break;
437 case FSNOTIFY_EVENT_NONE:
438 event->inode = NULL;
439 event->path.dentry = NULL;
440 event->path.mnt = NULL;
441 break;
442 default:
443 BUG();
444 }
445
446 event->mask = mask; 185 event->mask = mask;
447
448 return event;
449}
450
451static __init int fsnotify_notification_init(void)
452{
453 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
454 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
455
456 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
457 FSNOTIFY_EVENT_NONE, NULL, 0,
458 GFP_KERNEL);
459 if (!q_overflow_event)
460 panic("unable to allocate fsnotify q_overflow_event\n");
461
462 return 0;
463} 186}
464subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
38 symlink.o \ 38 symlink.o \
39 sysfile.o \ 39 sysfile.o \
40 uptodate.o \ 40 uptodate.o \
41 ver.o \
42 quota_local.o \ 41 quota_local.o \
43 quota_global.o \ 42 quota_global.o \
44 xattr.o \ 43 xattr.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..8750ae1b8636 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7260 start = range->start >> osb->s_clustersize_bits; 7260 start = range->start >> osb->s_clustersize_bits;
7261 len = range->len >> osb->s_clustersize_bits; 7261 len = range->len >> osb->s_clustersize_bits;
7262 minlen = range->minlen >> osb->s_clustersize_bits; 7262 minlen = range->minlen >> osb->s_clustersize_bits;
7263 trimmed = 0;
7264
7265 if (!len) {
7266 range->len = 0;
7267 return 0;
7268 }
7269 7263
7270 if (minlen >= osb->bitmap_cpg) 7264 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7271 return -EINVAL; 7265 return -EINVAL;
7272 7266
7273 main_bm_inode = ocfs2_get_system_file_inode(osb, 7267 main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7293 goto out_unlock; 7287 goto out_unlock;
7294 } 7288 }
7295 7289
7290 len = range->len >> osb->s_clustersize_bits;
7296 if (start + len > le32_to_cpu(main_bm->i_clusters)) 7291 if (start + len > le32_to_cpu(main_bm->i_clusters))
7297 len = le32_to_cpu(main_bm->i_clusters) - start; 7292 len = le32_to_cpu(main_bm->i_clusters) - start;
7298 7293
@@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7307 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); 7302 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7308 last_bit = osb->bitmap_cpg; 7303 last_bit = osb->bitmap_cpg;
7309 7304
7305 trimmed = 0;
7310 for (group = first_group; group <= last_group;) { 7306 for (group = first_group; group <= last_group;) {
7311 if (first_bit + len >= osb->bitmap_cpg) 7307 if (first_bit + len >= osb->bitmap_cpg)
7312 last_bit = osb->bitmap_cpg; 7308 last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o netdebug.o ver.o 4 quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
29#include "heartbeat.h" 29#include "heartbeat.h"
30#include "masklog.h" 30#include "masklog.h"
31#include "sys.h" 31#include "sys.h"
32#include "ver.h"
33 32
34/* for now we operate under the assertion that there can be only one 33/* for now we operate under the assertion that there can be only one
35 * cluster active at a time. Changing this will require trickling 34 * cluster active at a time. Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
945{ 944{
946 int ret = -1; 945 int ret = -1;
947 946
948 cluster_print_version();
949
950 ret = o2hb_init(); 947 ret = o2hb_init();
951 if (ret) 948 if (ret)
952 goto out; 949 goto out;
@@ -984,6 +981,7 @@ out:
984 981
985MODULE_AUTHOR("Oracle"); 982MODULE_AUTHOR("Oracle");
986MODULE_LICENSE("GPL"); 983MODULE_LICENSE("GPL");
984MODULE_DESCRIPTION("OCFS2 cluster management");
987 985
988module_init(init_o2nm) 986module_init(init_o2nm)
989module_exit(exit_o2nm) 987module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
7 7
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
43#include "dlmdomain.h" 43#include "dlmdomain.h"
44#include "dlmdebug.h" 44#include "dlmdebug.h"
45 45
46#include "dlmver.h"
47
48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 46#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
49#include "cluster/masklog.h" 47#include "cluster/masklog.h"
50 48
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
2328{ 2326{
2329 int status; 2327 int status;
2330 2328
2331 dlm_print_version();
2332
2333 status = dlm_init_mle_cache(); 2329 status = dlm_init_mle_cache();
2334 if (status) { 2330 if (status) {
2335 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); 2331 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
2379 2375
2380MODULE_AUTHOR("Oracle"); 2376MODULE_AUTHOR("Oracle");
2381MODULE_LICENSE("GPL"); 2377MODULE_LICENSE("GPL");
2378MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
2382 2379
2383module_init(dlm_init); 2380module_init(dlm_init);
2384module_exit(dlm_exit); 2381module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4 4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o 5ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
49 49
50#include "stackglue.h" 50#include "stackglue.h"
51#include "userdlm.h" 51#include "userdlm.h"
52#include "dlmfsver.h"
53 52
54#define MLOG_MASK_PREFIX ML_DLMFS 53#define MLOG_MASK_PREFIX ML_DLMFS
55#include "cluster/masklog.h" 54#include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
644 int status; 643 int status;
645 int cleanup_inode = 0, cleanup_worker = 0; 644 int cleanup_inode = 0, cleanup_worker = 0;
646 645
647 dlmfs_print_version();
648
649 status = bdi_init(&dlmfs_backing_dev_info); 646 status = bdi_init(&dlmfs_backing_dev_info);
650 if (status) 647 if (status)
651 return status; 648 return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
701 698
702MODULE_AUTHOR("Oracle"); 699MODULE_AUTHOR("Oracle");
703MODULE_LICENSE("GPL"); 700MODULE_LICENSE("GPL");
701MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
704 702
705module_init(init_dlmfs_fs) 703module_init(init_dlmfs_fs)
706module_exit(exit_dlmfs_fs) 704module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2996 2996
2997 /* for now, uuid == domain */ 2997 /* for now, uuid == domain */
2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2999 osb->osb_cluster_name,
3000 strlen(osb->osb_cluster_name),
2999 osb->uuid_str, 3001 osb->uuid_str,
3000 strlen(osb->uuid_str), 3002 strlen(osb->uuid_str),
3001 &lproto, ocfs2_do_node_down, osb, 3003 &lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
3005 goto bail; 3007 goto bail;
3006 } 3008 }
3007 3009
3008 status = ocfs2_cluster_this_node(&osb->node_num); 3010 status = ocfs2_cluster_this_node(conn, &osb->node_num);
3009 if (status < 0) { 3011 if (status < 0) {
3010 mlog_errno(status); 3012 mlog_errno(status);
3011 mlog(ML_ERROR, 3013 mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..f42eecef6478 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1869 } 1869 }
1870 size = sr->l_start + sr->l_len; 1870 size = sr->l_start + sr->l_len;
1871 1871
1872 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1872 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1873 cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1873 if (sr->l_len <= 0) { 1874 if (sr->l_len <= 0) {
1874 ret = -EINVAL; 1875 ret = -EINVAL;
1875 goto out_inode_unlock; 1876 goto out_inode_unlock;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/blkdev.h>
10#include <linux/compat.h> 11#include <linux/compat.h>
11 12
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
966 case FITRIM: 967 case FITRIM:
967 { 968 {
968 struct super_block *sb = inode->i_sb; 969 struct super_block *sb = inode->i_sb;
970 struct request_queue *q = bdev_get_queue(sb->s_bdev);
969 struct fstrim_range range; 971 struct fstrim_range range;
970 int ret = 0; 972 int ret = 0;
971 973
972 if (!capable(CAP_SYS_ADMIN)) 974 if (!capable(CAP_SYS_ADMIN))
973 return -EPERM; 975 return -EPERM;
974 976
977 if (!blk_queue_discard(q))
978 return -EOPNOTSUPP;
979
975 if (copy_from_user(&range, argp, sizeof(range))) 980 if (copy_from_user(&range, argp, sizeof(range)))
976 return -EFAULT; 981 return -EFAULT;
977 982
983 range.minlen = max_t(u64, q->limits.discard_granularity,
984 range.minlen);
978 ret = ocfs2_trim_fs(sb, &range); 985 ret = ocfs2_trim_fs(sb, &range);
979 if (ret < 0) 986 if (ret < 0)
980 return ret; 987 return ret;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
562} 562}
563 563
564static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
565 handle_t *handle,
566 struct buffer_head *di_bh,
567 u32 num_bits,
568 u16 chain)
569{
570 int ret;
571 u32 tmp_used;
572 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
573 struct ocfs2_chain_list *cl =
574 (struct ocfs2_chain_list *) &di->id2.i_chain;
575
576 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
577 OCFS2_JOURNAL_ACCESS_WRITE);
578 if (ret < 0) {
579 mlog_errno(ret);
580 goto out;
581 }
582
583 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
584 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
585 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
586 ocfs2_journal_dirty(handle, di_bh);
587
588out:
589 return ret;
590}
591
592static inline int ocfs2_block_group_set_bits(handle_t *handle,
593 struct inode *alloc_inode,
594 struct ocfs2_group_desc *bg,
595 struct buffer_head *group_bh,
596 unsigned int bit_off,
597 unsigned int num_bits)
598{
599 int status;
600 void *bitmap = bg->bg_bitmap;
601 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
602
603 /* All callers get the descriptor via
604 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
605 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
606 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
607
608 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
609 num_bits);
610
611 if (ocfs2_is_cluster_bitmap(alloc_inode))
612 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
613
614 status = ocfs2_journal_access_gd(handle,
615 INODE_CACHE(alloc_inode),
616 group_bh,
617 journal_type);
618 if (status < 0) {
619 mlog_errno(status);
620 goto bail;
621 }
622
623 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
624 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
625 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
626 " count %u but claims %u are freed. num_bits %d",
627 (unsigned long long)le64_to_cpu(bg->bg_blkno),
628 le16_to_cpu(bg->bg_bits),
629 le16_to_cpu(bg->bg_free_bits_count), num_bits);
630 return -EROFS;
631 }
632 while (num_bits--)
633 ocfs2_set_bit(bit_off++, bitmap);
634
635 ocfs2_journal_dirty(handle, group_bh);
636
637bail:
638 return status;
639}
640
641static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 564static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
642 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 565 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
643 u32 len, int ext_flags) 566 u32 len, int ext_flags)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
387 u8 osb_stackflags; 387 u8 osb_stackflags;
388 388
389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
390 char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
390 struct ocfs2_cluster_connection *cconn; 391 struct ocfs2_cluster_connection *cconn;
391 struct ocfs2_lock_res osb_super_lockres; 392 struct ocfs2_lock_res osb_super_lockres;
392 struct ocfs2_lock_res osb_rename_lockres; 393 struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
398 return 0; 398 return 0;
399} 399}
400 400
401static int o2cb_cluster_this_node(unsigned int *node) 401static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
402 unsigned int *node)
402{ 403{
403 int node_num; 404 int node_num;
404 405
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/sched.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "stackglue.h" 29#include "stackglue.h"
@@ -102,6 +103,12 @@
102#define OCFS2_TEXT_UUID_LEN 32 103#define OCFS2_TEXT_UUID_LEN 32
103#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 104#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
104#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 105#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
106#define VERSION_LOCK "version_lock"
107
108enum ocfs2_connection_type {
109 WITH_CONTROLD,
110 NO_CONTROLD
111};
105 112
106/* 113/*
107 * ocfs2_live_connection is refcounted because the filesystem and 114 * ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
110struct ocfs2_live_connection { 117struct ocfs2_live_connection {
111 struct list_head oc_list; 118 struct list_head oc_list;
112 struct ocfs2_cluster_connection *oc_conn; 119 struct ocfs2_cluster_connection *oc_conn;
120 enum ocfs2_connection_type oc_type;
121 atomic_t oc_this_node;
122 int oc_our_slot;
123 struct dlm_lksb oc_version_lksb;
124 char oc_lvb[DLM_LVB_LEN];
125 struct completion oc_sync_wait;
126 wait_queue_head_t oc_wait;
113}; 127};
114 128
115struct ocfs2_control_private { 129struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
198 * mount path. Since the VFS prevents multiple calls to 212 * mount path. Since the VFS prevents multiple calls to
199 * fill_super(), we can't get dupes here. 213 * fill_super(), we can't get dupes here.
200 */ 214 */
201static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, 215static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
202 struct ocfs2_live_connection **c_ret) 216 struct ocfs2_live_connection *c)
203{ 217{
204 int rc = 0; 218 int rc = 0;
205 struct ocfs2_live_connection *c;
206
207 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
208 if (!c)
209 return -ENOMEM;
210 219
211 mutex_lock(&ocfs2_control_lock); 220 mutex_lock(&ocfs2_control_lock);
212 c->oc_conn = conn; 221 c->oc_conn = conn;
213 222
214 if (atomic_read(&ocfs2_control_opened)) 223 if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
215 list_add(&c->oc_list, &ocfs2_live_connection_list); 224 list_add(&c->oc_list, &ocfs2_live_connection_list);
216 else { 225 else {
217 printk(KERN_ERR 226 printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
220 } 229 }
221 230
222 mutex_unlock(&ocfs2_control_lock); 231 mutex_unlock(&ocfs2_control_lock);
223
224 if (!rc)
225 *c_ret = c;
226 else
227 kfree(c);
228
229 return rc; 232 return rc;
230} 233}
231 234
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
799 return 0; 802 return 0;
800} 803}
801 804
805static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
806{
807 struct ocfs2_protocol_version *pv =
808 (struct ocfs2_protocol_version *)lvb;
809 /*
810 * ocfs2_protocol_version has two u8 variables, so we don't
811 * need any endian conversion.
812 */
813 ver->pv_major = pv->pv_major;
814 ver->pv_minor = pv->pv_minor;
815}
816
817static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
818{
819 struct ocfs2_protocol_version *pv =
820 (struct ocfs2_protocol_version *)lvb;
821 /*
822 * ocfs2_protocol_version has two u8 variables, so we don't
823 * need any endian conversion.
824 */
825 pv->pv_major = ver->pv_major;
826 pv->pv_minor = ver->pv_minor;
827}
828
829static void sync_wait_cb(void *arg)
830{
831 struct ocfs2_cluster_connection *conn = arg;
832 struct ocfs2_live_connection *lc = conn->cc_private;
833 complete(&lc->oc_sync_wait);
834}
835
836static int sync_unlock(struct ocfs2_cluster_connection *conn,
837 struct dlm_lksb *lksb, char *name)
838{
839 int error;
840 struct ocfs2_live_connection *lc = conn->cc_private;
841
842 error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
843 if (error) {
844 printk(KERN_ERR "%s lkid %x error %d\n",
845 name, lksb->sb_lkid, error);
846 return error;
847 }
848
849 wait_for_completion(&lc->oc_sync_wait);
850
851 if (lksb->sb_status != -DLM_EUNLOCK) {
852 printk(KERN_ERR "%s lkid %x status %d\n",
853 name, lksb->sb_lkid, lksb->sb_status);
854 return -1;
855 }
856 return 0;
857}
858
859static int sync_lock(struct ocfs2_cluster_connection *conn,
860 int mode, uint32_t flags,
861 struct dlm_lksb *lksb, char *name)
862{
863 int error, status;
864 struct ocfs2_live_connection *lc = conn->cc_private;
865
866 error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
867 name, strlen(name),
868 0, sync_wait_cb, conn, NULL);
869 if (error) {
870 printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
871 name, lksb->sb_lkid, flags, mode, error);
872 return error;
873 }
874
875 wait_for_completion(&lc->oc_sync_wait);
876
877 status = lksb->sb_status;
878
879 if (status && status != -EAGAIN) {
880 printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
881 name, lksb->sb_lkid, flags, mode, status);
882 }
883
884 return status;
885}
886
887
888static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
889 int flags)
890{
891 struct ocfs2_live_connection *lc = conn->cc_private;
892 return sync_lock(conn, mode, flags,
893 &lc->oc_version_lksb, VERSION_LOCK);
894}
895
896static int version_unlock(struct ocfs2_cluster_connection *conn)
897{
898 struct ocfs2_live_connection *lc = conn->cc_private;
899 return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
900}
901
902/* get_protocol_version()
903 *
904 * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
905 * The algorithm is:
906 * 1. Attempt to take the lock in EX mode (non-blocking).
907 * 2. If successful (which means it is the first mount), write the
908 * version number and downconvert to PR lock.
909 * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
910 * taking the PR lock.
911 */
912
913static int get_protocol_version(struct ocfs2_cluster_connection *conn)
914{
915 int ret;
916 struct ocfs2_live_connection *lc = conn->cc_private;
917 struct ocfs2_protocol_version pv;
918
919 running_proto.pv_major =
920 ocfs2_user_plugin.sp_max_proto.pv_major;
921 running_proto.pv_minor =
922 ocfs2_user_plugin.sp_max_proto.pv_minor;
923
924 lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
925 ret = version_lock(conn, DLM_LOCK_EX,
926 DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
927 if (!ret) {
928 conn->cc_version.pv_major = running_proto.pv_major;
929 conn->cc_version.pv_minor = running_proto.pv_minor;
930 version_to_lvb(&running_proto, lc->oc_lvb);
931 version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
932 } else if (ret == -EAGAIN) {
933 ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
934 if (ret)
935 goto out;
936 lvb_to_version(lc->oc_lvb, &pv);
937
938 if ((pv.pv_major != running_proto.pv_major) ||
939 (pv.pv_minor > running_proto.pv_minor)) {
940 ret = -EINVAL;
941 goto out;
942 }
943
944 conn->cc_version.pv_major = pv.pv_major;
945 conn->cc_version.pv_minor = pv.pv_minor;
946 }
947out:
948 return ret;
949}
950
951static void user_recover_prep(void *arg)
952{
953}
954
955static void user_recover_slot(void *arg, struct dlm_slot *slot)
956{
957 struct ocfs2_cluster_connection *conn = arg;
958 printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
959 slot->nodeid, slot->slot);
960 conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
961
962}
963
964static void user_recover_done(void *arg, struct dlm_slot *slots,
965 int num_slots, int our_slot,
966 uint32_t generation)
967{
968 struct ocfs2_cluster_connection *conn = arg;
969 struct ocfs2_live_connection *lc = conn->cc_private;
970 int i;
971
972 for (i = 0; i < num_slots; i++)
973 if (slots[i].slot == our_slot) {
974 atomic_set(&lc->oc_this_node, slots[i].nodeid);
975 break;
976 }
977
978 lc->oc_our_slot = our_slot;
979 wake_up(&lc->oc_wait);
980}
981
982static const struct dlm_lockspace_ops ocfs2_ls_ops = {
983 .recover_prep = user_recover_prep,
984 .recover_slot = user_recover_slot,
985 .recover_done = user_recover_done,
986};
987
988static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
989{
990 version_unlock(conn);
991 dlm_release_lockspace(conn->cc_lockspace, 2);
992 conn->cc_lockspace = NULL;
993 ocfs2_live_connection_drop(conn->cc_private);
994 conn->cc_private = NULL;
995 return 0;
996}
997
802static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 998static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
803{ 999{
804 dlm_lockspace_t *fsdlm; 1000 dlm_lockspace_t *fsdlm;
805 struct ocfs2_live_connection *uninitialized_var(control); 1001 struct ocfs2_live_connection *lc;
806 int rc = 0; 1002 int rc, ops_rv;
807 1003
808 BUG_ON(conn == NULL); 1004 BUG_ON(conn == NULL);
809 1005
810 rc = ocfs2_live_connection_new(conn, &control); 1006 lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
1007 if (!lc) {
1008 rc = -ENOMEM;
1009 goto out;
1010 }
1011
1012 init_waitqueue_head(&lc->oc_wait);
1013 init_completion(&lc->oc_sync_wait);
1014 atomic_set(&lc->oc_this_node, 0);
1015 conn->cc_private = lc;
1016 lc->oc_type = NO_CONTROLD;
1017
1018 rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
1019 DLM_LSFL_FS, DLM_LVB_LEN,
1020 &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
1021 if (rc)
1022 goto out;
1023
1024 if (ops_rv == -EOPNOTSUPP) {
1025 lc->oc_type = WITH_CONTROLD;
1026 printk(KERN_NOTICE "ocfs2: You seem to be using an older "
1027 "version of dlm_controld and/or ocfs2-tools."
1028 " Please consider upgrading.\n");
1029 } else if (ops_rv) {
1030 rc = ops_rv;
1031 goto out;
1032 }
1033 conn->cc_lockspace = fsdlm;
1034
1035 rc = ocfs2_live_connection_attach(conn, lc);
811 if (rc) 1036 if (rc)
812 goto out; 1037 goto out;
813 1038
1039 if (lc->oc_type == NO_CONTROLD) {
1040 rc = get_protocol_version(conn);
1041 if (rc) {
1042 printk(KERN_ERR "ocfs2: Could not determine"
1043 " locking version\n");
1044 user_cluster_disconnect(conn);
1045 goto out;
1046 }
1047 wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
1048 }
1049
814 /* 1050 /*
815 * running_proto must have been set before we allowed any mounts 1051 * running_proto must have been set before we allowed any mounts
816 * to proceed. 1052 * to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
818 if (fs_protocol_compare(&running_proto, &conn->cc_version)) { 1054 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
819 printk(KERN_ERR 1055 printk(KERN_ERR
820 "Unable to mount with fs locking protocol version " 1056 "Unable to mount with fs locking protocol version "
821 "%u.%u because the userspace control daemon has " 1057 "%u.%u because negotiated protocol is %u.%u\n",
822 "negotiated %u.%u\n",
823 conn->cc_version.pv_major, conn->cc_version.pv_minor, 1058 conn->cc_version.pv_major, conn->cc_version.pv_minor,
824 running_proto.pv_major, running_proto.pv_minor); 1059 running_proto.pv_major, running_proto.pv_minor);
825 rc = -EPROTO; 1060 rc = -EPROTO;
826 ocfs2_live_connection_drop(control); 1061 ocfs2_live_connection_drop(lc);
827 goto out; 1062 lc = NULL;
828 }
829
830 rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
831 NULL, NULL, NULL, &fsdlm);
832 if (rc) {
833 ocfs2_live_connection_drop(control);
834 goto out;
835 } 1063 }
836 1064
837 conn->cc_private = control;
838 conn->cc_lockspace = fsdlm;
839out: 1065out:
1066 if (rc && lc)
1067 kfree(lc);
840 return rc; 1068 return rc;
841} 1069}
842 1070
843static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
844{
845 dlm_release_lockspace(conn->cc_lockspace, 2);
846 conn->cc_lockspace = NULL;
847 ocfs2_live_connection_drop(conn->cc_private);
848 conn->cc_private = NULL;
849 return 0;
850}
851 1071
852static int user_cluster_this_node(unsigned int *this_node) 1072static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
1073 unsigned int *this_node)
853{ 1074{
854 int rc; 1075 int rc;
1076 struct ocfs2_live_connection *lc = conn->cc_private;
1077
1078 if (lc->oc_type == WITH_CONTROLD)
1079 rc = ocfs2_control_get_this_node();
1080 else if (lc->oc_type == NO_CONTROLD)
1081 rc = atomic_read(&lc->oc_this_node);
1082 else
1083 rc = -EINVAL;
855 1084
856 rc = ocfs2_control_get_this_node();
857 if (rc < 0) 1085 if (rc < 0)
858 return rc; 1086 return rc;
859 1087
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..1324e6600e57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
309EXPORT_SYMBOL_GPL(ocfs2_plock); 309EXPORT_SYMBOL_GPL(ocfs2_plock);
310 310
311int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
312 const char *cluster_name,
313 int cluster_name_len,
312 const char *group, 314 const char *group,
313 int grouplen, 315 int grouplen,
314 struct ocfs2_locking_protocol *lproto, 316 struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name,
342 goto out; 344 goto out;
343 } 345 }
344 346
345 memcpy(new_conn->cc_name, group, grouplen); 347 strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
346 new_conn->cc_namelen = grouplen; 348 new_conn->cc_namelen = grouplen;
349 strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1);
350 new_conn->cc_cluster_name_len = cluster_name_len;
347 new_conn->cc_recovery_handler = recovery_handler; 351 new_conn->cc_recovery_handler = recovery_handler;
348 new_conn->cc_recovery_data = recovery_data; 352 new_conn->cc_recovery_data = recovery_data;
349 353
@@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
386 390
387 if (cluster_stack_name[0]) 391 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name; 392 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, 393 return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
390 recovery_handler, recovery_data, conn); 394 lproto, recovery_handler, recovery_data,
395 conn);
391} 396}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); 397EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393 398
@@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
460} 465}
461EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); 466EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
462 467
463int ocfs2_cluster_this_node(unsigned int *node) 468int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
469 unsigned int *node)
464{ 470{
465 return active_stack->sp_ops->this_node(node); 471 return active_stack->sp_ops->this_node(conn, node);
466} 472}
467EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); 473EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
468 474
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
45 */ 45 */
46#define GROUP_NAME_MAX 64 46#define GROUP_NAME_MAX 64
47 47
48/* This shadows OCFS2_CLUSTER_NAME_LEN */
49#define CLUSTER_NAME_MAX 16
50
48 51
49/* 52/*
50 * ocfs2_protocol_version changes when ocfs2 does something different in 53 * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
97 * locking compatibility. 100 * locking compatibility.
98 */ 101 */
99struct ocfs2_cluster_connection { 102struct ocfs2_cluster_connection {
100 char cc_name[GROUP_NAME_MAX]; 103 char cc_name[GROUP_NAME_MAX + 1];
101 int cc_namelen; 104 int cc_namelen;
105 char cc_cluster_name[CLUSTER_NAME_MAX + 1];
106 int cc_cluster_name_len;
102 struct ocfs2_protocol_version cc_version; 107 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto; 108 struct ocfs2_locking_protocol *cc_proto;
104 void (*cc_recovery_handler)(int node_num, void *recovery_data); 109 void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
152 * ->this_node() returns the cluster's unique identifier for the 157 * ->this_node() returns the cluster's unique identifier for the
153 * local node. 158 * local node.
154 */ 159 */
155 int (*this_node)(unsigned int *node); 160 int (*this_node)(struct ocfs2_cluster_connection *conn,
161 unsigned int *node);
156 162
157 /* 163 /*
158 * Call the underlying dlm lock function. The ->dlm_lock() 164 * Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
239 245
240/* Used by the filesystem */ 246/* Used by the filesystem */
241int ocfs2_cluster_connect(const char *stack_name, 247int ocfs2_cluster_connect(const char *stack_name,
248 const char *cluster_name,
249 int cluster_name_len,
242 const char *group, 250 const char *group,
243 int grouplen, 251 int grouplen,
244 struct ocfs2_locking_protocol *lproto, 252 struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 268int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
261 int hangup_pending); 269 int hangup_pending);
262void ocfs2_cluster_hangup(const char *group, int grouplen); 270void ocfs2_cluster_hangup(const char *group, int grouplen);
263int ocfs2_cluster_this_node(unsigned int *node); 271int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
272 unsigned int *node);
264 273
265struct ocfs2_lock_res; 274struct ocfs2_lock_res;
266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 275int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
113 struct ocfs2_suballoc_result *res); 113 struct ocfs2_suballoc_result *res);
114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115 int nr); 115 int nr);
116static inline int ocfs2_block_group_set_bits(handle_t *handle,
117 struct inode *alloc_inode,
118 struct ocfs2_group_desc *bg,
119 struct buffer_head *group_bh,
120 unsigned int bit_off,
121 unsigned int num_bits);
122static int ocfs2_relink_block_group(handle_t *handle, 116static int ocfs2_relink_block_group(handle_t *handle,
123 struct inode *alloc_inode, 117 struct inode *alloc_inode,
124 struct buffer_head *fe_bh, 118 struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1343 return status; 1337 return status;
1344} 1338}
1345 1339
1346static inline int ocfs2_block_group_set_bits(handle_t *handle, 1340int ocfs2_block_group_set_bits(handle_t *handle,
1347 struct inode *alloc_inode, 1341 struct inode *alloc_inode,
1348 struct ocfs2_group_desc *bg, 1342 struct ocfs2_group_desc *bg,
1349 struct buffer_head *group_bh, 1343 struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1388 ocfs2_journal_dirty(handle, group_bh); 1382 ocfs2_journal_dirty(handle, group_bh);
1389 1383
1390bail: 1384bail:
1391 if (status)
1392 mlog_errno(status);
1393 return status; 1385 return status;
1394} 1386}
1395 1387
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1588 return ret; 1580 return ret;
1589} 1581}
1590 1582
1591static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1583int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1592 handle_t *handle, 1584 handle_t *handle,
1593 struct buffer_head *di_bh, 1585 struct buffer_head *di_bh,
1594 u32 num_bits, 1586 u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
86 u32 bits_wanted, 86 u32 bits_wanted,
87 struct ocfs2_alloc_context **ac); 87 struct ocfs2_alloc_context **ac);
88 88
89int ocfs2_alloc_dinode_update_counts(struct inode *inode,
90 handle_t *handle,
91 struct buffer_head *di_bh,
92 u32 num_bits,
93 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg,
97 struct buffer_head *group_bh,
98 unsigned int bit_off,
99 unsigned int num_bits);
100
89int ocfs2_claim_metadata(handle_t *handle, 101int ocfs2_claim_metadata(handle_t *handle,
90 struct ocfs2_alloc_context *ac, 102 struct ocfs2_alloc_context *ac,
91 u32 bits_wanted, 103 u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
68#include "super.h" 68#include "super.h"
69#include "sysfile.h" 69#include "sysfile.h"
70#include "uptodate.h" 70#include "uptodate.h"
71#include "ver.h"
72#include "xattr.h" 71#include "xattr.h"
73#include "quota.h" 72#include "quota.h"
74#include "refcounttree.h" 73#include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
90 89
91MODULE_AUTHOR("Oracle"); 90MODULE_AUTHOR("Oracle");
92MODULE_LICENSE("GPL"); 91MODULE_LICENSE("GPL");
92MODULE_DESCRIPTION("OCFS2 cluster file system");
93 93
94struct mount_options 94struct mount_options
95{ 95{
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
1618{ 1618{
1619 int status, i; 1619 int status, i;
1620 1620
1621 ocfs2_print_version();
1622
1623 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) 1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1624 init_waitqueue_head(&ocfs2__ioend_wq[i]); 1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1625 1623
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1947 1945
1948 ocfs2_shutdown_local_alloc(osb); 1946 ocfs2_shutdown_local_alloc(osb);
1949 1947
1950 ocfs2_truncate_log_shutdown(osb);
1951
1952 /* This will disable recovery and flush any recovery work. */ 1948 /* This will disable recovery and flush any recovery work. */
1953 ocfs2_recovery_exit(osb); 1949 ocfs2_recovery_exit(osb);
1954 1950
1951 /*
1952 * During dismount, when it recovers another node it will call
1953 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
1954 */
1955 ocfs2_truncate_log_shutdown(osb);
1956
1955 ocfs2_journal_shutdown(osb); 1957 ocfs2_journal_shutdown(osb);
1956 1958
1957 ocfs2_sync_blockdev(sb); 1959 ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2225 if (ocfs2_clusterinfo_valid(osb)) { 2227 if (ocfs2_clusterinfo_valid(osb)) {
2226 osb->osb_stackflags = 2228 osb->osb_stackflags =
2227 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; 2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2228 memcpy(osb->osb_cluster_stack, 2230 strlcpy(osb->osb_cluster_stack,
2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2231 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2230 OCFS2_STACK_LABEL_LEN); 2232 OCFS2_STACK_LABEL_LEN + 1);
2231 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
2232 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { 2233 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
2233 mlog(ML_ERROR, 2234 mlog(ML_ERROR,
2234 "couldn't mount because of an invalid " 2235 "couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2237 status = -EINVAL; 2238 status = -EINVAL;
2238 goto bail; 2239 goto bail;
2239 } 2240 }
2241 strlcpy(osb->osb_cluster_name,
2242 OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
2243 OCFS2_CLUSTER_NAME_LEN + 1);
2240 } else { 2244 } else {
2241 /* The empty string is identical with classic tools that 2245 /* The empty string is identical with classic tools that
2242 * don't know about s_cluster_info. */ 2246 * don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.5.0"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..021e7c069b86 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,11 +22,80 @@
22 22
23#include <linux/errno.h> 23#include <linux/errno.h>
24 24
25EXPORT_SYMBOL(posix_acl_init); 25struct posix_acl **acl_by_type(struct inode *inode, int type)
26EXPORT_SYMBOL(posix_acl_alloc); 26{
27EXPORT_SYMBOL(posix_acl_valid); 27 switch (type) {
28EXPORT_SYMBOL(posix_acl_equiv_mode); 28 case ACL_TYPE_ACCESS:
29EXPORT_SYMBOL(posix_acl_from_mode); 29 return &inode->i_acl;
30 case ACL_TYPE_DEFAULT:
31 return &inode->i_default_acl;
32 default:
33 BUG();
34 }
35}
36EXPORT_SYMBOL(acl_by_type);
37
38struct posix_acl *get_cached_acl(struct inode *inode, int type)
39{
40 struct posix_acl **p = acl_by_type(inode, type);
41 struct posix_acl *acl = ACCESS_ONCE(*p);
42 if (acl) {
43 spin_lock(&inode->i_lock);
44 acl = *p;
45 if (acl != ACL_NOT_CACHED)
46 acl = posix_acl_dup(acl);
47 spin_unlock(&inode->i_lock);
48 }
49 return acl;
50}
51EXPORT_SYMBOL(get_cached_acl);
52
53struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
54{
55 return rcu_dereference(*acl_by_type(inode, type));
56}
57EXPORT_SYMBOL(get_cached_acl_rcu);
58
59void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
60{
61 struct posix_acl **p = acl_by_type(inode, type);
62 struct posix_acl *old;
63 spin_lock(&inode->i_lock);
64 old = *p;
65 rcu_assign_pointer(*p, posix_acl_dup(acl));
66 spin_unlock(&inode->i_lock);
67 if (old != ACL_NOT_CACHED)
68 posix_acl_release(old);
69}
70EXPORT_SYMBOL(set_cached_acl);
71
72void forget_cached_acl(struct inode *inode, int type)
73{
74 struct posix_acl **p = acl_by_type(inode, type);
75 struct posix_acl *old;
76 spin_lock(&inode->i_lock);
77 old = *p;
78 *p = ACL_NOT_CACHED;
79 spin_unlock(&inode->i_lock);
80 if (old != ACL_NOT_CACHED)
81 posix_acl_release(old);
82}
83EXPORT_SYMBOL(forget_cached_acl);
84
85void forget_all_cached_acls(struct inode *inode)
86{
87 struct posix_acl *old_access, *old_default;
88 spin_lock(&inode->i_lock);
89 old_access = inode->i_acl;
90 old_default = inode->i_default_acl;
91 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
92 spin_unlock(&inode->i_lock);
93 if (old_access != ACL_NOT_CACHED)
94 posix_acl_release(old_access);
95 if (old_default != ACL_NOT_CACHED)
96 posix_acl_release(old_default);
97}
98EXPORT_SYMBOL(forget_all_cached_acls);
30 99
31/* 100/*
32 * Init a fresh posix_acl 101 * Init a fresh posix_acl
@@ -37,6 +106,7 @@ posix_acl_init(struct posix_acl *acl, int count)
37 atomic_set(&acl->a_refcount, 1); 106 atomic_set(&acl->a_refcount, 1);
38 acl->a_count = count; 107 acl->a_count = count;
39} 108}
109EXPORT_SYMBOL(posix_acl_init);
40 110
41/* 111/*
42 * Allocate a new ACL with the specified number of entries. 112 * Allocate a new ACL with the specified number of entries.
@@ -51,6 +121,7 @@ posix_acl_alloc(int count, gfp_t flags)
51 posix_acl_init(acl, count); 121 posix_acl_init(acl, count);
52 return acl; 122 return acl;
53} 123}
124EXPORT_SYMBOL(posix_acl_alloc);
54 125
55/* 126/*
56 * Clone an ACL. 127 * Clone an ACL.
@@ -146,6 +217,7 @@ posix_acl_valid(const struct posix_acl *acl)
146 return 0; 217 return 0;
147 return -EINVAL; 218 return -EINVAL;
148} 219}
220EXPORT_SYMBOL(posix_acl_valid);
149 221
150/* 222/*
151 * Returns 0 if the acl can be exactly represented in the traditional 223 * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +258,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
186 *mode_p = (*mode_p & ~S_IRWXUGO) | mode; 258 *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
187 return not_equiv; 259 return not_equiv;
188} 260}
261EXPORT_SYMBOL(posix_acl_equiv_mode);
189 262
190/* 263/*
191 * Create an ACL representing the file mode permission bits of an inode. 264 * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +280,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
207 acl->a_entries[2].e_perm = (mode & S_IRWXO); 280 acl->a_entries[2].e_perm = (mode & S_IRWXO);
208 return acl; 281 return acl;
209} 282}
283EXPORT_SYMBOL(posix_acl_from_mode);
210 284
211/* 285/*
212 * Return 0 if current is granted want access to the inode 286 * Return 0 if current is granted want access to the inode
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..24270eceddbf 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
26 unsigned long committed; 26 unsigned long committed;
27 struct vmalloc_info vmi; 27 struct vmalloc_info vmi;
28 long cached; 28 long cached;
29 long available;
30 unsigned long pagecache;
31 unsigned long wmark_low = 0;
29 unsigned long pages[NR_LRU_LISTS]; 32 unsigned long pages[NR_LRU_LISTS];
33 struct zone *zone;
30 int lru; 34 int lru;
31 35
32/* 36/*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
47 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 51 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
48 pages[lru] = global_page_state(NR_LRU_BASE + lru); 52 pages[lru] = global_page_state(NR_LRU_BASE + lru);
49 53
54 for_each_zone(zone)
55 wmark_low += zone->watermark[WMARK_LOW];
56
57 /*
58 * Estimate the amount of memory available for userspace allocations,
59 * without causing swapping.
60 *
61 * Free memory cannot be taken below the low watermark, before the
62 * system starts swapping.
63 */
64 available = i.freeram - wmark_low;
65
66 /*
67 * Not all the page cache can be freed, otherwise the system will
68 * start swapping. Assume at least half of the page cache, or the
69 * low watermark worth of cache, needs to stay.
70 */
71 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
72 pagecache -= min(pagecache / 2, wmark_low);
73 available += pagecache;
74
75 /*
76 * Part of the reclaimable swap consists of items that are in use,
77 * and cannot be freed. Cap this estimate at the low watermark.
78 */
79 available += global_page_state(NR_SLAB_RECLAIMABLE) -
80 min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
81
82 if (available < 0)
83 available = 0;
84
50 /* 85 /*
51 * Tagged format, for easy grepping and expansion. 86 * Tagged format, for easy grepping and expansion.
52 */ 87 */
53 seq_printf(m, 88 seq_printf(m,
54 "MemTotal: %8lu kB\n" 89 "MemTotal: %8lu kB\n"
55 "MemFree: %8lu kB\n" 90 "MemFree: %8lu kB\n"
91 "MemAvailable: %8lu kB\n"
56 "Buffers: %8lu kB\n" 92 "Buffers: %8lu kB\n"
57 "Cached: %8lu kB\n" 93 "Cached: %8lu kB\n"
58 "SwapCached: %8lu kB\n" 94 "SwapCached: %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
105 , 141 ,
106 K(i.totalram), 142 K(i.totalram),
107 K(i.freeram), 143 K(i.freeram),
144 K(available),
108 K(i.bufferram), 145 K(i.bufferram),
109 K(cached), 146 K(cached),
110 K(total_swapcache_pages()), 147 K(total_swapcache_pages()),
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b8e93a40a5d3..78c3c2097787 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi)
443 pstore_get_records(0); 443 pstore_get_records(0);
444 444
445 kmsg_dump_register(&pstore_dumper); 445 kmsg_dump_register(&pstore_dumper);
446 pstore_register_console(); 446
447 pstore_register_ftrace(); 447 if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
448 pstore_register_console();
449 pstore_register_ftrace();
450 }
448 451
449 if (pstore_update_ms >= 0) { 452 if (pstore_update_ms >= 0) {
450 pstore_timer.expires = jiffies + 453 pstore_timer.expires = jiffies +
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..6a3e2c420180 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -275,4 +275,4 @@ int __init init_ramfs_fs(void)
275 275
276 return err; 276 return err;
277} 277}
278module_init(init_ramfs_fs) 278fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..1193ffd03565 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
901 io_fn_t fn; 901 io_fn_t fn;
902 iov_fn_t fnv; 902 iov_fn_t fnv;
903 903
904 ret = -EFAULT;
905 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
906 goto out;
907
908 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 904 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
909 UIO_FASTIOV, iovstack, &iov); 905 UIO_FASTIOV, iovstack, &iov);
910 if (ret <= 0) 906 if (ret <= 0)
diff --git a/fs/splice.c b/fs/splice.c
index 46a08f772d7d..12028fa41def 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
555 .get = generic_pipe_buf_get, 555 .get = generic_pipe_buf_get,
556}; 556};
557 557
558static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
559 struct pipe_buffer *buf)
560{
561 return 1;
562}
563
564/* Pipe buffer operations for a socket and similar. */
565const struct pipe_buf_operations nosteal_pipe_buf_ops = {
566 .can_merge = 0,
567 .map = generic_pipe_buf_map,
568 .unmap = generic_pipe_buf_unmap,
569 .confirm = generic_pipe_buf_confirm,
570 .release = generic_pipe_buf_release,
571 .steal = generic_pipe_buf_nosteal,
572 .get = generic_pipe_buf_get,
573};
574EXPORT_SYMBOL(nosteal_pipe_buf_ops);
575
558static ssize_t kernel_readv(struct file *file, const struct iovec *vec, 576static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
559 unsigned long vlen, loff_t offset) 577 unsigned long vlen, loff_t offset)
560{ 578{
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..cecd780e0f44 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
166 if (!s) 166 if (!s)
167 return NULL; 167 return NULL;
168 168
169 INIT_LIST_HEAD(&s->s_mounts);
170
169 if (security_sb_alloc(s)) 171 if (security_sb_alloc(s))
170 goto fail; 172 goto fail;
171 173
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
188 if (list_lru_init(&s->s_inode_lru)) 190 if (list_lru_init(&s->s_inode_lru))
189 goto fail; 191 goto fail;
190 192
191 INIT_LIST_HEAD(&s->s_mounts);
192 init_rwsem(&s->s_umount); 193 init_rwsem(&s->s_umount);
193 lockdep_set_class(&s->s_umount, &type->s_umount_key); 194 lockdep_set_class(&s->s_umount, &type->s_umount_key);
194 /* 195 /*
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
2# Makefile for the sysfs virtual filesystem 2# Makefile for the sysfs virtual filesystem
3# 3#
4 4
5obj-y := inode.o file.o dir.o symlink.o mount.o group.o 5obj-y := file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..ee0d761c3179 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
13#undef DEBUG 13#undef DEBUG
14 14
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mount.h>
17#include <linux/module.h>
18#include <linux/kobject.h> 16#include <linux/kobject.h>
19#include <linux/namei.h>
20#include <linux/idr.h>
21#include <linux/completion.h>
22#include <linux/mutex.h>
23#include <linux/slab.h> 17#include <linux/slab.h>
24#include <linux/security.h>
25#include <linux/hash.h>
26#include "sysfs.h" 18#include "sysfs.h"
27 19
28DEFINE_MUTEX(sysfs_mutex);
29DEFINE_SPINLOCK(sysfs_symlink_target_lock); 20DEFINE_SPINLOCK(sysfs_symlink_target_lock);
30 21
31#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
32
33static DEFINE_SPINLOCK(sysfs_ino_lock);
34static DEFINE_IDA(sysfs_ino_ida);
35
36/**
37 * sysfs_name_hash
38 * @name: Null terminated string to hash
39 * @ns: Namespace tag to hash
40 *
41 * Returns 31 bit hash of ns + name (so it fits in an off_t )
42 */
43static unsigned int sysfs_name_hash(const char *name, const void *ns)
44{
45 unsigned long hash = init_name_hash();
46 unsigned int len = strlen(name);
47 while (len--)
48 hash = partial_name_hash(*name++, hash);
49 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
50 hash &= 0x7fffffffU;
51 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
52 if (hash < 1)
53 hash += 2;
54 if (hash >= INT_MAX)
55 hash = INT_MAX - 1;
56 return hash;
57}
58
59static int sysfs_name_compare(unsigned int hash, const char *name,
60 const void *ns, const struct sysfs_dirent *sd)
61{
62 if (hash != sd->s_hash)
63 return hash - sd->s_hash;
64 if (ns != sd->s_ns)
65 return ns - sd->s_ns;
66 return strcmp(name, sd->s_name);
67}
68
69static int sysfs_sd_compare(const struct sysfs_dirent *left,
70 const struct sysfs_dirent *right)
71{
72 return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
73 right);
74}
75
76/**
77 * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 * @sd: sysfs_dirent of interest
79 *
80 * Link @sd into its sibling rbtree which starts from
81 * sd->s_parent->s_dir.children.
82 *
83 * Locking:
84 * mutex_lock(sysfs_mutex)
85 *
86 * RETURNS:
87 * 0 on susccess -EEXIST on failure.
88 */
89static int sysfs_link_sibling(struct sysfs_dirent *sd)
90{
91 struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
92 struct rb_node *parent = NULL;
93
94 if (sysfs_type(sd) == SYSFS_DIR)
95 sd->s_parent->s_dir.subdirs++;
96
97 while (*node) {
98 struct sysfs_dirent *pos;
99 int result;
100
101 pos = to_sysfs_dirent(*node);
102 parent = *node;
103 result = sysfs_sd_compare(sd, pos);
104 if (result < 0)
105 node = &pos->s_rb.rb_left;
106 else if (result > 0)
107 node = &pos->s_rb.rb_right;
108 else
109 return -EEXIST;
110 }
111 /* add new node and rebalance the tree */
112 rb_link_node(&sd->s_rb, parent, node);
113 rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
114 return 0;
115}
116
117/**
118 * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
119 * @sd: sysfs_dirent of interest
120 *
121 * Unlink @sd from its sibling rbtree which starts from
122 * sd->s_parent->s_dir.children.
123 *
124 * Locking:
125 * mutex_lock(sysfs_mutex)
126 */
127static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
128{
129 if (sysfs_type(sd) == SYSFS_DIR)
130 sd->s_parent->s_dir.subdirs--;
131
132 rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
133}
134
135/**
136 * sysfs_get_active - get an active reference to sysfs_dirent
137 * @sd: sysfs_dirent to get an active reference to
138 *
139 * Get an active reference of @sd. This function is noop if @sd
140 * is NULL.
141 *
142 * RETURNS:
143 * Pointer to @sd on success, NULL on failure.
144 */
145struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
146{
147 if (unlikely(!sd))
148 return NULL;
149
150 if (!atomic_inc_unless_negative(&sd->s_active))
151 return NULL;
152
153 if (likely(!sysfs_ignore_lockdep(sd)))
154 rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
155 return sd;
156}
157
158/**
159 * sysfs_put_active - put an active reference to sysfs_dirent
160 * @sd: sysfs_dirent to put an active reference to
161 *
162 * Put an active reference to @sd. This function is noop if @sd
163 * is NULL.
164 */
165void sysfs_put_active(struct sysfs_dirent *sd)
166{
167 int v;
168
169 if (unlikely(!sd))
170 return;
171
172 if (likely(!sysfs_ignore_lockdep(sd)))
173 rwsem_release(&sd->dep_map, 1, _RET_IP_);
174 v = atomic_dec_return(&sd->s_active);
175 if (likely(v != SD_DEACTIVATED_BIAS))
176 return;
177
178 /* atomic_dec_return() is a mb(), we'll always see the updated
179 * sd->u.completion.
180 */
181 complete(sd->u.completion);
182}
183
184/**
185 * sysfs_deactivate - deactivate sysfs_dirent
186 * @sd: sysfs_dirent to deactivate
187 *
188 * Deny new active references and drain existing ones.
189 */
190static void sysfs_deactivate(struct sysfs_dirent *sd)
191{
192 DECLARE_COMPLETION_ONSTACK(wait);
193 int v;
194
195 BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
196
197 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
198 return;
199
200 sd->u.completion = (void *)&wait;
201
202 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
203 /* atomic_add_return() is a mb(), put_active() will always see
204 * the updated sd->u.completion.
205 */
206 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
207
208 if (v != SD_DEACTIVATED_BIAS) {
209 lock_contended(&sd->dep_map, _RET_IP_);
210 wait_for_completion(&wait);
211 }
212
213 lock_acquired(&sd->dep_map, _RET_IP_);
214 rwsem_release(&sd->dep_map, 1, _RET_IP_);
215}
216
217static int sysfs_alloc_ino(unsigned int *pino)
218{
219 int ino, rc;
220
221 retry:
222 spin_lock(&sysfs_ino_lock);
223 rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
224 spin_unlock(&sysfs_ino_lock);
225
226 if (rc == -EAGAIN) {
227 if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
228 goto retry;
229 rc = -ENOMEM;
230 }
231
232 *pino = ino;
233 return rc;
234}
235
236static void sysfs_free_ino(unsigned int ino)
237{
238 spin_lock(&sysfs_ino_lock);
239 ida_remove(&sysfs_ino_ida, ino);
240 spin_unlock(&sysfs_ino_lock);
241}
242
243void release_sysfs_dirent(struct sysfs_dirent *sd)
244{
245 struct sysfs_dirent *parent_sd;
246
247 repeat:
248 /* Moving/renaming is always done while holding reference.
249 * sd->s_parent won't change beneath us.
250 */
251 parent_sd = sd->s_parent;
252
253 WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
254 "sysfs: free using entry: %s/%s\n",
255 parent_sd ? parent_sd->s_name : "", sd->s_name);
256
257 if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
258 sysfs_put(sd->s_symlink.target_sd);
259 if (sysfs_type(sd) & SYSFS_COPY_NAME)
260 kfree(sd->s_name);
261 if (sd->s_iattr && sd->s_iattr->ia_secdata)
262 security_release_secctx(sd->s_iattr->ia_secdata,
263 sd->s_iattr->ia_secdata_len);
264 kfree(sd->s_iattr);
265 sysfs_free_ino(sd->s_ino);
266 kmem_cache_free(sysfs_dir_cachep, sd);
267
268 sd = parent_sd;
269 if (sd && atomic_dec_and_test(&sd->s_count))
270 goto repeat;
271}
272
273static int sysfs_dentry_delete(const struct dentry *dentry)
274{
275 struct sysfs_dirent *sd = dentry->d_fsdata;
276 return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
277}
278
279static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
280{
281 struct sysfs_dirent *sd;
282 int type;
283
284 if (flags & LOOKUP_RCU)
285 return -ECHILD;
286
287 sd = dentry->d_fsdata;
288 mutex_lock(&sysfs_mutex);
289
290 /* The sysfs dirent has been deleted */
291 if (sd->s_flags & SYSFS_FLAG_REMOVED)
292 goto out_bad;
293
294 /* The sysfs dirent has been moved? */
295 if (dentry->d_parent->d_fsdata != sd->s_parent)
296 goto out_bad;
297
298 /* The sysfs dirent has been renamed */
299 if (strcmp(dentry->d_name.name, sd->s_name) != 0)
300 goto out_bad;
301
302 /* The sysfs dirent has been moved to a different namespace */
303 type = KOBJ_NS_TYPE_NONE;
304 if (sd->s_parent) {
305 type = sysfs_ns_type(sd->s_parent);
306 if (type != KOBJ_NS_TYPE_NONE &&
307 sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
308 goto out_bad;
309 }
310
311 mutex_unlock(&sysfs_mutex);
312out_valid:
313 return 1;
314out_bad:
315 /* Remove the dentry from the dcache hashes.
316 * If this is a deleted dentry we use d_drop instead of d_delete
317 * so sysfs doesn't need to cope with negative dentries.
318 *
319 * If this is a dentry that has simply been renamed we
320 * use d_drop to remove it from the dcache lookup on its
321 * old parent. If this dentry persists later when a lookup
322 * is performed at its new name the dentry will be readded
323 * to the dcache hashes.
324 */
325 mutex_unlock(&sysfs_mutex);
326
327 /* If we have submounts we must allow the vfs caches
328 * to lie about the state of the filesystem to prevent
329 * leaks and other nasty things.
330 */
331 if (check_submounts_and_drop(dentry) != 0)
332 goto out_valid;
333
334 return 0;
335}
336
337static void sysfs_dentry_release(struct dentry *dentry)
338{
339 sysfs_put(dentry->d_fsdata);
340}
341
342const struct dentry_operations sysfs_dentry_ops = {
343 .d_revalidate = sysfs_dentry_revalidate,
344 .d_delete = sysfs_dentry_delete,
345 .d_release = sysfs_dentry_release,
346};
347
348struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
349{
350 char *dup_name = NULL;
351 struct sysfs_dirent *sd;
352
353 if (type & SYSFS_COPY_NAME) {
354 name = dup_name = kstrdup(name, GFP_KERNEL);
355 if (!name)
356 return NULL;
357 }
358
359 sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
360 if (!sd)
361 goto err_out1;
362
363 if (sysfs_alloc_ino(&sd->s_ino))
364 goto err_out2;
365
366 atomic_set(&sd->s_count, 1);
367 atomic_set(&sd->s_active, 0);
368
369 sd->s_name = name;
370 sd->s_mode = mode;
371 sd->s_flags = type | SYSFS_FLAG_REMOVED;
372
373 return sd;
374
375 err_out2:
376 kmem_cache_free(sysfs_dir_cachep, sd);
377 err_out1:
378 kfree(dup_name);
379 return NULL;
380}
381
382/**
383 * sysfs_addrm_start - prepare for sysfs_dirent add/remove
384 * @acxt: pointer to sysfs_addrm_cxt to be used
385 *
386 * This function is called when the caller is about to add or remove
387 * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used
388 * to keep and pass context to other addrm functions.
389 *
390 * LOCKING:
391 * Kernel thread context (may sleep). sysfs_mutex is locked on
392 * return.
393 */
394void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
395 __acquires(sysfs_mutex)
396{
397 memset(acxt, 0, sizeof(*acxt));
398
399 mutex_lock(&sysfs_mutex);
400}
401
402/**
403 * __sysfs_add_one - add sysfs_dirent to parent without warning
404 * @acxt: addrm context to use
405 * @sd: sysfs_dirent to be added
406 * @parent_sd: the parent sysfs_dirent to add @sd to
407 *
408 * Get @parent_sd and set @sd->s_parent to it and increment nlink of
409 * the parent inode if @sd is a directory and link into the children
410 * list of the parent.
411 *
412 * This function should be called between calls to
413 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
414 * passed the same @acxt as passed to sysfs_addrm_start().
415 *
416 * LOCKING:
417 * Determined by sysfs_addrm_start().
418 *
419 * RETURNS:
420 * 0 on success, -EEXIST if entry with the given name already
421 * exists.
422 */
423int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
424 struct sysfs_dirent *parent_sd)
425{
426 struct sysfs_inode_attrs *ps_iattr;
427 int ret;
428
429 if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
430 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
431 sysfs_ns_type(parent_sd) ? "required" : "invalid",
432 parent_sd->s_name, sd->s_name);
433 return -EINVAL;
434 }
435
436 sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
437 sd->s_parent = sysfs_get(parent_sd);
438
439 ret = sysfs_link_sibling(sd);
440 if (ret)
441 return ret;
442
443 /* Update timestamps on the parent */
444 ps_iattr = parent_sd->s_iattr;
445 if (ps_iattr) {
446 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
447 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
448 }
449
450 /* Mark the entry added into directory tree */
451 sd->s_flags &= ~SYSFS_FLAG_REMOVED;
452
453 return 0;
454}
455
456/** 22/**
457 * sysfs_pathname - return full path to sysfs dirent 23 * sysfs_pathname - return full path to sysfs dirent
458 * @sd: sysfs_dirent whose path we want 24 * @kn: kernfs_node whose path we want
459 * @path: caller allocated buffer of size PATH_MAX 25 * @path: caller allocated buffer of size PATH_MAX
460 * 26 *
461 * Gives the name "/" to the sysfs_root entry; any path returned 27 * Gives the name "/" to the sysfs_root entry; any path returned
462 * is relative to wherever sysfs is mounted. 28 * is relative to wherever sysfs is mounted.
463 */ 29 */
464static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) 30static char *sysfs_pathname(struct kernfs_node *kn, char *path)
465{ 31{
466 if (sd->s_parent) { 32 if (kn->parent) {
467 sysfs_pathname(sd->s_parent, path); 33 sysfs_pathname(kn->parent, path);
468 strlcat(path, "/", PATH_MAX); 34 strlcat(path, "/", PATH_MAX);
469 } 35 }
470 strlcat(path, sd->s_name, PATH_MAX); 36 strlcat(path, kn->name, PATH_MAX);
471 return path; 37 return path;
472} 38}
473 39
474void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) 40void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
475{ 41{
476 char *path; 42 char *path;
477 43
@@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
489} 55}
490 56
491/** 57/**
492 * sysfs_add_one - add sysfs_dirent to parent
493 * @acxt: addrm context to use
494 * @sd: sysfs_dirent to be added
495 * @parent_sd: the parent sysfs_dirent to add @sd to
496 *
497 * Get @parent_sd and set @sd->s_parent to it and increment nlink of
498 * the parent inode if @sd is a directory and link into the children
499 * list of the parent.
500 *
501 * This function should be called between calls to
502 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
503 * passed the same @acxt as passed to sysfs_addrm_start().
504 *
505 * LOCKING:
506 * Determined by sysfs_addrm_start().
507 *
508 * RETURNS:
509 * 0 on success, -EEXIST if entry with the given name already
510 * exists.
511 */
512int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
513 struct sysfs_dirent *parent_sd)
514{
515 int ret;
516
517 ret = __sysfs_add_one(acxt, sd, parent_sd);
518
519 if (ret == -EEXIST)
520 sysfs_warn_dup(parent_sd, sd->s_name);
521 return ret;
522}
523
524/**
525 * sysfs_remove_one - remove sysfs_dirent from parent
526 * @acxt: addrm context to use
527 * @sd: sysfs_dirent to be removed
528 *
529 * Mark @sd removed and drop nlink of parent inode if @sd is a
530 * directory. @sd is unlinked from the children list.
531 *
532 * This function should be called between calls to
533 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
534 * passed the same @acxt as passed to sysfs_addrm_start().
535 *
536 * LOCKING:
537 * Determined by sysfs_addrm_start().
538 */
539static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
540 struct sysfs_dirent *sd)
541{
542 struct sysfs_inode_attrs *ps_iattr;
543
544 /*
545 * Removal can be called multiple times on the same node. Only the
546 * first invocation is effective and puts the base ref.
547 */
548 if (sd->s_flags & SYSFS_FLAG_REMOVED)
549 return;
550
551 sysfs_unlink_sibling(sd);
552
553 /* Update timestamps on the parent */
554 ps_iattr = sd->s_parent->s_iattr;
555 if (ps_iattr) {
556 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
557 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
558 }
559
560 sd->s_flags |= SYSFS_FLAG_REMOVED;
561 sd->u.removed_list = acxt->removed;
562 acxt->removed = sd;
563}
564
565/**
566 * sysfs_addrm_finish - finish up sysfs_dirent add/remove
567 * @acxt: addrm context to finish up
568 *
569 * Finish up sysfs_dirent add/remove. Resources acquired by
570 * sysfs_addrm_start() are released and removed sysfs_dirents are
571 * cleaned up.
572 *
573 * LOCKING:
574 * sysfs_mutex is released.
575 */
576void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
577 __releases(sysfs_mutex)
578{
579 /* release resources acquired by sysfs_addrm_start() */
580 mutex_unlock(&sysfs_mutex);
581
582 /* kill removed sysfs_dirents */
583 while (acxt->removed) {
584 struct sysfs_dirent *sd = acxt->removed;
585
586 acxt->removed = sd->u.removed_list;
587
588 sysfs_deactivate(sd);
589 sysfs_unmap_bin_file(sd);
590 sysfs_put(sd);
591 }
592}
593
594/**
595 * sysfs_find_dirent - find sysfs_dirent with the given name
596 * @parent_sd: sysfs_dirent to search under
597 * @name: name to look for
598 * @ns: the namespace tag to use
599 *
600 * Look for sysfs_dirent with name @name under @parent_sd.
601 *
602 * LOCKING:
603 * mutex_lock(sysfs_mutex)
604 *
605 * RETURNS:
606 * Pointer to sysfs_dirent if found, NULL if not.
607 */
608struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
609 const unsigned char *name,
610 const void *ns)
611{
612 struct rb_node *node = parent_sd->s_dir.children.rb_node;
613 unsigned int hash;
614
615 if (!!sysfs_ns_type(parent_sd) != !!ns) {
616 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
617 sysfs_ns_type(parent_sd) ? "required" : "invalid",
618 parent_sd->s_name, name);
619 return NULL;
620 }
621
622 hash = sysfs_name_hash(name, ns);
623 while (node) {
624 struct sysfs_dirent *sd;
625 int result;
626
627 sd = to_sysfs_dirent(node);
628 result = sysfs_name_compare(hash, name, ns, sd);
629 if (result < 0)
630 node = node->rb_left;
631 else if (result > 0)
632 node = node->rb_right;
633 else
634 return sd;
635 }
636 return NULL;
637}
638
639/**
640 * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
641 * @parent_sd: sysfs_dirent to search under
642 * @name: name to look for
643 * @ns: the namespace tag to use
644 *
645 * Look for sysfs_dirent with name @name under @parent_sd and get
646 * it if found.
647 *
648 * LOCKING:
649 * Kernel thread context (may sleep). Grabs sysfs_mutex.
650 *
651 * RETURNS:
652 * Pointer to sysfs_dirent if found, NULL if not.
653 */
654struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
655 const unsigned char *name,
656 const void *ns)
657{
658 struct sysfs_dirent *sd;
659
660 mutex_lock(&sysfs_mutex);
661 sd = sysfs_find_dirent(parent_sd, name, ns);
662 sysfs_get(sd);
663 mutex_unlock(&sysfs_mutex);
664
665 return sd;
666}
667EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
668
669static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
670 enum kobj_ns_type type,
671 const char *name, const void *ns,
672 struct sysfs_dirent **p_sd)
673{
674 umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
675 struct sysfs_addrm_cxt acxt;
676 struct sysfs_dirent *sd;
677 int rc;
678
679 /* allocate */
680 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
681 if (!sd)
682 return -ENOMEM;
683
684 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
685 sd->s_ns = ns;
686 sd->s_dir.kobj = kobj;
687
688 /* link in */
689 sysfs_addrm_start(&acxt);
690 rc = sysfs_add_one(&acxt, sd, parent_sd);
691 sysfs_addrm_finish(&acxt);
692
693 if (rc == 0)
694 *p_sd = sd;
695 else
696 sysfs_put(sd);
697
698 return rc;
699}
700
701int sysfs_create_subdir(struct kobject *kobj, const char *name,
702 struct sysfs_dirent **p_sd)
703{
704 return create_dir(kobj, kobj->sd,
705 KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
706}
707
708/**
709 * sysfs_read_ns_type: return associated ns_type
710 * @kobj: the kobject being queried
711 *
712 * Each kobject can be tagged with exactly one namespace type
713 * (i.e. network or user). Return the ns_type associated with
714 * this object if any
715 */
716static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
717{
718 const struct kobj_ns_type_operations *ops;
719 enum kobj_ns_type type;
720
721 ops = kobj_child_ns_ops(kobj);
722 if (!ops)
723 return KOBJ_NS_TYPE_NONE;
724
725 type = ops->type;
726 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
727 BUG_ON(type >= KOBJ_NS_TYPES);
728 BUG_ON(!kobj_ns_type_registered(type));
729
730 return type;
731}
732
733/**
734 * sysfs_create_dir_ns - create a directory for an object with a namespace tag 58 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
735 * @kobj: object we're creating directory for 59 * @kobj: object we're creating directory for
736 * @ns: the namespace tag to use 60 * @ns: the namespace tag to use
737 */ 61 */
738int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) 62int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
739{ 63{
740 enum kobj_ns_type type; 64 struct kernfs_node *parent, *kn;
741 struct sysfs_dirent *parent_sd, *sd;
742 int error = 0;
743 65
744 BUG_ON(!kobj); 66 BUG_ON(!kobj);
745 67
746 if (kobj->parent) 68 if (kobj->parent)
747 parent_sd = kobj->parent->sd; 69 parent = kobj->parent->sd;
748 else 70 else
749 parent_sd = &sysfs_root; 71 parent = sysfs_root_kn;
750 72
751 if (!parent_sd) 73 if (!parent)
752 return -ENOENT; 74 return -ENOENT;
753 75
754 type = sysfs_read_ns_type(kobj); 76 kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
755 77 S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
756 error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); 78 if (IS_ERR(kn)) {
757 if (!error) 79 if (PTR_ERR(kn) == -EEXIST)
758 kobj->sd = sd; 80 sysfs_warn_dup(parent, kobject_name(kobj));
759 return error; 81 return PTR_ERR(kn);
760}
761
762static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
763 unsigned int flags)
764{
765 struct dentry *ret = NULL;
766 struct dentry *parent = dentry->d_parent;
767 struct sysfs_dirent *parent_sd = parent->d_fsdata;
768 struct sysfs_dirent *sd;
769 struct inode *inode;
770 enum kobj_ns_type type;
771 const void *ns;
772
773 mutex_lock(&sysfs_mutex);
774
775 type = sysfs_ns_type(parent_sd);
776 ns = sysfs_info(dir->i_sb)->ns[type];
777
778 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
779
780 /* no such entry */
781 if (!sd) {
782 ret = ERR_PTR(-ENOENT);
783 goto out_unlock;
784 }
785 dentry->d_fsdata = sysfs_get(sd);
786
787 /* attach dentry and inode */
788 inode = sysfs_get_inode(dir->i_sb, sd);
789 if (!inode) {
790 ret = ERR_PTR(-ENOMEM);
791 goto out_unlock;
792 }
793
794 /* instantiate and hash dentry */
795 ret = d_materialise_unique(dentry, inode);
796 out_unlock:
797 mutex_unlock(&sysfs_mutex);
798 return ret;
799}
800
801const struct inode_operations sysfs_dir_inode_operations = {
802 .lookup = sysfs_lookup,
803 .permission = sysfs_permission,
804 .setattr = sysfs_setattr,
805 .getattr = sysfs_getattr,
806 .setxattr = sysfs_setxattr,
807};
808
809static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
810{
811 struct sysfs_dirent *last;
812
813 while (true) {
814 struct rb_node *rbn;
815
816 last = pos;
817
818 if (sysfs_type(pos) != SYSFS_DIR)
819 break;
820
821 rbn = rb_first(&pos->s_dir.children);
822 if (!rbn)
823 break;
824
825 pos = to_sysfs_dirent(rbn);
826 }
827
828 return last;
829}
830
831/**
832 * sysfs_next_descendant_post - find the next descendant for post-order walk
833 * @pos: the current position (%NULL to initiate traversal)
834 * @root: sysfs_dirent whose descendants to walk
835 *
836 * Find the next descendant to visit for post-order traversal of @root's
837 * descendants. @root is included in the iteration and the last node to be
838 * visited.
839 */
840static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
841 struct sysfs_dirent *root)
842{
843 struct rb_node *rbn;
844
845 lockdep_assert_held(&sysfs_mutex);
846
847 /* if first iteration, visit leftmost descendant which may be root */
848 if (!pos)
849 return sysfs_leftmost_descendant(root);
850
851 /* if we visited @root, we're done */
852 if (pos == root)
853 return NULL;
854
855 /* if there's an unvisited sibling, visit its leftmost descendant */
856 rbn = rb_next(&pos->s_rb);
857 if (rbn)
858 return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
859
860 /* no sibling left, visit parent */
861 return pos->s_parent;
862}
863
864static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
865 struct sysfs_dirent *sd)
866{
867 struct sysfs_dirent *pos, *next;
868
869 if (!sd)
870 return;
871
872 pr_debug("sysfs %s: removing\n", sd->s_name);
873
874 next = NULL;
875 do {
876 pos = next;
877 next = sysfs_next_descendant_post(pos, sd);
878 if (pos)
879 sysfs_remove_one(acxt, pos);
880 } while (next);
881}
882
883/**
884 * sysfs_remove - remove a sysfs_dirent recursively
885 * @sd: the sysfs_dirent to remove
886 *
887 * Remove @sd along with all its subdirectories and files.
888 */
889void sysfs_remove(struct sysfs_dirent *sd)
890{
891 struct sysfs_addrm_cxt acxt;
892
893 sysfs_addrm_start(&acxt);
894 __sysfs_remove(&acxt, sd);
895 sysfs_addrm_finish(&acxt);
896}
897
898/**
899 * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
900 * @dir_sd: parent of the target
901 * @name: name of the sysfs_dirent to remove
902 * @ns: namespace tag of the sysfs_dirent to remove
903 *
904 * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
905 * it. Returns 0 on success, -ENOENT if such entry doesn't exist.
906 */
907int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
908 const void *ns)
909{
910 struct sysfs_addrm_cxt acxt;
911 struct sysfs_dirent *sd;
912
913 if (!dir_sd) {
914 WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
915 name);
916 return -ENOENT;
917 } 82 }
918 83
919 sysfs_addrm_start(&acxt); 84 kobj->sd = kn;
920 85 return 0;
921 sd = sysfs_find_dirent(dir_sd, name, ns);
922 if (sd)
923 __sysfs_remove(&acxt, sd);
924
925 sysfs_addrm_finish(&acxt);
926
927 if (sd)
928 return 0;
929 else
930 return -ENOENT;
931} 86}
932 87
933/** 88/**
@@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
940 */ 95 */
941void sysfs_remove_dir(struct kobject *kobj) 96void sysfs_remove_dir(struct kobject *kobj)
942{ 97{
943 struct sysfs_dirent *sd = kobj->sd; 98 struct kernfs_node *kn = kobj->sd;
944 99
945 /* 100 /*
946 * In general, kboject owner is responsible for ensuring removal 101 * In general, kboject owner is responsible for ensuring removal
947 * doesn't race with other operations and sysfs doesn't provide any 102 * doesn't race with other operations and sysfs doesn't provide any
948 * protection; however, when @kobj is used as a symlink target, the 103 * protection; however, when @kobj is used as a symlink target, the
949 * symlinking entity usually doesn't own @kobj and thus has no 104 * symlinking entity usually doesn't own @kobj and thus has no
950 * control over removal. @kobj->sd may be removed anytime and 105 * control over removal. @kobj->sd may be removed anytime
951 * symlink code may end up dereferencing an already freed sd. 106 * and symlink code may end up dereferencing an already freed node.
952 * 107 *
953 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation 108 * sysfs_symlink_target_lock synchronizes @kobj->sd
954 * against symlink operations so that symlink code can safely 109 * disassociation against symlink operations so that symlink code
955 * dereference @kobj->sd. 110 * can safely dereference @kobj->sd.
956 */ 111 */
957 spin_lock(&sysfs_symlink_target_lock); 112 spin_lock(&sysfs_symlink_target_lock);
958 kobj->sd = NULL; 113 kobj->sd = NULL;
959 spin_unlock(&sysfs_symlink_target_lock); 114 spin_unlock(&sysfs_symlink_target_lock);
960 115
961 if (sd) { 116 if (kn) {
962 WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); 117 WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
963 sysfs_remove(sd); 118 kernfs_remove(kn);
964 } 119 }
965} 120}
966 121
967int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
968 const char *new_name, const void *new_ns)
969{
970 int error;
971
972 mutex_lock(&sysfs_mutex);
973
974 error = 0;
975 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
976 (strcmp(sd->s_name, new_name) == 0))
977 goto out; /* nothing to rename */
978
979 error = -EEXIST;
980 if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
981 goto out;
982
983 /* rename sysfs_dirent */
984 if (strcmp(sd->s_name, new_name) != 0) {
985 error = -ENOMEM;
986 new_name = kstrdup(new_name, GFP_KERNEL);
987 if (!new_name)
988 goto out;
989
990 kfree(sd->s_name);
991 sd->s_name = new_name;
992 }
993
994 /*
995 * Move to the appropriate place in the appropriate directories rbtree.
996 */
997 sysfs_unlink_sibling(sd);
998 sysfs_get(new_parent_sd);
999 sysfs_put(sd->s_parent);
1000 sd->s_ns = new_ns;
1001 sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
1002 sd->s_parent = new_parent_sd;
1003 sysfs_link_sibling(sd);
1004
1005 error = 0;
1006 out:
1007 mutex_unlock(&sysfs_mutex);
1008 return error;
1009}
1010
1011int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, 122int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
1012 const void *new_ns) 123 const void *new_ns)
1013{ 124{
1014 struct sysfs_dirent *parent_sd = kobj->sd->s_parent; 125 struct kernfs_node *parent = kobj->sd->parent;
1015 126
1016 return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); 127 return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
1017} 128}
1018 129
1019int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, 130int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
1020 const void *new_ns) 131 const void *new_ns)
1021{ 132{
1022 struct sysfs_dirent *sd = kobj->sd; 133 struct kernfs_node *kn = kobj->sd;
1023 struct sysfs_dirent *new_parent_sd; 134 struct kernfs_node *new_parent;
1024 135
1025 BUG_ON(!sd->s_parent); 136 BUG_ON(!kn->parent);
1026 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 137 new_parent = new_parent_kobj && new_parent_kobj->sd ?
1027 new_parent_kobj->sd : &sysfs_root; 138 new_parent_kobj->sd : sysfs_root_kn;
1028 139
1029 return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); 140 return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
1030} 141}
1031
1032/* Relationship between s_mode and the DT_xxx types */
1033static inline unsigned char dt_type(struct sysfs_dirent *sd)
1034{
1035 return (sd->s_mode >> 12) & 15;
1036}
1037
1038static int sysfs_dir_release(struct inode *inode, struct file *filp)
1039{
1040 sysfs_put(filp->private_data);
1041 return 0;
1042}
1043
1044static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
1045 struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
1046{
1047 if (pos) {
1048 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
1049 pos->s_parent == parent_sd &&
1050 hash == pos->s_hash;
1051 sysfs_put(pos);
1052 if (!valid)
1053 pos = NULL;
1054 }
1055 if (!pos && (hash > 1) && (hash < INT_MAX)) {
1056 struct rb_node *node = parent_sd->s_dir.children.rb_node;
1057 while (node) {
1058 pos = to_sysfs_dirent(node);
1059
1060 if (hash < pos->s_hash)
1061 node = node->rb_left;
1062 else if (hash > pos->s_hash)
1063 node = node->rb_right;
1064 else
1065 break;
1066 }
1067 }
1068 /* Skip over entries in the wrong namespace */
1069 while (pos && pos->s_ns != ns) {
1070 struct rb_node *node = rb_next(&pos->s_rb);
1071 if (!node)
1072 pos = NULL;
1073 else
1074 pos = to_sysfs_dirent(node);
1075 }
1076 return pos;
1077}
1078
1079static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
1080 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
1081{
1082 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
1083 if (pos)
1084 do {
1085 struct rb_node *node = rb_next(&pos->s_rb);
1086 if (!node)
1087 pos = NULL;
1088 else
1089 pos = to_sysfs_dirent(node);
1090 } while (pos && pos->s_ns != ns);
1091 return pos;
1092}
1093
1094static int sysfs_readdir(struct file *file, struct dir_context *ctx)
1095{
1096 struct dentry *dentry = file->f_path.dentry;
1097 struct sysfs_dirent *parent_sd = dentry->d_fsdata;
1098 struct sysfs_dirent *pos = file->private_data;
1099 enum kobj_ns_type type;
1100 const void *ns;
1101
1102 type = sysfs_ns_type(parent_sd);
1103 ns = sysfs_info(dentry->d_sb)->ns[type];
1104
1105 if (!dir_emit_dots(file, ctx))
1106 return 0;
1107 mutex_lock(&sysfs_mutex);
1108 for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1109 pos;
1110 pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
1111 const char *name = pos->s_name;
1112 unsigned int type = dt_type(pos);
1113 int len = strlen(name);
1114 ino_t ino = pos->s_ino;
1115 ctx->pos = pos->s_hash;
1116 file->private_data = sysfs_get(pos);
1117
1118 mutex_unlock(&sysfs_mutex);
1119 if (!dir_emit(ctx, name, len, ino, type))
1120 return 0;
1121 mutex_lock(&sysfs_mutex);
1122 }
1123 mutex_unlock(&sysfs_mutex);
1124 file->private_data = NULL;
1125 ctx->pos = INT_MAX;
1126 return 0;
1127}
1128
1129static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
1130{
1131 struct inode *inode = file_inode(file);
1132 loff_t ret;
1133
1134 mutex_lock(&inode->i_mutex);
1135 ret = generic_file_llseek(file, offset, whence);
1136 mutex_unlock(&inode->i_mutex);
1137
1138 return ret;
1139}
1140
1141const struct file_operations sysfs_dir_operations = {
1142 .read = generic_read_dir,
1143 .iterate = sysfs_readdir,
1144 .release = sysfs_dir_release,
1145 .llseek = sysfs_dir_llseek,
1146};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b94f93685093..810cf6e613e5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
14#include <linux/kobject.h> 14#include <linux/kobject.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/fsnotify.h>
18#include <linux/namei.h>
19#include <linux/poll.h>
20#include <linux/list.h> 17#include <linux/list.h>
21#include <linux/mutex.h> 18#include <linux/mutex.h>
22#include <linux/limits.h>
23#include <linux/uaccess.h>
24#include <linux/seq_file.h> 19#include <linux/seq_file.h>
25#include <linux/mm.h>
26 20
27#include "sysfs.h" 21#include "sysfs.h"
22#include "../kernfs/kernfs-internal.h"
28 23
29/* 24/*
30 * There's one sysfs_open_file for each open file and one sysfs_open_dirent 25 * Determine ktype->sysfs_ops for the given kernfs_node. This function
31 * for each sysfs_dirent with one or more open files.
32 *
33 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is
34 * protected by sysfs_open_dirent_lock.
35 *
36 * filp->private_data points to seq_file whose ->private points to
37 * sysfs_open_file. sysfs_open_files are chained at
38 * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
39 */
40static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
41static DEFINE_MUTEX(sysfs_open_file_mutex);
42
43struct sysfs_open_dirent {
44 atomic_t refcnt;
45 atomic_t event;
46 wait_queue_head_t poll;
47 struct list_head files; /* goes through sysfs_open_file.list */
48};
49
50struct sysfs_open_file {
51 struct sysfs_dirent *sd;
52 struct file *file;
53 struct mutex mutex;
54 int event;
55 struct list_head list;
56
57 bool mmapped;
58 const struct vm_operations_struct *vm_ops;
59};
60
61static bool sysfs_is_bin(struct sysfs_dirent *sd)
62{
63 return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
64}
65
66static struct sysfs_open_file *sysfs_of(struct file *file)
67{
68 return ((struct seq_file *)file->private_data)->private;
69}
70
71/*
72 * Determine ktype->sysfs_ops for the given sysfs_dirent. This function
73 * must be called while holding an active reference. 26 * must be called while holding an active reference.
74 */ 27 */
75static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) 28static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
76{ 29{
77 struct kobject *kobj = sd->s_parent->s_dir.kobj; 30 struct kobject *kobj = kn->parent->priv;
78 31
79 if (!sysfs_ignore_lockdep(sd)) 32 if (kn->flags & KERNFS_LOCKDEP)
80 lockdep_assert_held(sd); 33 lockdep_assert_held(kn);
81 return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; 34 return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
82} 35}
83 36
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
86 * details like buffering and seeking. The following function pipes 39 * details like buffering and seeking. The following function pipes
87 * sysfs_ops->show() result through seq_file. 40 * sysfs_ops->show() result through seq_file.
88 */ 41 */
89static int sysfs_seq_show(struct seq_file *sf, void *v) 42static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
90{ 43{
91 struct sysfs_open_file *of = sf->private; 44 struct kernfs_open_file *of = sf->private;
92 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 45 struct kobject *kobj = of->kn->parent->priv;
93 const struct sysfs_ops *ops; 46 const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
94 char *buf;
95 ssize_t count; 47 ssize_t count;
48 char *buf;
96 49
97 /* acquire buffer and ensure that it's >= PAGE_SIZE */ 50 /* acquire buffer and ensure that it's >= PAGE_SIZE */
98 count = seq_get_buf(sf, &buf); 51 count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
102 } 55 }
103 56
104 /* 57 /*
105 * Need @of->sd for attr and ops, its parent for kobj. @of->mutex 58 * Invoke show(). Control may reach here via seq file lseek even
106 * nests outside active ref and is just to ensure that the ops 59 * if @ops->show() isn't implemented.
107 * aren't called concurrently for the same open file.
108 */ 60 */
109 mutex_lock(&of->mutex); 61 if (ops->show) {
110 if (!sysfs_get_active(of->sd)) { 62 count = ops->show(kobj, of->kn->priv, buf);
111 mutex_unlock(&of->mutex); 63 if (count < 0)
112 return -ENODEV; 64 return count;
113 } 65 }
114 66
115 of->event = atomic_read(&of->sd->s_attr.open->event);
116
117 /*
118 * Lookup @ops and invoke show(). Control may reach here via seq
119 * file lseek even if @ops->show() isn't implemented.
120 */
121 ops = sysfs_file_ops(of->sd);
122 if (ops->show)
123 count = ops->show(kobj, of->sd->s_attr.attr, buf);
124 else
125 count = 0;
126
127 sysfs_put_active(of->sd);
128 mutex_unlock(&of->mutex);
129
130 if (count < 0)
131 return count;
132
133 /* 67 /*
134 * The code works fine with PAGE_SIZE return but it's likely to 68 * The code works fine with PAGE_SIZE return but it's likely to
135 * indicate truncated result or overflow in normal use cases. 69 * indicate truncated result or overflow in normal use cases.
@@ -144,728 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
144 return 0; 78 return 0;
145} 79}
146 80
147/* 81static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
148 * Read method for bin files. As reading a bin file can have side-effects, 82 size_t count, loff_t pos)
149 * the exact offset and bytes specified in read(2) call should be passed to
150 * the read callback making it difficult to use seq_file. Implement
151 * simplistic custom buffering for bin files.
152 */
153static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
154 size_t bytes, loff_t *off)
155{ 83{
156 struct sysfs_open_file *of = sysfs_of(file); 84 struct bin_attribute *battr = of->kn->priv;
157 struct bin_attribute *battr = of->sd->s_attr.bin_attr; 85 struct kobject *kobj = of->kn->parent->priv;
158 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 86 loff_t size = file_inode(of->file)->i_size;
159 loff_t size = file_inode(file)->i_size;
160 int count = min_t(size_t, bytes, PAGE_SIZE);
161 loff_t offs = *off;
162 char *buf;
163 87
164 if (!bytes) 88 if (!count)
165 return 0; 89 return 0;
166 90
167 if (size) { 91 if (size) {
168 if (offs > size) 92 if (pos > size)
169 return 0; 93 return 0;
170 if (offs + count > size) 94 if (pos + count > size)
171 count = size - offs; 95 count = size - pos;
172 }
173
174 buf = kmalloc(count, GFP_KERNEL);
175 if (!buf)
176 return -ENOMEM;
177
178 /* need of->sd for battr, its parent for kobj */
179 mutex_lock(&of->mutex);
180 if (!sysfs_get_active(of->sd)) {
181 count = -ENODEV;
182 mutex_unlock(&of->mutex);
183 goto out_free;
184 }
185
186 if (battr->read)
187 count = battr->read(file, kobj, battr, buf, offs, count);
188 else
189 count = -EIO;
190
191 sysfs_put_active(of->sd);
192 mutex_unlock(&of->mutex);
193
194 if (count < 0)
195 goto out_free;
196
197 if (copy_to_user(userbuf, buf, count)) {
198 count = -EFAULT;
199 goto out_free;
200 } 96 }
201 97
202 pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); 98 if (!battr->read)
203 99 return -EIO;
204 *off = offs + count;
205 100
206 out_free: 101 return battr->read(of->file, kobj, battr, buf, pos, count);
207 kfree(buf);
208 return count;
209} 102}
210 103
211/** 104/* kernfs write callback for regular sysfs files */
212 * flush_write_buffer - push buffer to kobject 105static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
213 * @of: open file 106 size_t count, loff_t pos)
214 * @buf: data buffer for file
215 * @off: file offset to write to
216 * @count: number of bytes
217 *
218 * Get the correct pointers for the kobject and the attribute we're dealing
219 * with, then call the store() method for it with @buf.
220 */
221static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
222 size_t count)
223{ 107{
224 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 108 const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
225 int rc = 0; 109 struct kobject *kobj = of->kn->parent->priv;
226
227 /*
228 * Need @of->sd for attr and ops, its parent for kobj. @of->mutex
229 * nests outside active ref and is just to ensure that the ops
230 * aren't called concurrently for the same open file.
231 */
232 mutex_lock(&of->mutex);
233 if (!sysfs_get_active(of->sd)) {
234 mutex_unlock(&of->mutex);
235 return -ENODEV;
236 }
237 110
238 if (sysfs_is_bin(of->sd)) { 111 if (!count)
239 struct bin_attribute *battr = of->sd->s_attr.bin_attr; 112 return 0;
240
241 rc = -EIO;
242 if (battr->write)
243 rc = battr->write(of->file, kobj, battr, buf, off,
244 count);
245 } else {
246 const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
247
248 rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
249 }
250
251 sysfs_put_active(of->sd);
252 mutex_unlock(&of->mutex);
253 113
254 return rc; 114 return ops->store(kobj, of->kn->priv, buf, count);
255} 115}
256 116
257/** 117/* kernfs write callback for bin sysfs files */
258 * sysfs_write_file - write an attribute 118static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
259 * @file: file pointer 119 size_t count, loff_t pos)
260 * @user_buf: data to write
261 * @count: number of bytes
262 * @ppos: starting offset
263 *
264 * Copy data in from userland and pass it to the matching
265 * sysfs_ops->store() by invoking flush_write_buffer().
266 *
267 * There is no easy way for us to know if userspace is only doing a partial
268 * write, so we don't support them. We expect the entire buffer to come on
269 * the first write. Hint: if you're writing a value, first read the file,
270 * modify only the the value you're changing, then write entire buffer
271 * back.
272 */
273static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
274 size_t count, loff_t *ppos)
275{ 120{
276 struct sysfs_open_file *of = sysfs_of(file); 121 struct bin_attribute *battr = of->kn->priv;
277 ssize_t len = min_t(size_t, count, PAGE_SIZE); 122 struct kobject *kobj = of->kn->parent->priv;
278 loff_t size = file_inode(file)->i_size; 123 loff_t size = file_inode(of->file)->i_size;
279 char *buf;
280 124
281 if (sysfs_is_bin(of->sd) && size) { 125 if (size) {
282 if (size <= *ppos) 126 if (size <= pos)
283 return 0; 127 return 0;
284 len = min_t(ssize_t, len, size - *ppos); 128 count = min_t(ssize_t, count, size - pos);
285 } 129 }
286 130 if (!count)
287 if (!len)
288 return 0; 131 return 0;
289 132
290 buf = kmalloc(len + 1, GFP_KERNEL); 133 if (!battr->write)
291 if (!buf) 134 return -EIO;
292 return -ENOMEM;
293 135
294 if (copy_from_user(buf, user_buf, len)) { 136 return battr->write(of->file, kobj, battr, buf, pos, count);
295 len = -EFAULT;
296 goto out_free;
297 }
298 buf[len] = '\0'; /* guarantee string termination */
299
300 len = flush_write_buffer(of, buf, *ppos, len);
301 if (len > 0)
302 *ppos += len;
303out_free:
304 kfree(buf);
305 return len;
306}
307
308static void sysfs_bin_vma_open(struct vm_area_struct *vma)
309{
310 struct file *file = vma->vm_file;
311 struct sysfs_open_file *of = sysfs_of(file);
312
313 if (!of->vm_ops)
314 return;
315
316 if (!sysfs_get_active(of->sd))
317 return;
318
319 if (of->vm_ops->open)
320 of->vm_ops->open(vma);
321
322 sysfs_put_active(of->sd);
323} 137}
324 138
325static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 139static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
140 struct vm_area_struct *vma)
326{ 141{
327 struct file *file = vma->vm_file; 142 struct bin_attribute *battr = of->kn->priv;
328 struct sysfs_open_file *of = sysfs_of(file); 143 struct kobject *kobj = of->kn->parent->priv;
329 int ret;
330 144
331 if (!of->vm_ops) 145 return battr->mmap(of->file, kobj, battr, vma);
332 return VM_FAULT_SIGBUS;
333
334 if (!sysfs_get_active(of->sd))
335 return VM_FAULT_SIGBUS;
336
337 ret = VM_FAULT_SIGBUS;
338 if (of->vm_ops->fault)
339 ret = of->vm_ops->fault(vma, vmf);
340
341 sysfs_put_active(of->sd);
342 return ret;
343} 146}
344 147
345static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, 148void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
346 struct vm_fault *vmf)
347{ 149{
348 struct file *file = vma->vm_file; 150 struct kernfs_node *kn = kobj->sd, *tmp;
349 struct sysfs_open_file *of = sysfs_of(file);
350 int ret;
351
352 if (!of->vm_ops)
353 return VM_FAULT_SIGBUS;
354 151
355 if (!sysfs_get_active(of->sd)) 152 if (kn && dir)
356 return VM_FAULT_SIGBUS; 153 kn = kernfs_find_and_get(kn, dir);
357
358 ret = 0;
359 if (of->vm_ops->page_mkwrite)
360 ret = of->vm_ops->page_mkwrite(vma, vmf);
361 else 154 else
362 file_update_time(file); 155 kernfs_get(kn);
363
364 sysfs_put_active(of->sd);
365 return ret;
366}
367
368static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
369 void *buf, int len, int write)
370{
371 struct file *file = vma->vm_file;
372 struct sysfs_open_file *of = sysfs_of(file);
373 int ret;
374
375 if (!of->vm_ops)
376 return -EINVAL;
377
378 if (!sysfs_get_active(of->sd))
379 return -EINVAL;
380
381 ret = -EINVAL;
382 if (of->vm_ops->access)
383 ret = of->vm_ops->access(vma, addr, buf, len, write);
384
385 sysfs_put_active(of->sd);
386 return ret;
387}
388
389#ifdef CONFIG_NUMA
390static int sysfs_bin_set_policy(struct vm_area_struct *vma,
391 struct mempolicy *new)
392{
393 struct file *file = vma->vm_file;
394 struct sysfs_open_file *of = sysfs_of(file);
395 int ret;
396
397 if (!of->vm_ops)
398 return 0;
399
400 if (!sysfs_get_active(of->sd))
401 return -EINVAL;
402
403 ret = 0;
404 if (of->vm_ops->set_policy)
405 ret = of->vm_ops->set_policy(vma, new);
406
407 sysfs_put_active(of->sd);
408 return ret;
409}
410
411static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
412 unsigned long addr)
413{
414 struct file *file = vma->vm_file;
415 struct sysfs_open_file *of = sysfs_of(file);
416 struct mempolicy *pol;
417
418 if (!of->vm_ops)
419 return vma->vm_policy;
420
421 if (!sysfs_get_active(of->sd))
422 return vma->vm_policy;
423
424 pol = vma->vm_policy;
425 if (of->vm_ops->get_policy)
426 pol = of->vm_ops->get_policy(vma, addr);
427
428 sysfs_put_active(of->sd);
429 return pol;
430}
431
432static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
433 const nodemask_t *to, unsigned long flags)
434{
435 struct file *file = vma->vm_file;
436 struct sysfs_open_file *of = sysfs_of(file);
437 int ret;
438
439 if (!of->vm_ops)
440 return 0;
441
442 if (!sysfs_get_active(of->sd))
443 return 0;
444
445 ret = 0;
446 if (of->vm_ops->migrate)
447 ret = of->vm_ops->migrate(vma, from, to, flags);
448
449 sysfs_put_active(of->sd);
450 return ret;
451}
452#endif
453
454static const struct vm_operations_struct sysfs_bin_vm_ops = {
455 .open = sysfs_bin_vma_open,
456 .fault = sysfs_bin_fault,
457 .page_mkwrite = sysfs_bin_page_mkwrite,
458 .access = sysfs_bin_access,
459#ifdef CONFIG_NUMA
460 .set_policy = sysfs_bin_set_policy,
461 .get_policy = sysfs_bin_get_policy,
462 .migrate = sysfs_bin_migrate,
463#endif
464};
465
466static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
467{
468 struct sysfs_open_file *of = sysfs_of(file);
469 struct bin_attribute *battr = of->sd->s_attr.bin_attr;
470 struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
471 int rc;
472
473 mutex_lock(&of->mutex);
474
475 /* need of->sd for battr, its parent for kobj */
476 rc = -ENODEV;
477 if (!sysfs_get_active(of->sd))
478 goto out_unlock;
479
480 if (!battr->mmap)
481 goto out_put;
482
483 rc = battr->mmap(file, kobj, battr, vma);
484 if (rc)
485 goto out_put;
486
487 /*
488 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
489 * to satisfy versions of X which crash if the mmap fails: that
490 * substitutes a new vm_file, and we don't then want bin_vm_ops.
491 */
492 if (vma->vm_file != file)
493 goto out_put;
494
495 rc = -EINVAL;
496 if (of->mmapped && of->vm_ops != vma->vm_ops)
497 goto out_put;
498 156
499 /* 157 if (kn && attr) {
500 * It is not possible to successfully wrap close. 158 tmp = kernfs_find_and_get(kn, attr);
501 * So error if someone is trying to use close. 159 kernfs_put(kn);
502 */ 160 kn = tmp;
503 rc = -EINVAL;
504 if (vma->vm_ops && vma->vm_ops->close)
505 goto out_put;
506
507 rc = 0;
508 of->mmapped = 1;
509 of->vm_ops = vma->vm_ops;
510 vma->vm_ops = &sysfs_bin_vm_ops;
511out_put:
512 sysfs_put_active(of->sd);
513out_unlock:
514 mutex_unlock(&of->mutex);
515
516 return rc;
517}
518
519/**
520 * sysfs_get_open_dirent - get or create sysfs_open_dirent
521 * @sd: target sysfs_dirent
522 * @of: sysfs_open_file for this instance of open
523 *
524 * If @sd->s_attr.open exists, increment its reference count;
525 * otherwise, create one. @of is chained to the files list.
526 *
527 * LOCKING:
528 * Kernel thread context (may sleep).
529 *
530 * RETURNS:
531 * 0 on success, -errno on failure.
532 */
533static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
534 struct sysfs_open_file *of)
535{
536 struct sysfs_open_dirent *od, *new_od = NULL;
537
538 retry:
539 mutex_lock(&sysfs_open_file_mutex);
540 spin_lock_irq(&sysfs_open_dirent_lock);
541
542 if (!sd->s_attr.open && new_od) {
543 sd->s_attr.open = new_od;
544 new_od = NULL;
545 } 161 }
546 162
547 od = sd->s_attr.open; 163 if (kn) {
548 if (od) { 164 kernfs_notify(kn);
549 atomic_inc(&od->refcnt); 165 kernfs_put(kn);
550 list_add_tail(&of->list, &od->files);
551 }
552
553 spin_unlock_irq(&sysfs_open_dirent_lock);
554 mutex_unlock(&sysfs_open_file_mutex);
555
556 if (od) {
557 kfree(new_od);
558 return 0;
559 } 166 }
167}
168EXPORT_SYMBOL_GPL(sysfs_notify);
560 169
561 /* not there, initialize a new one and retry */ 170static const struct kernfs_ops sysfs_file_kfops_empty = {
562 new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); 171};
563 if (!new_od)
564 return -ENOMEM;
565 172
566 atomic_set(&new_od->refcnt, 0); 173static const struct kernfs_ops sysfs_file_kfops_ro = {
567 atomic_set(&new_od->event, 1); 174 .seq_show = sysfs_kf_seq_show,
568 init_waitqueue_head(&new_od->poll); 175};
569 INIT_LIST_HEAD(&new_od->files);
570 goto retry;
571}
572 176
573/** 177static const struct kernfs_ops sysfs_file_kfops_wo = {
574 * sysfs_put_open_dirent - put sysfs_open_dirent 178 .write = sysfs_kf_write,
575 * @sd: target sysfs_dirent 179};
576 * @of: associated sysfs_open_file
577 *
578 * Put @sd->s_attr.open and unlink @of from the files list. If
579 * reference count reaches zero, disassociate and free it.
580 *
581 * LOCKING:
582 * None.
583 */
584static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
585 struct sysfs_open_file *of)
586{
587 struct sysfs_open_dirent *od = sd->s_attr.open;
588 unsigned long flags;
589 180
590 mutex_lock(&sysfs_open_file_mutex); 181static const struct kernfs_ops sysfs_file_kfops_rw = {
591 spin_lock_irqsave(&sysfs_open_dirent_lock, flags); 182 .seq_show = sysfs_kf_seq_show,
183 .write = sysfs_kf_write,
184};
592 185
593 if (of) 186static const struct kernfs_ops sysfs_bin_kfops_ro = {
594 list_del(&of->list); 187 .read = sysfs_kf_bin_read,
188};
595 189
596 if (atomic_dec_and_test(&od->refcnt)) 190static const struct kernfs_ops sysfs_bin_kfops_wo = {
597 sd->s_attr.open = NULL; 191 .write = sysfs_kf_bin_write,
598 else 192};
599 od = NULL;
600 193
601 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); 194static const struct kernfs_ops sysfs_bin_kfops_rw = {
602 mutex_unlock(&sysfs_open_file_mutex); 195 .read = sysfs_kf_bin_read,
196 .write = sysfs_kf_bin_write,
197};
603 198
604 kfree(od); 199static const struct kernfs_ops sysfs_bin_kfops_mmap = {
605} 200 .read = sysfs_kf_bin_read,
201 .write = sysfs_kf_bin_write,
202 .mmap = sysfs_kf_bin_mmap,
203};
606 204
607static int sysfs_open_file(struct inode *inode, struct file *file) 205int sysfs_add_file_mode_ns(struct kernfs_node *parent,
206 const struct attribute *attr, bool is_bin,
207 umode_t mode, const void *ns)
608{ 208{
609 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 209 struct lock_class_key *key = NULL;
610 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 210 const struct kernfs_ops *ops;
611 struct sysfs_open_file *of; 211 struct kernfs_node *kn;
612 bool has_read, has_write, has_mmap; 212 loff_t size;
613 int error = -EACCES;
614
615 /* need attr_sd for attr and ops, its parent for kobj */
616 if (!sysfs_get_active(attr_sd))
617 return -ENODEV;
618 213
619 if (sysfs_is_bin(attr_sd)) { 214 if (!is_bin) {
620 struct bin_attribute *battr = attr_sd->s_attr.bin_attr; 215 struct kobject *kobj = parent->priv;
621 216 const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
622 has_read = battr->read || battr->mmap;
623 has_write = battr->write || battr->mmap;
624 has_mmap = battr->mmap;
625 } else {
626 const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
627 217
628 /* every kobject with an attribute needs a ktype assigned */ 218 /* every kobject with an attribute needs a ktype assigned */
629 if (WARN(!ops, KERN_ERR 219 if (WARN(!sysfs_ops, KERN_ERR
630 "missing sysfs attribute operations for kobject: %s\n", 220 "missing sysfs attribute operations for kobject: %s\n",
631 kobject_name(kobj))) 221 kobject_name(kobj)))
632 goto err_out; 222 return -EINVAL;
633 223
634 has_read = ops->show; 224 if (sysfs_ops->show && sysfs_ops->store)
635 has_write = ops->store; 225 ops = &sysfs_file_kfops_rw;
636 has_mmap = false; 226 else if (sysfs_ops->show)
637 } 227 ops = &sysfs_file_kfops_ro;
638 228 else if (sysfs_ops->store)
639 /* check perms and supported operations */ 229 ops = &sysfs_file_kfops_wo;
640 if ((file->f_mode & FMODE_WRITE) && 230 else
641 (!(inode->i_mode & S_IWUGO) || !has_write)) 231 ops = &sysfs_file_kfops_empty;
642 goto err_out; 232
643 233 size = PAGE_SIZE;
644 if ((file->f_mode & FMODE_READ) && 234 } else {
645 (!(inode->i_mode & S_IRUGO) || !has_read)) 235 struct bin_attribute *battr = (void *)attr;
646 goto err_out; 236
647 237 if (battr->mmap)
648 /* allocate a sysfs_open_file for the file */ 238 ops = &sysfs_bin_kfops_mmap;
649 error = -ENOMEM; 239 else if (battr->read && battr->write)
650 of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); 240 ops = &sysfs_bin_kfops_rw;
651 if (!of) 241 else if (battr->read)
652 goto err_out; 242 ops = &sysfs_bin_kfops_ro;
653 243 else if (battr->write)
654 /* 244 ops = &sysfs_bin_kfops_wo;
655 * The following is done to give a different lockdep key to 245 else
656 * @of->mutex for files which implement mmap. This is a rather 246 ops = &sysfs_file_kfops_empty;
657 * crude way to avoid false positive lockdep warning around 247
658 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and 248 size = battr->size;
659 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
660 * which mm->mmap_sem nests, while holding @of->mutex. As each
661 * open file has a separate mutex, it's okay as long as those don't
662 * happen on the same file. At this point, we can't easily give
663 * each file a separate locking class. Let's differentiate on
664 * whether the file has mmap or not for now.
665 */
666 if (has_mmap)
667 mutex_init(&of->mutex);
668 else
669 mutex_init(&of->mutex);
670
671 of->sd = attr_sd;
672 of->file = file;
673
674 /*
675 * Always instantiate seq_file even if read access doesn't use
676 * seq_file or is not requested. This unifies private data access
677 * and readable regular files are the vast majority anyway.
678 */
679 if (sysfs_is_bin(attr_sd))
680 error = single_open(file, NULL, of);
681 else
682 error = single_open(file, sysfs_seq_show, of);
683 if (error)
684 goto err_free;
685
686 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
687 if (file->f_mode & FMODE_WRITE)
688 file->f_mode |= FMODE_PWRITE;
689
690 /* make sure we have open dirent struct */
691 error = sysfs_get_open_dirent(attr_sd, of);
692 if (error)
693 goto err_close;
694
695 /* open succeeded, put active references */
696 sysfs_put_active(attr_sd);
697 return 0;
698
699err_close:
700 single_release(inode, file);
701err_free:
702 kfree(of);
703err_out:
704 sysfs_put_active(attr_sd);
705 return error;
706}
707
708static int sysfs_release(struct inode *inode, struct file *filp)
709{
710 struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
711 struct sysfs_open_file *of = sysfs_of(filp);
712
713 sysfs_put_open_dirent(sd, of);
714 single_release(inode, filp);
715 kfree(of);
716
717 return 0;
718}
719
720void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
721{
722 struct sysfs_open_dirent *od;
723 struct sysfs_open_file *of;
724
725 if (!sysfs_is_bin(sd))
726 return;
727
728 spin_lock_irq(&sysfs_open_dirent_lock);
729 od = sd->s_attr.open;
730 if (od)
731 atomic_inc(&od->refcnt);
732 spin_unlock_irq(&sysfs_open_dirent_lock);
733 if (!od)
734 return;
735
736 mutex_lock(&sysfs_open_file_mutex);
737 list_for_each_entry(of, &od->files, list) {
738 struct inode *inode = file_inode(of->file);
739 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
740 } 249 }
741 mutex_unlock(&sysfs_open_file_mutex);
742
743 sysfs_put_open_dirent(sd, NULL);
744}
745
746/* Sysfs attribute files are pollable. The idea is that you read
747 * the content and then you use 'poll' or 'select' to wait for
748 * the content to change. When the content changes (assuming the
749 * manager for the kobject supports notification), poll will
750 * return POLLERR|POLLPRI, and select will return the fd whether
751 * it is waiting for read, write, or exceptions.
752 * Once poll/select indicates that the value has changed, you
753 * need to close and re-open the file, or seek to 0 and read again.
754 * Reminder: this only works for attributes which actively support
755 * it, and it is not possible to test an attribute from userspace
756 * to see if it supports poll (Neither 'poll' nor 'select' return
757 * an appropriate error code). When in doubt, set a suitable timeout value.
758 */
759static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
760{
761 struct sysfs_open_file *of = sysfs_of(filp);
762 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
763 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
764
765 /* need parent for the kobj, grab both */
766 if (!sysfs_get_active(attr_sd))
767 goto trigger;
768
769 poll_wait(filp, &od->poll, wait);
770 250
771 sysfs_put_active(attr_sd); 251#ifdef CONFIG_DEBUG_LOCK_ALLOC
772 252 if (!attr->ignore_lockdep)
773 if (of->event != atomic_read(&od->event)) 253 key = attr->key ?: (struct lock_class_key *)&attr->skey;
774 goto trigger; 254#endif
775 255 kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
776 return DEFAULT_POLLMASK; 256 (void *)attr, ns, true, key);
777 257 if (IS_ERR(kn)) {
778 trigger: 258 if (PTR_ERR(kn) == -EEXIST)
779 return DEFAULT_POLLMASK|POLLERR|POLLPRI; 259 sysfs_warn_dup(parent, attr->name);
780} 260 return PTR_ERR(kn);
781
782void sysfs_notify_dirent(struct sysfs_dirent *sd)
783{
784 struct sysfs_open_dirent *od;
785 unsigned long flags;
786
787 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
788
789 if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
790 od = sd->s_attr.open;
791 if (od) {
792 atomic_inc(&od->event);
793 wake_up_interruptible(&od->poll);
794 }
795 } 261 }
796 262 return 0;
797 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
798}
799EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
800
801void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
802{
803 struct sysfs_dirent *sd = k->sd;
804
805 mutex_lock(&sysfs_mutex);
806
807 if (sd && dir)
808 sd = sysfs_find_dirent(sd, dir, NULL);
809 if (sd && attr)
810 sd = sysfs_find_dirent(sd, attr, NULL);
811 if (sd)
812 sysfs_notify_dirent(sd);
813
814 mutex_unlock(&sysfs_mutex);
815}
816EXPORT_SYMBOL_GPL(sysfs_notify);
817
818const struct file_operations sysfs_file_operations = {
819 .read = seq_read,
820 .write = sysfs_write_file,
821 .llseek = generic_file_llseek,
822 .open = sysfs_open_file,
823 .release = sysfs_release,
824 .poll = sysfs_poll,
825};
826
827const struct file_operations sysfs_bin_operations = {
828 .read = sysfs_bin_read,
829 .write = sysfs_write_file,
830 .llseek = generic_file_llseek,
831 .mmap = sysfs_bin_mmap,
832 .open = sysfs_open_file,
833 .release = sysfs_release,
834 .poll = sysfs_poll,
835};
836
837int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
838 const struct attribute *attr, int type,
839 umode_t amode, const void *ns)
840{
841 umode_t mode = (amode & S_IALLUGO) | S_IFREG;
842 struct sysfs_addrm_cxt acxt;
843 struct sysfs_dirent *sd;
844 int rc;
845
846 sd = sysfs_new_dirent(attr->name, mode, type);
847 if (!sd)
848 return -ENOMEM;
849
850 sd->s_ns = ns;
851 sd->s_attr.attr = (void *)attr;
852 sysfs_dirent_init_lockdep(sd);
853
854 sysfs_addrm_start(&acxt);
855 rc = sysfs_add_one(&acxt, sd, dir_sd);
856 sysfs_addrm_finish(&acxt);
857
858 if (rc)
859 sysfs_put(sd);
860
861 return rc;
862} 263}
863 264
864 265int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
865int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, 266 bool is_bin)
866 int type)
867{ 267{
868 return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); 268 return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
869} 269}
870 270
871/** 271/**
@@ -879,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
879{ 279{
880 BUG_ON(!kobj || !kobj->sd || !attr); 280 BUG_ON(!kobj || !kobj->sd || !attr);
881 281
882 return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, 282 return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
883 attr->mode, ns);
884 283
885} 284}
886EXPORT_SYMBOL_GPL(sysfs_create_file_ns); 285EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -908,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
908int sysfs_add_file_to_group(struct kobject *kobj, 307int sysfs_add_file_to_group(struct kobject *kobj,
909 const struct attribute *attr, const char *group) 308 const struct attribute *attr, const char *group)
910{ 309{
911 struct sysfs_dirent *dir_sd; 310 struct kernfs_node *parent;
912 int error; 311 int error;
913 312
914 if (group) 313 if (group) {
915 dir_sd = sysfs_get_dirent(kobj->sd, group); 314 parent = kernfs_find_and_get(kobj->sd, group);
916 else 315 } else {
917 dir_sd = sysfs_get(kobj->sd); 316 parent = kobj->sd;
317 kernfs_get(parent);
318 }
918 319
919 if (!dir_sd) 320 if (!parent)
920 return -ENOENT; 321 return -ENOENT;
921 322
922 error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); 323 error = sysfs_add_file(parent, attr, false);
923 sysfs_put(dir_sd); 324 kernfs_put(parent);
924 325
925 return error; 326 return error;
926} 327}
@@ -936,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
936int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, 337int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
937 umode_t mode) 338 umode_t mode)
938{ 339{
939 struct sysfs_dirent *sd; 340 struct kernfs_node *kn;
940 struct iattr newattrs; 341 struct iattr newattrs;
941 int rc; 342 int rc;
942 343
943 mutex_lock(&sysfs_mutex); 344 kn = kernfs_find_and_get(kobj->sd, attr->name);
944 345 if (!kn)
945 rc = -ENOENT; 346 return -ENOENT;
946 sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
947 if (!sd)
948 goto out;
949 347
950 newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); 348 newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
951 newattrs.ia_valid = ATTR_MODE; 349 newattrs.ia_valid = ATTR_MODE;
952 rc = sysfs_sd_setattr(sd, &newattrs);
953 350
954 out: 351 rc = kernfs_setattr(kn, &newattrs);
955 mutex_unlock(&sysfs_mutex); 352
353 kernfs_put(kn);
956 return rc; 354 return rc;
957} 355}
958EXPORT_SYMBOL_GPL(sysfs_chmod_file); 356EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -968,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
968void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, 366void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
969 const void *ns) 367 const void *ns)
970{ 368{
971 struct sysfs_dirent *dir_sd = kobj->sd; 369 struct kernfs_node *parent = kobj->sd;
972 370
973 sysfs_hash_and_remove(dir_sd, attr->name, ns); 371 kernfs_remove_by_name_ns(parent, attr->name, ns);
974} 372}
975EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); 373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
976 374
@@ -991,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
991void sysfs_remove_file_from_group(struct kobject *kobj, 389void sysfs_remove_file_from_group(struct kobject *kobj,
992 const struct attribute *attr, const char *group) 390 const struct attribute *attr, const char *group)
993{ 391{
994 struct sysfs_dirent *dir_sd; 392 struct kernfs_node *parent;
995 393
996 if (group) 394 if (group) {
997 dir_sd = sysfs_get_dirent(kobj->sd, group); 395 parent = kernfs_find_and_get(kobj->sd, group);
998 else 396 } else {
999 dir_sd = sysfs_get(kobj->sd); 397 parent = kobj->sd;
1000 if (dir_sd) { 398 kernfs_get(parent);
1001 sysfs_hash_and_remove(dir_sd, attr->name, NULL); 399 }
1002 sysfs_put(dir_sd); 400
401 if (parent) {
402 kernfs_remove_by_name(parent, attr->name);
403 kernfs_put(parent);
1003 } 404 }
1004} 405}
1005EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); 406EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -1014,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
1014{ 415{
1015 BUG_ON(!kobj || !kobj->sd || !attr); 416 BUG_ON(!kobj || !kobj->sd || !attr);
1016 417
1017 return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); 418 return sysfs_add_file(kobj->sd, &attr->attr, true);
1018} 419}
1019EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 420EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
1020 421
@@ -1026,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
1026void sysfs_remove_bin_file(struct kobject *kobj, 427void sysfs_remove_bin_file(struct kobject *kobj,
1027 const struct bin_attribute *attr) 428 const struct bin_attribute *attr)
1028{ 429{
1029 sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); 430 kernfs_remove_by_name(kobj->sd, attr->attr.name);
1030} 431}
1031EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); 432EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
1032 433
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..6b579387c67a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
18#include "sysfs.h" 18#include "sysfs.h"
19 19
20 20
21static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 21static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
22 const struct attribute_group *grp) 22 const struct attribute_group *grp)
23{ 23{
24 struct attribute *const *attr; 24 struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
26 26
27 if (grp->attrs) 27 if (grp->attrs)
28 for (attr = grp->attrs; *attr; attr++) 28 for (attr = grp->attrs; *attr; attr++)
29 sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); 29 kernfs_remove_by_name(parent, (*attr)->name);
30 if (grp->bin_attrs) 30 if (grp->bin_attrs)
31 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) 31 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
32 sysfs_remove_bin_file(kobj, *bin_attr); 32 sysfs_remove_bin_file(kobj, *bin_attr);
33} 33}
34 34
35static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 35static int create_files(struct kernfs_node *parent, struct kobject *kobj,
36 const struct attribute_group *grp, int update) 36 const struct attribute_group *grp, int update)
37{ 37{
38 struct attribute *const *attr; 38 struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
49 * re-adding (if required) the file. 49 * re-adding (if required) the file.
50 */ 50 */
51 if (update) 51 if (update)
52 sysfs_hash_and_remove(dir_sd, (*attr)->name, 52 kernfs_remove_by_name(parent, (*attr)->name);
53 NULL);
54 if (grp->is_visible) { 53 if (grp->is_visible) {
55 mode = grp->is_visible(kobj, *attr, i); 54 mode = grp->is_visible(kobj, *attr, i);
56 if (!mode) 55 if (!mode)
57 continue; 56 continue;
58 } 57 }
59 error = sysfs_add_file_mode_ns(dir_sd, *attr, 58 error = sysfs_add_file_mode_ns(parent, *attr, false,
60 SYSFS_KOBJ_ATTR,
61 (*attr)->mode | mode, 59 (*attr)->mode | mode,
62 NULL); 60 NULL);
63 if (unlikely(error)) 61 if (unlikely(error))
64 break; 62 break;
65 } 63 }
66 if (error) { 64 if (error) {
67 remove_files(dir_sd, kobj, grp); 65 remove_files(parent, kobj, grp);
68 goto exit; 66 goto exit;
69 } 67 }
70 } 68 }
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
78 break; 76 break;
79 } 77 }
80 if (error) 78 if (error)
81 remove_files(dir_sd, kobj, grp); 79 remove_files(parent, kobj, grp);
82 } 80 }
83exit: 81exit:
84 return error; 82 return error;
@@ -88,7 +86,7 @@ exit:
88static int internal_create_group(struct kobject *kobj, int update, 86static int internal_create_group(struct kobject *kobj, int update,
89 const struct attribute_group *grp) 87 const struct attribute_group *grp)
90{ 88{
91 struct sysfs_dirent *sd; 89 struct kernfs_node *kn;
92 int error; 90 int error;
93 91
94 BUG_ON(!kobj || (!update && !kobj->sd)); 92 BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update,
102 return -EINVAL; 100 return -EINVAL;
103 } 101 }
104 if (grp->name) { 102 if (grp->name) {
105 error = sysfs_create_subdir(kobj, grp->name, &sd); 103 kn = kernfs_create_dir(kobj->sd, grp->name,
106 if (error) 104 S_IRWXU | S_IRUGO | S_IXUGO, kobj);
107 return error; 105 if (IS_ERR(kn)) {
106 if (PTR_ERR(kn) == -EEXIST)
107 sysfs_warn_dup(kobj->sd, grp->name);
108 return PTR_ERR(kn);
109 }
108 } else 110 } else
109 sd = kobj->sd; 111 kn = kobj->sd;
110 sysfs_get(sd); 112 kernfs_get(kn);
111 error = create_files(sd, kobj, grp, update); 113 error = create_files(kn, kobj, grp, update);
112 if (error) { 114 if (error) {
113 if (grp->name) 115 if (grp->name)
114 sysfs_remove(sd); 116 kernfs_remove(kn);
115 } 117 }
116 sysfs_put(sd); 118 kernfs_put(kn);
117 return error; 119 return error;
118} 120}
119 121
@@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
203void sysfs_remove_group(struct kobject *kobj, 205void sysfs_remove_group(struct kobject *kobj,
204 const struct attribute_group *grp) 206 const struct attribute_group *grp)
205{ 207{
206 struct sysfs_dirent *dir_sd = kobj->sd; 208 struct kernfs_node *parent = kobj->sd;
207 struct sysfs_dirent *sd; 209 struct kernfs_node *kn;
208 210
209 if (grp->name) { 211 if (grp->name) {
210 sd = sysfs_get_dirent(dir_sd, grp->name); 212 kn = kernfs_find_and_get(parent, grp->name);
211 if (!sd) { 213 if (!kn) {
212 WARN(!sd, KERN_WARNING 214 WARN(!kn, KERN_WARNING
213 "sysfs group %p not found for kobject '%s'\n", 215 "sysfs group %p not found for kobject '%s'\n",
214 grp, kobject_name(kobj)); 216 grp, kobject_name(kobj));
215 return; 217 return;
216 } 218 }
217 } else 219 } else {
218 sd = sysfs_get(dir_sd); 220 kn = parent;
221 kernfs_get(kn);
222 }
219 223
220 remove_files(sd, kobj, grp); 224 remove_files(kn, kobj, grp);
221 if (grp->name) 225 if (grp->name)
222 sysfs_remove(sd); 226 kernfs_remove(kn);
223 227
224 sysfs_put(sd); 228 kernfs_put(kn);
225} 229}
226EXPORT_SYMBOL_GPL(sysfs_remove_group); 230EXPORT_SYMBOL_GPL(sysfs_remove_group);
227 231
@@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
257int sysfs_merge_group(struct kobject *kobj, 261int sysfs_merge_group(struct kobject *kobj,
258 const struct attribute_group *grp) 262 const struct attribute_group *grp)
259{ 263{
260 struct sysfs_dirent *dir_sd; 264 struct kernfs_node *parent;
261 int error = 0; 265 int error = 0;
262 struct attribute *const *attr; 266 struct attribute *const *attr;
263 int i; 267 int i;
264 268
265 dir_sd = sysfs_get_dirent(kobj->sd, grp->name); 269 parent = kernfs_find_and_get(kobj->sd, grp->name);
266 if (!dir_sd) 270 if (!parent)
267 return -ENOENT; 271 return -ENOENT;
268 272
269 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) 273 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
270 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); 274 error = sysfs_add_file(parent, *attr, false);
271 if (error) { 275 if (error) {
272 while (--i >= 0) 276 while (--i >= 0)
273 sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); 277 kernfs_remove_by_name(parent, (*--attr)->name);
274 } 278 }
275 sysfs_put(dir_sd); 279 kernfs_put(parent);
276 280
277 return error; 281 return error;
278} 282}
@@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
286void sysfs_unmerge_group(struct kobject *kobj, 290void sysfs_unmerge_group(struct kobject *kobj,
287 const struct attribute_group *grp) 291 const struct attribute_group *grp)
288{ 292{
289 struct sysfs_dirent *dir_sd; 293 struct kernfs_node *parent;
290 struct attribute *const *attr; 294 struct attribute *const *attr;
291 295
292 dir_sd = sysfs_get_dirent(kobj->sd, grp->name); 296 parent = kernfs_find_and_get(kobj->sd, grp->name);
293 if (dir_sd) { 297 if (parent) {
294 for (attr = grp->attrs; *attr; ++attr) 298 for (attr = grp->attrs; *attr; ++attr)
295 sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); 299 kernfs_remove_by_name(parent, (*attr)->name);
296 sysfs_put(dir_sd); 300 kernfs_put(parent);
297 } 301 }
298} 302}
299EXPORT_SYMBOL_GPL(sysfs_unmerge_group); 303EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
308int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, 312int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
309 struct kobject *target, const char *link_name) 313 struct kobject *target, const char *link_name)
310{ 314{
311 struct sysfs_dirent *dir_sd; 315 struct kernfs_node *parent;
312 int error = 0; 316 int error = 0;
313 317
314 dir_sd = sysfs_get_dirent(kobj->sd, group_name); 318 parent = kernfs_find_and_get(kobj->sd, group_name);
315 if (!dir_sd) 319 if (!parent)
316 return -ENOENT; 320 return -ENOENT;
317 321
318 error = sysfs_create_link_sd(dir_sd, target, link_name); 322 error = sysfs_create_link_sd(parent, target, link_name);
319 sysfs_put(dir_sd); 323 kernfs_put(parent);
320 324
321 return error; 325 return error;
322} 326}
@@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
331void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, 335void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
332 const char *link_name) 336 const char *link_name)
333{ 337{
334 struct sysfs_dirent *dir_sd; 338 struct kernfs_node *parent;
335 339
336 dir_sd = sysfs_get_dirent(kobj->sd, group_name); 340 parent = kernfs_find_and_get(kobj->sd, group_name);
337 if (dir_sd) { 341 if (parent) {
338 sysfs_hash_and_remove(dir_sd, link_name, NULL); 342 kernfs_remove_by_name(parent, link_name);
339 sysfs_put(dir_sd); 343 kernfs_put(parent);
340 } 344 }
341} 345}
342EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); 346EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * fs/sysfs/inode.c - basic sysfs inode and dentry operations
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
11 */
12
13#undef DEBUG
14
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/backing-dev.h>
18#include <linux/capability.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/slab.h>
22#include <linux/sysfs.h>
23#include <linux/xattr.h>
24#include <linux/security.h>
25#include "sysfs.h"
26
27static const struct address_space_operations sysfs_aops = {
28 .readpage = simple_readpage,
29 .write_begin = simple_write_begin,
30 .write_end = simple_write_end,
31};
32
33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
35 .ra_pages = 0, /* No readahead */
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
37};
38
39static const struct inode_operations sysfs_inode_operations = {
40 .permission = sysfs_permission,
41 .setattr = sysfs_setattr,
42 .getattr = sysfs_getattr,
43 .setxattr = sysfs_setxattr,
44};
45
46int __init sysfs_inode_init(void)
47{
48 return bdi_init(&sysfs_backing_dev_info);
49}
50
51static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
52{
53 struct sysfs_inode_attrs *attrs;
54 struct iattr *iattrs;
55
56 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
57 if (!attrs)
58 return NULL;
59 iattrs = &attrs->ia_iattr;
60
61 /* assign default attributes */
62 iattrs->ia_mode = sd->s_mode;
63 iattrs->ia_uid = GLOBAL_ROOT_UID;
64 iattrs->ia_gid = GLOBAL_ROOT_GID;
65 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
66
67 return attrs;
68}
69
70int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
71{
72 struct sysfs_inode_attrs *sd_attrs;
73 struct iattr *iattrs;
74 unsigned int ia_valid = iattr->ia_valid;
75
76 sd_attrs = sd->s_iattr;
77
78 if (!sd_attrs) {
79 /* setting attributes for the first time, allocate now */
80 sd_attrs = sysfs_init_inode_attrs(sd);
81 if (!sd_attrs)
82 return -ENOMEM;
83 sd->s_iattr = sd_attrs;
84 }
85 /* attributes were changed at least once in past */
86 iattrs = &sd_attrs->ia_iattr;
87
88 if (ia_valid & ATTR_UID)
89 iattrs->ia_uid = iattr->ia_uid;
90 if (ia_valid & ATTR_GID)
91 iattrs->ia_gid = iattr->ia_gid;
92 if (ia_valid & ATTR_ATIME)
93 iattrs->ia_atime = iattr->ia_atime;
94 if (ia_valid & ATTR_MTIME)
95 iattrs->ia_mtime = iattr->ia_mtime;
96 if (ia_valid & ATTR_CTIME)
97 iattrs->ia_ctime = iattr->ia_ctime;
98 if (ia_valid & ATTR_MODE) {
99 umode_t mode = iattr->ia_mode;
100 iattrs->ia_mode = sd->s_mode = mode;
101 }
102 return 0;
103}
104
105int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
106{
107 struct inode *inode = dentry->d_inode;
108 struct sysfs_dirent *sd = dentry->d_fsdata;
109 int error;
110
111 if (!sd)
112 return -EINVAL;
113
114 mutex_lock(&sysfs_mutex);
115 error = inode_change_ok(inode, iattr);
116 if (error)
117 goto out;
118
119 error = sysfs_sd_setattr(sd, iattr);
120 if (error)
121 goto out;
122
123 /* this ignores size changes */
124 setattr_copy(inode, iattr);
125
126out:
127 mutex_unlock(&sysfs_mutex);
128 return error;
129}
130
131static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
132 u32 *secdata_len)
133{
134 struct sysfs_inode_attrs *iattrs;
135 void *old_secdata;
136 size_t old_secdata_len;
137
138 if (!sd->s_iattr) {
139 sd->s_iattr = sysfs_init_inode_attrs(sd);
140 if (!sd->s_iattr)
141 return -ENOMEM;
142 }
143
144 iattrs = sd->s_iattr;
145 old_secdata = iattrs->ia_secdata;
146 old_secdata_len = iattrs->ia_secdata_len;
147
148 iattrs->ia_secdata = *secdata;
149 iattrs->ia_secdata_len = *secdata_len;
150
151 *secdata = old_secdata;
152 *secdata_len = old_secdata_len;
153 return 0;
154}
155
156int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
157 size_t size, int flags)
158{
159 struct sysfs_dirent *sd = dentry->d_fsdata;
160 void *secdata;
161 int error;
162 u32 secdata_len = 0;
163
164 if (!sd)
165 return -EINVAL;
166
167 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
168 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
169 error = security_inode_setsecurity(dentry->d_inode, suffix,
170 value, size, flags);
171 if (error)
172 goto out;
173 error = security_inode_getsecctx(dentry->d_inode,
174 &secdata, &secdata_len);
175 if (error)
176 goto out;
177
178 mutex_lock(&sysfs_mutex);
179 error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
180 mutex_unlock(&sysfs_mutex);
181
182 if (secdata)
183 security_release_secctx(secdata, secdata_len);
184 } else
185 return -EINVAL;
186out:
187 return error;
188}
189
190static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
191{
192 inode->i_mode = mode;
193 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
194}
195
196static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
197{
198 inode->i_uid = iattr->ia_uid;
199 inode->i_gid = iattr->ia_gid;
200 inode->i_atime = iattr->ia_atime;
201 inode->i_mtime = iattr->ia_mtime;
202 inode->i_ctime = iattr->ia_ctime;
203}
204
205static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
206{
207 struct sysfs_inode_attrs *iattrs = sd->s_iattr;
208
209 inode->i_mode = sd->s_mode;
210 if (iattrs) {
211 /* sysfs_dirent has non-default attributes
212 * get them from persistent copy in sysfs_dirent
213 */
214 set_inode_attr(inode, &iattrs->ia_iattr);
215 security_inode_notifysecctx(inode,
216 iattrs->ia_secdata,
217 iattrs->ia_secdata_len);
218 }
219
220 if (sysfs_type(sd) == SYSFS_DIR)
221 set_nlink(inode, sd->s_dir.subdirs + 2);
222}
223
224int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
225 struct kstat *stat)
226{
227 struct sysfs_dirent *sd = dentry->d_fsdata;
228 struct inode *inode = dentry->d_inode;
229
230 mutex_lock(&sysfs_mutex);
231 sysfs_refresh_inode(sd, inode);
232 mutex_unlock(&sysfs_mutex);
233
234 generic_fillattr(inode, stat);
235 return 0;
236}
237
238static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
239{
240 struct bin_attribute *bin_attr;
241
242 inode->i_private = sysfs_get(sd);
243 inode->i_mapping->a_ops = &sysfs_aops;
244 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
245 inode->i_op = &sysfs_inode_operations;
246
247 set_default_inode_attr(inode, sd->s_mode);
248 sysfs_refresh_inode(sd, inode);
249
250 /* initialize inode according to type */
251 switch (sysfs_type(sd)) {
252 case SYSFS_DIR:
253 inode->i_op = &sysfs_dir_inode_operations;
254 inode->i_fop = &sysfs_dir_operations;
255 break;
256 case SYSFS_KOBJ_ATTR:
257 inode->i_size = PAGE_SIZE;
258 inode->i_fop = &sysfs_file_operations;
259 break;
260 case SYSFS_KOBJ_BIN_ATTR:
261 bin_attr = sd->s_attr.bin_attr;
262 inode->i_size = bin_attr->size;
263 inode->i_fop = &sysfs_bin_operations;
264 break;
265 case SYSFS_KOBJ_LINK:
266 inode->i_op = &sysfs_symlink_inode_operations;
267 break;
268 default:
269 BUG();
270 }
271
272 unlock_new_inode(inode);
273}
274
275/**
276 * sysfs_get_inode - get inode for sysfs_dirent
277 * @sb: super block
278 * @sd: sysfs_dirent to allocate inode for
279 *
280 * Get inode for @sd. If such inode doesn't exist, a new inode
281 * is allocated and basics are initialized. New inode is
282 * returned locked.
283 *
284 * LOCKING:
285 * Kernel thread context (may sleep).
286 *
287 * RETURNS:
288 * Pointer to allocated inode on success, NULL on failure.
289 */
290struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
291{
292 struct inode *inode;
293
294 inode = iget_locked(sb, sd->s_ino);
295 if (inode && (inode->i_state & I_NEW))
296 sysfs_init_inode(sd, inode);
297
298 return inode;
299}
300
301/*
302 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
303 * To prevent the sysfs inode numbers from being freed prematurely we take a
304 * reference to sysfs_dirent from the sysfs inode. A
305 * super_operations.evict_inode() implementation is needed to drop that
306 * reference upon inode destruction.
307 */
308void sysfs_evict_inode(struct inode *inode)
309{
310 struct sysfs_dirent *sd = inode->i_private;
311
312 truncate_inode_pages(&inode->i_data, 0);
313 clear_inode(inode);
314 sysfs_put(sd);
315}
316
317int sysfs_permission(struct inode *inode, int mask)
318{
319 struct sysfs_dirent *sd;
320
321 if (mask & MAY_NOT_BLOCK)
322 return -ECHILD;
323
324 sd = inode->i_private;
325
326 mutex_lock(&sysfs_mutex);
327 sysfs_refresh_inode(sd, inode);
328 mutex_unlock(&sysfs_mutex);
329
330 return generic_permission(inode, mask);
331}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..6211230814fd 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,41 @@
14 14
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/pagemap.h>
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/magic.h>
21#include <linux/slab.h>
22#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
23 19
24#include "sysfs.h" 20#include "sysfs.h"
25 21
26 22static struct kernfs_root *sysfs_root;
27static struct vfsmount *sysfs_mnt; 23struct kernfs_node *sysfs_root_kn;
28struct kmem_cache *sysfs_dir_cachep;
29
30static const struct super_operations sysfs_ops = {
31 .statfs = simple_statfs,
32 .drop_inode = generic_delete_inode,
33 .evict_inode = sysfs_evict_inode,
34};
35
36struct sysfs_dirent sysfs_root = {
37 .s_name = "",
38 .s_count = ATOMIC_INIT(1),
39 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
40 .s_mode = S_IFDIR | S_IRUGO | S_IXUGO,
41 .s_ino = 1,
42};
43
44static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
45{
46 struct inode *inode;
47 struct dentry *root;
48
49 sb->s_blocksize = PAGE_CACHE_SIZE;
50 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
51 sb->s_magic = SYSFS_MAGIC;
52 sb->s_op = &sysfs_ops;
53 sb->s_time_gran = 1;
54
55 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex);
59 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n");
61 return -ENOMEM;
62 }
63
64 /* instantiate and link root dentry */
65 root = d_make_root(inode);
66 if (!root) {
67 pr_debug("%s: could not get root dentry!\n", __func__);
68 return -ENOMEM;
69 }
70 root->d_fsdata = &sysfs_root;
71 sb->s_root = root;
72 sb->s_d_op = &sysfs_dentry_ops;
73 return 0;
74}
75
76static int sysfs_test_super(struct super_block *sb, void *data)
77{
78 struct sysfs_super_info *sb_info = sysfs_info(sb);
79 struct sysfs_super_info *info = data;
80 enum kobj_ns_type type;
81 int found = 1;
82
83 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
84 if (sb_info->ns[type] != info->ns[type])
85 found = 0;
86 }
87 return found;
88}
89
90static int sysfs_set_super(struct super_block *sb, void *data)
91{
92 int error;
93 error = set_anon_super(sb, data);
94 if (!error)
95 sb->s_fs_info = data;
96 return error;
97}
98
99static void free_sysfs_super_info(struct sysfs_super_info *info)
100{
101 int type;
102 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
103 kobj_ns_drop(type, info->ns[type]);
104 kfree(info);
105}
106 24
107static struct dentry *sysfs_mount(struct file_system_type *fs_type, 25static struct dentry *sysfs_mount(struct file_system_type *fs_type,
108 int flags, const char *dev_name, void *data) 26 int flags, const char *dev_name, void *data)
109{ 27{
110 struct sysfs_super_info *info; 28 struct dentry *root;
111 enum kobj_ns_type type; 29 void *ns;
112 struct super_block *sb;
113 int error;
114 30
115 if (!(flags & MS_KERNMOUNT)) { 31 if (!(flags & MS_KERNMOUNT)) {
116 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) 32 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
117 return ERR_PTR(-EPERM); 33 return ERR_PTR(-EPERM);
118 34
119 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { 35 if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
120 if (!kobj_ns_current_may_mount(type)) 36 return ERR_PTR(-EPERM);
121 return ERR_PTR(-EPERM);
122 }
123 }
124
125 info = kzalloc(sizeof(*info), GFP_KERNEL);
126 if (!info)
127 return ERR_PTR(-ENOMEM);
128
129 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
130 info->ns[type] = kobj_ns_grab_current(type);
131
132 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
133 if (IS_ERR(sb) || sb->s_fs_info != info)
134 free_sysfs_super_info(info);
135 if (IS_ERR(sb))
136 return ERR_CAST(sb);
137 if (!sb->s_root) {
138 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
139 if (error) {
140 deactivate_locked_super(sb);
141 return ERR_PTR(error);
142 }
143 sb->s_flags |= MS_ACTIVE;
144 } 37 }
145 38
146 return dget(sb->s_root); 39 ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
40 root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns);
41 if (IS_ERR(root))
42 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
43 return root;
147} 44}
148 45
149static void sysfs_kill_sb(struct super_block *sb) 46static void sysfs_kill_sb(struct super_block *sb)
150{ 47{
151 struct sysfs_super_info *info = sysfs_info(sb); 48 void *ns = (void *)kernfs_super_ns(sb);
152 /* Remove the superblock from fs_supers/s_instances 49
153 * so we can't find it, before freeing sysfs_super_info. 50 kernfs_kill_sb(sb);
154 */ 51 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
155 kill_anon_super(sb);
156 free_sysfs_super_info(info);
157} 52}
158 53
159static struct file_system_type sysfs_fs_type = { 54static struct file_system_type sysfs_fs_type = {
@@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = {
165 60
166int __init sysfs_init(void) 61int __init sysfs_init(void)
167{ 62{
168 int err = -ENOMEM; 63 int err;
169 64
170 sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", 65 sysfs_root = kernfs_create_root(NULL, NULL);
171 sizeof(struct sysfs_dirent), 66 if (IS_ERR(sysfs_root))
172 0, 0, NULL); 67 return PTR_ERR(sysfs_root);
173 if (!sysfs_dir_cachep)
174 goto out;
175 68
176 err = sysfs_inode_init(); 69 sysfs_root_kn = sysfs_root->kn;
177 if (err)
178 goto out_err;
179 70
180 err = register_filesystem(&sysfs_fs_type); 71 err = register_filesystem(&sysfs_fs_type);
181 if (!err) { 72 if (err) {
182 sysfs_mnt = kern_mount(&sysfs_fs_type); 73 kernfs_destroy_root(sysfs_root);
183 if (IS_ERR(sysfs_mnt)) { 74 return err;
184 printk(KERN_ERR "sysfs: could not mount!\n"); 75 }
185 err = PTR_ERR(sysfs_mnt);
186 sysfs_mnt = NULL;
187 unregister_filesystem(&sysfs_fs_type);
188 goto out_err;
189 }
190 } else
191 goto out_err;
192out:
193 return err;
194out_err:
195 kmem_cache_destroy(sysfs_dir_cachep);
196 sysfs_dir_cachep = NULL;
197 goto out;
198}
199
200#undef sysfs_get
201struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
202{
203 return __sysfs_get(sd);
204}
205EXPORT_SYMBOL_GPL(sysfs_get);
206 76
207#undef sysfs_put 77 return 0;
208void sysfs_put(struct sysfs_dirent *sd)
209{
210 __sysfs_put(sd);
211} 78}
212EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
15#include <linux/mount.h>
16#include <linux/module.h> 14#include <linux/module.h>
17#include <linux/kobject.h> 15#include <linux/kobject.h>
18#include <linux/namei.h>
19#include <linux/mutex.h> 16#include <linux/mutex.h>
20#include <linux/security.h> 17#include <linux/security.h>
21 18
22#include "sysfs.h" 19#include "sysfs.h"
23 20
24static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, 21static int sysfs_do_create_link_sd(struct kernfs_node *parent,
25 struct kobject *target, 22 struct kobject *target_kobj,
26 const char *name, int warn) 23 const char *name, int warn)
27{ 24{
28 struct sysfs_dirent *target_sd = NULL; 25 struct kernfs_node *kn, *target = NULL;
29 struct sysfs_dirent *sd = NULL;
30 struct sysfs_addrm_cxt acxt;
31 enum kobj_ns_type ns_type;
32 int error;
33 26
34 BUG_ON(!name || !parent_sd); 27 BUG_ON(!name || !parent);
35 28
36 /* 29 /*
37 * We don't own @target and it may be removed at any time. 30 * We don't own @target_kobj and it may be removed at any time.
38 * Synchronize using sysfs_symlink_target_lock. See 31 * Synchronize using sysfs_symlink_target_lock. See
39 * sysfs_remove_dir() for details. 32 * sysfs_remove_dir() for details.
40 */ 33 */
41 spin_lock(&sysfs_symlink_target_lock); 34 spin_lock(&sysfs_symlink_target_lock);
42 if (target->sd) 35 if (target_kobj->sd) {
43 target_sd = sysfs_get(target->sd); 36 target = target_kobj->sd;
37 kernfs_get(target);
38 }
44 spin_unlock(&sysfs_symlink_target_lock); 39 spin_unlock(&sysfs_symlink_target_lock);
45 40
46 error = -ENOENT; 41 if (!target)
47 if (!target_sd) 42 return -ENOENT;
48 goto out_put;
49
50 error = -ENOMEM;
51 sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
52 if (!sd)
53 goto out_put;
54 43
55 ns_type = sysfs_ns_type(parent_sd); 44 kn = kernfs_create_link(parent, name, target);
56 if (ns_type) 45 kernfs_put(target);
57 sd->s_ns = target_sd->s_ns;
58 sd->s_symlink.target_sd = target_sd;
59 target_sd = NULL; /* reference is now owned by the symlink */
60
61 sysfs_addrm_start(&acxt);
62 /* Symlinks must be between directories with the same ns_type */
63 if (!ns_type ||
64 (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
65 if (warn)
66 error = sysfs_add_one(&acxt, sd, parent_sd);
67 else
68 error = __sysfs_add_one(&acxt, sd, parent_sd);
69 } else {
70 error = -EINVAL;
71 WARN(1, KERN_WARNING
72 "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
73 parent_sd->s_name,
74 sd->s_name,
75 sd->s_symlink.target_sd->s_parent->s_name,
76 sd->s_symlink.target_sd->s_name);
77 }
78 sysfs_addrm_finish(&acxt);
79 46
80 if (error) 47 if (!IS_ERR(kn))
81 goto out_put; 48 return 0;
82 49
83 return 0; 50 if (warn && PTR_ERR(kn) == -EEXIST)
84 51 sysfs_warn_dup(parent, name);
85 out_put: 52 return PTR_ERR(kn);
86 sysfs_put(target_sd);
87 sysfs_put(sd);
88 return error;
89} 53}
90 54
91/** 55/**
92 * sysfs_create_link_sd - create symlink to a given object. 56 * sysfs_create_link_sd - create symlink to a given object.
93 * @sd: directory we're creating the link in. 57 * @kn: directory we're creating the link in.
94 * @target: object we're pointing to. 58 * @target: object we're pointing to.
95 * @name: name of the symlink. 59 * @name: name of the symlink.
96 */ 60 */
97int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, 61int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
98 const char *name) 62 const char *name)
99{ 63{
100 return sysfs_do_create_link_sd(sd, target, name, 1); 64 return sysfs_do_create_link_sd(kn, target, name, 1);
101} 65}
102 66
103static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, 67static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
104 const char *name, int warn) 68 const char *name, int warn)
105{ 69{
106 struct sysfs_dirent *parent_sd = NULL; 70 struct kernfs_node *parent = NULL;
107 71
108 if (!kobj) 72 if (!kobj)
109 parent_sd = &sysfs_root; 73 parent = sysfs_root_kn;
110 else 74 else
111 parent_sd = kobj->sd; 75 parent = kobj->sd;
112 76
113 if (!parent_sd) 77 if (!parent)
114 return -EFAULT; 78 return -EFAULT;
115 79
116 return sysfs_do_create_link_sd(parent_sd, target, name, warn); 80 return sysfs_do_create_link_sd(parent, target, name, warn);
117} 81}
118 82
119/** 83/**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
164 * sysfs_remove_dir() for details. 128 * sysfs_remove_dir() for details.
165 */ 129 */
166 spin_lock(&sysfs_symlink_target_lock); 130 spin_lock(&sysfs_symlink_target_lock);
167 if (targ->sd && sysfs_ns_type(kobj->sd)) 131 if (targ->sd && kernfs_ns_enabled(kobj->sd))
168 ns = targ->sd->s_ns; 132 ns = targ->sd->ns;
169 spin_unlock(&sysfs_symlink_target_lock); 133 spin_unlock(&sysfs_symlink_target_lock);
170 sysfs_hash_and_remove(kobj->sd, name, ns); 134 kernfs_remove_by_name_ns(kobj->sd, name, ns);
171} 135}
172 136
173/** 137/**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
177 */ 141 */
178void sysfs_remove_link(struct kobject *kobj, const char *name) 142void sysfs_remove_link(struct kobject *kobj, const char *name)
179{ 143{
180 struct sysfs_dirent *parent_sd = NULL; 144 struct kernfs_node *parent = NULL;
181 145
182 if (!kobj) 146 if (!kobj)
183 parent_sd = &sysfs_root; 147 parent = sysfs_root_kn;
184 else 148 else
185 parent_sd = kobj->sd; 149 parent = kobj->sd;
186 150
187 sysfs_hash_and_remove(parent_sd, name, NULL); 151 kernfs_remove_by_name(parent, name);
188} 152}
189EXPORT_SYMBOL_GPL(sysfs_remove_link); 153EXPORT_SYMBOL_GPL(sysfs_remove_link);
190 154
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
201int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, 165int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
202 const char *old, const char *new, const void *new_ns) 166 const char *old, const char *new, const void *new_ns)
203{ 167{
204 struct sysfs_dirent *parent_sd, *sd = NULL; 168 struct kernfs_node *parent, *kn = NULL;
205 const void *old_ns = NULL; 169 const void *old_ns = NULL;
206 int result; 170 int result;
207 171
208 if (!kobj) 172 if (!kobj)
209 parent_sd = &sysfs_root; 173 parent = sysfs_root_kn;
210 else 174 else
211 parent_sd = kobj->sd; 175 parent = kobj->sd;
212 176
213 if (targ->sd) 177 if (targ->sd)
214 old_ns = targ->sd->s_ns; 178 old_ns = targ->sd->ns;
215 179
216 result = -ENOENT; 180 result = -ENOENT;
217 sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); 181 kn = kernfs_find_and_get_ns(parent, old, old_ns);
218 if (!sd) 182 if (!kn)
219 goto out; 183 goto out;
220 184
221 result = -EINVAL; 185 result = -EINVAL;
222 if (sysfs_type(sd) != SYSFS_KOBJ_LINK) 186 if (kernfs_type(kn) != KERNFS_LINK)
223 goto out; 187 goto out;
224 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 188 if (kn->symlink.target_kn->priv != targ)
225 goto out; 189 goto out;
226 190
227 result = sysfs_rename(sd, parent_sd, new, new_ns); 191 result = kernfs_rename_ns(kn, parent, new, new_ns);
228 192
229out: 193out:
230 sysfs_put(sd); 194 kernfs_put(kn);
231 return result; 195 return result;
232} 196}
233EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); 197EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
234
235static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
236 struct sysfs_dirent *target_sd, char *path)
237{
238 struct sysfs_dirent *base, *sd;
239 char *s = path;
240 int len = 0;
241
242 /* go up to the root, stop at the base */
243 base = parent_sd;
244 while (base->s_parent) {
245 sd = target_sd->s_parent;
246 while (sd->s_parent && base != sd)
247 sd = sd->s_parent;
248
249 if (base == sd)
250 break;
251
252 strcpy(s, "../");
253 s += 3;
254 base = base->s_parent;
255 }
256
257 /* determine end of target string for reverse fillup */
258 sd = target_sd;
259 while (sd->s_parent && sd != base) {
260 len += strlen(sd->s_name) + 1;
261 sd = sd->s_parent;
262 }
263
264 /* check limits */
265 if (len < 2)
266 return -EINVAL;
267 len--;
268 if ((s - path) + len > PATH_MAX)
269 return -ENAMETOOLONG;
270
271 /* reverse fillup of target string from target to base */
272 sd = target_sd;
273 while (sd->s_parent && sd != base) {
274 int slen = strlen(sd->s_name);
275
276 len -= slen;
277 strncpy(s + len, sd->s_name, slen);
278 if (len)
279 s[--len] = '/';
280
281 sd = sd->s_parent;
282 }
283
284 return 0;
285}
286
287static int sysfs_getlink(struct dentry *dentry, char *path)
288{
289 struct sysfs_dirent *sd = dentry->d_fsdata;
290 struct sysfs_dirent *parent_sd = sd->s_parent;
291 struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
292 int error;
293
294 mutex_lock(&sysfs_mutex);
295 error = sysfs_get_target_path(parent_sd, target_sd, path);
296 mutex_unlock(&sysfs_mutex);
297
298 return error;
299}
300
301static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
302{
303 int error = -ENOMEM;
304 unsigned long page = get_zeroed_page(GFP_KERNEL);
305 if (page) {
306 error = sysfs_getlink(dentry, (char *) page);
307 if (error < 0)
308 free_page((unsigned long)page);
309 }
310 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
311 return NULL;
312}
313
314static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
315 void *cookie)
316{
317 char *page = nd_get_link(nd);
318 if (!IS_ERR(page))
319 free_page((unsigned long)page);
320}
321
322const struct inode_operations sysfs_symlink_inode_operations = {
323 .setxattr = sysfs_setxattr,
324 .readlink = generic_readlink,
325 .follow_link = sysfs_follow_link,
326 .put_link = sysfs_put_link,
327 .setattr = sysfs_setattr,
328 .getattr = sysfs_getattr,
329 .permission = sysfs_permission,
330};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/lockdep.h> 11#ifndef __SYSFS_INTERNAL_H
12#include <linux/kobject_ns.h> 12#define __SYSFS_INTERNAL_H
13#include <linux/fs.h>
14#include <linux/rbtree.h>
15 13
16struct sysfs_open_dirent; 14#include <linux/sysfs.h>
17
18/* type-specific structures for sysfs_dirent->s_* union members */
19struct sysfs_elem_dir {
20 struct kobject *kobj;
21
22 unsigned long subdirs;
23 /* children rbtree starts here and goes through sd->s_rb */
24 struct rb_root children;
25};
26
27struct sysfs_elem_symlink {
28 struct sysfs_dirent *target_sd;
29};
30
31struct sysfs_elem_attr {
32 union {
33 struct attribute *attr;
34 struct bin_attribute *bin_attr;
35 };
36 struct sysfs_open_dirent *open;
37};
38
39struct sysfs_inode_attrs {
40 struct iattr ia_iattr;
41 void *ia_secdata;
42 u32 ia_secdata_len;
43};
44
45/*
46 * sysfs_dirent - the building block of sysfs hierarchy. Each and
47 * every sysfs node is represented by single sysfs_dirent.
48 *
49 * As long as s_count reference is held, the sysfs_dirent itself is
50 * accessible. Dereferencing s_elem or any other outer entity
51 * requires s_active reference.
52 */
53struct sysfs_dirent {
54 atomic_t s_count;
55 atomic_t s_active;
56#ifdef CONFIG_DEBUG_LOCK_ALLOC
57 struct lockdep_map dep_map;
58#endif
59 struct sysfs_dirent *s_parent;
60 const char *s_name;
61
62 struct rb_node s_rb;
63
64 union {
65 struct completion *completion;
66 struct sysfs_dirent *removed_list;
67 } u;
68
69 const void *s_ns; /* namespace tag */
70 unsigned int s_hash; /* ns + name hash */
71 union {
72 struct sysfs_elem_dir s_dir;
73 struct sysfs_elem_symlink s_symlink;
74 struct sysfs_elem_attr s_attr;
75 };
76
77 unsigned short s_flags;
78 umode_t s_mode;
79 unsigned int s_ino;
80 struct sysfs_inode_attrs *s_iattr;
81};
82
83#define SD_DEACTIVATED_BIAS INT_MIN
84
85#define SYSFS_TYPE_MASK 0x00ff
86#define SYSFS_DIR 0x0001
87#define SYSFS_KOBJ_ATTR 0x0002
88#define SYSFS_KOBJ_BIN_ATTR 0x0004
89#define SYSFS_KOBJ_LINK 0x0008
90#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
91#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
92
93/* identify any namespace tag on sysfs_dirents */
94#define SYSFS_NS_TYPE_MASK 0xf00
95#define SYSFS_NS_TYPE_SHIFT 8
96
97#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
98#define SYSFS_FLAG_REMOVED 0x02000
99
100static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
101{
102 return sd->s_flags & SYSFS_TYPE_MASK;
103}
104
105/*
106 * Return any namespace tags on this dirent.
107 * enum kobj_ns_type is defined in linux/kobject.h
108 */
109static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
110{
111 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
112}
113
114#ifdef CONFIG_DEBUG_LOCK_ALLOC
115
116#define sysfs_dirent_init_lockdep(sd) \
117do { \
118 struct attribute *attr = sd->s_attr.attr; \
119 struct lock_class_key *key = attr->key; \
120 if (!key) \
121 key = &attr->skey; \
122 \
123 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
124} while (0)
125
126/* Test for attributes that want to ignore lockdep for read-locking */
127static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
128{
129 int type = sysfs_type(sd);
130
131 return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
132 sd->s_attr.attr->ignore_lockdep;
133}
134
135#else
136
137#define sysfs_dirent_init_lockdep(sd) do {} while (0)
138
139static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
140{
141 return true;
142}
143
144#endif
145
146/*
147 * Context structure to be used while adding/removing nodes.
148 */
149struct sysfs_addrm_cxt {
150 struct sysfs_dirent *removed;
151};
152 15
153/* 16/*
154 * mount.c 17 * mount.c
155 */ 18 */
156 19extern struct kernfs_node *sysfs_root_kn;
157/*
158 * Each sb is associated with a set of namespace tags (i.e.
159 * the network namespace of the task which mounted this sysfs
160 * instance).
161 */
162struct sysfs_super_info {
163 void *ns[KOBJ_NS_TYPES];
164};
165#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
166extern struct sysfs_dirent sysfs_root;
167extern struct kmem_cache *sysfs_dir_cachep;
168 20
169/* 21/*
170 * dir.c 22 * dir.c
171 */ 23 */
172extern struct mutex sysfs_mutex;
173extern spinlock_t sysfs_symlink_target_lock; 24extern spinlock_t sysfs_symlink_target_lock;
174extern const struct dentry_operations sysfs_dentry_ops;
175
176extern const struct file_operations sysfs_dir_operations;
177extern const struct inode_operations sysfs_dir_inode_operations;
178 25
179struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); 26void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
180void sysfs_put_active(struct sysfs_dirent *sd);
181void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
182void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
183int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
184 struct sysfs_dirent *parent_sd);
185int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
186 struct sysfs_dirent *parent_sd);
187void sysfs_remove(struct sysfs_dirent *sd);
188int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
189 const void *ns);
190void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
191
192struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
193 const unsigned char *name,
194 const void *ns);
195struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
196
197void release_sysfs_dirent(struct sysfs_dirent *sd);
198
199int sysfs_create_subdir(struct kobject *kobj, const char *name,
200 struct sysfs_dirent **p_sd);
201
202int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
203 const char *new_name, const void *new_ns);
204
205static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
206{
207 if (sd) {
208 WARN_ON(!atomic_read(&sd->s_count));
209 atomic_inc(&sd->s_count);
210 }
211 return sd;
212}
213#define sysfs_get(sd) __sysfs_get(sd)
214
215static inline void __sysfs_put(struct sysfs_dirent *sd)
216{
217 if (sd && atomic_dec_and_test(&sd->s_count))
218 release_sysfs_dirent(sd);
219}
220#define sysfs_put(sd) __sysfs_put(sd)
221
222/*
223 * inode.c
224 */
225struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
226void sysfs_evict_inode(struct inode *inode);
227int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
228int sysfs_permission(struct inode *inode, int mask);
229int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
230int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
231 struct kstat *stat);
232int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
233 size_t size, int flags);
234int sysfs_inode_init(void);
235 27
236/* 28/*
237 * file.c 29 * file.c
238 */ 30 */
239extern const struct file_operations sysfs_file_operations; 31int sysfs_add_file(struct kernfs_node *parent,
240extern const struct file_operations sysfs_bin_operations; 32 const struct attribute *attr, bool is_bin);
241 33int sysfs_add_file_mode_ns(struct kernfs_node *parent,
242int sysfs_add_file(struct sysfs_dirent *dir_sd, 34 const struct attribute *attr, bool is_bin,
243 const struct attribute *attr, int type);
244
245int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
246 const struct attribute *attr, int type,
247 umode_t amode, const void *ns); 35 umode_t amode, const void *ns);
248void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
249 36
250/* 37/*
251 * symlink.c 38 * symlink.c
252 */ 39 */
253extern const struct inode_operations sysfs_symlink_inode_operations; 40int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
254int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
255 const char *name); 41 const char *name);
42
43#endif /* __SYSFS_INTERNAL_H */
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc5..9737cba1357d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1010 else 1010 else
1011 udf_truncate_tail_extent(inode); 1011 udf_truncate_tail_extent(inode);
1012 mark_inode_dirty(inode); 1012 mark_inode_dirty(inode);
1013 up_write(&iinfo->i_data_sem);
1013 1014
1014 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1015 if (!fi) 1016 if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1023 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1024 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1024 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1025 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1025 mark_inode_dirty(dir); 1026 mark_inode_dirty(dir);
1026 up_write(&iinfo->i_data_sem);
1027 if (fibh.sbh != fibh.ebh) 1027 if (fibh.sbh != fibh.ebh)
1028 brelse(fibh.ebh); 1028 brelse(fibh.ebh);
1029 brelse(fibh.sbh); 1029 brelse(fibh.sbh);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b882..a26739451b53 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1217,7 +1217,7 @@ __xfs_get_blocks(
1217 lockmode = XFS_ILOCK_EXCL; 1217 lockmode = XFS_ILOCK_EXCL;
1218 xfs_ilock(ip, lockmode); 1218 xfs_ilock(ip, lockmode);
1219 } else { 1219 } else {
1220 lockmode = xfs_ilock_map_shared(ip); 1220 lockmode = xfs_ilock_data_map_shared(ip);
1221 } 1221 }
1222 1222
1223 ASSERT(offset <= mp->m_super->s_maxbytes); 1223 ASSERT(offset <= mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b86127072ac3..01b6a0102fbd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -164,6 +164,7 @@ xfs_attr_get(
164{ 164{
165 int error; 165 int error;
166 struct xfs_name xname; 166 struct xfs_name xname;
167 uint lock_mode;
167 168
168 XFS_STATS_INC(xs_attr_get); 169 XFS_STATS_INC(xs_attr_get);
169 170
@@ -174,9 +175,9 @@ xfs_attr_get(
174 if (error) 175 if (error)
175 return error; 176 return error;
176 177
177 xfs_ilock(ip, XFS_ILOCK_SHARED); 178 lock_mode = xfs_ilock_attr_map_shared(ip);
178 error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); 179 error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
179 xfs_iunlock(ip, XFS_ILOCK_SHARED); 180 xfs_iunlock(ip, lock_mode);
180 return(error); 181 return(error);
181} 182}
182 183
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d174b128153..01db96f60cf0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -507,17 +507,17 @@ xfs_attr_list_int(
507{ 507{
508 int error; 508 int error;
509 xfs_inode_t *dp = context->dp; 509 xfs_inode_t *dp = context->dp;
510 uint lock_mode;
510 511
511 XFS_STATS_INC(xs_attr_list); 512 XFS_STATS_INC(xs_attr_list);
512 513
513 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 514 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
514 return EIO; 515 return EIO;
515 516
516 xfs_ilock(dp, XFS_ILOCK_SHARED);
517
518 /* 517 /*
519 * Decide on what work routines to call based on the inode size. 518 * Decide on what work routines to call based on the inode size.
520 */ 519 */
520 lock_mode = xfs_ilock_attr_map_shared(dp);
521 if (!xfs_inode_hasattr(dp)) { 521 if (!xfs_inode_hasattr(dp)) {
522 error = 0; 522 error = 0;
523 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { 523 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -527,9 +527,7 @@ xfs_attr_list_int(
527 } else { 527 } else {
528 error = xfs_attr_node_list(context); 528 error = xfs_attr_node_list(context);
529 } 529 }
530 530 xfs_iunlock(dp, lock_mode);
531 xfs_iunlock(dp, XFS_ILOCK_SHARED);
532
533 return error; 531 return error;
534} 532}
535 533
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52deda..5549d69ddb45 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
110 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) 110 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
111 return false; 111 return false;
112 if (be32_to_cpu(rmt->rm_offset) + 112 if (be32_to_cpu(rmt->rm_offset) +
113 be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) 113 be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
114 return false; 114 return false;
115 if (rmt->rm_owner == 0) 115 if (rmt->rm_owner == 0)
116 return false; 116 return false;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef11b22e750..152543c4ca70 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1635,7 +1635,7 @@ xfs_bmap_last_extent(
1635 * blocks at the end of the file which do not start at the previous data block, 1635 * blocks at the end of the file which do not start at the previous data block,
1636 * we will try to align the new blocks at stripe unit boundaries. 1636 * we will try to align the new blocks at stripe unit boundaries.
1637 * 1637 *
1638 * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be 1638 * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
1639 * at, or past the EOF. 1639 * at, or past the EOF.
1640 */ 1640 */
1641STATIC int 1641STATIC int
@@ -1650,9 +1650,14 @@ xfs_bmap_isaeof(
1650 bma->aeof = 0; 1650 bma->aeof = 0;
1651 error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, 1651 error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
1652 &is_empty); 1652 &is_empty);
1653 if (error || is_empty) 1653 if (error)
1654 return error; 1654 return error;
1655 1655
1656 if (is_empty) {
1657 bma->aeof = 1;
1658 return 0;
1659 }
1660
1656 /* 1661 /*
1657 * Check if we are allocation or past the last extent, or at least into 1662 * Check if we are allocation or past the last extent, or at least into
1658 * the last delayed allocated extent. 1663 * the last delayed allocated extent.
@@ -3643,10 +3648,19 @@ xfs_bmap_btalloc(
3643 int isaligned; 3648 int isaligned;
3644 int tryagain; 3649 int tryagain;
3645 int error; 3650 int error;
3651 int stripe_align;
3646 3652
3647 ASSERT(ap->length); 3653 ASSERT(ap->length);
3648 3654
3649 mp = ap->ip->i_mount; 3655 mp = ap->ip->i_mount;
3656
3657 /* stripe alignment for allocation is determined by mount parameters */
3658 stripe_align = 0;
3659 if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
3660 stripe_align = mp->m_swidth;
3661 else if (mp->m_dalign)
3662 stripe_align = mp->m_dalign;
3663
3650 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 3664 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
3651 if (unlikely(align)) { 3665 if (unlikely(align)) {
3652 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, 3666 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3655,6 +3669,8 @@ xfs_bmap_btalloc(
3655 ASSERT(!error); 3669 ASSERT(!error);
3656 ASSERT(ap->length); 3670 ASSERT(ap->length);
3657 } 3671 }
3672
3673
3658 nullfb = *ap->firstblock == NULLFSBLOCK; 3674 nullfb = *ap->firstblock == NULLFSBLOCK;
3659 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); 3675 fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
3660 if (nullfb) { 3676 if (nullfb) {
@@ -3730,7 +3746,7 @@ xfs_bmap_btalloc(
3730 */ 3746 */
3731 if (!ap->flist->xbf_low && ap->aeof) { 3747 if (!ap->flist->xbf_low && ap->aeof) {
3732 if (!ap->offset) { 3748 if (!ap->offset) {
3733 args.alignment = mp->m_dalign; 3749 args.alignment = stripe_align;
3734 atype = args.type; 3750 atype = args.type;
3735 isaligned = 1; 3751 isaligned = 1;
3736 /* 3752 /*
@@ -3755,13 +3771,13 @@ xfs_bmap_btalloc(
3755 * of minlen+alignment+slop doesn't go up 3771 * of minlen+alignment+slop doesn't go up
3756 * between the calls. 3772 * between the calls.
3757 */ 3773 */
3758 if (blen > mp->m_dalign && blen <= args.maxlen) 3774 if (blen > stripe_align && blen <= args.maxlen)
3759 nextminlen = blen - mp->m_dalign; 3775 nextminlen = blen - stripe_align;
3760 else 3776 else
3761 nextminlen = args.minlen; 3777 nextminlen = args.minlen;
3762 if (nextminlen + mp->m_dalign > args.minlen + 1) 3778 if (nextminlen + stripe_align > args.minlen + 1)
3763 args.minalignslop = 3779 args.minalignslop =
3764 nextminlen + mp->m_dalign - 3780 nextminlen + stripe_align -
3765 args.minlen - 1; 3781 args.minlen - 1;
3766 else 3782 else
3767 args.minalignslop = 0; 3783 args.minalignslop = 0;
@@ -3783,7 +3799,7 @@ xfs_bmap_btalloc(
3783 */ 3799 */
3784 args.type = atype; 3800 args.type = atype;
3785 args.fsbno = ap->blkno; 3801 args.fsbno = ap->blkno;
3786 args.alignment = mp->m_dalign; 3802 args.alignment = stripe_align;
3787 args.minlen = nextminlen; 3803 args.minlen = nextminlen;
3788 args.minalignslop = 0; 3804 args.minalignslop = 0;
3789 isaligned = 1; 3805 isaligned = 1;
@@ -3997,6 +4013,7 @@ xfs_bmapi_read(
3997 ASSERT(*nmap >= 1); 4013 ASSERT(*nmap >= 1);
3998 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| 4014 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
3999 XFS_BMAPI_IGSTATE))); 4015 XFS_BMAPI_IGSTATE)));
4016 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
4000 4017
4001 if (unlikely(XFS_TEST_ERROR( 4018 if (unlikely(XFS_TEST_ERROR(
4002 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4019 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4191,6 +4208,7 @@ xfs_bmapi_delay(
4191 ASSERT(*nmap >= 1); 4208 ASSERT(*nmap >= 1);
4192 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4209 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4193 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); 4210 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4211 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4194 4212
4195 if (unlikely(XFS_TEST_ERROR( 4213 if (unlikely(XFS_TEST_ERROR(
4196 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && 4214 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4484,6 +4502,7 @@ xfs_bmapi_write(
4484 ASSERT(tp != NULL); 4502 ASSERT(tp != NULL);
4485 ASSERT(len > 0); 4503 ASSERT(len > 0);
4486 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); 4504 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4505 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4487 4506
4488 if (unlikely(XFS_TEST_ERROR( 4507 if (unlikely(XFS_TEST_ERROR(
4489 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4508 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5035,6 +5054,7 @@ xfs_bunmapi(
5035 if (XFS_FORCED_SHUTDOWN(mp)) 5054 if (XFS_FORCED_SHUTDOWN(mp))
5036 return XFS_ERROR(EIO); 5055 return XFS_ERROR(EIO);
5037 5056
5057 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5038 ASSERT(len > 0); 5058 ASSERT(len > 0);
5039 ASSERT(nexts >= 0); 5059 ASSERT(nexts >= 0);
5040 5060
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41c0323..f264616080ca 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); 287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
288 queue_work(xfs_alloc_wq, &args->work); 288 queue_work(xfs_alloc_wq, &args->work);
289 wait_for_completion(&done); 289 wait_for_completion(&done);
290 destroy_work_on_stack(&args->work);
290 return args->result; 291 return args->result;
291} 292}
292 293
@@ -617,22 +618,27 @@ xfs_getbmap(
617 return XFS_ERROR(ENOMEM); 618 return XFS_ERROR(ENOMEM);
618 619
619 xfs_ilock(ip, XFS_IOLOCK_SHARED); 620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
620 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 621 if (whichfork == XFS_DATA_FORK) {
621 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 622 if (!(iflags & BMV_IF_DELALLOC) &&
623 (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
622 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); 624 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
623 if (error) 625 if (error)
624 goto out_unlock_iolock; 626 goto out_unlock_iolock;
627
628 /*
629 * Even after flushing the inode, there can still be
630 * delalloc blocks on the inode beyond EOF due to
631 * speculative preallocation. These are not removed
632 * until the release function is called or the inode
633 * is inactivated. Hence we cannot assert here that
634 * ip->i_delayed_blks == 0.
635 */
625 } 636 }
626 /*
627 * even after flushing the inode, there can still be delalloc
628 * blocks on the inode beyond EOF due to speculative
629 * preallocation. These are not removed until the release
630 * function is called or the inode is inactivated. Hence we
631 * cannot assert here that ip->i_delayed_blks == 0.
632 */
633 }
634 637
635 lock = xfs_ilock_map_shared(ip); 638 lock = xfs_ilock_data_map_shared(ip);
639 } else {
640 lock = xfs_ilock_attr_map_shared(ip);
641 }
636 642
637 /* 643 /*
638 * Don't let nex be bigger than the number of extents 644 * Don't let nex be bigger than the number of extents
@@ -737,7 +743,7 @@ xfs_getbmap(
737 out_free_map: 743 out_free_map:
738 kmem_free(map); 744 kmem_free(map);
739 out_unlock_ilock: 745 out_unlock_ilock:
740 xfs_iunlock_map_shared(ip, lock); 746 xfs_iunlock(ip, lock);
741 out_unlock_iolock: 747 out_unlock_iolock:
742 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 748 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
743 749
@@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes(
1168 xfs_buf_unlock(bp); 1174 xfs_buf_unlock(bp);
1169 1175
1170 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1176 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1177 uint lock_mode;
1178
1171 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1179 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1172 nimap = 1; 1180 nimap = 1;
1181
1182 lock_mode = xfs_ilock_data_map_shared(ip);
1173 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); 1183 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1184 xfs_iunlock(ip, lock_mode);
1185
1174 if (error || nimap < 1) 1186 if (error || nimap < 1)
1175 break; 1187 break;
1176 ASSERT(imap.br_blockcount >= 1); 1188 ASSERT(imap.br_blockcount >= 1);
@@ -1187,7 +1199,12 @@ xfs_zero_remaining_bytes(
1187 XFS_BUF_UNWRITE(bp); 1199 XFS_BUF_UNWRITE(bp);
1188 XFS_BUF_READ(bp); 1200 XFS_BUF_READ(bp);
1189 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 1201 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1190 xfsbdstrat(mp, bp); 1202
1203 if (XFS_FORCED_SHUTDOWN(mp)) {
1204 error = XFS_ERROR(EIO);
1205 break;
1206 }
1207 xfs_buf_iorequest(bp);
1191 error = xfs_buf_iowait(bp); 1208 error = xfs_buf_iowait(bp);
1192 if (error) { 1209 if (error) {
1193 xfs_buf_ioerror_alert(bp, 1210 xfs_buf_ioerror_alert(bp,
@@ -1200,7 +1217,12 @@ xfs_zero_remaining_bytes(
1200 XFS_BUF_UNDONE(bp); 1217 XFS_BUF_UNDONE(bp);
1201 XFS_BUF_UNREAD(bp); 1218 XFS_BUF_UNREAD(bp);
1202 XFS_BUF_WRITE(bp); 1219 XFS_BUF_WRITE(bp);
1203 xfsbdstrat(mp, bp); 1220
1221 if (XFS_FORCED_SHUTDOWN(mp)) {
1222 error = XFS_ERROR(EIO);
1223 break;
1224 }
1225 xfs_buf_iorequest(bp);
1204 error = xfs_buf_iowait(bp); 1226 error = xfs_buf_iowait(bp);
1205 if (error) { 1227 if (error) {
1206 xfs_buf_ioerror_alert(bp, 1228 xfs_buf_ioerror_alert(bp,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f0b77dcb00..9fccfb594291 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -698,7 +698,11 @@ xfs_buf_read_uncached(
698 bp->b_flags |= XBF_READ; 698 bp->b_flags |= XBF_READ;
699 bp->b_ops = ops; 699 bp->b_ops = ops;
700 700
701 xfsbdstrat(target->bt_mount, bp); 701 if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
702 xfs_buf_relse(bp);
703 return NULL;
704 }
705 xfs_buf_iorequest(bp);
702 xfs_buf_iowait(bp); 706 xfs_buf_iowait(bp);
703 return bp; 707 return bp;
704} 708}
@@ -1089,7 +1093,7 @@ xfs_bioerror(
1089 * This is meant for userdata errors; metadata bufs come with 1093 * This is meant for userdata errors; metadata bufs come with
1090 * iodone functions attached, so that we can track down errors. 1094 * iodone functions attached, so that we can track down errors.
1091 */ 1095 */
1092STATIC int 1096int
1093xfs_bioerror_relse( 1097xfs_bioerror_relse(
1094 struct xfs_buf *bp) 1098 struct xfs_buf *bp)
1095{ 1099{
@@ -1152,7 +1156,7 @@ xfs_bwrite(
1152 ASSERT(xfs_buf_islocked(bp)); 1156 ASSERT(xfs_buf_islocked(bp));
1153 1157
1154 bp->b_flags |= XBF_WRITE; 1158 bp->b_flags |= XBF_WRITE;
1155 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); 1159 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
1156 1160
1157 xfs_bdstrat_cb(bp); 1161 xfs_bdstrat_cb(bp);
1158 1162
@@ -1164,25 +1168,6 @@ xfs_bwrite(
1164 return error; 1168 return error;
1165} 1169}
1166 1170
1167/*
1168 * Wrapper around bdstrat so that we can stop data from going to disk in case
1169 * we are shutting down the filesystem. Typically user data goes thru this
1170 * path; one of the exceptions is the superblock.
1171 */
1172void
1173xfsbdstrat(
1174 struct xfs_mount *mp,
1175 struct xfs_buf *bp)
1176{
1177 if (XFS_FORCED_SHUTDOWN(mp)) {
1178 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1179 xfs_bioerror_relse(bp);
1180 return;
1181 }
1182
1183 xfs_buf_iorequest(bp);
1184}
1185
1186STATIC void 1171STATIC void
1187_xfs_buf_ioend( 1172_xfs_buf_ioend(
1188 xfs_buf_t *bp, 1173 xfs_buf_t *bp,
@@ -1516,6 +1501,12 @@ xfs_wait_buftarg(
1516 struct xfs_buf *bp; 1501 struct xfs_buf *bp;
1517 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1502 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1518 list_del_init(&bp->b_lru); 1503 list_del_init(&bp->b_lru);
1504 if (bp->b_flags & XBF_WRITE_FAIL) {
1505 xfs_alert(btp->bt_mount,
1506"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
1507"Please run xfs_repair to determine the extent of the problem.",
1508 (long long)bp->b_bn);
1509 }
1519 xfs_buf_rele(bp); 1510 xfs_buf_rele(bp);
1520 } 1511 }
1521 if (loop++ != 0) 1512 if (loop++ != 0)
@@ -1602,12 +1593,11 @@ xfs_free_buftarg(
1602 kmem_free(btp); 1593 kmem_free(btp);
1603} 1594}
1604 1595
1605STATIC int 1596int
1606xfs_setsize_buftarg_flags( 1597xfs_setsize_buftarg(
1607 xfs_buftarg_t *btp, 1598 xfs_buftarg_t *btp,
1608 unsigned int blocksize, 1599 unsigned int blocksize,
1609 unsigned int sectorsize, 1600 unsigned int sectorsize)
1610 int verbose)
1611{ 1601{
1612 btp->bt_bsize = blocksize; 1602 btp->bt_bsize = blocksize;
1613 btp->bt_sshift = ffs(sectorsize) - 1; 1603 btp->bt_sshift = ffs(sectorsize) - 1;
@@ -1628,26 +1618,17 @@ xfs_setsize_buftarg_flags(
1628} 1618}
1629 1619
1630/* 1620/*
1631 * When allocating the initial buffer target we have not yet 1621 * When allocating the initial buffer target we have not yet
1632 * read in the superblock, so don't know what sized sectors 1622 * read in the superblock, so don't know what sized sectors
1633 * are being used at this early stage. Play safe. 1623 * are being used at this early stage. Play safe.
1634 */ 1624 */
1635STATIC int 1625STATIC int
1636xfs_setsize_buftarg_early( 1626xfs_setsize_buftarg_early(
1637 xfs_buftarg_t *btp, 1627 xfs_buftarg_t *btp,
1638 struct block_device *bdev) 1628 struct block_device *bdev)
1639{ 1629{
1640 return xfs_setsize_buftarg_flags(btp, 1630 return xfs_setsize_buftarg(btp, PAGE_SIZE,
1641 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1631 bdev_logical_block_size(bdev));
1642}
1643
1644int
1645xfs_setsize_buftarg(
1646 xfs_buftarg_t *btp,
1647 unsigned int blocksize,
1648 unsigned int sectorsize)
1649{
1650 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1651} 1632}
1652 1633
1653xfs_buftarg_t * 1634xfs_buftarg_t *
@@ -1799,7 +1780,7 @@ __xfs_buf_delwri_submit(
1799 1780
1800 blk_start_plug(&plug); 1781 blk_start_plug(&plug);
1801 list_for_each_entry_safe(bp, n, io_list, b_list) { 1782 list_for_each_entry_safe(bp, n, io_list, b_list) {
1802 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 1783 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1803 bp->b_flags |= XBF_WRITE; 1784 bp->b_flags |= XBF_WRITE;
1804 1785
1805 if (!wait) { 1786 if (!wait) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e65683361017..1cf21a4a9f22 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -45,6 +45,7 @@ typedef enum {
45#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 45#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
46#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 46#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
47#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ 47#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
48#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
48 49
49/* I/O hints for the BIO layer */ 50/* I/O hints for the BIO layer */
50#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ 51#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
70 { XBF_ASYNC, "ASYNC" }, \ 71 { XBF_ASYNC, "ASYNC" }, \
71 { XBF_DONE, "DONE" }, \ 72 { XBF_DONE, "DONE" }, \
72 { XBF_STALE, "STALE" }, \ 73 { XBF_STALE, "STALE" }, \
74 { XBF_WRITE_FAIL, "WRITE_FAIL" }, \
73 { XBF_SYNCIO, "SYNCIO" }, \ 75 { XBF_SYNCIO, "SYNCIO" }, \
74 { XBF_FUA, "FUA" }, \ 76 { XBF_FUA, "FUA" }, \
75 { XBF_FLUSH, "FLUSH" }, \ 77 { XBF_FLUSH, "FLUSH" }, \
@@ -80,6 +82,7 @@ typedef unsigned int xfs_buf_flags_t;
80 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 82 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
81 { _XBF_COMPOUND, "COMPOUND" } 83 { _XBF_COMPOUND, "COMPOUND" }
82 84
85
83/* 86/*
84 * Internal state flags. 87 * Internal state flags.
85 */ 88 */
@@ -269,9 +272,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
269 272
270/* Buffer Read and Write Routines */ 273/* Buffer Read and Write Routines */
271extern int xfs_bwrite(struct xfs_buf *bp); 274extern int xfs_bwrite(struct xfs_buf *bp);
272
273extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
274
275extern void xfs_buf_ioend(xfs_buf_t *, int); 275extern void xfs_buf_ioend(xfs_buf_t *, int);
276extern void xfs_buf_ioerror(xfs_buf_t *, int); 276extern void xfs_buf_ioerror(xfs_buf_t *, int);
277extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); 277extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
@@ -282,6 +282,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
282#define xfs_buf_zero(bp, off, len) \ 282#define xfs_buf_zero(bp, off, len) \
283 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 283 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
284 284
285extern int xfs_bioerror_relse(struct xfs_buf *);
286
285static inline int xfs_buf_geterror(xfs_buf_t *bp) 287static inline int xfs_buf_geterror(xfs_buf_t *bp)
286{ 288{
287 return bp ? bp->b_error : ENOMEM; 289 return bp ? bp->b_error : ENOMEM;
@@ -301,7 +303,8 @@ extern void xfs_buf_terminate(void);
301 303
302#define XFS_BUF_ZEROFLAGS(bp) \ 304#define XFS_BUF_ZEROFLAGS(bp) \
303 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ 305 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
304 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 306 XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
307 XBF_WRITE_FAIL))
305 308
306void xfs_buf_stale(struct xfs_buf *bp); 309void xfs_buf_stale(struct xfs_buf *bp);
307#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 310#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a64f67ba25d3..33149113e333 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,21 +182,47 @@ xfs_buf_item_size(
182 trace_xfs_buf_item_size(bip); 182 trace_xfs_buf_item_size(bip);
183} 183}
184 184
185static struct xfs_log_iovec * 185static inline void
186xfs_buf_item_copy_iovec(
187 struct xfs_log_vec *lv,
188 struct xfs_log_iovec **vecp,
189 struct xfs_buf *bp,
190 uint offset,
191 int first_bit,
192 uint nbits)
193{
194 offset += first_bit * XFS_BLF_CHUNK;
195 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
196 xfs_buf_offset(bp, offset),
197 nbits * XFS_BLF_CHUNK);
198}
199
200static inline bool
201xfs_buf_item_straddle(
202 struct xfs_buf *bp,
203 uint offset,
204 int next_bit,
205 int last_bit)
206{
207 return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
208 (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
209 XFS_BLF_CHUNK);
210}
211
212static void
186xfs_buf_item_format_segment( 213xfs_buf_item_format_segment(
187 struct xfs_buf_log_item *bip, 214 struct xfs_buf_log_item *bip,
188 struct xfs_log_iovec *vecp, 215 struct xfs_log_vec *lv,
216 struct xfs_log_iovec **vecp,
189 uint offset, 217 uint offset,
190 struct xfs_buf_log_format *blfp) 218 struct xfs_buf_log_format *blfp)
191{ 219{
192 struct xfs_buf *bp = bip->bli_buf; 220 struct xfs_buf *bp = bip->bli_buf;
193 uint base_size; 221 uint base_size;
194 uint nvecs;
195 int first_bit; 222 int first_bit;
196 int last_bit; 223 int last_bit;
197 int next_bit; 224 int next_bit;
198 uint nbits; 225 uint nbits;
199 uint buffer_offset;
200 226
201 /* copy the flags across from the base format item */ 227 /* copy the flags across from the base format item */
202 blfp->blf_flags = bip->__bli_format.blf_flags; 228 blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -208,21 +234,17 @@ xfs_buf_item_format_segment(
208 */ 234 */
209 base_size = xfs_buf_log_format_size(blfp); 235 base_size = xfs_buf_log_format_size(blfp);
210 236
211 nvecs = 0;
212 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 237 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
213 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 238 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
214 /* 239 /*
215 * If the map is not be dirty in the transaction, mark 240 * If the map is not be dirty in the transaction, mark
216 * the size as zero and do not advance the vector pointer. 241 * the size as zero and do not advance the vector pointer.
217 */ 242 */
218 goto out; 243 return;
219 } 244 }
220 245
221 vecp->i_addr = blfp; 246 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
222 vecp->i_len = base_size; 247 blfp->blf_size = 1;
223 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
224 vecp++;
225 nvecs = 1;
226 248
227 if (bip->bli_flags & XFS_BLI_STALE) { 249 if (bip->bli_flags & XFS_BLI_STALE) {
228 /* 250 /*
@@ -232,14 +254,13 @@ xfs_buf_item_format_segment(
232 */ 254 */
233 trace_xfs_buf_item_format_stale(bip); 255 trace_xfs_buf_item_format_stale(bip);
234 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 256 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
235 goto out; 257 return;
236 } 258 }
237 259
238 260
239 /* 261 /*
240 * Fill in an iovec for each set of contiguous chunks. 262 * Fill in an iovec for each set of contiguous chunks.
241 */ 263 */
242
243 last_bit = first_bit; 264 last_bit = first_bit;
244 nbits = 1; 265 nbits = 1;
245 for (;;) { 266 for (;;) {
@@ -252,42 +273,22 @@ xfs_buf_item_format_segment(
252 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 273 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
253 (uint)last_bit + 1); 274 (uint)last_bit + 1);
254 /* 275 /*
255 * If we run out of bits fill in the last iovec and get 276 * If we run out of bits fill in the last iovec and get out of
256 * out of the loop. 277 * the loop. Else if we start a new set of bits then fill in
257 * Else if we start a new set of bits then fill in the 278 * the iovec for the series we were looking at and start
258 * iovec for the series we were looking at and start 279 * counting the bits in the new one. Else we're still in the
259 * counting the bits in the new one. 280 * same set of bits so just keep counting and scanning.
260 * Else we're still in the same set of bits so just
261 * keep counting and scanning.
262 */ 281 */
263 if (next_bit == -1) { 282 if (next_bit == -1) {
264 buffer_offset = offset + first_bit * XFS_BLF_CHUNK; 283 xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
265 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 284 first_bit, nbits);
266 vecp->i_len = nbits * XFS_BLF_CHUNK; 285 blfp->blf_size++;
267 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
268 nvecs++;
269 break; 286 break;
270 } else if (next_bit != last_bit + 1) { 287 } else if (next_bit != last_bit + 1 ||
271 buffer_offset = offset + first_bit * XFS_BLF_CHUNK; 288 xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
272 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 289 xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
273 vecp->i_len = nbits * XFS_BLF_CHUNK; 290 first_bit, nbits);
274 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 291 blfp->blf_size++;
275 nvecs++;
276 vecp++;
277 first_bit = next_bit;
278 last_bit = next_bit;
279 nbits = 1;
280 } else if (xfs_buf_offset(bp, offset +
281 (next_bit << XFS_BLF_SHIFT)) !=
282 (xfs_buf_offset(bp, offset +
283 (last_bit << XFS_BLF_SHIFT)) +
284 XFS_BLF_CHUNK)) {
285 buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
286 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
287 vecp->i_len = nbits * XFS_BLF_CHUNK;
288 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
289 nvecs++;
290 vecp++;
291 first_bit = next_bit; 292 first_bit = next_bit;
292 last_bit = next_bit; 293 last_bit = next_bit;
293 nbits = 1; 294 nbits = 1;
@@ -296,9 +297,6 @@ xfs_buf_item_format_segment(
296 nbits++; 297 nbits++;
297 } 298 }
298 } 299 }
299out:
300 blfp->blf_size = nvecs;
301 return vecp;
302} 300}
303 301
304/* 302/*
@@ -310,10 +308,11 @@ out:
310STATIC void 308STATIC void
311xfs_buf_item_format( 309xfs_buf_item_format(
312 struct xfs_log_item *lip, 310 struct xfs_log_item *lip,
313 struct xfs_log_iovec *vecp) 311 struct xfs_log_vec *lv)
314{ 312{
315 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 313 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
316 struct xfs_buf *bp = bip->bli_buf; 314 struct xfs_buf *bp = bip->bli_buf;
315 struct xfs_log_iovec *vecp = NULL;
317 uint offset = 0; 316 uint offset = 0;
318 int i; 317 int i;
319 318
@@ -354,8 +353,8 @@ xfs_buf_item_format(
354 } 353 }
355 354
356 for (i = 0; i < bip->bli_format_count; i++) { 355 for (i = 0; i < bip->bli_format_count; i++) {
357 vecp = xfs_buf_item_format_segment(bip, vecp, offset, 356 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
358 &bip->bli_formats[i]); 357 &bip->bli_formats[i]);
359 offset += bp->b_maps[i].bm_len; 358 offset += bp->b_maps[i].bm_len;
360 } 359 }
361 360
@@ -496,6 +495,14 @@ xfs_buf_item_unpin(
496 } 495 }
497} 496}
498 497
498/*
499 * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
500 * seconds so as to not spam logs too much on repeated detection of the same
501 * buffer being bad..
502 */
503
504DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
505
499STATIC uint 506STATIC uint
500xfs_buf_item_push( 507xfs_buf_item_push(
501 struct xfs_log_item *lip, 508 struct xfs_log_item *lip,
@@ -524,6 +531,14 @@ xfs_buf_item_push(
524 531
525 trace_xfs_buf_item_push(bip); 532 trace_xfs_buf_item_push(bip);
526 533
534 /* has a previous flush failed due to IO errors? */
535 if ((bp->b_flags & XBF_WRITE_FAIL) &&
536 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
537 xfs_warn(bp->b_target->bt_mount,
538"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
539 (long long)bp->b_bn);
540 }
541
527 if (!xfs_buf_delwri_queue(bp, buffer_list)) 542 if (!xfs_buf_delwri_queue(bp, buffer_list))
528 rval = XFS_ITEM_FLUSHING; 543 rval = XFS_ITEM_FLUSHING;
529 xfs_buf_unlock(bp); 544 xfs_buf_unlock(bp);
@@ -1096,8 +1111,9 @@ xfs_buf_iodone_callbacks(
1096 1111
1097 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 1112 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
1098 1113
1099 if (!XFS_BUF_ISSTALE(bp)) { 1114 if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
1100 bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; 1115 bp->b_flags |= XBF_WRITE | XBF_ASYNC |
1116 XBF_DONE | XBF_WRITE_FAIL;
1101 xfs_buf_iorequest(bp); 1117 xfs_buf_iorequest(bp);
1102 } else { 1118 } else {
1103 xfs_buf_relse(bp); 1119 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 56369d4509d5..48c7d18f68c3 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup(
2067 */ 2067 */
2068int /* error */ 2068int /* error */
2069xfs_dir2_node_removename( 2069xfs_dir2_node_removename(
2070 xfs_da_args_t *args) /* operation arguments */ 2070 struct xfs_da_args *args) /* operation arguments */
2071{ 2071{
2072 xfs_da_state_blk_t *blk; /* leaf block */ 2072 struct xfs_da_state_blk *blk; /* leaf block */
2073 int error; /* error return value */ 2073 int error; /* error return value */
2074 int rval; /* operation return value */ 2074 int rval; /* operation return value */
2075 xfs_da_state_t *state; /* btree cursor */ 2075 struct xfs_da_state *state; /* btree cursor */
2076 2076
2077 trace_xfs_dir2_node_removename(args); 2077 trace_xfs_dir2_node_removename(args);
2078 2078
@@ -2084,19 +2084,18 @@ xfs_dir2_node_removename(
2084 state->mp = args->dp->i_mount; 2084 state->mp = args->dp->i_mount;
2085 state->blocksize = state->mp->m_dirblksize; 2085 state->blocksize = state->mp->m_dirblksize;
2086 state->node_ents = state->mp->m_dir_node_ents; 2086 state->node_ents = state->mp->m_dir_node_ents;
2087 /* 2087
2088 * Look up the entry we're deleting, set up the cursor. 2088 /* Look up the entry we're deleting, set up the cursor. */
2089 */
2090 error = xfs_da3_node_lookup_int(state, &rval); 2089 error = xfs_da3_node_lookup_int(state, &rval);
2091 if (error) 2090 if (error)
2092 rval = error; 2091 goto out_free;
2093 /* 2092
2094 * Didn't find it, upper layer screwed up. 2093 /* Didn't find it, upper layer screwed up. */
2095 */
2096 if (rval != EEXIST) { 2094 if (rval != EEXIST) {
2097 xfs_da_state_free(state); 2095 error = rval;
2098 return rval; 2096 goto out_free;
2099 } 2097 }
2098
2100 blk = &state->path.blk[state->path.active - 1]; 2099 blk = &state->path.blk[state->path.active - 1];
2101 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); 2100 ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
2102 ASSERT(state->extravalid); 2101 ASSERT(state->extravalid);
@@ -2107,7 +2106,7 @@ xfs_dir2_node_removename(
2107 error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, 2106 error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
2108 &state->extrablk, &rval); 2107 &state->extrablk, &rval);
2109 if (error) 2108 if (error)
2110 return error; 2109 goto out_free;
2111 /* 2110 /*
2112 * Fix the hash values up the btree. 2111 * Fix the hash values up the btree.
2113 */ 2112 */
@@ -2122,6 +2121,7 @@ xfs_dir2_node_removename(
2122 */ 2121 */
2123 if (!error) 2122 if (!error)
2124 error = xfs_dir2_node_to_leaf(state); 2123 error = xfs_dir2_node_to_leaf(state);
2124out_free:
2125 xfs_da_state_free(state); 2125 xfs_da_state_free(state);
2126 return error; 2126 return error;
2127} 2127}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c4e50c6ed584..aead369e1c30 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -674,6 +674,7 @@ xfs_readdir(
674{ 674{
675 int rval; /* return value */ 675 int rval; /* return value */
676 int v; /* type-checking value */ 676 int v; /* type-checking value */
677 uint lock_mode;
677 678
678 trace_xfs_readdir(dp); 679 trace_xfs_readdir(dp);
679 680
@@ -683,6 +684,7 @@ xfs_readdir(
683 ASSERT(S_ISDIR(dp->i_d.di_mode)); 684 ASSERT(S_ISDIR(dp->i_d.di_mode));
684 XFS_STATS_INC(xs_dir_getdents); 685 XFS_STATS_INC(xs_dir_getdents);
685 686
687 lock_mode = xfs_ilock_data_map_shared(dp);
686 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 688 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
687 rval = xfs_dir2_sf_getdents(dp, ctx); 689 rval = xfs_dir2_sf_getdents(dp, ctx);
688 else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) 690 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
@@ -691,5 +693,7 @@ xfs_readdir(
691 rval = xfs_dir2_block_getdents(dp, ctx); 693 rval = xfs_dir2_block_getdents(dp, ctx);
692 else 694 else
693 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); 695 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
696 xfs_iunlock(dp, lock_mode);
697
694 return rval; 698 return rval;
695} 699}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index aafc6e46cb58..3725fb1b902b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -170,6 +170,7 @@ xfs_dir2_block_to_sf(
170 char *ptr; /* current data pointer */ 170 char *ptr; /* current data pointer */
171 xfs_dir2_sf_entry_t *sfep; /* shortform entry */ 171 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
172 xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ 172 xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
173 xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
173 174
174 trace_xfs_dir2_block_to_sf(args); 175 trace_xfs_dir2_block_to_sf(args);
175 176
@@ -177,35 +178,20 @@ xfs_dir2_block_to_sf(
177 mp = dp->i_mount; 178 mp = dp->i_mount;
178 179
179 /* 180 /*
180 * Make a copy of the block data, so we can shrink the inode 181 * allocate a temporary destination buffer the size of the inode
181 * and add local data. 182 * to format the data into. Once we have formatted the data, we
183 * can free the block and copy the formatted data into the inode literal
184 * area.
182 */ 185 */
183 hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); 186 dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
184 memcpy(hdr, bp->b_addr, mp->m_dirblksize); 187 hdr = bp->b_addr;
185 logflags = XFS_ILOG_CORE;
186 if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
187 ASSERT(error != ENOSPC);
188 goto out;
189 }
190 188
191 /* 189 /*
192 * The buffer is now unconditionally gone, whether
193 * xfs_dir2_shrink_inode worked or not.
194 *
195 * Convert the inode to local format.
196 */
197 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
198 dp->i_df.if_flags |= XFS_IFINLINE;
199 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
200 ASSERT(dp->i_df.if_bytes == 0);
201 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
202 logflags |= XFS_ILOG_DDATA;
203 /*
204 * Copy the header into the newly allocate local space. 190 * Copy the header into the newly allocate local space.
205 */ 191 */
206 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 192 sfp = (xfs_dir2_sf_hdr_t *)dst;
207 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); 193 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
208 dp->i_d.di_size = size; 194
209 /* 195 /*
210 * Set up to loop over the block's entries. 196 * Set up to loop over the block's entries.
211 */ 197 */
@@ -258,10 +244,34 @@ xfs_dir2_block_to_sf(
258 ptr += dp->d_ops->data_entsize(dep->namelen); 244 ptr += dp->d_ops->data_entsize(dep->namelen);
259 } 245 }
260 ASSERT((char *)sfep - (char *)sfp == size); 246 ASSERT((char *)sfep - (char *)sfp == size);
247
248 /* now we are done with the block, we can shrink the inode */
249 logflags = XFS_ILOG_CORE;
250 error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
251 if (error) {
252 ASSERT(error != ENOSPC);
253 goto out;
254 }
255
256 /*
257 * The buffer is now unconditionally gone, whether
258 * xfs_dir2_shrink_inode worked or not.
259 *
260 * Convert the inode to local format and copy the data in.
261 */
262 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
263 dp->i_df.if_flags |= XFS_IFINLINE;
264 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
265 ASSERT(dp->i_df.if_bytes == 0);
266 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
267
268 logflags |= XFS_ILOG_DDATA;
269 memcpy(dp->i_df.if_u1.if_data, dst, size);
270 dp->i_d.di_size = size;
261 xfs_dir2_sf_check(args); 271 xfs_dir2_sf_check(args);
262out: 272out:
263 xfs_trans_log_inode(args->trans, dp, logflags); 273 xfs_trans_log_inode(args->trans, dp, logflags);
264 kmem_free(hdr); 274 kmem_free(dst);
265 return error; 275 return error;
266} 276}
267 277
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6b1e695caf0e..7aeb4c895b32 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -469,16 +469,17 @@ xfs_qm_dqtobp(
469 struct xfs_mount *mp = dqp->q_mount; 469 struct xfs_mount *mp = dqp->q_mount;
470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
471 struct xfs_trans *tp = (tpp ? *tpp : NULL); 471 struct xfs_trans *tp = (tpp ? *tpp : NULL);
472 uint lock_mode;
472 473
473 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 474 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
474 475
475 xfs_ilock(quotip, XFS_ILOCK_SHARED); 476 lock_mode = xfs_ilock_data_map_shared(quotip);
476 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { 477 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
477 /* 478 /*
478 * Return if this type of quotas is turned off while we 479 * Return if this type of quotas is turned off while we
479 * didn't have the quota inode lock. 480 * didn't have the quota inode lock.
480 */ 481 */
481 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 482 xfs_iunlock(quotip, lock_mode);
482 return ESRCH; 483 return ESRCH;
483 } 484 }
484 485
@@ -488,7 +489,7 @@ xfs_qm_dqtobp(
488 error = xfs_bmapi_read(quotip, dqp->q_fileoffset, 489 error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
489 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); 490 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
490 491
491 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 492 xfs_iunlock(quotip, lock_mode);
492 if (error) 493 if (error)
493 return error; 494 return error;
494 495
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 92e5f62eefc6..f33fbaaa4d8a 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size(
57STATIC void 57STATIC void
58xfs_qm_dquot_logitem_format( 58xfs_qm_dquot_logitem_format(
59 struct xfs_log_item *lip, 59 struct xfs_log_item *lip,
60 struct xfs_log_iovec *logvec) 60 struct xfs_log_vec *lv)
61{ 61{
62 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); 62 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
63 63 struct xfs_log_iovec *vecp = NULL;
64 logvec->i_addr = &qlip->qli_format; 64 struct xfs_dq_logformat *qlf;
65 logvec->i_len = sizeof(xfs_dq_logformat_t); 65
66 logvec->i_type = XLOG_REG_TYPE_QFORMAT; 66 qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
67 logvec++; 67 qlf->qlf_type = XFS_LI_DQUOT;
68 logvec->i_addr = &qlip->qli_dquot->q_core; 68 qlf->qlf_size = 2;
69 logvec->i_len = sizeof(xfs_disk_dquot_t); 69 qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
70 logvec->i_type = XLOG_REG_TYPE_DQUOT; 70 qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
71 71 qlf->qlf_len = 1;
72 qlip->qli_format.qlf_size = 2; 72 qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
73 73 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
74
75 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
76 &qlip->qli_dquot->q_core,
77 sizeof(struct xfs_disk_dquot));
74} 78}
75 79
76/* 80/*
@@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init(
257 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, 261 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
258 &xfs_dquot_item_ops); 262 &xfs_dquot_item_ops);
259 lp->qli_dquot = dqp; 263 lp->qli_dquot = dqp;
260 lp->qli_format.qlf_type = XFS_LI_DQUOT;
261 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
262 lp->qli_format.qlf_blkno = dqp->q_blkno;
263 lp->qli_format.qlf_len = 1;
264 /*
265 * This is just the offset of this dquot within its buffer
266 * (which is currently 1 FSB and probably won't change).
267 * Hence 32 bits for this offset should be just fine.
268 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
269 * here, and recompute it at recovery time.
270 */
271 lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
272} 264}
273 265
274/*------------------ QUOTAOFF LOG ITEMS -------------------*/ 266/*------------------ QUOTAOFF LOG ITEMS -------------------*/
@@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size(
294 *nbytes += sizeof(struct xfs_qoff_logitem); 286 *nbytes += sizeof(struct xfs_qoff_logitem);
295} 287}
296 288
297/*
298 * This is called to fill in the vector of log iovecs for the
299 * given quotaoff log item. We use only 1 iovec, and we point that
300 * at the quotaoff_log_format structure embedded in the quotaoff item.
301 * It is at this point that we assert that all of the extent
302 * slots in the quotaoff item have been filled.
303 */
304STATIC void 289STATIC void
305xfs_qm_qoff_logitem_format( 290xfs_qm_qoff_logitem_format(
306 struct xfs_log_item *lip, 291 struct xfs_log_item *lip,
307 struct xfs_log_iovec *log_vector) 292 struct xfs_log_vec *lv)
308{ 293{
309 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); 294 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
310 295 struct xfs_log_iovec *vecp = NULL;
311 ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); 296 struct xfs_qoff_logformat *qlf;
312 297
313 log_vector->i_addr = &qflip->qql_format; 298 qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
314 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 299 qlf->qf_type = XFS_LI_QUOTAOFF;
315 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; 300 qlf->qf_size = 1;
316 qflip->qql_format.qf_size = 1; 301 qlf->qf_flags = qflip->qql_flags;
302 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
317} 303}
318 304
319/* 305/*
@@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init(
453 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 439 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
454 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); 440 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
455 qf->qql_item.li_mountp = mp; 441 qf->qql_item.li_mountp = mp;
456 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
457 qf->qql_format.qf_flags = flags;
458 qf->qql_start_lip = start; 442 qf->qql_start_lip = start;
443 qf->qql_flags = flags;
459 return qf; 444 return qf;
460} 445}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..502e9464634a 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 xfs_dq_logformat_t qli_format; /* logged structure */
31} xfs_dq_logitem_t; 30} xfs_dq_logitem_t;
32 31
33typedef struct xfs_qoff_logitem { 32typedef struct xfs_qoff_logitem {
34 xfs_log_item_t qql_item; /* common portion */ 33 xfs_log_item_t qql_item; /* common portion */
35 struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ 34 struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
36 xfs_qoff_logformat_t qql_format; /* logged structure */ 35 unsigned int qql_flags;
37} xfs_qoff_logitem_t; 36} xfs_qoff_logitem_t;
38 37
39 38
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3680d04f973f..fb7a4c1ce1c5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -26,6 +26,7 @@
26#include "xfs_trans_priv.h" 26#include "xfs_trans_priv.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
28#include "xfs_extfree_item.h" 28#include "xfs_extfree_item.h"
29#include "xfs_log.h"
29 30
30 31
31kmem_zone_t *xfs_efi_zone; 32kmem_zone_t *xfs_efi_zone;
@@ -101,9 +102,10 @@ xfs_efi_item_size(
101STATIC void 102STATIC void
102xfs_efi_item_format( 103xfs_efi_item_format(
103 struct xfs_log_item *lip, 104 struct xfs_log_item *lip,
104 struct xfs_log_iovec *log_vector) 105 struct xfs_log_vec *lv)
105{ 106{
106 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 107 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
108 struct xfs_log_iovec *vecp = NULL;
107 109
108 ASSERT(atomic_read(&efip->efi_next_extent) == 110 ASSERT(atomic_read(&efip->efi_next_extent) ==
109 efip->efi_format.efi_nextents); 111 efip->efi_format.efi_nextents);
@@ -111,10 +113,9 @@ xfs_efi_item_format(
111 efip->efi_format.efi_type = XFS_LI_EFI; 113 efip->efi_format.efi_type = XFS_LI_EFI;
112 efip->efi_format.efi_size = 1; 114 efip->efi_format.efi_size = 1;
113 115
114 log_vector->i_addr = &efip->efi_format; 116 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
115 log_vector->i_len = xfs_efi_item_sizeof(efip); 117 &efip->efi_format,
116 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 118 xfs_efi_item_sizeof(efip));
117 ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
118} 119}
119 120
120 121
@@ -368,19 +369,19 @@ xfs_efd_item_size(
368STATIC void 369STATIC void
369xfs_efd_item_format( 370xfs_efd_item_format(
370 struct xfs_log_item *lip, 371 struct xfs_log_item *lip,
371 struct xfs_log_iovec *log_vector) 372 struct xfs_log_vec *lv)
372{ 373{
373 struct xfs_efd_log_item *efdp = EFD_ITEM(lip); 374 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
375 struct xfs_log_iovec *vecp = NULL;
374 376
375 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 377 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
376 378
377 efdp->efd_format.efd_type = XFS_LI_EFD; 379 efdp->efd_format.efd_type = XFS_LI_EFD;
378 efdp->efd_format.efd_size = 1; 380 efdp->efd_format.efd_size = 1;
379 381
380 log_vector->i_addr = &efdp->efd_format; 382 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
381 log_vector->i_len = xfs_efd_item_sizeof(efdp); 383 &efdp->efd_format,
382 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 384 xfs_efd_item_sizeof(efdp));
383 ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
384} 385}
385 386
386/* 387/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e143725..e00121592632 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -912,7 +912,7 @@ xfs_dir_open(
912 * If there are any blocks, read-ahead block 0 as we're almost 912 * If there are any blocks, read-ahead block 0 as we're almost
913 * certain to have the next operation be a read there. 913 * certain to have the next operation be a read there.
914 */ 914 */
915 mode = xfs_ilock_map_shared(ip); 915 mode = xfs_ilock_data_map_shared(ip);
916 if (ip->i_d.di_nextents > 0) 916 if (ip->i_d.di_nextents > 0)
917 xfs_dir3_data_readahead(NULL, ip, 0, -1); 917 xfs_dir3_data_readahead(NULL, ip, 0, -1);
918 xfs_iunlock(ip, mode); 918 xfs_iunlock(ip, mode);
@@ -1215,7 +1215,7 @@ xfs_seek_data(
1215 uint lock; 1215 uint lock;
1216 int error; 1216 int error;
1217 1217
1218 lock = xfs_ilock_map_shared(ip); 1218 lock = xfs_ilock_data_map_shared(ip);
1219 1219
1220 isize = i_size_read(inode); 1220 isize = i_size_read(inode);
1221 if (start >= isize) { 1221 if (start >= isize) {
@@ -1294,7 +1294,7 @@ out:
1294 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1294 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1295 1295
1296out_unlock: 1296out_unlock:
1297 xfs_iunlock_map_shared(ip, lock); 1297 xfs_iunlock(ip, lock);
1298 1298
1299 if (error) 1299 if (error)
1300 return -error; 1300 return -error;
@@ -1319,7 +1319,7 @@ xfs_seek_hole(
1319 if (XFS_FORCED_SHUTDOWN(mp)) 1319 if (XFS_FORCED_SHUTDOWN(mp))
1320 return -XFS_ERROR(EIO); 1320 return -XFS_ERROR(EIO);
1321 1321
1322 lock = xfs_ilock_map_shared(ip); 1322 lock = xfs_ilock_data_map_shared(ip);
1323 1323
1324 isize = i_size_read(inode); 1324 isize = i_size_read(inode);
1325 if (start >= isize) { 1325 if (start >= isize) {
@@ -1402,7 +1402,7 @@ out:
1402 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1402 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1403 1403
1404out_unlock: 1404out_unlock:
1405 xfs_iunlock_map_shared(ip, lock); 1405 xfs_iunlock(ip, lock);
1406 1406
1407 if (error) 1407 if (error)
1408 return -error; 1408 return -error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e87719c5bebe..5d7f105a1c82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment(
52{ 52{
53 if (xfs_sb_version_hasalign(&args->mp->m_sb) && 53 if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
54 args->mp->m_sb.sb_inoalignmt >= 54 args->mp->m_sb.sb_inoalignmt >=
55 XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) 55 XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
56 return args->mp->m_sb.sb_inoalignmt; 56 return args->mp->m_sb.sb_inoalignmt;
57 return 1; 57 return 1;
58} 58}
@@ -170,27 +170,20 @@ xfs_ialloc_inode_init(
170{ 170{
171 struct xfs_buf *fbuf; 171 struct xfs_buf *fbuf;
172 struct xfs_dinode *free; 172 struct xfs_dinode *free;
173 int blks_per_cluster, nbufs, ninodes; 173 int nbufs, blks_per_cluster, inodes_per_cluster;
174 int version; 174 int version;
175 int i, j; 175 int i, j;
176 xfs_daddr_t d; 176 xfs_daddr_t d;
177 xfs_ino_t ino = 0; 177 xfs_ino_t ino = 0;
178 178
179 /* 179 /*
180 * Loop over the new block(s), filling in the inodes. 180 * Loop over the new block(s), filling in the inodes. For small block
181 * For small block sizes, manipulate the inodes in buffers 181 * sizes, manipulate the inodes in buffers which are multiples of the
182 * which are multiples of the blocks size. 182 * blocks size.
183 */ 183 */
184 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 184 blks_per_cluster = xfs_icluster_size_fsb(mp);
185 blks_per_cluster = 1; 185 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
186 nbufs = length; 186 nbufs = length / blks_per_cluster;
187 ninodes = mp->m_sb.sb_inopblock;
188 } else {
189 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
190 mp->m_sb.sb_blocksize;
191 nbufs = length / blks_per_cluster;
192 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
193 }
194 187
195 /* 188 /*
196 * Figure out what version number to use in the inodes we create. If 189 * Figure out what version number to use in the inodes we create. If
@@ -225,7 +218,7 @@ xfs_ialloc_inode_init(
225 * they track in the AIL as if they were physically logged. 218 * they track in the AIL as if they were physically logged.
226 */ 219 */
227 if (tp) 220 if (tp)
228 xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), 221 xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
229 mp->m_sb.sb_inodesize, length, gen); 222 mp->m_sb.sb_inodesize, length, gen);
230 } else if (xfs_sb_version_hasnlink(&mp->m_sb)) 223 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
231 version = 2; 224 version = 2;
@@ -246,7 +239,7 @@ xfs_ialloc_inode_init(
246 /* Initialize the inode buffers and log them appropriately. */ 239 /* Initialize the inode buffers and log them appropriately. */
247 fbuf->b_ops = &xfs_inode_buf_ops; 240 fbuf->b_ops = &xfs_inode_buf_ops;
248 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); 241 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
249 for (i = 0; i < ninodes; i++) { 242 for (i = 0; i < inodes_per_cluster; i++) {
250 int ioffset = i << mp->m_sb.sb_inodelog; 243 int ioffset = i << mp->m_sb.sb_inodelog;
251 uint isize = xfs_dinode_size(version); 244 uint isize = xfs_dinode_size(version);
252 245
@@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc(
329 * Locking will ensure that we don't have two callers in here 322 * Locking will ensure that we don't have two callers in here
330 * at one time. 323 * at one time.
331 */ 324 */
332 newlen = XFS_IALLOC_INODES(args.mp); 325 newlen = args.mp->m_ialloc_inos;
333 if (args.mp->m_maxicount && 326 if (args.mp->m_maxicount &&
334 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) 327 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
335 return XFS_ERROR(ENOSPC); 328 return XFS_ERROR(ENOSPC);
336 args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); 329 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
337 /* 330 /*
338 * First try to allocate inodes contiguous with the last-allocated 331 * First try to allocate inodes contiguous with the last-allocated
339 * chunk of inodes. If the filesystem is striped, this will fill 332 * chunk of inodes. If the filesystem is striped, this will fill
@@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc(
343 newino = be32_to_cpu(agi->agi_newino); 336 newino = be32_to_cpu(agi->agi_newino);
344 agno = be32_to_cpu(agi->agi_seqno); 337 agno = be32_to_cpu(agi->agi_seqno);
345 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 338 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
346 XFS_IALLOC_BLOCKS(args.mp); 339 args.mp->m_ialloc_blks;
347 if (likely(newino != NULLAGINO && 340 if (likely(newino != NULLAGINO &&
348 (args.agbno < be32_to_cpu(agi->agi_length)))) { 341 (args.agbno < be32_to_cpu(agi->agi_length)))) {
349 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 342 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -585,7 +578,7 @@ xfs_ialloc_ag_select(
585 * Is there enough free space for the file plus a block of 578 * Is there enough free space for the file plus a block of
586 * inodes? (if we need to allocate some)? 579 * inodes? (if we need to allocate some)?
587 */ 580 */
588 ineed = XFS_IALLOC_BLOCKS(mp); 581 ineed = mp->m_ialloc_blks;
589 longest = pag->pagf_longest; 582 longest = pag->pagf_longest;
590 if (!longest) 583 if (!longest)
591 longest = pag->pagf_flcount > 0; 584 longest = pag->pagf_flcount > 0;
@@ -999,7 +992,7 @@ xfs_dialloc(
999 * inode. 992 * inode.
1000 */ 993 */
1001 if (mp->m_maxicount && 994 if (mp->m_maxicount &&
1002 mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { 995 mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
1003 noroom = 1; 996 noroom = 1;
1004 okalloc = 0; 997 okalloc = 0;
1005 } 998 }
@@ -1202,7 +1195,7 @@ xfs_difree(
1202 * When an inode cluster is free, it becomes eligible for removal 1195 * When an inode cluster is free, it becomes eligible for removal
1203 */ 1196 */
1204 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1197 if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1205 (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { 1198 (rec.ir_freecount == mp->m_ialloc_inos)) {
1206 1199
1207 *delete = 1; 1200 *delete = 1;
1208 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1201 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
@@ -1212,7 +1205,7 @@ xfs_difree(
1212 * AGI and Superblock inode counts, and mark the disk space 1205 * AGI and Superblock inode counts, and mark the disk space
1213 * to be freed when the transaction is committed. 1206 * to be freed when the transaction is committed.
1214 */ 1207 */
1215 ilen = XFS_IALLOC_INODES(mp); 1208 ilen = mp->m_ialloc_inos;
1216 be32_add_cpu(&agi->agi_count, -ilen); 1209 be32_add_cpu(&agi->agi_count, -ilen);
1217 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1210 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1218 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1211 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1228,9 +1221,9 @@ xfs_difree(
1228 goto error0; 1221 goto error0;
1229 } 1222 }
1230 1223
1231 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, 1224 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
1232 agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), 1225 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
1233 XFS_IALLOC_BLOCKS(mp), flist, mp); 1226 mp->m_ialloc_blks, flist, mp);
1234 } else { 1227 } else {
1235 *delete = 0; 1228 *delete = 0;
1236 1229
@@ -1311,7 +1304,7 @@ xfs_imap_lookup(
1311 1304
1312 /* check that the returned record contains the required inode */ 1305 /* check that the returned record contains the required inode */
1313 if (rec.ir_startino > agino || 1306 if (rec.ir_startino > agino ||
1314 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) 1307 rec.ir_startino + mp->m_ialloc_inos <= agino)
1315 return EINVAL; 1308 return EINVAL;
1316 1309
1317 /* for untrusted inodes check it is allocated first */ 1310 /* for untrusted inodes check it is allocated first */
@@ -1384,7 +1377,7 @@ xfs_imap(
1384 return XFS_ERROR(EINVAL); 1377 return XFS_ERROR(EINVAL);
1385 } 1378 }
1386 1379
1387 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; 1380 blks_per_cluster = xfs_icluster_size_fsb(mp);
1388 1381
1389 /* 1382 /*
1390 * For bulkstat and handle lookups, we have an untrusted inode number 1383 * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1405,7 +1398,7 @@ xfs_imap(
1405 * If the inode cluster size is the same as the blocksize or 1398 * If the inode cluster size is the same as the blocksize or
1406 * smaller we get to the buffer by simple arithmetics. 1399 * smaller we get to the buffer by simple arithmetics.
1407 */ 1400 */
1408 if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { 1401 if (blks_per_cluster == 1) {
1409 offset = XFS_INO_TO_OFFSET(mp, ino); 1402 offset = XFS_INO_TO_OFFSET(mp, ino);
1410 ASSERT(offset < mp->m_sb.sb_inopblock); 1403 ASSERT(offset < mp->m_sb.sb_inopblock);
1411 1404
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index a8f76a5ff418..812365d17e67 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -25,17 +25,18 @@ struct xfs_mount;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_btree_cur; 26struct xfs_btree_cur;
27 27
28/* 28/* Move inodes in clusters of this size */
29 * Allocation parameters for inode allocation.
30 */
31#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
32#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
33
34/*
35 * Move inodes in clusters of this size.
36 */
37#define XFS_INODE_BIG_CLUSTER_SIZE 8192 29#define XFS_INODE_BIG_CLUSTER_SIZE 8192
38#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size 30
31/* Calculate and return the number of filesystem blocks per inode cluster */
32static inline int
33xfs_icluster_size_fsb(
34 struct xfs_mount *mp)
35{
36 if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
37 return 1;
38 return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
39}
39 40
40/* 41/*
41 * Make an inode pointer out of the buffer/offset. 42 * Make an inode pointer out of the buffer/offset.
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d2eaccfa73f4..7e4549233251 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -28,6 +28,7 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30#include "xfs_icreate_item.h" 30#include "xfs_icreate_item.h"
31#include "xfs_log.h"
31 32
32kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ 33kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
33 34
@@ -58,13 +59,14 @@ xfs_icreate_item_size(
58STATIC void 59STATIC void
59xfs_icreate_item_format( 60xfs_icreate_item_format(
60 struct xfs_log_item *lip, 61 struct xfs_log_item *lip,
61 struct xfs_log_iovec *log_vector) 62 struct xfs_log_vec *lv)
62{ 63{
63 struct xfs_icreate_item *icp = ICR_ITEM(lip); 64 struct xfs_icreate_item *icp = ICR_ITEM(lip);
65 struct xfs_log_iovec *vecp = NULL;
64 66
65 log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; 67 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
66 log_vector->i_len = sizeof(struct xfs_icreate_log); 68 &icp->ic_format,
67 log_vector->i_type = XLOG_REG_TYPE_ICREATE; 69 sizeof(struct xfs_icreate_log));
68} 70}
69 71
70 72
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed59..3a137e9f9a7d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -77,48 +77,44 @@ xfs_get_extsz_hint(
77} 77}
78 78
79/* 79/*
80 * This is a wrapper routine around the xfs_ilock() routine used to centralize 80 * These two are wrapper routines around the xfs_ilock() routine used to
81 * some grungy code. It is used in places that wish to lock the inode solely 81 * centralize some grungy code. They are used in places that wish to lock the
82 * for reading the extents. The reason these places can't just call 82 * inode solely for reading the extents. The reason these places can't just
83 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the 83 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
84 * extents from disk for a file in b-tree format. If the inode is in b-tree 84 * bringing in of the extents from disk for a file in b-tree format. If the
85 * format, then we need to lock the inode exclusively until the extents are read 85 * inode is in b-tree format, then we need to lock the inode exclusively until
86 * in. Locking it exclusively all the time would limit our parallelism 86 * the extents are read in. Locking it exclusively all the time would limit
87 * unnecessarily, though. What we do instead is check to see if the extents 87 * our parallelism unnecessarily, though. What we do instead is check to see
88 * have been read in yet, and only lock the inode exclusively if they have not. 88 * if the extents have been read in yet, and only lock the inode exclusively
89 * if they have not.
89 * 90 *
90 * The function returns a value which should be given to the corresponding 91 * The functions return a value which should be given to the corresponding
91 * xfs_iunlock_map_shared(). This value is the mode in which the lock was 92 * xfs_iunlock() call.
92 * actually taken.
93 */ 93 */
94uint 94uint
95xfs_ilock_map_shared( 95xfs_ilock_data_map_shared(
96 xfs_inode_t *ip) 96 struct xfs_inode *ip)
97{ 97{
98 uint lock_mode; 98 uint lock_mode = XFS_ILOCK_SHARED;
99 99
100 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && 100 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
101 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { 101 (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
102 lock_mode = XFS_ILOCK_EXCL; 102 lock_mode = XFS_ILOCK_EXCL;
103 } else {
104 lock_mode = XFS_ILOCK_SHARED;
105 }
106
107 xfs_ilock(ip, lock_mode); 103 xfs_ilock(ip, lock_mode);
108
109 return lock_mode; 104 return lock_mode;
110} 105}
111 106
112/* 107uint
113 * This is simply the unlock routine to go with xfs_ilock_map_shared(). 108xfs_ilock_attr_map_shared(
114 * All it does is call xfs_iunlock() with the given lock_mode. 109 struct xfs_inode *ip)
115 */
116void
117xfs_iunlock_map_shared(
118 xfs_inode_t *ip,
119 unsigned int lock_mode)
120{ 110{
121 xfs_iunlock(ip, lock_mode); 111 uint lock_mode = XFS_ILOCK_SHARED;
112
113 if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
114 (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
115 lock_mode = XFS_ILOCK_EXCL;
116 xfs_ilock(ip, lock_mode);
117 return lock_mode;
122} 118}
123 119
124/* 120/*
@@ -588,9 +584,9 @@ xfs_lookup(
588 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 584 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
589 return XFS_ERROR(EIO); 585 return XFS_ERROR(EIO);
590 586
591 lock_mode = xfs_ilock_map_shared(dp); 587 lock_mode = xfs_ilock_data_map_shared(dp);
592 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 588 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
593 xfs_iunlock_map_shared(dp, lock_mode); 589 xfs_iunlock(dp, lock_mode);
594 590
595 if (error) 591 if (error)
596 goto out; 592 goto out;
@@ -2141,8 +2137,8 @@ xfs_ifree_cluster(
2141{ 2137{
2142 xfs_mount_t *mp = free_ip->i_mount; 2138 xfs_mount_t *mp = free_ip->i_mount;
2143 int blks_per_cluster; 2139 int blks_per_cluster;
2140 int inodes_per_cluster;
2144 int nbufs; 2141 int nbufs;
2145 int ninodes;
2146 int i, j; 2142 int i, j;
2147 xfs_daddr_t blkno; 2143 xfs_daddr_t blkno;
2148 xfs_buf_t *bp; 2144 xfs_buf_t *bp;
@@ -2152,18 +2148,11 @@ xfs_ifree_cluster(
2152 struct xfs_perag *pag; 2148 struct xfs_perag *pag;
2153 2149
2154 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2150 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2155 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2151 blks_per_cluster = xfs_icluster_size_fsb(mp);
2156 blks_per_cluster = 1; 2152 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2157 ninodes = mp->m_sb.sb_inopblock; 2153 nbufs = mp->m_ialloc_blks / blks_per_cluster;
2158 nbufs = XFS_IALLOC_BLOCKS(mp);
2159 } else {
2160 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2161 mp->m_sb.sb_blocksize;
2162 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2163 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2164 }
2165 2154
2166 for (j = 0; j < nbufs; j++, inum += ninodes) { 2155 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2167 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2156 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2168 XFS_INO_TO_AGBNO(mp, inum)); 2157 XFS_INO_TO_AGBNO(mp, inum));
2169 2158
@@ -2225,7 +2214,7 @@ xfs_ifree_cluster(
2225 * transaction stale above, which means there is no point in 2214 * transaction stale above, which means there is no point in
2226 * even trying to lock them. 2215 * even trying to lock them.
2227 */ 2216 */
2228 for (i = 0; i < ninodes; i++) { 2217 for (i = 0; i < inodes_per_cluster; i++) {
2229retry: 2218retry:
2230 rcu_read_lock(); 2219 rcu_read_lock();
2231 ip = radix_tree_lookup(&pag->pag_ici_root, 2220 ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2906,13 +2895,13 @@ xfs_iflush_cluster(
2906 2895
2907 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2896 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2908 2897
2909 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2898 inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
2910 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2899 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2911 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2900 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2912 if (!ilist) 2901 if (!ilist)
2913 goto out_put; 2902 goto out_put;
2914 2903
2915 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2904 mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
2916 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2905 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2917 rcu_read_lock(); 2906 rcu_read_lock();
2918 /* really need a gang lookup range call here */ 2907 /* really need a gang lookup range call here */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9e6efccbae04..65e2350f449c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -337,8 +337,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint);
337void xfs_iunlock(xfs_inode_t *, uint); 337void xfs_iunlock(xfs_inode_t *, uint);
338void xfs_ilock_demote(xfs_inode_t *, uint); 338void xfs_ilock_demote(xfs_inode_t *, uint);
339int xfs_isilocked(xfs_inode_t *, uint); 339int xfs_isilocked(xfs_inode_t *, uint);
340uint xfs_ilock_map_shared(xfs_inode_t *); 340uint xfs_ilock_data_map_shared(struct xfs_inode *);
341void xfs_iunlock_map_shared(xfs_inode_t *, uint); 341uint xfs_ilock_attr_map_shared(struct xfs_inode *);
342int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 342int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
343 xfs_nlink_t, xfs_dev_t, prid_t, int, 343 xfs_nlink_t, xfs_dev_t, prid_t, int,
344 struct xfs_buf **, xfs_inode_t **); 344 struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cfe..73514c0486b7 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -431,6 +431,8 @@ xfs_iread_extents(
431 xfs_ifork_t *ifp; 431 xfs_ifork_t *ifp;
432 xfs_extnum_t nextents; 432 xfs_extnum_t nextents;
433 433
434 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
435
434 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 436 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
435 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 437 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
436 ip->i_mount); 438 ip->i_mount);
@@ -721,15 +723,16 @@ xfs_idestroy_fork(
721} 723}
722 724
723/* 725/*
724 * xfs_iextents_copy() 726 * Convert in-core extents to on-disk form
725 * 727 *
726 * This is called to copy the REAL extents (as opposed to the delayed 728 * For either the data or attr fork in extent format, we need to endian convert
727 * allocation extents) from the inode into the given buffer. It 729 * the in-core extent as we place them into the on-disk inode.
728 * returns the number of bytes copied into the buffer.
729 * 730 *
730 * If there are no delayed allocation extents, then we can just 731 * In the case of the data fork, the in-core and on-disk fork sizes can be
731 * memcpy() the extents into the buffer. Otherwise, we need to 732 * different due to delayed allocation extents. We only copy on-disk extents
732 * examine each extent in turn and skip those which are delayed. 733 * here, so callers must always use the physical fork size to determine the
734 * size of the buffer passed to this routine. We will return the size actually
735 * used.
733 */ 736 */
734int 737int
735xfs_iextents_copy( 738xfs_iextents_copy(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c0d391f9a6e..686889b4a1e5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
30#include "xfs_trace.h" 30#include "xfs_trace.h"
31#include "xfs_trans_priv.h" 31#include "xfs_trans_priv.h"
32#include "xfs_dinode.h" 32#include "xfs_dinode.h"
33#include "xfs_log.h"
33 34
34 35
35kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 36kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
39 return container_of(lip, struct xfs_inode_log_item, ili_item); 40 return container_of(lip, struct xfs_inode_log_item, ili_item);
40} 41}
41 42
42
43/*
44 * This returns the number of iovecs needed to log the given inode item.
45 *
46 * We need one iovec for the inode log format structure, one for the
47 * inode core, and possibly one for the inode data/extents/b-tree root
48 * and one for the inode attribute data/extents/b-tree root.
49 */
50STATIC void 43STATIC void
51xfs_inode_item_size( 44xfs_inode_item_data_fork_size(
52 struct xfs_log_item *lip, 45 struct xfs_inode_log_item *iip,
53 int *nvecs, 46 int *nvecs,
54 int *nbytes) 47 int *nbytes)
55{ 48{
56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
57 struct xfs_inode *ip = iip->ili_inode; 49 struct xfs_inode *ip = iip->ili_inode;
58 50
59 *nvecs += 2;
60 *nbytes += sizeof(struct xfs_inode_log_format) +
61 xfs_icdinode_size(ip->i_d.di_version);
62
63 switch (ip->i_d.di_format) { 51 switch (ip->i_d.di_format) {
64 case XFS_DINODE_FMT_EXTENTS: 52 case XFS_DINODE_FMT_EXTENTS:
65 if ((iip->ili_fields & XFS_ILOG_DEXT) && 53 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +58,6 @@ xfs_inode_item_size(
70 *nvecs += 1; 58 *nvecs += 1;
71 } 59 }
72 break; 60 break;
73
74 case XFS_DINODE_FMT_BTREE: 61 case XFS_DINODE_FMT_BTREE:
75 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 62 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
76 ip->i_df.if_broot_bytes > 0) { 63 ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +65,6 @@ xfs_inode_item_size(
78 *nvecs += 1; 65 *nvecs += 1;
79 } 66 }
80 break; 67 break;
81
82 case XFS_DINODE_FMT_LOCAL: 68 case XFS_DINODE_FMT_LOCAL:
83 if ((iip->ili_fields & XFS_ILOG_DDATA) && 69 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
84 ip->i_df.if_bytes > 0) { 70 ip->i_df.if_bytes > 0) {
@@ -90,19 +76,20 @@ xfs_inode_item_size(
90 case XFS_DINODE_FMT_DEV: 76 case XFS_DINODE_FMT_DEV:
91 case XFS_DINODE_FMT_UUID: 77 case XFS_DINODE_FMT_UUID:
92 break; 78 break;
93
94 default: 79 default:
95 ASSERT(0); 80 ASSERT(0);
96 break; 81 break;
97 } 82 }
83}
98 84
99 if (!XFS_IFORK_Q(ip)) 85STATIC void
100 return; 86xfs_inode_item_attr_fork_size(
101 87 struct xfs_inode_log_item *iip,
88 int *nvecs,
89 int *nbytes)
90{
91 struct xfs_inode *ip = iip->ili_inode;
102 92
103 /*
104 * Log any necessary attribute data.
105 */
106 switch (ip->i_d.di_aformat) { 93 switch (ip->i_d.di_aformat) {
107 case XFS_DINODE_FMT_EXTENTS: 94 case XFS_DINODE_FMT_EXTENTS:
108 if ((iip->ili_fields & XFS_ILOG_AEXT) && 95 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +100,6 @@ xfs_inode_item_size(
113 *nvecs += 1; 100 *nvecs += 1;
114 } 101 }
115 break; 102 break;
116
117 case XFS_DINODE_FMT_BTREE: 103 case XFS_DINODE_FMT_BTREE:
118 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 104 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
119 ip->i_afp->if_broot_bytes > 0) { 105 ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +107,6 @@ xfs_inode_item_size(
121 *nvecs += 1; 107 *nvecs += 1;
122 } 108 }
123 break; 109 break;
124
125 case XFS_DINODE_FMT_LOCAL: 110 case XFS_DINODE_FMT_LOCAL:
126 if ((iip->ili_fields & XFS_ILOG_ADATA) && 111 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
127 ip->i_afp->if_bytes > 0) { 112 ip->i_afp->if_bytes > 0) {
@@ -129,7 +114,6 @@ xfs_inode_item_size(
129 *nvecs += 1; 114 *nvecs += 1;
130 } 115 }
131 break; 116 break;
132
133 default: 117 default:
134 ASSERT(0); 118 ASSERT(0);
135 break; 119 break;
@@ -137,98 +121,67 @@ xfs_inode_item_size(
137} 121}
138 122
139/* 123/*
140 * xfs_inode_item_format_extents - convert in-core extents to on-disk form 124 * This returns the number of iovecs needed to log the given inode item.
141 *
142 * For either the data or attr fork in extent format, we need to endian convert
143 * the in-core extent as we place them into the on-disk inode. In this case, we
144 * need to do this conversion before we write the extents into the log. Because
145 * we don't have the disk inode to write into here, we allocate a buffer and
146 * format the extents into it via xfs_iextents_copy(). We free the buffer in
147 * the unlock routine after the copy for the log has been made.
148 * 125 *
149 * In the case of the data fork, the in-core and on-disk fork sizes can be 126 * We need one iovec for the inode log format structure, one for the
150 * different due to delayed allocation extents. We only log on-disk extents 127 * inode core, and possibly one for the inode data/extents/b-tree root
151 * here, so always use the physical fork size to determine the size of the 128 * and one for the inode attribute data/extents/b-tree root.
152 * buffer we need to allocate.
153 */ 129 */
154STATIC void 130STATIC void
155xfs_inode_item_format_extents( 131xfs_inode_item_size(
156 struct xfs_inode *ip, 132 struct xfs_log_item *lip,
157 struct xfs_log_iovec *vecp, 133 int *nvecs,
158 int whichfork, 134 int *nbytes)
159 int type)
160{ 135{
161 xfs_bmbt_rec_t *ext_buffer; 136 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
137 struct xfs_inode *ip = iip->ili_inode;
162 138
163 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); 139 *nvecs += 2;
164 if (whichfork == XFS_DATA_FORK) 140 *nbytes += sizeof(struct xfs_inode_log_format) +
165 ip->i_itemp->ili_extents_buf = ext_buffer; 141 xfs_icdinode_size(ip->i_d.di_version);
166 else
167 ip->i_itemp->ili_aextents_buf = ext_buffer;
168 142
169 vecp->i_addr = ext_buffer; 143 xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
170 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); 144 if (XFS_IFORK_Q(ip))
171 vecp->i_type = type; 145 xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
172} 146}
173 147
174/* 148/*
175 * This is called to fill in the vector of log iovecs for the 149 * If this is a v1 format inode, then we need to log it as such. This means
176 * given inode log item. It fills the first item with an inode 150 * that we have to copy the link count from the new field to the old. We
177 * log format structure, the second with the on-disk inode structure, 151 * don't have to worry about the new fields, because nothing trusts them as
178 * and a possible third and/or fourth with the inode data/extents/b-tree 152 * long as the old inode version number is there.
179 * root and inode attributes data/extents/b-tree root.
180 */ 153 */
181STATIC void 154STATIC void
182xfs_inode_item_format( 155xfs_inode_item_format_v1_inode(
183 struct xfs_log_item *lip, 156 struct xfs_inode *ip)
184 struct xfs_log_iovec *vecp) 157{
158 if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
159 /*
160 * Convert it back.
161 */
162 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
163 ip->i_d.di_onlink = ip->i_d.di_nlink;
164 } else {
165 /*
166 * The superblock version has already been bumped,
167 * so just make the conversion to the new inode
168 * format permanent.
169 */
170 ip->i_d.di_version = 2;
171 ip->i_d.di_onlink = 0;
172 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
173 }
174}
175
176STATIC void
177xfs_inode_item_format_data_fork(
178 struct xfs_inode_log_item *iip,
179 struct xfs_inode_log_format *ilf,
180 struct xfs_log_vec *lv,
181 struct xfs_log_iovec **vecp)
185{ 182{
186 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
187 struct xfs_inode *ip = iip->ili_inode; 183 struct xfs_inode *ip = iip->ili_inode;
188 uint nvecs;
189 size_t data_bytes; 184 size_t data_bytes;
190 xfs_mount_t *mp;
191
192 vecp->i_addr = &iip->ili_format;
193 vecp->i_len = sizeof(xfs_inode_log_format_t);
194 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
195 vecp++;
196 nvecs = 1;
197
198 vecp->i_addr = &ip->i_d;
199 vecp->i_len = xfs_icdinode_size(ip->i_d.di_version);
200 vecp->i_type = XLOG_REG_TYPE_ICORE;
201 vecp++;
202 nvecs++;
203
204 /*
205 * If this is really an old format inode, then we need to
206 * log it as such. This means that we have to copy the link
207 * count from the new field to the old. We don't have to worry
208 * about the new fields, because nothing trusts them as long as
209 * the old inode version number is there. If the superblock already
210 * has a new version number, then we don't bother converting back.
211 */
212 mp = ip->i_mount;
213 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
214 if (ip->i_d.di_version == 1) {
215 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
216 /*
217 * Convert it back.
218 */
219 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
220 ip->i_d.di_onlink = ip->i_d.di_nlink;
221 } else {
222 /*
223 * The superblock version has already been bumped,
224 * so just make the conversion to the new inode
225 * format permanent.
226 */
227 ip->i_d.di_version = 2;
228 ip->i_d.di_onlink = 0;
229 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
230 }
231 }
232 185
233 switch (ip->i_d.di_format) { 186 switch (ip->i_d.di_format) {
234 case XFS_DINODE_FMT_EXTENTS: 187 case XFS_DINODE_FMT_EXTENTS:
@@ -239,36 +192,23 @@ xfs_inode_item_format(
239 if ((iip->ili_fields & XFS_ILOG_DEXT) && 192 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
240 ip->i_d.di_nextents > 0 && 193 ip->i_d.di_nextents > 0 &&
241 ip->i_df.if_bytes > 0) { 194 ip->i_df.if_bytes > 0) {
195 struct xfs_bmbt_rec *p;
196
242 ASSERT(ip->i_df.if_u1.if_extents != NULL); 197 ASSERT(ip->i_df.if_u1.if_extents != NULL);
243 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); 198 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
244 ASSERT(iip->ili_extents_buf == NULL); 199
245 200 p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
246#ifdef XFS_NATIVE_HOST 201 data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
247 if (ip->i_d.di_nextents == ip->i_df.if_bytes / 202 xlog_finish_iovec(lv, *vecp, data_bytes);
248 (uint)sizeof(xfs_bmbt_rec_t)) { 203
249 /* 204 ASSERT(data_bytes <= ip->i_df.if_bytes);
250 * There are no delayed allocation 205
251 * extents, so just point to the 206 ilf->ilf_dsize = data_bytes;
252 * real extents array. 207 ilf->ilf_size++;
253 */
254 vecp->i_addr = ip->i_df.if_u1.if_extents;
255 vecp->i_len = ip->i_df.if_bytes;
256 vecp->i_type = XLOG_REG_TYPE_IEXT;
257 } else
258#endif
259 {
260 xfs_inode_item_format_extents(ip, vecp,
261 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
262 }
263 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
264 iip->ili_format.ilf_dsize = vecp->i_len;
265 vecp++;
266 nvecs++;
267 } else { 208 } else {
268 iip->ili_fields &= ~XFS_ILOG_DEXT; 209 iip->ili_fields &= ~XFS_ILOG_DEXT;
269 } 210 }
270 break; 211 break;
271
272 case XFS_DINODE_FMT_BTREE: 212 case XFS_DINODE_FMT_BTREE:
273 iip->ili_fields &= 213 iip->ili_fields &=
274 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 214 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -277,80 +217,70 @@ xfs_inode_item_format(
277 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 217 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
278 ip->i_df.if_broot_bytes > 0) { 218 ip->i_df.if_broot_bytes > 0) {
279 ASSERT(ip->i_df.if_broot != NULL); 219 ASSERT(ip->i_df.if_broot != NULL);
280 vecp->i_addr = ip->i_df.if_broot; 220 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
281 vecp->i_len = ip->i_df.if_broot_bytes; 221 ip->i_df.if_broot,
282 vecp->i_type = XLOG_REG_TYPE_IBROOT; 222 ip->i_df.if_broot_bytes);
283 vecp++; 223 ilf->ilf_dsize = ip->i_df.if_broot_bytes;
284 nvecs++; 224 ilf->ilf_size++;
285 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
286 } else { 225 } else {
287 ASSERT(!(iip->ili_fields & 226 ASSERT(!(iip->ili_fields &
288 XFS_ILOG_DBROOT)); 227 XFS_ILOG_DBROOT));
289 iip->ili_fields &= ~XFS_ILOG_DBROOT; 228 iip->ili_fields &= ~XFS_ILOG_DBROOT;
290 } 229 }
291 break; 230 break;
292
293 case XFS_DINODE_FMT_LOCAL: 231 case XFS_DINODE_FMT_LOCAL:
294 iip->ili_fields &= 232 iip->ili_fields &=
295 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 233 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
296 XFS_ILOG_DEV | XFS_ILOG_UUID); 234 XFS_ILOG_DEV | XFS_ILOG_UUID);
297 if ((iip->ili_fields & XFS_ILOG_DDATA) && 235 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
298 ip->i_df.if_bytes > 0) { 236 ip->i_df.if_bytes > 0) {
299 ASSERT(ip->i_df.if_u1.if_data != NULL);
300 ASSERT(ip->i_d.di_size > 0);
301
302 vecp->i_addr = ip->i_df.if_u1.if_data;
303 /* 237 /*
304 * Round i_bytes up to a word boundary. 238 * Round i_bytes up to a word boundary.
305 * The underlying memory is guaranteed to 239 * The underlying memory is guaranteed to
306 * to be there by xfs_idata_realloc(). 240 * to be there by xfs_idata_realloc().
307 */ 241 */
308 data_bytes = roundup(ip->i_df.if_bytes, 4); 242 data_bytes = roundup(ip->i_df.if_bytes, 4);
309 ASSERT((ip->i_df.if_real_bytes == 0) || 243 ASSERT(ip->i_df.if_real_bytes == 0 ||
310 (ip->i_df.if_real_bytes == data_bytes)); 244 ip->i_df.if_real_bytes == data_bytes);
311 vecp->i_len = (int)data_bytes; 245 ASSERT(ip->i_df.if_u1.if_data != NULL);
312 vecp->i_type = XLOG_REG_TYPE_ILOCAL; 246 ASSERT(ip->i_d.di_size > 0);
313 vecp++; 247 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
314 nvecs++; 248 ip->i_df.if_u1.if_data, data_bytes);
315 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 249 ilf->ilf_dsize = (unsigned)data_bytes;
250 ilf->ilf_size++;
316 } else { 251 } else {
317 iip->ili_fields &= ~XFS_ILOG_DDATA; 252 iip->ili_fields &= ~XFS_ILOG_DDATA;
318 } 253 }
319 break; 254 break;
320
321 case XFS_DINODE_FMT_DEV: 255 case XFS_DINODE_FMT_DEV:
322 iip->ili_fields &= 256 iip->ili_fields &=
323 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 257 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
324 XFS_ILOG_DEXT | XFS_ILOG_UUID); 258 XFS_ILOG_DEXT | XFS_ILOG_UUID);
325 if (iip->ili_fields & XFS_ILOG_DEV) { 259 if (iip->ili_fields & XFS_ILOG_DEV)
326 iip->ili_format.ilf_u.ilfu_rdev = 260 ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
327 ip->i_df.if_u2.if_rdev;
328 }
329 break; 261 break;
330
331 case XFS_DINODE_FMT_UUID: 262 case XFS_DINODE_FMT_UUID:
332 iip->ili_fields &= 263 iip->ili_fields &=
333 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 264 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
334 XFS_ILOG_DEXT | XFS_ILOG_DEV); 265 XFS_ILOG_DEXT | XFS_ILOG_DEV);
335 if (iip->ili_fields & XFS_ILOG_UUID) { 266 if (iip->ili_fields & XFS_ILOG_UUID)
336 iip->ili_format.ilf_u.ilfu_uuid = 267 ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
337 ip->i_df.if_u2.if_uuid;
338 }
339 break; 268 break;
340
341 default: 269 default:
342 ASSERT(0); 270 ASSERT(0);
343 break; 271 break;
344 } 272 }
273}
345 274
346 /* 275STATIC void
347 * If there are no attributes associated with the file, then we're done. 276xfs_inode_item_format_attr_fork(
348 */ 277 struct xfs_inode_log_item *iip,
349 if (!XFS_IFORK_Q(ip)) { 278 struct xfs_inode_log_format *ilf,
350 iip->ili_fields &= 279 struct xfs_log_vec *lv,
351 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); 280 struct xfs_log_iovec **vecp)
352 goto out; 281{
353 } 282 struct xfs_inode *ip = iip->ili_inode;
283 size_t data_bytes;
354 284
355 switch (ip->i_d.di_aformat) { 285 switch (ip->i_d.di_aformat) {
356 case XFS_DINODE_FMT_EXTENTS: 286 case XFS_DINODE_FMT_EXTENTS:
@@ -360,30 +290,22 @@ xfs_inode_item_format(
360 if ((iip->ili_fields & XFS_ILOG_AEXT) && 290 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
361 ip->i_d.di_anextents > 0 && 291 ip->i_d.di_anextents > 0 &&
362 ip->i_afp->if_bytes > 0) { 292 ip->i_afp->if_bytes > 0) {
293 struct xfs_bmbt_rec *p;
294
363 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == 295 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
364 ip->i_d.di_anextents); 296 ip->i_d.di_anextents);
365 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 297 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
366#ifdef XFS_NATIVE_HOST 298
367 /* 299 p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
368 * There are not delayed allocation extents 300 data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
369 * for attributes, so just point at the array. 301 xlog_finish_iovec(lv, *vecp, data_bytes);
370 */ 302
371 vecp->i_addr = ip->i_afp->if_u1.if_extents; 303 ilf->ilf_asize = data_bytes;
372 vecp->i_len = ip->i_afp->if_bytes; 304 ilf->ilf_size++;
373 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
374#else
375 ASSERT(iip->ili_aextents_buf == NULL);
376 xfs_inode_item_format_extents(ip, vecp,
377 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
378#endif
379 iip->ili_format.ilf_asize = vecp->i_len;
380 vecp++;
381 nvecs++;
382 } else { 305 } else {
383 iip->ili_fields &= ~XFS_ILOG_AEXT; 306 iip->ili_fields &= ~XFS_ILOG_AEXT;
384 } 307 }
385 break; 308 break;
386
387 case XFS_DINODE_FMT_BTREE: 309 case XFS_DINODE_FMT_BTREE:
388 iip->ili_fields &= 310 iip->ili_fields &=
389 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 311 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -392,61 +314,89 @@ xfs_inode_item_format(
392 ip->i_afp->if_broot_bytes > 0) { 314 ip->i_afp->if_broot_bytes > 0) {
393 ASSERT(ip->i_afp->if_broot != NULL); 315 ASSERT(ip->i_afp->if_broot != NULL);
394 316
395 vecp->i_addr = ip->i_afp->if_broot; 317 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
396 vecp->i_len = ip->i_afp->if_broot_bytes; 318 ip->i_afp->if_broot,
397 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 319 ip->i_afp->if_broot_bytes);
398 vecp++; 320 ilf->ilf_asize = ip->i_afp->if_broot_bytes;
399 nvecs++; 321 ilf->ilf_size++;
400 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
401 } else { 322 } else {
402 iip->ili_fields &= ~XFS_ILOG_ABROOT; 323 iip->ili_fields &= ~XFS_ILOG_ABROOT;
403 } 324 }
404 break; 325 break;
405
406 case XFS_DINODE_FMT_LOCAL: 326 case XFS_DINODE_FMT_LOCAL:
407 iip->ili_fields &= 327 iip->ili_fields &=
408 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 328 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
409 329
410 if ((iip->ili_fields & XFS_ILOG_ADATA) && 330 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
411 ip->i_afp->if_bytes > 0) { 331 ip->i_afp->if_bytes > 0) {
412 ASSERT(ip->i_afp->if_u1.if_data != NULL);
413
414 vecp->i_addr = ip->i_afp->if_u1.if_data;
415 /* 332 /*
416 * Round i_bytes up to a word boundary. 333 * Round i_bytes up to a word boundary.
417 * The underlying memory is guaranteed to 334 * The underlying memory is guaranteed to
418 * to be there by xfs_idata_realloc(). 335 * to be there by xfs_idata_realloc().
419 */ 336 */
420 data_bytes = roundup(ip->i_afp->if_bytes, 4); 337 data_bytes = roundup(ip->i_afp->if_bytes, 4);
421 ASSERT((ip->i_afp->if_real_bytes == 0) || 338 ASSERT(ip->i_afp->if_real_bytes == 0 ||
422 (ip->i_afp->if_real_bytes == data_bytes)); 339 ip->i_afp->if_real_bytes == data_bytes);
423 vecp->i_len = (int)data_bytes; 340 ASSERT(ip->i_afp->if_u1.if_data != NULL);
424 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; 341 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
425 vecp++; 342 ip->i_afp->if_u1.if_data,
426 nvecs++; 343 data_bytes);
427 iip->ili_format.ilf_asize = (unsigned)data_bytes; 344 ilf->ilf_asize = (unsigned)data_bytes;
345 ilf->ilf_size++;
428 } else { 346 } else {
429 iip->ili_fields &= ~XFS_ILOG_ADATA; 347 iip->ili_fields &= ~XFS_ILOG_ADATA;
430 } 348 }
431 break; 349 break;
432
433 default: 350 default:
434 ASSERT(0); 351 ASSERT(0);
435 break; 352 break;
436 } 353 }
437
438out:
439 /*
440 * Now update the log format that goes out to disk from the in-core
441 * values. We always write the inode core to make the arithmetic
442 * games in recovery easier, which isn't a big deal as just about any
443 * transaction would dirty it anyway.
444 */
445 iip->ili_format.ilf_fields = XFS_ILOG_CORE |
446 (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
447 iip->ili_format.ilf_size = nvecs;
448} 354}
449 355
356/*
357 * This is called to fill in the vector of log iovecs for the given inode
358 * log item. It fills the first item with an inode log format structure,
359 * the second with the on-disk inode structure, and a possible third and/or
360 * fourth with the inode data/extents/b-tree root and inode attributes
361 * data/extents/b-tree root.
362 */
363STATIC void
364xfs_inode_item_format(
365 struct xfs_log_item *lip,
366 struct xfs_log_vec *lv)
367{
368 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
369 struct xfs_inode *ip = iip->ili_inode;
370 struct xfs_inode_log_format *ilf;
371 struct xfs_log_iovec *vecp = NULL;
372
373 ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
374 ilf->ilf_type = XFS_LI_INODE;
375 ilf->ilf_ino = ip->i_ino;
376 ilf->ilf_blkno = ip->i_imap.im_blkno;
377 ilf->ilf_len = ip->i_imap.im_len;
378 ilf->ilf_boffset = ip->i_imap.im_boffset;
379 ilf->ilf_fields = XFS_ILOG_CORE;
380 ilf->ilf_size = 2; /* format + core */
381 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
382
383 if (ip->i_d.di_version == 1)
384 xfs_inode_item_format_v1_inode(ip);
385 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
386 &ip->i_d,
387 xfs_icdinode_size(ip->i_d.di_version));
388
389 xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
390 if (XFS_IFORK_Q(ip)) {
391 xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
392 } else {
393 iip->ili_fields &=
394 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
395 }
396
397 /* update the format with the exact fields we actually logged */
398 ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
399}
450 400
451/* 401/*
452 * This is called to pin the inode associated with the inode log 402 * This is called to pin the inode associated with the inode log
@@ -563,27 +513,6 @@ xfs_inode_item_unlock(
563 ASSERT(ip->i_itemp != NULL); 513 ASSERT(ip->i_itemp != NULL);
564 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
565 515
566 /*
567 * If the inode needed a separate buffer with which to log
568 * its extents, then free it now.
569 */
570 if (iip->ili_extents_buf != NULL) {
571 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
572 ASSERT(ip->i_d.di_nextents > 0);
573 ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
574 ASSERT(ip->i_df.if_bytes > 0);
575 kmem_free(iip->ili_extents_buf);
576 iip->ili_extents_buf = NULL;
577 }
578 if (iip->ili_aextents_buf != NULL) {
579 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
580 ASSERT(ip->i_d.di_anextents > 0);
581 ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
582 ASSERT(ip->i_afp->if_bytes > 0);
583 kmem_free(iip->ili_aextents_buf);
584 iip->ili_aextents_buf = NULL;
585 }
586
587 lock_flags = iip->ili_lock_flags; 516 lock_flags = iip->ili_lock_flags;
588 iip->ili_lock_flags = 0; 517 iip->ili_lock_flags = 0;
589 if (lock_flags) 518 if (lock_flags)
@@ -670,11 +599,6 @@ xfs_inode_item_init(
670 iip->ili_inode = ip; 599 iip->ili_inode = ip;
671 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 600 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
672 &xfs_inode_item_ops); 601 &xfs_inode_item_ops);
673 iip->ili_format.ilf_type = XFS_LI_INODE;
674 iip->ili_format.ilf_ino = ip->i_ino;
675 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
676 iip->ili_format.ilf_len = ip->i_imap.im_len;
677 iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
678} 602}
679 603
680/* 604/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768c..488d81254e28 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item {
34 unsigned short ili_logged; /* flushed logged data */ 34 unsigned short ili_logged; /* flushed logged data */
35 unsigned int ili_last_fields; /* fields when flushed */ 35 unsigned int ili_last_fields; /* fields when flushed */
36 unsigned int ili_fields; /* fields to be logged */ 36 unsigned int ili_fields; /* fields to be logged */
37 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
38 data exts */
39 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
40 attr exts */
41 xfs_inode_log_format_t ili_format; /* logged structure */
42} xfs_inode_log_item_t; 37} xfs_inode_log_item_t;
43 38
44static inline int xfs_inode_clean(xfs_inode_t *ip) 39static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 33ad9a77791f..518aa56b8f2e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -112,15 +112,11 @@ xfs_find_handle(
112 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); 112 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
113 hsize = sizeof(xfs_fsid_t); 113 hsize = sizeof(xfs_fsid_t);
114 } else { 114 } else {
115 int lock_mode;
116
117 lock_mode = xfs_ilock_map_shared(ip);
118 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 115 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
119 sizeof(handle.ha_fid.fid_len); 116 sizeof(handle.ha_fid.fid_len);
120 handle.ha_fid.fid_pad = 0; 117 handle.ha_fid.fid_pad = 0;
121 handle.ha_fid.fid_gen = ip->i_d.di_gen; 118 handle.ha_fid.fid_gen = ip->i_d.di_gen;
122 handle.ha_fid.fid_ino = ip->i_ino; 119 handle.ha_fid.fid_ino = ip->i_ino;
123 xfs_iunlock_map_shared(ip, lock_mode);
124 120
125 hsize = XFS_HSIZE(handle); 121 hsize = XFS_HSIZE(handle);
126 } 122 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 27e0e544e963..0ce1d759156e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,14 +459,12 @@ xfs_vn_getattr(
459 459
460static void 460static void
461xfs_setattr_mode( 461xfs_setattr_mode(
462 struct xfs_trans *tp,
463 struct xfs_inode *ip, 462 struct xfs_inode *ip,
464 struct iattr *iattr) 463 struct iattr *iattr)
465{ 464{
466 struct inode *inode = VFS_I(ip); 465 struct inode *inode = VFS_I(ip);
467 umode_t mode = iattr->ia_mode; 466 umode_t mode = iattr->ia_mode;
468 467
469 ASSERT(tp);
470 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
471 469
472 ip->i_d.di_mode &= S_IFMT; 470 ip->i_d.di_mode &= S_IFMT;
@@ -476,6 +474,32 @@ xfs_setattr_mode(
476 inode->i_mode |= mode & ~S_IFMT; 474 inode->i_mode |= mode & ~S_IFMT;
477} 475}
478 476
477static void
478xfs_setattr_time(
479 struct xfs_inode *ip,
480 struct iattr *iattr)
481{
482 struct inode *inode = VFS_I(ip);
483
484 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
485
486 if (iattr->ia_valid & ATTR_ATIME) {
487 inode->i_atime = iattr->ia_atime;
488 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
489 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
490 }
491 if (iattr->ia_valid & ATTR_CTIME) {
492 inode->i_ctime = iattr->ia_ctime;
493 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
494 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
495 }
496 if (iattr->ia_valid & ATTR_MTIME) {
497 inode->i_mtime = iattr->ia_mtime;
498 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
499 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
500 }
501}
502
479int 503int
480xfs_setattr_nonsize( 504xfs_setattr_nonsize(
481 struct xfs_inode *ip, 505 struct xfs_inode *ip,
@@ -618,7 +642,8 @@ xfs_setattr_nonsize(
618 } 642 }
619 if (!gid_eq(igid, gid)) { 643 if (!gid_eq(igid, gid)) {
620 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { 644 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
621 ASSERT(!XFS_IS_PQUOTA_ON(mp)); 645 ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
646 !XFS_IS_PQUOTA_ON(mp));
622 ASSERT(mask & ATTR_GID); 647 ASSERT(mask & ATTR_GID);
623 ASSERT(gdqp); 648 ASSERT(gdqp);
624 olddquot2 = xfs_qm_vop_chown(tp, ip, 649 olddquot2 = xfs_qm_vop_chown(tp, ip,
@@ -629,30 +654,10 @@ xfs_setattr_nonsize(
629 } 654 }
630 } 655 }
631 656
632 /*
633 * Change file access modes.
634 */
635 if (mask & ATTR_MODE) 657 if (mask & ATTR_MODE)
636 xfs_setattr_mode(tp, ip, iattr); 658 xfs_setattr_mode(ip, iattr);
637 659 if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
638 /* 660 xfs_setattr_time(ip, iattr);
639 * Change file access or modified times.
640 */
641 if (mask & ATTR_ATIME) {
642 inode->i_atime = iattr->ia_atime;
643 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
644 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
645 }
646 if (mask & ATTR_CTIME) {
647 inode->i_ctime = iattr->ia_ctime;
648 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
649 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
650 }
651 if (mask & ATTR_MTIME) {
652 inode->i_mtime = iattr->ia_mtime;
653 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
654 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
655 }
656 661
657 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 662 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
658 663
@@ -867,22 +872,10 @@ xfs_setattr_size(
867 xfs_inode_clear_eofblocks_tag(ip); 872 xfs_inode_clear_eofblocks_tag(ip);
868 } 873 }
869 874
870 /*
871 * Change file access modes.
872 */
873 if (mask & ATTR_MODE) 875 if (mask & ATTR_MODE)
874 xfs_setattr_mode(tp, ip, iattr); 876 xfs_setattr_mode(ip, iattr);
875 877 if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
876 if (mask & ATTR_CTIME) { 878 xfs_setattr_time(ip, iattr);
877 inode->i_ctime = iattr->ia_ctime;
878 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
879 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
880 }
881 if (mask & ATTR_MTIME) {
882 inode->i_mtime = iattr->ia_mtime;
883 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
884 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
885 }
886 879
887 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 880 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
888 881
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c237ad15d500..f46338285152 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -209,9 +209,8 @@ xfs_bulkstat(
209 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ 209 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
210 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ 210 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
211 xfs_ino_t lastino; /* last inode number returned */ 211 xfs_ino_t lastino; /* last inode number returned */
212 int nbcluster; /* # of blocks in a cluster */ 212 int blks_per_cluster; /* # of blocks per cluster */
213 int nicluster; /* # of inodes in a cluster */ 213 int inodes_per_cluster;/* # of inodes per cluster */
214 int nimask; /* mask for inode clusters */
215 int nirbuf; /* size of irbuf */ 214 int nirbuf; /* size of irbuf */
216 int rval; /* return value error code */ 215 int rval; /* return value error code */
217 int tmp; /* result value from btree calls */ 216 int tmp; /* result value from btree calls */
@@ -243,11 +242,8 @@ xfs_bulkstat(
243 *done = 0; 242 *done = 0;
244 fmterror = 0; 243 fmterror = 0;
245 ubufp = ubuffer; 244 ubufp = ubuffer;
246 nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? 245 blks_per_cluster = xfs_icluster_size_fsb(mp);
247 mp->m_sb.sb_inopblock : 246 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
248 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
249 nimask = ~(nicluster - 1);
250 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
251 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); 247 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
252 if (!irbuf) 248 if (!irbuf)
253 return ENOMEM; 249 return ENOMEM;
@@ -390,12 +386,12 @@ xfs_bulkstat(
390 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); 386 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
391 for (chunkidx = 0; 387 for (chunkidx = 0;
392 chunkidx < XFS_INODES_PER_CHUNK; 388 chunkidx < XFS_INODES_PER_CHUNK;
393 chunkidx += nicluster, 389 chunkidx += inodes_per_cluster,
394 agbno += nbcluster) { 390 agbno += blks_per_cluster) {
395 if (xfs_inobt_maskn(chunkidx, nicluster) 391 if (xfs_inobt_maskn(chunkidx,
396 & ~r.ir_free) 392 inodes_per_cluster) & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 393 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster, 394 agbno, blks_per_cluster,
399 &xfs_inode_buf_ops); 395 &xfs_inode_buf_ops);
400 } 396 }
401 blk_finish_plug(&plug); 397 blk_finish_plug(&plug);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index e148719e0a5d..b0f4ef77fa70 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,6 +30,52 @@ struct xfs_log_vec {
30 30
31#define XFS_LOG_VEC_ORDERED (-1) 31#define XFS_LOG_VEC_ORDERED (-1)
32 32
33static inline void *
34xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
35 uint type)
36{
37 struct xfs_log_iovec *vec = *vecp;
38
39 if (vec) {
40 ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
41 vec++;
42 } else {
43 vec = &lv->lv_iovecp[0];
44 }
45
46 vec->i_type = type;
47 vec->i_addr = lv->lv_buf + lv->lv_buf_len;
48
49 ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
50
51 *vecp = vec;
52 return vec->i_addr;
53}
54
55static inline void
56xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
57{
58 /*
59 * We need to make sure the next buffer is naturally aligned for the
60 * biggest basic data type we put into it. We already accounted for
61 * this when sizing the buffer.
62 */
63 lv->lv_buf_len += round_up(len, sizeof(uint64_t));
64 vec->i_len = len;
65}
66
67static inline void *
68xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
69 uint type, void *data, int len)
70{
71 void *buf;
72
73 buf = xlog_prepare_iovec(lv, vecp, type);
74 memcpy(buf, data, len);
75 xlog_finish_iovec(lv, *vecp, len);
76 return buf;
77}
78
33/* 79/*
34 * Structure used to pass callback function and the function's argument 80 * Structure used to pass callback function and the function's argument
35 * to the log manager. 81 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5eb51fc5eb84..cdebd832c3db 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -82,36 +82,6 @@ xlog_cil_init_post_recovery(
82 log->l_curr_block); 82 log->l_curr_block);
83} 83}
84 84
85STATIC int
86xlog_cil_lv_item_format(
87 struct xfs_log_item *lip,
88 struct xfs_log_vec *lv)
89{
90 int index;
91 char *ptr;
92
93 /* format new vectors into array */
94 lip->li_ops->iop_format(lip, lv->lv_iovecp);
95
96 /* copy data into existing array */
97 ptr = lv->lv_buf;
98 for (index = 0; index < lv->lv_niovecs; index++) {
99 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
100
101 memcpy(ptr, vec->i_addr, vec->i_len);
102 vec->i_addr = ptr;
103 ptr += vec->i_len;
104 }
105
106 /*
107 * some size calculations for log vectors over-estimate, so the caller
108 * doesn't know the amount of space actually used by the item. Return
109 * the byte count to the caller so they can check and store it
110 * appropriately.
111 */
112 return ptr - lv->lv_buf;
113}
114
115/* 85/*
116 * Prepare the log item for insertion into the CIL. Calculate the difference in 86 * Prepare the log item for insertion into the CIL. Calculate the difference in
117 * log space and vectors it will consume, and if it is a new item pin it as 87 * log space and vectors it will consume, and if it is a new item pin it as
@@ -232,6 +202,13 @@ xlog_cil_insert_format_items(
232 nbytes = 0; 202 nbytes = 0;
233 } 203 }
234 204
205 /*
206 * We 64-bit align the length of each iovec so that the start
207 * of the next one is naturally aligned. We'll need to
208 * account for that slack space here.
209 */
210 nbytes += niovecs * sizeof(uint64_t);
211
235 /* grab the old item if it exists for reservation accounting */ 212 /* grab the old item if it exists for reservation accounting */
236 old_lv = lip->li_lv; 213 old_lv = lip->li_lv;
237 214
@@ -254,34 +231,27 @@ xlog_cil_insert_format_items(
254 */ 231 */
255 *diff_iovecs -= lv->lv_niovecs; 232 *diff_iovecs -= lv->lv_niovecs;
256 *diff_len -= lv->lv_buf_len; 233 *diff_len -= lv->lv_buf_len;
257 234 } else {
258 /* Ensure the lv is set up according to ->iop_size */ 235 /* allocate new data chunk */
259 lv->lv_niovecs = niovecs; 236 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
260 lv->lv_buf = (char *)lv + buf_size - nbytes; 237 lv->lv_item = lip;
261 238 lv->lv_size = buf_size;
262 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); 239 if (ordered) {
263 goto insert; 240 /* track as an ordered logvec */
241 ASSERT(lip->li_lv == NULL);
242 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
243 goto insert;
244 }
245 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
264 } 246 }
265 247
266 /* allocate new data chunk */ 248 /* Ensure the lv is set up according to ->iop_size */
267 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
268 lv->lv_item = lip;
269 lv->lv_size = buf_size;
270 lv->lv_niovecs = niovecs; 249 lv->lv_niovecs = niovecs;
271 if (ordered) {
272 /* track as an ordered logvec */
273 ASSERT(lip->li_lv == NULL);
274 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
275 goto insert;
276 }
277
278 /* The allocated iovec region lies beyond the log vector. */
279 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
280 250
281 /* The allocated data region lies beyond the iovec region */ 251 /* The allocated data region lies beyond the iovec region */
252 lv->lv_buf_len = 0;
282 lv->lv_buf = (char *)lv + buf_size - nbytes; 253 lv->lv_buf = (char *)lv + buf_size - nbytes;
283 254 lip->li_ops->iop_format(lip, lv);
284 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
285insert: 255insert:
286 ASSERT(lv->lv_buf_len <= nbytes); 256 ASSERT(lv->lv_buf_len <= nbytes);
287 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); 257 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b6b669df40f3..bce53ac81096 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,7 +193,10 @@ xlog_bread_noalign(
193 bp->b_io_length = nbblks; 193 bp->b_io_length = nbblks;
194 bp->b_error = 0; 194 bp->b_error = 0;
195 195
196 xfsbdstrat(log->l_mp, bp); 196 if (XFS_FORCED_SHUTDOWN(log->l_mp))
197 return XFS_ERROR(EIO);
198
199 xfs_buf_iorequest(bp);
197 error = xfs_buf_iowait(bp); 200 error = xfs_buf_iowait(bp);
198 if (error) 201 if (error)
199 xfs_buf_ioerror_alert(bp, __func__); 202 xfs_buf_ioerror_alert(bp, __func__);
@@ -1651,6 +1654,7 @@ xlog_recover_reorder_trans(
1651 int pass) 1654 int pass)
1652{ 1655{
1653 xlog_recover_item_t *item, *n; 1656 xlog_recover_item_t *item, *n;
1657 int error = 0;
1654 LIST_HEAD(sort_list); 1658 LIST_HEAD(sort_list);
1655 LIST_HEAD(cancel_list); 1659 LIST_HEAD(cancel_list);
1656 LIST_HEAD(buffer_list); 1660 LIST_HEAD(buffer_list);
@@ -1692,9 +1696,17 @@ xlog_recover_reorder_trans(
1692 "%s: unrecognized type of log operation", 1696 "%s: unrecognized type of log operation",
1693 __func__); 1697 __func__);
1694 ASSERT(0); 1698 ASSERT(0);
1695 return XFS_ERROR(EIO); 1699 /*
1700 * return the remaining items back to the transaction
1701 * item list so they can be freed in caller.
1702 */
1703 if (!list_empty(&sort_list))
1704 list_splice_init(&sort_list, &trans->r_itemq);
1705 error = XFS_ERROR(EIO);
1706 goto out;
1696 } 1707 }
1697 } 1708 }
1709out:
1698 ASSERT(list_empty(&sort_list)); 1710 ASSERT(list_empty(&sort_list));
1699 if (!list_empty(&buffer_list)) 1711 if (!list_empty(&buffer_list))
1700 list_splice(&buffer_list, &trans->r_itemq); 1712 list_splice(&buffer_list, &trans->r_itemq);
@@ -1704,7 +1716,7 @@ xlog_recover_reorder_trans(
1704 list_splice_tail(&inode_buffer_list, &trans->r_itemq); 1716 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1705 if (!list_empty(&cancel_list)) 1717 if (!list_empty(&cancel_list))
1706 list_splice_tail(&cancel_list, &trans->r_itemq); 1718 list_splice_tail(&cancel_list, &trans->r_itemq);
1707 return 0; 1719 return error;
1708} 1720}
1709 1721
1710/* 1722/*
@@ -2514,19 +2526,19 @@ xlog_recover_buffer_pass2(
2514 * 2526 *
2515 * Also make sure that only inode buffers with good sizes stay in 2527 * Also make sure that only inode buffers with good sizes stay in
2516 * the buffer cache. The kernel moves inodes in buffers of 1 block 2528 * the buffer cache. The kernel moves inodes in buffers of 1 block
2517 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode 2529 * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
2518 * buffers in the log can be a different size if the log was generated 2530 * buffers in the log can be a different size if the log was generated
2519 * by an older kernel using unclustered inode buffers or a newer kernel 2531 * by an older kernel using unclustered inode buffers or a newer kernel
2520 * running with a different inode cluster size. Regardless, if the 2532 * running with a different inode cluster size. Regardless, if the
2521 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) 2533 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2522 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep 2534 * for *our* value of mp->m_inode_cluster_size, then we need to keep
2523 * the buffer out of the buffer cache so that the buffer won't 2535 * the buffer out of the buffer cache so that the buffer won't
2524 * overlap with future reads of those inodes. 2536 * overlap with future reads of those inodes.
2525 */ 2537 */
2526 if (XFS_DINODE_MAGIC == 2538 if (XFS_DINODE_MAGIC ==
2527 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2539 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2528 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, 2540 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2529 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2541 (__uint32_t)log->l_mp->m_inode_cluster_size))) {
2530 xfs_buf_stale(bp); 2542 xfs_buf_stale(bp);
2531 error = xfs_bwrite(bp); 2543 error = xfs_bwrite(bp);
2532 } else { 2544 } else {
@@ -3199,10 +3211,10 @@ xlog_recover_do_icreate_pass2(
3199 } 3211 }
3200 3212
3201 /* existing allocation is fixed value */ 3213 /* existing allocation is fixed value */
3202 ASSERT(count == XFS_IALLOC_INODES(mp)); 3214 ASSERT(count == mp->m_ialloc_inos);
3203 ASSERT(length == XFS_IALLOC_BLOCKS(mp)); 3215 ASSERT(length == mp->m_ialloc_blks);
3204 if (count != XFS_IALLOC_INODES(mp) || 3216 if (count != mp->m_ialloc_inos ||
3205 length != XFS_IALLOC_BLOCKS(mp)) { 3217 length != mp->m_ialloc_blks) {
3206 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3218 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3207 return EINVAL; 3219 return EINVAL;
3208 } 3220 }
@@ -3608,8 +3620,10 @@ xlog_recover_process_data(
3608 error = XFS_ERROR(EIO); 3620 error = XFS_ERROR(EIO);
3609 break; 3621 break;
3610 } 3622 }
3611 if (error) 3623 if (error) {
3624 xlog_recover_free_trans(trans);
3612 return error; 3625 return error;
3626 }
3613 } 3627 }
3614 dp += be32_to_cpu(ohead->oh_len); 3628 dp += be32_to_cpu(ohead->oh_len);
3615 num_logops--; 3629 num_logops--;
@@ -4397,7 +4411,13 @@ xlog_do_recover(
4397 XFS_BUF_READ(bp); 4411 XFS_BUF_READ(bp);
4398 XFS_BUF_UNASYNC(bp); 4412 XFS_BUF_UNASYNC(bp);
4399 bp->b_ops = &xfs_sb_buf_ops; 4413 bp->b_ops = &xfs_sb_buf_ops;
4400 xfsbdstrat(log->l_mp, bp); 4414
4415 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
4416 xfs_buf_relse(bp);
4417 return XFS_ERROR(EIO);
4418 }
4419
4420 xfs_buf_iorequest(bp);
4401 error = xfs_buf_iowait(bp); 4421 error = xfs_buf_iowait(bp);
4402 if (error) { 4422 if (error) {
4403 xfs_buf_ioerror_alert(bp, __func__); 4423 xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 14a4996cfec6..348e4d2ed6e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,8 +134,6 @@ xfs_qm_dqpurge(
134{ 134{
135 struct xfs_mount *mp = dqp->q_mount; 135 struct xfs_mount *mp = dqp->q_mount;
136 struct xfs_quotainfo *qi = mp->m_quotainfo; 136 struct xfs_quotainfo *qi = mp->m_quotainfo;
137 struct xfs_dquot *gdqp = NULL;
138 struct xfs_dquot *pdqp = NULL;
139 137
140 xfs_dqlock(dqp); 138 xfs_dqlock(dqp);
141 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { 139 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -143,21 +141,6 @@ xfs_qm_dqpurge(
143 return EAGAIN; 141 return EAGAIN;
144 } 142 }
145 143
146 /*
147 * If this quota has a hint attached, prepare for releasing it now.
148 */
149 gdqp = dqp->q_gdquot;
150 if (gdqp) {
151 xfs_dqlock(gdqp);
152 dqp->q_gdquot = NULL;
153 }
154
155 pdqp = dqp->q_pdquot;
156 if (pdqp) {
157 xfs_dqlock(pdqp);
158 dqp->q_pdquot = NULL;
159 }
160
161 dqp->dq_flags |= XFS_DQ_FREEING; 144 dqp->dq_flags |= XFS_DQ_FREEING;
162 145
163 xfs_dqflock(dqp); 146 xfs_dqflock(dqp);
@@ -206,11 +189,47 @@ xfs_qm_dqpurge(
206 XFS_STATS_DEC(xs_qm_dquot_unused); 189 XFS_STATS_DEC(xs_qm_dquot_unused);
207 190
208 xfs_qm_dqdestroy(dqp); 191 xfs_qm_dqdestroy(dqp);
192 return 0;
193}
194
195/*
196 * Release the group or project dquot pointers the user dquots maybe carrying
197 * around as a hint, and proceed to purge the user dquot cache if requested.
198*/
199STATIC int
200xfs_qm_dqpurge_hints(
201 struct xfs_dquot *dqp,
202 void *data)
203{
204 struct xfs_dquot *gdqp = NULL;
205 struct xfs_dquot *pdqp = NULL;
206 uint flags = *((uint *)data);
207
208 xfs_dqlock(dqp);
209 if (dqp->dq_flags & XFS_DQ_FREEING) {
210 xfs_dqunlock(dqp);
211 return EAGAIN;
212 }
209 213
214 /* If this quota has a hint attached, prepare for releasing it now */
215 gdqp = dqp->q_gdquot;
210 if (gdqp) 216 if (gdqp)
211 xfs_qm_dqput(gdqp); 217 dqp->q_gdquot = NULL;
218
219 pdqp = dqp->q_pdquot;
212 if (pdqp) 220 if (pdqp)
213 xfs_qm_dqput(pdqp); 221 dqp->q_pdquot = NULL;
222
223 xfs_dqunlock(dqp);
224
225 if (gdqp)
226 xfs_qm_dqrele(gdqp);
227 if (pdqp)
228 xfs_qm_dqrele(pdqp);
229
230 if (flags & XFS_QMOPT_UQUOTA)
231 return xfs_qm_dqpurge(dqp, NULL);
232
214 return 0; 233 return 0;
215} 234}
216 235
@@ -222,8 +241,18 @@ xfs_qm_dqpurge_all(
222 struct xfs_mount *mp, 241 struct xfs_mount *mp,
223 uint flags) 242 uint flags)
224{ 243{
225 if (flags & XFS_QMOPT_UQUOTA) 244 /*
226 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); 245 * We have to release group/project dquot hint(s) from the user dquot
246 * at first if they are there, otherwise we would run into an infinite
247 * loop while walking through radix tree to purge other type of dquots
248 * since their refcount is not zero if the user dquot refers to them
249 * as hint.
250 *
251 * Call the special xfs_qm_dqpurge_hints() will end up go through the
252 * general xfs_qm_dqpurge() against user dquot cache if requested.
253 */
254 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
255
227 if (flags & XFS_QMOPT_GQUOTA) 256 if (flags & XFS_QMOPT_GQUOTA)
228 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); 257 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
229 if (flags & XFS_QMOPT_PQUOTA) 258 if (flags & XFS_QMOPT_PQUOTA)
@@ -1193,16 +1222,18 @@ xfs_qm_dqiterate(
1193 lblkno = 0; 1222 lblkno = 0;
1194 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 1223 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1195 do { 1224 do {
1225 uint lock_mode;
1226
1196 nmaps = XFS_DQITER_MAP_SIZE; 1227 nmaps = XFS_DQITER_MAP_SIZE;
1197 /* 1228 /*
1198 * We aren't changing the inode itself. Just changing 1229 * We aren't changing the inode itself. Just changing
1199 * some of its data. No new blocks are added here, and 1230 * some of its data. No new blocks are added here, and
1200 * the inode is never added to the transaction. 1231 * the inode is never added to the transaction.
1201 */ 1232 */
1202 xfs_ilock(qip, XFS_ILOCK_SHARED); 1233 lock_mode = xfs_ilock_data_map_shared(qip);
1203 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, 1234 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
1204 map, &nmaps, 0); 1235 map, &nmaps, 0);
1205 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1236 xfs_iunlock(qip, lock_mode);
1206 if (error) 1237 if (error)
1207 break; 1238 break;
1208 1239
@@ -2082,24 +2113,21 @@ xfs_qm_vop_create_dqattach(
2082 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2083 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 2114 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
2084 2115
2085 if (udqp) { 2116 if (udqp && XFS_IS_UQUOTA_ON(mp)) {
2086 ASSERT(ip->i_udquot == NULL); 2117 ASSERT(ip->i_udquot == NULL);
2087 ASSERT(XFS_IS_UQUOTA_ON(mp));
2088 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); 2118 ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
2089 2119
2090 ip->i_udquot = xfs_qm_dqhold(udqp); 2120 ip->i_udquot = xfs_qm_dqhold(udqp);
2091 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); 2121 xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
2092 } 2122 }
2093 if (gdqp) { 2123 if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
2094 ASSERT(ip->i_gdquot == NULL); 2124 ASSERT(ip->i_gdquot == NULL);
2095 ASSERT(XFS_IS_GQUOTA_ON(mp));
2096 ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); 2125 ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
2097 ip->i_gdquot = xfs_qm_dqhold(gdqp); 2126 ip->i_gdquot = xfs_qm_dqhold(gdqp);
2098 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2127 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2099 } 2128 }
2100 if (pdqp) { 2129 if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
2101 ASSERT(ip->i_pdquot == NULL); 2130 ASSERT(ip->i_pdquot == NULL);
2102 ASSERT(XFS_IS_PQUOTA_ON(mp));
2103 ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); 2131 ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
2104 2132
2105 ip->i_pdquot = xfs_qm_dqhold(pdqp); 2133 ip->i_pdquot = xfs_qm_dqhold(pdqp);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index a788b66a5cb1..797fd4636273 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
20 20
21#include "xfs_dquot_item.h" 21#include "xfs_dquot_item.h"
22#include "xfs_dquot.h" 22#include "xfs_dquot.h"
23#include "xfs_quota_priv.h"
24 23
25struct xfs_inode; 24struct xfs_inode;
26 25
27extern struct kmem_zone *xfs_qm_dqtrxzone; 26extern struct kmem_zone *xfs_qm_dqtrxzone;
28 27
29/* 28/*
29 * Number of bmaps that we ask from bmapi when doing a quotacheck.
30 * We make this restriction to keep the memory usage to a minimum.
31 */
32#define XFS_DQITER_MAP_SIZE 10
33
34#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
35 !dqp->q_core.d_blk_hardlimit && \
36 !dqp->q_core.d_blk_softlimit && \
37 !dqp->q_core.d_rtb_hardlimit && \
38 !dqp->q_core.d_rtb_softlimit && \
39 !dqp->q_core.d_ino_hardlimit && \
40 !dqp->q_core.d_ino_softlimit && \
41 !dqp->q_core.d_bcount && \
42 !dqp->q_core.d_rtbcount && \
43 !dqp->q_core.d_icount)
44
45/*
30 * This defines the unit of allocation of dquots. 46 * This defines the unit of allocation of dquots.
31 * Currently, it is just one file system block, and a 4K blk contains 30 47 * Currently, it is just one file system block, and a 4K blk contains 30
32 * (136 * 30 = 4080) dquots. It's probably not worth trying to make 48 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 437c9198031a..3daf5ea1eb8d 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles(
278 xfs_mount_t *mp, 278 xfs_mount_t *mp,
279 uint flags) 279 uint flags)
280{ 280{
281 int error = 0, error2 = 0; 281 int error;
282 282
283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
284 xfs_debug(mp, "%s: flags=%x m_qflags=%x", 284 xfs_debug(mp, "%s: flags=%x m_qflags=%x",
@@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles(
286 return XFS_ERROR(EINVAL); 286 return XFS_ERROR(EINVAL);
287 } 287 }
288 288
289 if (flags & XFS_DQ_USER) 289 if (flags & XFS_DQ_USER) {
290 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); 290 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
291 if (flags & XFS_DQ_GROUP) 291 if (error)
292 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); 292 return error;
293 }
294 if (flags & XFS_DQ_GROUP) {
295 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
296 if (error)
297 return error;
298 }
293 if (flags & XFS_DQ_PROJ) 299 if (flags & XFS_DQ_PROJ)
294 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); 300 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
295 301
296 return error ? error : error2; 302 return error;
297} 303}
298 304
299/* 305/*
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93da..000000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QUOTA_PRIV_H__
19#define __XFS_QUOTA_PRIV_H__
20
21/*
22 * Number of bmaps that we ask from bmapi when doing a quotacheck.
23 * We make this restriction to keep the memory usage to a minimum.
24 */
25#define XFS_DQITER_MAP_SIZE 10
26
27#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
28 !dqp->q_core.d_blk_hardlimit && \
29 !dqp->q_core.d_blk_softlimit && \
30 !dqp->q_core.d_rtb_hardlimit && \
31 !dqp->q_core.d_rtb_softlimit && \
32 !dqp->q_core.d_ino_hardlimit && \
33 !dqp->q_core.d_ino_softlimit && \
34 !dqp->q_core.d_bcount && \
35 !dqp->q_core.d_rtbcount && \
36 !dqp->q_core.d_icount)
37
38#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
39 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
40 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
41
42#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9b96d35e483d..b5bc1ab3c4da 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,7 +64,7 @@ typedef struct xfs_log_item {
64 64
65struct xfs_item_ops { 65struct xfs_item_ops {
66 void (*iop_size)(xfs_log_item_t *, int *, int *); 66 void (*iop_size)(xfs_log_item_t *, int *, int *);
67 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 67 void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
68 void (*iop_pin)(xfs_log_item_t *); 68 void (*iop_pin)(xfs_log_item_t *);
69 void (*iop_unpin)(xfs_log_item_t *, int remove); 69 void (*iop_unpin)(xfs_log_item_t *, int remove);
70 uint (*iop_push)(struct xfs_log_item *, struct list_head *); 70 uint (*iop_push)(struct xfs_log_item *, struct list_head *);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c035d11b7734..647b6f1d8923 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -314,7 +314,18 @@ xfs_trans_read_buf_map(
314 ASSERT(bp->b_iodone == NULL); 314 ASSERT(bp->b_iodone == NULL);
315 XFS_BUF_READ(bp); 315 XFS_BUF_READ(bp);
316 bp->b_ops = ops; 316 bp->b_ops = ops;
317 xfsbdstrat(tp->t_mountp, bp); 317
318 /*
319 * XXX(hch): clean up the error handling here to be less
320 * of a mess..
321 */
322 if (XFS_FORCED_SHUTDOWN(mp)) {
323 trace_xfs_bdstrat_shut(bp, _RET_IP_);
324 xfs_bioerror_relse(bp);
325 } else {
326 xfs_buf_iorequest(bp);
327 }
328
318 error = xfs_buf_iowait(bp); 329 error = xfs_buf_iowait(bp);
319 if (error) { 330 if (error) {
320 xfs_buf_ioerror_alert(bp, __func__); 331 xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index cd2a10e15d3a..41172861e857 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -295,8 +295,8 @@ xfs_trans_mod_dquot(
295/* 295/*
296 * Given an array of dqtrx structures, lock all the dquots associated and join 296 * Given an array of dqtrx structures, lock all the dquots associated and join
297 * them to the transaction, provided they have been modified. We know that the 297 * them to the transaction, provided they have been modified. We know that the
298 * highest number of dquots of one type - usr, grp OR prj - involved in a 298 * highest number of dquots of one type - usr, grp and prj - involved in a
299 * transaction is 2 so we don't need to make this very generic. 299 * transaction is 3 so we don't need to make this very generic.
300 */ 300 */
301STATIC void 301STATIC void
302xfs_trans_dqlockedjoin( 302xfs_trans_dqlockedjoin(
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2fd59c0dae66..2ffd3e331b49 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation(
174 xfs_calc_buf_res(5, 0) + 174 xfs_calc_buf_res(5, 0) +
175 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 175 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
176 XFS_FSB_TO_B(mp, 1)) + 176 XFS_FSB_TO_B(mp, 1)) +
177 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + 177 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
178 mp->m_in_maxlevels, 0))); 178 mp->m_in_maxlevels, 0)));
179} 179}
180 180
@@ -282,7 +282,7 @@ xfs_calc_create_resv_modify(
282 * For create we can allocate some inodes giving: 282 * For create we can allocate some inodes giving:
283 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 283 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
284 * the superblock for the nlink flag: sector size 284 * the superblock for the nlink flag: sector size
285 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 285 * the inode blocks allocated: mp->m_ialloc_blks * blocksize
286 * the inode btree: max depth * blocksize 286 * the inode btree: max depth * blocksize
287 * the allocation btrees: 2 trees * (max depth - 1) * block size 287 * the allocation btrees: 2 trees * (max depth - 1) * block size
288 */ 288 */
@@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc(
292{ 292{
293 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 293 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
294 mp->m_sb.sb_sectsize + 294 mp->m_sb.sb_sectsize +
295 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + 295 xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
296 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + 296 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
297 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 297 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
298 XFS_FSB_TO_B(mp, 1)); 298 XFS_FSB_TO_B(mp, 1));
@@ -385,9 +385,9 @@ xfs_calc_ifree_reservation(
385 xfs_calc_inode_res(mp, 1) + 385 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + 388 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
389 xfs_calc_buf_res(1, 0) + 389 xfs_calc_buf_res(1, 0) +
390 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + 390 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
391 mp->m_in_maxlevels, 0) + 391 mp->m_in_maxlevels, 0) +
392 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 392 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
393 XFS_FSB_TO_B(mp, 1)); 393 XFS_FSB_TO_B(mp, 1));
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9c..af5dbe06cb65 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
47#define XFS_DIRREMOVE_SPACE_RES(mp) \ 47#define XFS_DIRREMOVE_SPACE_RES(mp) \
48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
49#define XFS_IALLOC_SPACE_RES(mp) \ 49#define XFS_IALLOC_SPACE_RES(mp) \
50 (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) 50 ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
51 51
52/* 52/*
53 * Space reservation values for various transactions. 53 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 3e8e797c6d11..e8a77383c0d5 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -35,15 +35,6 @@ struct attrlist_cursor_kern;
35 { IO_INVIS, "INVIS"} 35 { IO_INVIS, "INVIS"}
36 36
37/* 37/*
38 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
39 */
40#define FI_NONE 0 /* none */
41#define FI_REMAPF 1 /* Do a remapf prior to the operation */
42#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
43 Prevent VM access to the pages until
44 the operation completes. */
45
46/*
47 * Some useful predicates. 38 * Some useful predicates.
48 */ 39 */
49#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) 40#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)