aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 13:55:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 13:55:15 -0400
commita74b81b0aff4a01e0816df5915c854fb52c5e87f (patch)
tree98364cec6a9e0c0fd510e5fe9af46f1ddb28956b /fs/ocfs2
parentf8d613e2a665bf1be9628a3c3f9bafe7599b32c0 (diff)
parentece928df16494becd43f999aff9bd530182e7e81 (diff)
Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2
* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (28 commits) Ocfs2: Teach local-mounted ocfs2 to handle unwritten_extents correctly. ocfs2/dlm: Do not migrate resource to a node that is leaving the domain ocfs2/dlm: Add new dlm message DLM_BEGIN_EXIT_DOMAIN_MSG Ocfs2/move_extents: Set several trivial constraints for threshold. Ocfs2/move_extents: Let defrag handle partial extent moving. Ocfs2/move_extents: move/defrag extents within a certain range. Ocfs2/move_extents: helper to calculate the defraging length in one run. Ocfs2/move_extents: move entire/partial extent. Ocfs2/move_extents: helpers to update the group descriptor and global bitmap inode. Ocfs2/move_extents: helper to probe a proper region to move in an alloc group. Ocfs2/move_extents: helper to validate and adjust moving goal. Ocfs2/move_extents: find the victim alloc group, where the given #blk fits. Ocfs2/move_extents: defrag a range of extent. Ocfs2/move_extents: move a range of extent. Ocfs2/move_extents: lock allocators and reserve metadata blocks and data clusters for extents moving. Ocfs2/move_extents: Add basic framework and source files for extent moving. Ocfs2/move_extents: Adding new ioctl code 'OCFS2_IOC_MOVE_EXT' to ocfs2. Ocfs2/refcounttree: Publicize couple of funcs from refcounttree.c Ocfs2: Add a new code 'OCFS2_INFO_FREEFRAG' for o2info ioctl. Ocfs2: Add a new code 'OCFS2_INFO_FREEINODE' for o2info ioctl. ...
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c166
-rw-r--r--fs/ocfs2/alloc.h1
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h14
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c94
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c255
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/ocfs2/ioctl.c492
-rw-r--r--fs/ocfs2/move_extents.c1153
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h68
-rw-r--r--fs/ocfs2/ocfs2_trace.h25
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h11
-rw-r--r--fs/ocfs2/super.c2
19 files changed, 2135 insertions, 246 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \ 32 reservations.o \
33 move_extents.o \
33 resize.o \ 34 resize.o \
34 slot_map.o \ 35 slot_map.o \
35 suballoc.o \ 36 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7401c7..ed553c60de82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h> 31#include <linux/quotaops.h>
32#include <linux/blkdev.h>
32 33
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
34 35
@@ -7184,3 +7185,168 @@ out_commit:
7184out: 7185out:
7185 return ret; 7186 return ret;
7186} 7187}
7188
7189static int ocfs2_trim_extent(struct super_block *sb,
7190 struct ocfs2_group_desc *gd,
7191 u32 start, u32 count)
7192{
7193 u64 discard, bcount;
7194
7195 bcount = ocfs2_clusters_to_blocks(sb, count);
7196 discard = le64_to_cpu(gd->bg_blkno) +
7197 ocfs2_clusters_to_blocks(sb, start);
7198
7199 trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7200
7201 return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7202}
7203
7204static int ocfs2_trim_group(struct super_block *sb,
7205 struct ocfs2_group_desc *gd,
7206 u32 start, u32 max, u32 minbits)
7207{
7208 int ret = 0, count = 0, next;
7209 void *bitmap = gd->bg_bitmap;
7210
7211 if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7212 return 0;
7213
7214 trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7215 start, max, minbits);
7216
7217 while (start < max) {
7218 start = ocfs2_find_next_zero_bit(bitmap, max, start);
7219 if (start >= max)
7220 break;
7221 next = ocfs2_find_next_bit(bitmap, max, start);
7222
7223 if ((next - start) >= minbits) {
7224 ret = ocfs2_trim_extent(sb, gd,
7225 start, next - start);
7226 if (ret < 0) {
7227 mlog_errno(ret);
7228 break;
7229 }
7230 count += next - start;
7231 }
7232 start = next + 1;
7233
7234 if (fatal_signal_pending(current)) {
7235 count = -ERESTARTSYS;
7236 break;
7237 }
7238
7239 if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7240 break;
7241 }
7242
7243 if (ret < 0)
7244 count = ret;
7245
7246 return count;
7247}
7248
7249int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7250{
7251 struct ocfs2_super *osb = OCFS2_SB(sb);
7252 u64 start, len, trimmed, first_group, last_group, group;
7253 int ret, cnt;
7254 u32 first_bit, last_bit, minlen;
7255 struct buffer_head *main_bm_bh = NULL;
7256 struct inode *main_bm_inode = NULL;
7257 struct buffer_head *gd_bh = NULL;
7258 struct ocfs2_dinode *main_bm;
7259 struct ocfs2_group_desc *gd = NULL;
7260
7261 start = range->start >> osb->s_clustersize_bits;
7262 len = range->len >> osb->s_clustersize_bits;
7263 minlen = range->minlen >> osb->s_clustersize_bits;
7264 trimmed = 0;
7265
7266 if (!len) {
7267 range->len = 0;
7268 return 0;
7269 }
7270
7271 if (minlen >= osb->bitmap_cpg)
7272 return -EINVAL;
7273
7274 main_bm_inode = ocfs2_get_system_file_inode(osb,
7275 GLOBAL_BITMAP_SYSTEM_INODE,
7276 OCFS2_INVALID_SLOT);
7277 if (!main_bm_inode) {
7278 ret = -EIO;
7279 mlog_errno(ret);
7280 goto out;
7281 }
7282
7283 mutex_lock(&main_bm_inode->i_mutex);
7284
7285 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7286 if (ret < 0) {
7287 mlog_errno(ret);
7288 goto out_mutex;
7289 }
7290 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7291
7292 if (start >= le32_to_cpu(main_bm->i_clusters)) {
7293 ret = -EINVAL;
7294 goto out_unlock;
7295 }
7296
7297 if (start + len > le32_to_cpu(main_bm->i_clusters))
7298 len = le32_to_cpu(main_bm->i_clusters) - start;
7299
7300 trace_ocfs2_trim_fs(start, len, minlen);
7301
7302 /* Determine first and last group to examine based on start and len */
7303 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7304 if (first_group == osb->first_cluster_group_blkno)
7305 first_bit = start;
7306 else
7307 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7308 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7309 last_bit = osb->bitmap_cpg;
7310
7311 for (group = first_group; group <= last_group;) {
7312 if (first_bit + len >= osb->bitmap_cpg)
7313 last_bit = osb->bitmap_cpg;
7314 else
7315 last_bit = first_bit + len;
7316
7317 ret = ocfs2_read_group_descriptor(main_bm_inode,
7318 main_bm, group,
7319 &gd_bh);
7320 if (ret < 0) {
7321 mlog_errno(ret);
7322 break;
7323 }
7324
7325 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7326 cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
7327 brelse(gd_bh);
7328 gd_bh = NULL;
7329 if (cnt < 0) {
7330 ret = cnt;
7331 mlog_errno(ret);
7332 break;
7333 }
7334
7335 trimmed += cnt;
7336 len -= osb->bitmap_cpg - first_bit;
7337 first_bit = 0;
7338 if (group == osb->first_cluster_group_blkno)
7339 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7340 else
7341 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7342 }
7343 range->len = trimmed * sb->s_blocksize;
7344out_unlock:
7345 ocfs2_inode_unlock(main_bm_inode, 0);
7346 brelse(main_bm_bh);
7347out_mutex:
7348 mutex_unlock(&main_bm_inode->i_mutex);
7349 iput(main_bm_inode);
7350out:
7351 return ret;
7352}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a03251c..ca381c584127 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
239 struct buffer_head **leaf_bh); 239 struct buffer_head **leaf_bh);
240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
241 241
242int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
242/* 243/*
243 * Helper function to look at the # of clusters in an extent record. 244 * Helper function to look at the # of clusters in an extent record.
244 */ 245 */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1f..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
61 kset_unregister(o2cb_kset); 60 kset_unregister(o2cb_kset);
62} 61}
63 62
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
69 if (!o2cb_kset) 68 if (!o2cb_kset)
70 return -ENOMEM; 69 return -ENOMEM;
71 70
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
81 if (ret) 72 if (ret)
82 goto error; 73 goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..d602abb51b61 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
144 wait_queue_head_t dlm_join_events; 144 wait_queue_head_t dlm_join_events;
145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 148 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
148 struct dlm_recovery_ctxt reco; 149 struct dlm_recovery_ctxt reco;
149 spinlock_t master_lock; 150 spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
401 return 1; 402 return 1;
402} 403}
403 404
405static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
406{
407 if (idx == DLM_GRANTED_LIST)
408 return "granted";
409 else if (idx == DLM_CONVERTING_LIST)
410 return "converting";
411 else if (idx == DLM_BLOCKED_LIST)
412 return "blocked";
413 else
414 return "unknown";
415}
416
404static inline struct list_head * 417static inline struct list_head *
405dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) 418dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
406{ 419{
@@ -448,6 +461,7 @@ enum {
448 DLM_FINALIZE_RECO_MSG = 518, 461 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION = 519, 462 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO = 520, 463 DLM_QUERY_NODEINFO = 520,
464 DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
451}; 465};
452 466
453struct dlm_reco_node_data 467struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..56f82cb912e3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
756 buf + out, len - out); 756 buf + out, len - out);
757 out += snprintf(buf + out, len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
758 758
759 /* Exit Domain Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Exit Domain Map: ");
761 out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
762 buf + out, len - out);
763 out += snprintf(buf + out, len - out, "\n");
764
759 /* Live Map: xx xx xx */ 765 /* Live Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Live Map: "); 766 out += snprintf(buf + out, len - out, "Live Map: ");
761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 767 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6cbde0..6ed6b95dcf93 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
132 * New in version 1.1: 132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat 133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes 134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
135 * New in version 1.2:
136 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
135 */ 137 */
136static const struct dlm_protocol_version dlm_protocol = { 138static const struct dlm_protocol_version dlm_protocol = {
137 .pv_major = 1, 139 .pv_major = 1,
138 .pv_minor = 1, 140 .pv_minor = 2,
139}; 141};
140 142
141#define DLM_DOMAIN_BACKOFF_MS 200 143#define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
449 dropped = dlm_empty_lockres(dlm, res); 451 dropped = dlm_empty_lockres(dlm, res);
450 452
451 spin_lock(&res->spinlock); 453 spin_lock(&res->spinlock);
452 __dlm_lockres_calc_usage(dlm, res); 454 if (dropped)
453 iter = res->hash_node.next; 455 __dlm_lockres_calc_usage(dlm, res);
456 else
457 iter = res->hash_node.next;
454 spin_unlock(&res->spinlock); 458 spin_unlock(&res->spinlock);
455 459
456 dlm_lockres_put(res); 460 dlm_lockres_put(res);
457 461
458 if (dropped) 462 if (dropped) {
463 cond_resched_lock(&dlm->spinlock);
459 goto redo_bucket; 464 goto redo_bucket;
465 }
460 } 466 }
461 cond_resched_lock(&dlm->spinlock); 467 cond_resched_lock(&dlm->spinlock);
462 num += n; 468 num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
486 return ret; 492 return ret;
487} 493}
488 494
495static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
496 void *data, void **ret_data)
497{
498 struct dlm_ctxt *dlm = data;
499 unsigned int node;
500 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
501
502 if (!dlm_grab(dlm))
503 return 0;
504
505 node = exit_msg->node_idx;
506 mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
507
508 spin_lock(&dlm->spinlock);
509 set_bit(node, dlm->exit_domain_map);
510 spin_unlock(&dlm->spinlock);
511
512 dlm_put(dlm);
513
514 return 0;
515}
516
489static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) 517static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
490{ 518{
491 /* Yikes, a double spinlock! I need domain_lock for the dlm 519 /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
542 570
543 spin_lock(&dlm->spinlock); 571 spin_lock(&dlm->spinlock);
544 clear_bit(node, dlm->domain_map); 572 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map);
545 __dlm_print_nodes(dlm); 574 __dlm_print_nodes(dlm);
546 575
547 /* notify anything attached to the heartbeat events */ 576 /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
554 return 0; 583 return 0;
555} 584}
556 585
557static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, 586static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
558 unsigned int node) 587 unsigned int node)
559{ 588{
560 int status; 589 int status;
561 struct dlm_exit_domain leave_msg; 590 struct dlm_exit_domain leave_msg;
562 591
563 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", 592 mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
564 node, dlm->name, dlm->node_num); 593 msg_type, node);
565 594
566 memset(&leave_msg, 0, sizeof(leave_msg)); 595 memset(&leave_msg, 0, sizeof(leave_msg));
567 leave_msg.node_idx = dlm->node_num; 596 leave_msg.node_idx = dlm->node_num;
568 597
569 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 598 status = o2net_send_message(msg_type, dlm->key, &leave_msg,
570 &leave_msg, sizeof(leave_msg), node, 599 sizeof(leave_msg), node, NULL);
571 NULL);
572 if (status < 0) 600 if (status < 0)
573 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 601 mlog(ML_ERROR, "Error %d sending domain exit message %u "
574 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); 602 "to node %u on domain %s\n", status, msg_type, node,
575 mlog(0, "status return %d from o2net_send_message\n", status); 603 dlm->name);
576 604
577 return status; 605 return status;
578} 606}
579 607
608static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
609{
610 int node = -1;
611
612 /* Support for begin exit domain was added in 1.2 */
613 if (dlm->dlm_locking_proto.pv_major == 1 &&
614 dlm->dlm_locking_proto.pv_minor < 2)
615 return;
616
617 /*
618 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
619 * informational. Meaning if a node does not receive the message,
620 * so be it.
621 */
622 spin_lock(&dlm->spinlock);
623 while (1) {
624 node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
625 if (node >= O2NM_MAX_NODES)
626 break;
627 if (node == dlm->node_num)
628 continue;
629
630 spin_unlock(&dlm->spinlock);
631 dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
632 spin_lock(&dlm->spinlock);
633 }
634 spin_unlock(&dlm->spinlock);
635}
580 636
581static void dlm_leave_domain(struct dlm_ctxt *dlm) 637static void dlm_leave_domain(struct dlm_ctxt *dlm)
582{ 638{
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
602 658
603 clear_node = 1; 659 clear_node = 1;
604 660
605 status = dlm_send_one_domain_exit(dlm, node); 661 status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
662 node);
606 if (status < 0 && 663 if (status < 0 &&
607 status != -ENOPROTOOPT && 664 status != -ENOPROTOOPT &&
608 status != -ENOTCONN) { 665 status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
677 734
678 if (leave) { 735 if (leave) {
679 mlog(0, "shutting down domain %s\n", dlm->name); 736 mlog(0, "shutting down domain %s\n", dlm->name);
737 dlm_begin_exit_domain(dlm);
680 738
681 /* We changed dlm state, notify the thread */ 739 /* We changed dlm state, notify the thread */
682 dlm_kick_thread(dlm, NULL); 740 dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
909 * leftover join state. */ 967 * leftover join state. */
910 BUG_ON(dlm->joining_node != assert->node_idx); 968 BUG_ON(dlm->joining_node != assert->node_idx);
911 set_bit(assert->node_idx, dlm->domain_map); 969 set_bit(assert->node_idx, dlm->domain_map);
970 clear_bit(assert->node_idx, dlm->exit_domain_map);
912 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
913 972
914 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1793 if (status) 1852 if (status)
1794 goto bail; 1853 goto bail;
1795 1854
1855 status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
1856 sizeof(struct dlm_exit_domain),
1857 dlm_begin_exit_domain_handler,
1858 dlm, NULL, &dlm->dlm_domain_handlers);
1859 if (status)
1860 goto bail;
1861
1796bail: 1862bail:
1797 if (status) 1863 if (status)
1798 dlm_unregister_domain_handlers(dlm); 1864 dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d166328cf7..11eefb8c12e9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2339 dlm_lockres_put(res); 2339 dlm_lockres_put(res);
2340} 2340}
2341 2341
2342/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 2342/*
2343 * if not. If 0, numlocks is set to the number of locks in the lockres. 2343 * A migrateable resource is one that is :
2344 * 1. locally mastered, and,
2345 * 2. zero local locks, and,
2346 * 3. one or more non-local locks, or, one or more references
2347 * Returns 1 if yes, 0 if not.
2344 */ 2348 */
2345static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2349static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2346 struct dlm_lock_resource *res, 2350 struct dlm_lock_resource *res)
2347 int *numlocks,
2348 int *hasrefs)
2349{ 2351{
2350 int ret; 2352 enum dlm_lockres_list idx;
2351 int i; 2353 int nonlocal = 0, node_ref;
2352 int count = 0;
2353 struct list_head *queue; 2354 struct list_head *queue;
2354 struct dlm_lock *lock; 2355 struct dlm_lock *lock;
2356 u64 cookie;
2355 2357
2356 assert_spin_locked(&res->spinlock); 2358 assert_spin_locked(&res->spinlock);
2357 2359
2358 *numlocks = 0; 2360 if (res->owner != dlm->node_num)
2359 *hasrefs = 0; 2361 return 0;
2360
2361 ret = -EINVAL;
2362 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2363 mlog(0, "cannot migrate lockres with unknown owner!\n");
2364 goto leave;
2365 }
2366
2367 if (res->owner != dlm->node_num) {
2368 mlog(0, "cannot migrate lockres this node doesn't own!\n");
2369 goto leave;
2370 }
2371 2362
2372 ret = 0; 2363 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2373 queue = &res->granted; 2364 queue = dlm_list_idx_to_ptr(res, idx);
2374 for (i = 0; i < 3; i++) {
2375 list_for_each_entry(lock, queue, list) { 2365 list_for_each_entry(lock, queue, list) {
2376 ++count; 2366 if (lock->ml.node != dlm->node_num) {
2377 if (lock->ml.node == dlm->node_num) { 2367 nonlocal++;
2378 mlog(0, "found a lock owned by this node still " 2368 continue;
2379 "on the %s queue! will not migrate this "
2380 "lockres\n", (i == 0 ? "granted" :
2381 (i == 1 ? "converting" :
2382 "blocked")));
2383 ret = -ENOTEMPTY;
2384 goto leave;
2385 } 2369 }
2370 cookie = be64_to_cpu(lock->ml.cookie);
2371 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
2372 "%s list\n", dlm->name, res->lockname.len,
2373 res->lockname.name,
2374 dlm_get_lock_cookie_node(cookie),
2375 dlm_get_lock_cookie_seq(cookie),
2376 dlm_list_in_text(idx));
2377 return 0;
2386 } 2378 }
2387 queue++;
2388 } 2379 }
2389 2380
2390 *numlocks = count; 2381 if (!nonlocal) {
2391 2382 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2392 count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 2383 if (node_ref >= O2NM_MAX_NODES)
2393 if (count < O2NM_MAX_NODES) 2384 return 0;
2394 *hasrefs = 1; 2385 }
2395 2386
2396 mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, 2387 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
2397 res->lockname.len, res->lockname.name, *numlocks, *hasrefs); 2388 res->lockname.name);
2398 2389
2399leave: 2390 return 1;
2400 return ret;
2401} 2391}
2402 2392
2403/* 2393/*
@@ -2406,8 +2396,7 @@ leave:
2406 2396
2407 2397
2408static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2398static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2409 struct dlm_lock_resource *res, 2399 struct dlm_lock_resource *res, u8 target)
2410 u8 target)
2411{ 2400{
2412 struct dlm_master_list_entry *mle = NULL; 2401 struct dlm_master_list_entry *mle = NULL;
2413 struct dlm_master_list_entry *oldmle = NULL; 2402 struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2416 const char *name; 2405 const char *name;
2417 unsigned int namelen; 2406 unsigned int namelen;
2418 int mle_added = 0; 2407 int mle_added = 0;
2419 int numlocks, hasrefs;
2420 int wake = 0; 2408 int wake = 0;
2421 2409
2422 if (!dlm_grab(dlm)) 2410 if (!dlm_grab(dlm))
2423 return -EINVAL; 2411 return -EINVAL;
2424 2412
2413 BUG_ON(target == O2NM_MAX_NODES);
2414
2425 name = res->lockname.name; 2415 name = res->lockname.name;
2426 namelen = res->lockname.len; 2416 namelen = res->lockname.len;
2427 2417
2428 mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); 2418 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2429 2419 target);
2430 /*
2431 * ensure this lockres is a proper candidate for migration
2432 */
2433 spin_lock(&res->spinlock);
2434 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2435 if (ret < 0) {
2436 spin_unlock(&res->spinlock);
2437 goto leave;
2438 }
2439 spin_unlock(&res->spinlock);
2440
2441 /* no work to do */
2442 if (numlocks == 0 && !hasrefs)
2443 goto leave;
2444
2445 /*
2446 * preallocate up front
2447 * if this fails, abort
2448 */
2449 2420
2421 /* preallocate up front. if this fails, abort */
2450 ret = -ENOMEM; 2422 ret = -ENOMEM;
2451 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 2423 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2452 if (!mres) { 2424 if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2462 ret = 0; 2434 ret = 0;
2463 2435
2464 /* 2436 /*
2465 * find a node to migrate the lockres to
2466 */
2467
2468 spin_lock(&dlm->spinlock);
2469 /* pick a new node */
2470 if (!test_bit(target, dlm->domain_map) ||
2471 target >= O2NM_MAX_NODES) {
2472 target = dlm_pick_migration_target(dlm, res);
2473 }
2474 mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
2475 namelen, name, target);
2476
2477 if (target >= O2NM_MAX_NODES ||
2478 !test_bit(target, dlm->domain_map)) {
2479 /* target chosen is not alive */
2480 ret = -EINVAL;
2481 }
2482
2483 if (ret) {
2484 spin_unlock(&dlm->spinlock);
2485 goto fail;
2486 }
2487
2488 mlog(0, "continuing with target = %u\n", target);
2489
2490 /*
2491 * clear any existing master requests and 2437 * clear any existing master requests and
2492 * add the migration mle to the list 2438 * add the migration mle to the list
2493 */ 2439 */
2440 spin_lock(&dlm->spinlock);
2494 spin_lock(&dlm->master_lock); 2441 spin_lock(&dlm->master_lock);
2495 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 2442 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2496 namelen, target, dlm->node_num); 2443 namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
2531 dlm_put_mle(mle); 2478 dlm_put_mle(mle);
2532 } else if (mle) { 2479 } else if (mle) {
2533 kmem_cache_free(dlm_mle_cache, mle); 2480 kmem_cache_free(dlm_mle_cache, mle);
2481 mle = NULL;
2534 } 2482 }
2535 goto leave; 2483 goto leave;
2536 } 2484 }
@@ -2652,69 +2600,52 @@ leave:
2652 if (wake) 2600 if (wake)
2653 wake_up(&res->wq); 2601 wake_up(&res->wq);
2654 2602
2655 /* TODO: cleanup */
2656 if (mres) 2603 if (mres)
2657 free_page((unsigned long)mres); 2604 free_page((unsigned long)mres);
2658 2605
2659 dlm_put(dlm); 2606 dlm_put(dlm);
2660 2607
2661 mlog(0, "returning %d\n", ret); 2608 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2609 name, target, ret);
2662 return ret; 2610 return ret;
2663} 2611}
2664 2612
2665#define DLM_MIGRATION_RETRY_MS 100 2613#define DLM_MIGRATION_RETRY_MS 100
2666 2614
2667/* Should be called only after beginning the domain leave process. 2615/*
2616 * Should be called only after beginning the domain leave process.
2668 * There should not be any remaining locks on nonlocal lock resources, 2617 * There should not be any remaining locks on nonlocal lock resources,
2669 * and there should be no local locks left on locally mastered resources. 2618 * and there should be no local locks left on locally mastered resources.
2670 * 2619 *
2671 * Called with the dlm spinlock held, may drop it to do migration, but 2620 * Called with the dlm spinlock held, may drop it to do migration, but
2672 * will re-acquire before exit. 2621 * will re-acquire before exit.
2673 * 2622 *
2674 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ 2623 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2624 */
2675int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2625int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2676{ 2626{
2677 int ret; 2627 int ret;
2678 int lock_dropped = 0; 2628 int lock_dropped = 0;
2679 int numlocks, hasrefs; 2629 u8 target = O2NM_MAX_NODES;
2630
2631 assert_spin_locked(&dlm->spinlock);
2680 2632
2681 spin_lock(&res->spinlock); 2633 spin_lock(&res->spinlock);
2682 if (res->owner != dlm->node_num) { 2634 if (dlm_is_lockres_migrateable(dlm, res))
2683 if (!__dlm_lockres_unused(res)) { 2635 target = dlm_pick_migration_target(dlm, res);
2684 mlog(ML_ERROR, "%s:%.*s: this node is not master, " 2636 spin_unlock(&res->spinlock);
2685 "trying to free this but locks remain\n",
2686 dlm->name, res->lockname.len, res->lockname.name);
2687 }
2688 spin_unlock(&res->spinlock);
2689 goto leave;
2690 }
2691 2637
2692 /* No need to migrate a lockres having no locks */ 2638 if (target == O2NM_MAX_NODES)
2693 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2694 if (ret >= 0 && numlocks == 0 && !hasrefs) {
2695 spin_unlock(&res->spinlock);
2696 goto leave; 2639 goto leave;
2697 }
2698 spin_unlock(&res->spinlock);
2699 2640
2700 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2641 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2701 spin_unlock(&dlm->spinlock); 2642 spin_unlock(&dlm->spinlock);
2702 lock_dropped = 1; 2643 lock_dropped = 1;
2703 while (1) { 2644 ret = dlm_migrate_lockres(dlm, res, target);
2704 ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); 2645 if (ret)
2705 if (ret >= 0) 2646 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2706 break; 2647 dlm->name, res->lockname.len, res->lockname.name,
2707 if (ret == -ENOTEMPTY) { 2648 target, ret);
2708 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
2709 res->lockname.len, res->lockname.name);
2710 BUG();
2711 }
2712
2713 mlog(0, "lockres %.*s: migrate failed, "
2714 "retrying\n", res->lockname.len,
2715 res->lockname.name);
2716 msleep(DLM_MIGRATION_RETRY_MS);
2717 }
2718 spin_lock(&dlm->spinlock); 2649 spin_lock(&dlm->spinlock);
2719leave: 2650leave:
2720 return lock_dropped; 2651 return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2898 } 2829 }
2899} 2830}
2900 2831
2901/* for now this is not too intelligent. we will 2832/*
2902 * need stats to make this do the right thing. 2833 * Pick a node to migrate the lock resource to. This function selects a
2903 * this just finds the first lock on one of the 2834 * potential target based first on the locks and then on refmap. It skips
2904 * queues and uses that node as the target. */ 2835 * nodes that are in the process of exiting the domain.
2836 */
2905static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 2837static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2906 struct dlm_lock_resource *res) 2838 struct dlm_lock_resource *res)
2907{ 2839{
2908 int i; 2840 enum dlm_lockres_list idx;
2909 struct list_head *queue = &res->granted; 2841 struct list_head *queue = &res->granted;
2910 struct dlm_lock *lock; 2842 struct dlm_lock *lock;
2911 int nodenum; 2843 int noderef;
2844 u8 nodenum = O2NM_MAX_NODES;
2912 2845
2913 assert_spin_locked(&dlm->spinlock); 2846 assert_spin_locked(&dlm->spinlock);
2847 assert_spin_locked(&res->spinlock);
2914 2848
2915 spin_lock(&res->spinlock); 2849 /* Go through all the locks */
2916 for (i=0; i<3; i++) { 2850 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2851 queue = dlm_list_idx_to_ptr(res, idx);
2917 list_for_each_entry(lock, queue, list) { 2852 list_for_each_entry(lock, queue, list) {
2918 /* up to the caller to make sure this node 2853 if (lock->ml.node == dlm->node_num)
2919 * is alive */ 2854 continue;
2920 if (lock->ml.node != dlm->node_num) { 2855 if (test_bit(lock->ml.node, dlm->exit_domain_map))
2921 spin_unlock(&res->spinlock); 2856 continue;
2922 return lock->ml.node; 2857 nodenum = lock->ml.node;
2923 } 2858 goto bail;
2924 } 2859 }
2925 queue++;
2926 }
2927
2928 nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2929 if (nodenum < O2NM_MAX_NODES) {
2930 spin_unlock(&res->spinlock);
2931 return nodenum;
2932 } 2860 }
2933 spin_unlock(&res->spinlock);
2934 mlog(0, "have not found a suitable target yet! checking domain map\n");
2935 2861
2936 /* ok now we're getting desperate. pick anyone alive. */ 2862 /* Go thru the refmap */
2937 nodenum = -1; 2863 noderef = -1;
2938 while (1) { 2864 while (1) {
2939 nodenum = find_next_bit(dlm->domain_map, 2865 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
2940 O2NM_MAX_NODES, nodenum+1); 2866 noderef + 1);
2941 mlog(0, "found %d in domain map\n", nodenum); 2867 if (noderef >= O2NM_MAX_NODES)
2942 if (nodenum >= O2NM_MAX_NODES)
2943 break; 2868 break;
2944 if (nodenum != dlm->node_num) { 2869 if (noderef == dlm->node_num)
2945 mlog(0, "picking %d\n", nodenum); 2870 continue;
2946 return nodenum; 2871 if (test_bit(noderef, dlm->exit_domain_map))
2947 } 2872 continue;
2873 nodenum = noderef;
2874 goto bail;
2948 } 2875 }
2949 2876
2950 mlog(0, "giving up. no master to migrate to\n"); 2877bail:
2951 return DLM_LOCK_RES_OWNER_UNKNOWN; 2878 return nodenum;
2952} 2879}
2953 2880
2954
2955
2956/* this is called by the new master once all lockres 2881/* this is called by the new master once all lockres
2957 * data has been received */ 2882 * data has been received */
2958static int dlm_do_migrate_request(struct dlm_ctxt *dlm, 2883static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6fc254d..7efab6d28a21 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2393 2393
2394 mlog(0, "node %u being removed from domain map!\n", idx); 2394 mlog(0, "node %u being removed from domain map!\n", idx);
2395 clear_bit(idx, dlm->domain_map); 2395 clear_bit(idx, dlm->domain_map);
2396 clear_bit(idx, dlm->exit_domain_map);
2396 /* wake up migration waiters if a node goes down. 2397 /* wake up migration waiters if a node goes down.
2397 * perhaps later we can genericize this for other waiters. */ 2398 * perhaps later we can genericize this for other waiters. */
2398 wake_up(&dlm->migration_wq); 2399 wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0eddc365..b42076797049 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
88 * signifies a bast fired on the lock. 88 * signifies a bast fired on the lock.
89 */ 89 */
90#define DLMFS_CAPABILITIES "bast stackglue" 90#define DLMFS_CAPABILITIES "bast stackglue"
91extern int param_set_dlmfs_capabilities(const char *val, 91static int param_set_dlmfs_capabilities(const char *val,
92 struct kernel_param *kp) 92 struct kernel_param *kp)
93{ 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name); 94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6dc206..b1e35a392ca5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
2670 .flock = ocfs2_flock, 2670 .flock = ocfs2_flock,
2671 .splice_read = ocfs2_file_splice_read, 2671 .splice_read = ocfs2_file_splice_read,
2672 .splice_write = ocfs2_file_splice_write, 2672 .splice_write = ocfs2_file_splice_write,
2673 .fallocate = ocfs2_fallocate,
2673}; 2674};
2674 2675
2675const struct file_operations ocfs2_dops_no_plocks = { 2676const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c5989eae..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h" 24#include "refcounttree.h"
25#include "sysfile.h"
26#include "dir.h"
27#include "buffer_head_io.h"
28#include "suballoc.h"
29#include "move_extents.h"
25 30
26#include <linux/ext2_fs.h> 31#include <linux/ext2_fs.h>
27 32
@@ -35,31 +40,27 @@
35 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 40 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
36 * just a best-effort to tell userspace that this request caused the error. 41 * just a best-effort to tell userspace that this request caused the error.
37 */ 42 */
38static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, 43static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
39 struct ocfs2_info_request __user *req) 44 struct ocfs2_info_request __user *req)
40{ 45{
41 kreq->ir_flags |= OCFS2_INFO_FL_ERROR; 46 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
42 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); 47 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
43} 48}
44 49
45#define o2info_set_request_error(a, b) \ 50static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
46 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
47
48static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
49{ 51{
50 req->ir_flags |= OCFS2_INFO_FL_FILLED; 52 req->ir_flags |= OCFS2_INFO_FL_FILLED;
51} 53}
52 54
53#define o2info_set_request_filled(a) \ 55static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
54 __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
55
56static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
57{ 56{
58 req->ir_flags &= ~OCFS2_INFO_FL_FILLED; 57 req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
59} 58}
60 59
61#define o2info_clear_request_filled(a) \ 60static inline int o2info_coherent(struct ocfs2_info_request *req)
62 __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) 61{
62 return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
63}
63 64
64static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 65static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
65{ 66{
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
153 154
154 oib.ib_blocksize = inode->i_sb->s_blocksize; 155 oib.ib_blocksize = inode->i_sb->s_blocksize;
155 156
156 o2info_set_request_filled(oib); 157 o2info_set_request_filled(&oib.ib_req);
157 158
158 if (o2info_to_user(oib, req)) 159 if (o2info_to_user(oib, req))
159 goto bail; 160 goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
161 status = 0; 162 status = 0;
162bail: 163bail:
163 if (status) 164 if (status)
164 o2info_set_request_error(oib, req); 165 o2info_set_request_error(&oib.ib_req, req);
165 166
166 return status; 167 return status;
167} 168}
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
178 179
179 oic.ic_clustersize = osb->s_clustersize; 180 oic.ic_clustersize = osb->s_clustersize;
180 181
181 o2info_set_request_filled(oic); 182 o2info_set_request_filled(&oic.ic_req);
182 183
183 if (o2info_to_user(oic, req)) 184 if (o2info_to_user(oic, req))
184 goto bail; 185 goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
186 status = 0; 187 status = 0;
187bail: 188bail:
188 if (status) 189 if (status)
189 o2info_set_request_error(oic, req); 190 o2info_set_request_error(&oic.ic_req, req);
190 191
191 return status; 192 return status;
192} 193}
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
203 204
204 oim.im_max_slots = osb->max_slots; 205 oim.im_max_slots = osb->max_slots;
205 206
206 o2info_set_request_filled(oim); 207 o2info_set_request_filled(&oim.im_req);
207 208
208 if (o2info_to_user(oim, req)) 209 if (o2info_to_user(oim, req))
209 goto bail; 210 goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
211 status = 0; 212 status = 0;
212bail: 213bail:
213 if (status) 214 if (status)
214 o2info_set_request_error(oim, req); 215 o2info_set_request_error(&oim.im_req, req);
215 216
216 return status; 217 return status;
217} 218}
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
228 229
229 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
230 231
231 o2info_set_request_filled(oil); 232 o2info_set_request_filled(&oil.il_req);
232 233
233 if (o2info_to_user(oil, req)) 234 if (o2info_to_user(oil, req))
234 goto bail; 235 goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
236 status = 0; 237 status = 0;
237bail: 238bail:
238 if (status) 239 if (status)
239 o2info_set_request_error(oil, req); 240 o2info_set_request_error(&oil.il_req, req);
240 241
241 return status; 242 return status;
242} 243}
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
253 254
254 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
255 256
256 o2info_set_request_filled(oiu); 257 o2info_set_request_filled(&oiu.iu_req);
257 258
258 if (o2info_to_user(oiu, req)) 259 if (o2info_to_user(oiu, req))
259 goto bail; 260 goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
261 status = 0; 262 status = 0;
262bail: 263bail:
263 if (status) 264 if (status)
264 o2info_set_request_error(oiu, req); 265 o2info_set_request_error(&oiu.iu_req, req);
265 266
266 return status; 267 return status;
267} 268}
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
280 oif.if_incompat_features = osb->s_feature_incompat; 281 oif.if_incompat_features = osb->s_feature_incompat;
281 oif.if_ro_compat_features = osb->s_feature_ro_compat; 282 oif.if_ro_compat_features = osb->s_feature_ro_compat;
282 283
283 o2info_set_request_filled(oif); 284 o2info_set_request_filled(&oif.if_req);
284 285
285 if (o2info_to_user(oif, req)) 286 if (o2info_to_user(oif, req))
286 goto bail; 287 goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
288 status = 0; 289 status = 0;
289bail: 290bail:
290 if (status) 291 if (status)
291 o2info_set_request_error(oif, req); 292 o2info_set_request_error(&oif.if_req, req);
292 293
293 return status; 294 return status;
294} 295}
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
305 306
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 307 oij.ij_journal_size = osb->journal->j_inode->i_size;
307 308
308 o2info_set_request_filled(oij); 309 o2info_set_request_filled(&oij.ij_req);
309 310
310 if (o2info_to_user(oij, req)) 311 if (o2info_to_user(oij, req))
311 goto bail; 312 goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
313 status = 0; 314 status = 0;
314bail: 315bail:
315 if (status) 316 if (status)
316 o2info_set_request_error(oij, req); 317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320}
321
322int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
323 struct inode *inode_alloc, u64 blkno,
324 struct ocfs2_info_freeinode *fi, u32 slot)
325{
326 int status = 0, unlock = 0;
327
328 struct buffer_head *bh = NULL;
329 struct ocfs2_dinode *dinode_alloc = NULL;
330
331 if (inode_alloc)
332 mutex_lock(&inode_alloc->i_mutex);
333
334 if (o2info_coherent(&fi->ifi_req)) {
335 status = ocfs2_inode_lock(inode_alloc, &bh, 0);
336 if (status < 0) {
337 mlog_errno(status);
338 goto bail;
339 }
340 unlock = 1;
341 } else {
342 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
343 if (status < 0) {
344 mlog_errno(status);
345 goto bail;
346 }
347 }
348
349 dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
350
351 fi->ifi_stat[slot].lfi_total =
352 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
353 fi->ifi_stat[slot].lfi_free =
354 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
355 le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
356
357bail:
358 if (unlock)
359 ocfs2_inode_unlock(inode_alloc, 0);
360
361 if (inode_alloc)
362 mutex_unlock(&inode_alloc->i_mutex);
363
364 brelse(bh);
365
366 return status;
367}
368
369int ocfs2_info_handle_freeinode(struct inode *inode,
370 struct ocfs2_info_request __user *req)
371{
372 u32 i;
373 u64 blkno = -1;
374 char namebuf[40];
375 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
376 struct ocfs2_info_freeinode *oifi = NULL;
377 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
378 struct inode *inode_alloc = NULL;
379
380 oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
381 if (!oifi) {
382 status = -ENOMEM;
383 mlog_errno(status);
384 goto bail;
385 }
386
387 if (o2info_from_user(*oifi, req))
388 goto bail;
389
390 oifi->ifi_slotnum = osb->max_slots;
391
392 for (i = 0; i < oifi->ifi_slotnum; i++) {
393 if (o2info_coherent(&oifi->ifi_req)) {
394 inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
395 if (!inode_alloc) {
396 mlog(ML_ERROR, "unable to get alloc inode in "
397 "slot %u\n", i);
398 status = -EIO;
399 goto bail;
400 }
401 } else {
402 ocfs2_sprintf_system_inode_name(namebuf,
403 sizeof(namebuf),
404 type, i);
405 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
406 namebuf,
407 strlen(namebuf),
408 &blkno);
409 if (status < 0) {
410 status = -ENOENT;
411 goto bail;
412 }
413 }
414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418
419 iput(inode_alloc);
420 inode_alloc = NULL;
421 }
422
423 o2info_set_request_filled(&oifi->ifi_req);
424
425 if (o2info_to_user(*oifi, req))
426 goto bail;
427
428 status = 0;
429bail:
430 if (status)
431 o2info_set_request_error(&oifi->ifi_req, req);
432
433 kfree(oifi);
434
435 return status;
436}
437
438static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
439 unsigned int chunksize)
440{
441 int index;
442
443 index = __ilog2_u32(chunksize);
444 if (index >= OCFS2_INFO_MAX_HIST)
445 index = OCFS2_INFO_MAX_HIST - 1;
446
447 hist->fc_chunks[index]++;
448 hist->fc_clusters[index] += chunksize;
449}
450
451static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
452 unsigned int chunksize)
453{
454 if (chunksize > stats->ffs_max)
455 stats->ffs_max = chunksize;
456
457 if (chunksize < stats->ffs_min)
458 stats->ffs_min = chunksize;
459
460 stats->ffs_avg += chunksize;
461 stats->ffs_free_chunks_real++;
462}
463
464void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
465 unsigned int chunksize)
466{
467 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
468 o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
469}
470
471int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
472 struct inode *gb_inode,
473 struct ocfs2_dinode *gb_dinode,
474 struct ocfs2_chain_rec *rec,
475 struct ocfs2_info_freefrag *ffg,
476 u32 chunks_in_group)
477{
478 int status = 0, used;
479 u64 blkno;
480
481 struct buffer_head *bh = NULL;
482 struct ocfs2_group_desc *bg = NULL;
483
484 unsigned int max_bits, num_clusters;
485 unsigned int offset = 0, cluster, chunk;
486 unsigned int chunk_free, last_chunksize = 0;
487
488 if (!le32_to_cpu(rec->c_free))
489 goto bail;
490
491 do {
492 if (!bg)
493 blkno = le64_to_cpu(rec->c_blkno);
494 else
495 blkno = le64_to_cpu(bg->bg_next_group);
496
497 if (bh) {
498 brelse(bh);
499 bh = NULL;
500 }
501
502 if (o2info_coherent(&ffg->iff_req))
503 status = ocfs2_read_group_descriptor(gb_inode,
504 gb_dinode,
505 blkno, &bh);
506 else
507 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
508
509 if (status < 0) {
510 mlog(ML_ERROR, "Can't read the group descriptor # "
511 "%llu from device.", (unsigned long long)blkno);
512 status = -EIO;
513 goto bail;
514 }
515
516 bg = (struct ocfs2_group_desc *)bh->b_data;
517
518 if (!le16_to_cpu(bg->bg_free_bits_count))
519 continue;
520
521 max_bits = le16_to_cpu(bg->bg_bits);
522 offset = 0;
523
524 for (chunk = 0; chunk < chunks_in_group; chunk++) {
525 /*
526 * last chunk may be not an entire one.
527 */
528 if ((offset + ffg->iff_chunksize) > max_bits)
529 num_clusters = max_bits - offset;
530 else
531 num_clusters = ffg->iff_chunksize;
532
533 chunk_free = 0;
534 for (cluster = 0; cluster < num_clusters; cluster++) {
535 used = ocfs2_test_bit(offset,
536 (unsigned long *)bg->bg_bitmap);
537 /*
538 * - chunk_free counts free clusters in #N chunk.
539 * - last_chunksize records the size(in) clusters
540 * for the last real free chunk being counted.
541 */
542 if (!used) {
543 last_chunksize++;
544 chunk_free++;
545 }
546
547 if (used && last_chunksize) {
548 ocfs2_info_update_ffg(ffg,
549 last_chunksize);
550 last_chunksize = 0;
551 }
552
553 offset++;
554 }
555
556 if (chunk_free == ffg->iff_chunksize)
557 ffg->iff_ffs.ffs_free_chunks++;
558 }
559
560 /*
561 * need to update the info for last free chunk.
562 */
563 if (last_chunksize)
564 ocfs2_info_update_ffg(ffg, last_chunksize);
565
566 } while (le64_to_cpu(bg->bg_next_group));
567
568bail:
569 brelse(bh);
570
571 return status;
572}
573
574int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
575 struct inode *gb_inode, u64 blkno,
576 struct ocfs2_info_freefrag *ffg)
577{
578 u32 chunks_in_group;
579 int status = 0, unlock = 0, i;
580
581 struct buffer_head *bh = NULL;
582 struct ocfs2_chain_list *cl = NULL;
583 struct ocfs2_chain_rec *rec = NULL;
584 struct ocfs2_dinode *gb_dinode = NULL;
585
586 if (gb_inode)
587 mutex_lock(&gb_inode->i_mutex);
588
589 if (o2info_coherent(&ffg->iff_req)) {
590 status = ocfs2_inode_lock(gb_inode, &bh, 0);
591 if (status < 0) {
592 mlog_errno(status);
593 goto bail;
594 }
595 unlock = 1;
596 } else {
597 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
598 if (status < 0) {
599 mlog_errno(status);
600 goto bail;
601 }
602 }
603
604 gb_dinode = (struct ocfs2_dinode *)bh->b_data;
605 cl = &(gb_dinode->id2.i_chain);
606
607 /*
608 * Chunksize(in) clusters from userspace should be
609 * less than clusters in a group.
610 */
611 if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
612 status = -EINVAL;
613 goto bail;
614 }
615
616 memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
617
618 ffg->iff_ffs.ffs_min = ~0U;
619 ffg->iff_ffs.ffs_clusters =
620 le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
621 ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
622 le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
623
624 chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
625
626 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
627 rec = &(cl->cl_recs[i]);
628 status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
629 gb_dinode,
630 rec, ffg,
631 chunks_in_group);
632 if (status)
633 goto bail;
634 }
635
636 if (ffg->iff_ffs.ffs_free_chunks_real)
637 ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
638 ffg->iff_ffs.ffs_free_chunks_real);
639bail:
640 if (unlock)
641 ocfs2_inode_unlock(gb_inode, 0);
642
643 if (gb_inode)
644 mutex_unlock(&gb_inode->i_mutex);
645
646 if (gb_inode)
647 iput(gb_inode);
648
649 brelse(bh);
650
651 return status;
652}
653
654int ocfs2_info_handle_freefrag(struct inode *inode,
655 struct ocfs2_info_request __user *req)
656{
657 u64 blkno = -1;
658 char namebuf[40];
659 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
660
661 struct ocfs2_info_freefrag *oiff;
662 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
663 struct inode *gb_inode = NULL;
664
665 oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
666 if (!oiff) {
667 status = -ENOMEM;
668 mlog_errno(status);
669 goto bail;
670 }
671
672 if (o2info_from_user(*oiff, req))
673 goto bail;
674 /*
675 * chunksize from userspace should be power of 2.
676 */
677 if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
678 (!oiff->iff_chunksize)) {
679 status = -EINVAL;
680 goto bail;
681 }
682
683 if (o2info_coherent(&oiff->iff_req)) {
684 gb_inode = ocfs2_get_system_file_inode(osb, type,
685 OCFS2_INVALID_SLOT);
686 if (!gb_inode) {
687 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
688 status = -EIO;
689 goto bail;
690 }
691 } else {
692 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
693 OCFS2_INVALID_SLOT);
694 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
695 namebuf,
696 strlen(namebuf),
697 &blkno);
698 if (status < 0) {
699 status = -ENOENT;
700 goto bail;
701 }
702 }
703
704 status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
705 if (status < 0)
706 goto bail;
707
708 o2info_set_request_filled(&oiff->iff_req);
709
710 if (o2info_to_user(*oiff, req))
711 goto bail;
712
713 status = 0;
714bail:
715 if (status)
716 o2info_set_request_error(&oiff->iff_req, req);
717
718 kfree(oiff);
317 719
318 return status; 720 return status;
319} 721}
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
327 if (o2info_from_user(oir, req)) 729 if (o2info_from_user(oir, req))
328 goto bail; 730 goto bail;
329 731
330 o2info_clear_request_filled(oir); 732 o2info_clear_request_filled(&oir);
331 733
332 if (o2info_to_user(oir, req)) 734 if (o2info_to_user(oir, req))
333 goto bail; 735 goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
335 status = 0; 737 status = 0;
336bail: 738bail:
337 if (status) 739 if (status)
338 o2info_set_request_error(oir, req); 740 o2info_set_request_error(&oir, req);
339 741
340 return status; 742 return status;
341} 743}
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
389 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) 791 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
390 status = ocfs2_info_handle_journal_size(inode, req); 792 status = ocfs2_info_handle_journal_size(inode, req);
391 break; 793 break;
794 case OCFS2_INFO_FREEINODE:
795 if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
796 status = ocfs2_info_handle_freeinode(inode, req);
797 break;
798 case OCFS2_INFO_FREEFRAG:
799 if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
800 status = ocfs2_info_handle_freefrag(inode, req);
801 break;
392 default: 802 default:
393 status = ocfs2_info_handle_unknown(inode, req); 803 status = ocfs2_info_handle_unknown(inode, req);
394 break; 804 break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
542 return -EFAULT; 952 return -EFAULT;
543 953
544 return ocfs2_info_handle(inode, &info, 0); 954 return ocfs2_info_handle(inode, &info, 0);
955 case FITRIM:
956 {
957 struct super_block *sb = inode->i_sb;
958 struct fstrim_range range;
959 int ret = 0;
960
961 if (!capable(CAP_SYS_ADMIN))
962 return -EPERM;
963
964 if (copy_from_user(&range, (struct fstrim_range *)arg,
965 sizeof(range)))
966 return -EFAULT;
967
968 ret = ocfs2_trim_fs(sb, &range);
969 if (ret < 0)
970 return ret;
971
972 if (copy_to_user((struct fstrim_range *)arg, &range,
973 sizeof(range)))
974 return -EFAULT;
975
976 return 0;
977 }
978 case OCFS2_IOC_MOVE_EXT:
979 return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
545 default: 980 default:
546 return -ENOTTY; 981 return -ENOTTY;
547 } 982 }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
569 case OCFS2_IOC_GROUP_EXTEND: 1004 case OCFS2_IOC_GROUP_EXTEND:
570 case OCFS2_IOC_GROUP_ADD: 1005 case OCFS2_IOC_GROUP_ADD:
571 case OCFS2_IOC_GROUP_ADD64: 1006 case OCFS2_IOC_GROUP_ADD64:
1007 case FITRIM:
572 break; 1008 break;
573 case OCFS2_IOC_REFLINK: 1009 case OCFS2_IOC_REFLINK:
574 if (copy_from_user(&args, (struct reflink_arguments *)arg, 1010 if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
584 return -EFAULT; 1020 return -EFAULT;
585 1021
586 return ocfs2_info_handle(inode, &info, 1); 1022 return ocfs2_info_handle(inode, &info, 1);
1023 case OCFS2_IOC_MOVE_EXT:
1024 break;
587 default: 1025 default:
588 return -ENOIOCTLCMD; 1026 return -ENOIOCTLCMD;
589 } 1027 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.c
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#include <linux/fs.h>
18#include <linux/types.h>
19#include <linux/mount.h>
20#include <linux/swap.h>
21
22#include <cluster/masklog.h>
23
24#include "ocfs2.h"
25#include "ocfs2_ioctl.h"
26
27#include "alloc.h"
28#include "aops.h"
29#include "dlmglue.h"
30#include "extent_map.h"
31#include "inode.h"
32#include "journal.h"
33#include "suballoc.h"
34#include "uptodate.h"
35#include "super.h"
36#include "dir.h"
37#include "buffer_head_io.h"
38#include "sysfile.h"
39#include "suballoc.h"
40#include "refcounttree.h"
41#include "move_extents.h"
42
43struct ocfs2_move_extents_context {
44 struct inode *inode;
45 struct file *file;
46 int auto_defrag;
47 int partial;
48 int credits;
49 u32 new_phys_cpos;
50 u32 clusters_moved;
51 u64 refcount_loc;
52 struct ocfs2_move_extents *range;
53 struct ocfs2_extent_tree et;
54 struct ocfs2_alloc_context *meta_ac;
55 struct ocfs2_alloc_context *data_ac;
56 struct ocfs2_cached_dealloc_ctxt dealloc;
57};
58
59static int __ocfs2_move_extent(handle_t *handle,
60 struct ocfs2_move_extents_context *context,
61 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
62 int ext_flags)
63{
64 int ret = 0, index;
65 struct inode *inode = context->inode;
66 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
67 struct ocfs2_extent_rec *rec, replace_rec;
68 struct ocfs2_path *path = NULL;
69 struct ocfs2_extent_list *el;
70 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
71 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
72
73 ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
74 p_cpos, new_p_cpos, len);
75 if (ret) {
76 mlog_errno(ret);
77 goto out;
78 }
79
80 memset(&replace_rec, 0, sizeof(replace_rec));
81 replace_rec.e_cpos = cpu_to_le32(cpos);
82 replace_rec.e_leaf_clusters = cpu_to_le16(len);
83 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
84 new_p_cpos));
85
86 path = ocfs2_new_path_from_et(&context->et);
87 if (!path) {
88 ret = -ENOMEM;
89 mlog_errno(ret);
90 goto out;
91 }
92
93 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
94 if (ret) {
95 mlog_errno(ret);
96 goto out;
97 }
98
99 el = path_leaf_el(path);
100
101 index = ocfs2_search_extent_list(el, cpos);
102 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
103 ocfs2_error(inode->i_sb,
104 "Inode %llu has an extent at cpos %u which can no "
105 "longer be found.\n",
106 (unsigned long long)ino, cpos);
107 ret = -EROFS;
108 goto out;
109 }
110
111 rec = &el->l_recs[index];
112
113 BUG_ON(ext_flags != rec->e_flags);
114 /*
115 * after moving/defraging to new location, the extent is not going
116 * to be refcounted anymore.
117 */
118 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
119
120 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
121 context->et.et_root_bh,
122 OCFS2_JOURNAL_ACCESS_WRITE);
123 if (ret) {
124 mlog_errno(ret);
125 goto out;
126 }
127
128 ret = ocfs2_split_extent(handle, &context->et, path, index,
129 &replace_rec, context->meta_ac,
130 &context->dealloc);
131 if (ret) {
132 mlog_errno(ret);
133 goto out;
134 }
135
136 ocfs2_journal_dirty(handle, context->et.et_root_bh);
137
138 context->new_phys_cpos = new_p_cpos;
139
140 /*
141 * need I to append truncate log for old clusters?
142 */
143 if (old_blkno) {
144 if (ext_flags & OCFS2_EXT_REFCOUNTED)
145 ret = ocfs2_decrease_refcount(inode, handle,
146 ocfs2_blocks_to_clusters(osb->sb,
147 old_blkno),
148 len, context->meta_ac,
149 &context->dealloc, 1);
150 else
151 ret = ocfs2_truncate_log_append(osb, handle,
152 old_blkno, len);
153 }
154
155out:
156 return ret;
157}
158
159/*
160 * lock allocators, and reserving appropriate number of bits for
161 * meta blocks and data clusters.
162 *
163 * in some cases, we don't need to reserve clusters, just let data_ac
164 * be NULL.
165 */
166static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167 struct ocfs2_extent_tree *et,
168 u32 clusters_to_move,
169 u32 extents_to_split,
170 struct ocfs2_alloc_context **meta_ac,
171 struct ocfs2_alloc_context **data_ac,
172 int extra_blocks,
173 int *credits)
174{
175 int ret, num_free_extents;
176 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179 num_free_extents = ocfs2_num_free_extents(osb, et);
180 if (num_free_extents < 0) {
181 ret = num_free_extents;
182 mlog_errno(ret);
183 goto out;
184 }
185
186 if (!num_free_extents ||
187 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191 if (ret) {
192 mlog_errno(ret);
193 goto out;
194 }
195
196 if (data_ac) {
197 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198 if (ret) {
199 mlog_errno(ret);
200 goto out;
201 }
202 }
203
204 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205 clusters_to_move + 2);
206
207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208 extra_blocks, clusters_to_move, *credits);
209out:
210 if (ret) {
211 if (*meta_ac) {
212 ocfs2_free_alloc_context(*meta_ac);
213 *meta_ac = NULL;
214 }
215 }
216
217 return ret;
218}
219
220/*
221 * Using one journal handle to guarantee the data consistency in case
222 * crash happens anywhere.
223 *
224 * XXX: defrag can end up with finishing partial extent as requested,
225 * due to not enough contiguous clusters can be found in allocator.
226 */
227static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
229{
230 int ret, credits = 0, extra_blocks = 0, partial = context->partial;
231 handle_t *handle;
232 struct inode *inode = context->inode;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234 struct inode *tl_inode = osb->osb_tl_inode;
235 struct ocfs2_refcount_tree *ref_tree = NULL;
236 u32 new_phys_cpos, new_len;
237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
238
239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
240
241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
242 OCFS2_HAS_REFCOUNT_FL));
243
244 BUG_ON(!context->refcount_loc);
245
246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
247 &ref_tree, NULL);
248 if (ret) {
249 mlog_errno(ret);
250 return ret;
251 }
252
253 ret = ocfs2_prepare_refcount_change_for_del(inode,
254 context->refcount_loc,
255 phys_blkno,
256 *len,
257 &credits,
258 &extra_blocks);
259 if (ret) {
260 mlog_errno(ret);
261 goto out;
262 }
263 }
264
265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
266 &context->meta_ac,
267 &context->data_ac,
268 extra_blocks, &credits);
269 if (ret) {
270 mlog_errno(ret);
271 goto out;
272 }
273
274 /*
275 * should be using allocation reservation strategy there?
276 *
277 * if (context->data_ac)
278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
279 */
280
281 mutex_lock(&tl_inode->i_mutex);
282
283 if (ocfs2_truncate_log_needs_flush(osb)) {
284 ret = __ocfs2_flush_truncate_log(osb);
285 if (ret < 0) {
286 mlog_errno(ret);
287 goto out_unlock_mutex;
288 }
289 }
290
291 handle = ocfs2_start_trans(osb, credits);
292 if (IS_ERR(handle)) {
293 ret = PTR_ERR(handle);
294 mlog_errno(ret);
295 goto out_unlock_mutex;
296 }
297
298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
299 &new_phys_cpos, &new_len);
300 if (ret) {
301 mlog_errno(ret);
302 goto out_commit;
303 }
304
305 /*
306 * allowing partial extent moving is kind of 'pros and cons', it makes
307 * whole defragmentation less likely to fail, on the contrary, the bad
308 * thing is it may make the fs even more fragmented after moving, let
309 * userspace make a good decision here.
310 */
311 if (new_len != *len) {
312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
313 if (!partial) {
314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
315 ret = -ENOSPC;
316 goto out_commit;
317 }
318 }
319
320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
321 phys_cpos, new_phys_cpos);
322
323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
324 new_phys_cpos, ext_flags);
325 if (ret)
326 mlog_errno(ret);
327
328 if (partial && (new_len != *len))
329 *len = new_len;
330
331 /*
332 * Here we should write the new page out first if we are
333 * in write-back mode.
334 */
335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
336 if (ret)
337 mlog_errno(ret);
338
339out_commit:
340 ocfs2_commit_trans(osb, handle);
341
342out_unlock_mutex:
343 mutex_unlock(&tl_inode->i_mutex);
344
345 if (context->data_ac) {
346 ocfs2_free_alloc_context(context->data_ac);
347 context->data_ac = NULL;
348 }
349
350 if (context->meta_ac) {
351 ocfs2_free_alloc_context(context->meta_ac);
352 context->meta_ac = NULL;
353 }
354
355out:
356 if (ref_tree)
357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
358
359 return ret;
360}
361
362/*
363 * find the victim alloc group, where #blkno fits.
364 */
365static int ocfs2_find_victim_alloc_group(struct inode *inode,
366 u64 vict_blkno,
367 int type, int slot,
368 int *vict_bit,
369 struct buffer_head **ret_bh)
370{
371 int ret, i, blocks_per_unit = 1;
372 u64 blkno;
373 char namebuf[40];
374
375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
377 struct ocfs2_chain_list *cl;
378 struct ocfs2_chain_rec *rec;
379 struct ocfs2_dinode *ac_dinode;
380 struct ocfs2_group_desc *bg;
381
382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
384 strlen(namebuf), &blkno);
385 if (ret) {
386 ret = -ENOENT;
387 goto out;
388 }
389
390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
391 if (ret) {
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
397 cl = &(ac_dinode->id2.i_chain);
398 rec = &(cl->cl_recs[0]);
399
400 if (type == GLOBAL_BITMAP_SYSTEM_INODE)
401 blocks_per_unit <<= (osb->s_clustersize_bits -
402 inode->i_sb->s_blocksize_bits);
403 /*
404 * 'vict_blkno' was out of the valid range.
405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
408 blocks_per_unit))) {
409 ret = -EINVAL;
410 goto out;
411 }
412
413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
414
415 rec = &(cl->cl_recs[i]);
416 if (!rec)
417 continue;
418
419 bg = NULL;
420
421 do {
422 if (!bg)
423 blkno = le64_to_cpu(rec->c_blkno);
424 else
425 blkno = le64_to_cpu(bg->bg_next_group);
426
427 if (gd_bh) {
428 brelse(gd_bh);
429 gd_bh = NULL;
430 }
431
432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
433 if (ret) {
434 mlog_errno(ret);
435 goto out;
436 }
437
438 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
439
440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
441 le16_to_cpu(bg->bg_bits))) {
442
443 *ret_bh = gd_bh;
444 *vict_bit = (vict_blkno - blkno) /
445 blocks_per_unit;
446 mlog(0, "find the victim group: #%llu, "
447 "total_bits: %u, vict_bit: %u\n",
448 blkno, le16_to_cpu(bg->bg_bits),
449 *vict_bit);
450 goto out;
451 }
452
453 } while (le64_to_cpu(bg->bg_next_group));
454 }
455
456 ret = -EINVAL;
457out:
458 brelse(ac_bh);
459
460 /*
461 * caller has to release the gd_bh properly.
462 */
463 return ret;
464}
465
466/*
467 * XXX: helper to validate and adjust moving goal.
468 */
469static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
470 struct ocfs2_move_extents *range)
471{
472 int ret, goal_bit = 0;
473
474 struct buffer_head *gd_bh = NULL;
475 struct ocfs2_group_desc *bg;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 int c_to_b = 1 << (osb->s_clustersize_bits -
478 inode->i_sb->s_blocksize_bits);
479
480 /*
481 * validate goal sits within global_bitmap, and return the victim
482 * group desc
483 */
484 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
485 GLOBAL_BITMAP_SYSTEM_INODE,
486 OCFS2_INVALID_SLOT,
487 &goal_bit, &gd_bh);
488 if (ret)
489 goto out;
490
491 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
492
493 /*
494 * make goal become cluster aligned.
495 */
496 if (range->me_goal % c_to_b)
497 range->me_goal = range->me_goal / c_to_b * c_to_b;
498
499 /*
500 * moving goal is not allowd to start with a group desc blok(#0 blk)
501 * let's compromise to the latter cluster.
502 */
503 if (range->me_goal == le64_to_cpu(bg->bg_blkno))
504 range->me_goal += c_to_b;
505
506 /*
507 * movement is not gonna cross two groups.
508 */
509 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
510 range->me_len) {
511 ret = -EINVAL;
512 goto out;
513 }
514 /*
515 * more exact validations/adjustments will be performed later during
516 * moving operation for each extent range.
517 */
518 mlog(0, "extents get ready to be moved to #%llu block\n",
519 range->me_goal);
520
521out:
522 brelse(gd_bh);
523
524 return ret;
525}
526
527static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
528 int *goal_bit, u32 move_len, u32 max_hop,
529 u32 *phys_cpos)
530{
531 int i, used, last_free_bits = 0, base_bit = *goal_bit;
532 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
533 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
534 le64_to_cpu(gd->bg_blkno));
535
536 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
537
538 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
539 if (used) {
540 /*
541 * we even tried searching the free chunk by jumping
542 * a 'max_hop' distance, but still failed.
543 */
544 if ((i - base_bit) > max_hop) {
545 *phys_cpos = 0;
546 break;
547 }
548
549 if (last_free_bits)
550 last_free_bits = 0;
551
552 continue;
553 } else
554 last_free_bits++;
555
556 if (last_free_bits == move_len) {
557 *goal_bit = i;
558 *phys_cpos = base_cpos + i;
559 break;
560 }
561 }
562
563 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
564}
565
566static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
567 handle_t *handle,
568 struct buffer_head *di_bh,
569 u32 num_bits,
570 u16 chain)
571{
572 int ret;
573 u32 tmp_used;
574 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
575 struct ocfs2_chain_list *cl =
576 (struct ocfs2_chain_list *) &di->id2.i_chain;
577
578 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
579 OCFS2_JOURNAL_ACCESS_WRITE);
580 if (ret < 0) {
581 mlog_errno(ret);
582 goto out;
583 }
584
585 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
586 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
587 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
588 ocfs2_journal_dirty(handle, di_bh);
589
590out:
591 return ret;
592}
593
594static inline int ocfs2_block_group_set_bits(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_group_desc *bg,
597 struct buffer_head *group_bh,
598 unsigned int bit_off,
599 unsigned int num_bits)
600{
601 int status;
602 void *bitmap = bg->bg_bitmap;
603 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
604
605 /* All callers get the descriptor via
606 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
607 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
608 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
609
610 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
611 num_bits);
612
613 if (ocfs2_is_cluster_bitmap(alloc_inode))
614 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
615
616 status = ocfs2_journal_access_gd(handle,
617 INODE_CACHE(alloc_inode),
618 group_bh,
619 journal_type);
620 if (status < 0) {
621 mlog_errno(status);
622 goto bail;
623 }
624
625 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
626 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
627 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
628 " count %u but claims %u are freed. num_bits %d",
629 (unsigned long long)le64_to_cpu(bg->bg_blkno),
630 le16_to_cpu(bg->bg_bits),
631 le16_to_cpu(bg->bg_free_bits_count), num_bits);
632 return -EROFS;
633 }
634 while (num_bits--)
635 ocfs2_set_bit(bit_off++, bitmap);
636
637 ocfs2_journal_dirty(handle, group_bh);
638
639bail:
640 return status;
641}
642
643static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
644 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
645 u32 len, int ext_flags)
646{
647 int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
648 handle_t *handle;
649 struct inode *inode = context->inode;
650 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
651 struct inode *tl_inode = osb->osb_tl_inode;
652 struct inode *gb_inode = NULL;
653 struct buffer_head *gb_bh = NULL;
654 struct buffer_head *gd_bh = NULL;
655 struct ocfs2_group_desc *gd;
656 struct ocfs2_refcount_tree *ref_tree = NULL;
657 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
658 context->range->me_threshold);
659 u64 phys_blkno, new_phys_blkno;
660
661 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
662
663 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
664
665 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
666 OCFS2_HAS_REFCOUNT_FL));
667
668 BUG_ON(!context->refcount_loc);
669
670 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
671 &ref_tree, NULL);
672 if (ret) {
673 mlog_errno(ret);
674 return ret;
675 }
676
677 ret = ocfs2_prepare_refcount_change_for_del(inode,
678 context->refcount_loc,
679 phys_blkno,
680 len,
681 &credits,
682 &extra_blocks);
683 if (ret) {
684 mlog_errno(ret);
685 goto out;
686 }
687 }
688
689 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
690 &context->meta_ac,
691 NULL, extra_blocks, &credits);
692 if (ret) {
693 mlog_errno(ret);
694 goto out;
695 }
696
697 /*
698 * need to count 2 extra credits for global_bitmap inode and
699 * group descriptor.
700 */
701 credits += OCFS2_INODE_UPDATE_CREDITS + 1;
702
703 /*
704 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
705 * logic, while we still need to lock the global_bitmap.
706 */
707 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
708 OCFS2_INVALID_SLOT);
709 if (!gb_inode) {
710 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
711 ret = -EIO;
712 goto out;
713 }
714
715 mutex_lock(&gb_inode->i_mutex);
716
717 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
718 if (ret) {
719 mlog_errno(ret);
720 goto out_unlock_gb_mutex;
721 }
722
723 mutex_lock(&tl_inode->i_mutex);
724
725 handle = ocfs2_start_trans(osb, credits);
726 if (IS_ERR(handle)) {
727 ret = PTR_ERR(handle);
728 mlog_errno(ret);
729 goto out_unlock_tl_inode;
730 }
731
732 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
733 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
734 GLOBAL_BITMAP_SYSTEM_INODE,
735 OCFS2_INVALID_SLOT,
736 &goal_bit, &gd_bh);
737 if (ret) {
738 mlog_errno(ret);
739 goto out_commit;
740 }
741
742 /*
743 * probe the victim cluster group to find a proper
744 * region to fit wanted movement, it even will perfrom
745 * a best-effort attempt by compromising to a threshold
746 * around the goal.
747 */
748 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
749 new_phys_cpos);
750 if (!new_phys_cpos) {
751 ret = -ENOSPC;
752 goto out_commit;
753 }
754
755 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
756 *new_phys_cpos, ext_flags);
757 if (ret) {
758 mlog_errno(ret);
759 goto out_commit;
760 }
761
762 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
763 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
764 le16_to_cpu(gd->bg_chain));
765 if (ret) {
766 mlog_errno(ret);
767 goto out_commit;
768 }
769
770 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
771 goal_bit, len);
772 if (ret)
773 mlog_errno(ret);
774
775 /*
776 * Here we should write the new page out first if we are
777 * in write-back mode.
778 */
779 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
780 if (ret)
781 mlog_errno(ret);
782
783out_commit:
784 ocfs2_commit_trans(osb, handle);
785 brelse(gd_bh);
786
787out_unlock_tl_inode:
788 mutex_unlock(&tl_inode->i_mutex);
789
790 ocfs2_inode_unlock(gb_inode, 1);
791out_unlock_gb_mutex:
792 mutex_unlock(&gb_inode->i_mutex);
793 brelse(gb_bh);
794 iput(gb_inode);
795
796out:
797 if (context->meta_ac) {
798 ocfs2_free_alloc_context(context->meta_ac);
799 context->meta_ac = NULL;
800 }
801
802 if (ref_tree)
803 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
804
805 return ret;
806}
807
808/*
809 * Helper to calculate the defraging length in one run according to threshold.
810 */
811static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
812 u32 threshold, int *skip)
813{
814 if ((*alloc_size + *len_defraged) < threshold) {
815 /*
816 * proceed defragmentation until we meet the thresh
817 */
818 *len_defraged += *alloc_size;
819 } else if (*len_defraged == 0) {
820 /*
821 * XXX: skip a large extent.
822 */
823 *skip = 1;
824 } else {
825 /*
826 * split this extent to coalesce with former pieces as
827 * to reach the threshold.
828 *
829 * we're done here with one cycle of defragmentation
830 * in a size of 'thresh', resetting 'len_defraged'
831 * forces a new defragmentation.
832 */
833 *alloc_size = threshold - *len_defraged;
834 *len_defraged = 0;
835 }
836}
837
838static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
839 struct ocfs2_move_extents_context *context)
840{
841 int ret = 0, flags, do_defrag, skip = 0;
842 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
843 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
844
845 struct inode *inode = context->inode;
846 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
847 struct ocfs2_move_extents *range = context->range;
848 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
849
850 if ((inode->i_size == 0) || (range->me_len == 0))
851 return 0;
852
853 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
854 return 0;
855
856 context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
857
858 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
859 ocfs2_init_dealloc_ctxt(&context->dealloc);
860
861 /*
862 * TO-DO XXX:
863 *
864 * - xattr extents.
865 */
866
867 do_defrag = context->auto_defrag;
868
869 /*
870 * extents moving happens in unit of clusters, for the sake
871 * of simplicity, we may ignore two clusters where 'byte_start'
872 * and 'byte_start + len' were within.
873 */
874 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
875 len_to_move = (range->me_start + range->me_len) >>
876 osb->s_clustersize_bits;
877 if (len_to_move >= move_start)
878 len_to_move -= move_start;
879 else
880 len_to_move = 0;
881
882 if (do_defrag) {
883 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
884 if (defrag_thresh <= 1)
885 goto done;
886 } else
887 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
888 range->me_goal);
889
890 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
891 "thresh: %u\n",
892 (unsigned long long)OCFS2_I(inode)->ip_blkno,
893 (unsigned long long)range->me_start,
894 (unsigned long long)range->me_len,
895 move_start, len_to_move, defrag_thresh);
896
897 cpos = move_start;
898 while (len_to_move) {
899 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
900 &flags);
901 if (ret) {
902 mlog_errno(ret);
903 goto out;
904 }
905
906 if (alloc_size > len_to_move)
907 alloc_size = len_to_move;
908
909 /*
910 * XXX: how to deal with a hole:
911 *
912 * - skip the hole of course
913 * - force a new defragmentation
914 */
915 if (!phys_cpos) {
916 if (do_defrag)
917 len_defraged = 0;
918
919 goto next;
920 }
921
922 if (do_defrag) {
923 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
924 defrag_thresh, &skip);
925 /*
926 * skip large extents
927 */
928 if (skip) {
929 skip = 0;
930 goto next;
931 }
932
933 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
934 "alloc_size: %u, len_defraged: %u\n",
935 cpos, phys_cpos, alloc_size, len_defraged);
936
937 ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
938 &alloc_size, flags);
939 } else {
940 ret = ocfs2_move_extent(context, cpos, phys_cpos,
941 &new_phys_cpos, alloc_size,
942 flags);
943
944 new_phys_cpos += alloc_size;
945 }
946
947 if (ret < 0) {
948 mlog_errno(ret);
949 goto out;
950 }
951
952 context->clusters_moved += alloc_size;
953next:
954 cpos += alloc_size;
955 len_to_move -= alloc_size;
956 }
957
958done:
959 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
960
961out:
962 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
963 context->clusters_moved);
964 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
965 context->new_phys_cpos);
966
967 ocfs2_schedule_truncate_log_flush(osb, 1);
968 ocfs2_run_deallocs(osb, &context->dealloc);
969
970 return ret;
971}
972
973static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
974{
975 int status;
976 handle_t *handle;
977 struct inode *inode = context->inode;
978 struct ocfs2_dinode *di;
979 struct buffer_head *di_bh = NULL;
980 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
981
982 if (!inode)
983 return -ENOENT;
984
985 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
986 return -EROFS;
987
988 mutex_lock(&inode->i_mutex);
989
990 /*
991 * This prevents concurrent writes from other nodes
992 */
993 status = ocfs2_rw_lock(inode, 1);
994 if (status) {
995 mlog_errno(status);
996 goto out;
997 }
998
999 status = ocfs2_inode_lock(inode, &di_bh, 1);
1000 if (status) {
1001 mlog_errno(status);
1002 goto out_rw_unlock;
1003 }
1004
1005 /*
1006 * rememer ip_xattr_sem also needs to be held if necessary
1007 */
1008 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1009
1010 status = __ocfs2_move_extents_range(di_bh, context);
1011
1012 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1013 if (status) {
1014 mlog_errno(status);
1015 goto out_inode_unlock;
1016 }
1017
1018 /*
1019 * We update ctime for these changes
1020 */
1021 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1022 if (IS_ERR(handle)) {
1023 status = PTR_ERR(handle);
1024 mlog_errno(status);
1025 goto out_inode_unlock;
1026 }
1027
1028 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1029 OCFS2_JOURNAL_ACCESS_WRITE);
1030 if (status) {
1031 mlog_errno(status);
1032 goto out_commit;
1033 }
1034
1035 di = (struct ocfs2_dinode *)di_bh->b_data;
1036 inode->i_ctime = CURRENT_TIME;
1037 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1038 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1039
1040 ocfs2_journal_dirty(handle, di_bh);
1041
1042out_commit:
1043 ocfs2_commit_trans(osb, handle);
1044
1045out_inode_unlock:
1046 brelse(di_bh);
1047 ocfs2_inode_unlock(inode, 1);
1048out_rw_unlock:
1049 ocfs2_rw_unlock(inode, 1);
1050out:
1051 mutex_unlock(&inode->i_mutex);
1052
1053 return status;
1054}
1055
1056int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1057{
1058 int status;
1059
1060 struct inode *inode = filp->f_path.dentry->d_inode;
1061 struct ocfs2_move_extents range;
1062 struct ocfs2_move_extents_context *context = NULL;
1063
1064 status = mnt_want_write(filp->f_path.mnt);
1065 if (status)
1066 return status;
1067
1068 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1069 goto out;
1070
1071 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1072 status = -EPERM;
1073 goto out;
1074 }
1075
1076 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1077 if (!context) {
1078 status = -ENOMEM;
1079 mlog_errno(status);
1080 goto out;
1081 }
1082
1083 context->inode = inode;
1084 context->file = filp;
1085
1086 if (argp) {
1087 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1088 sizeof(range))) {
1089 status = -EFAULT;
1090 goto out;
1091 }
1092 } else {
1093 status = -EINVAL;
1094 goto out;
1095 }
1096
1097 if (range.me_start > i_size_read(inode))
1098 goto out;
1099
1100 if (range.me_start + range.me_len > i_size_read(inode))
1101 range.me_len = i_size_read(inode) - range.me_start;
1102
1103 context->range = &range;
1104
1105 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1106 context->auto_defrag = 1;
1107 /*
1108 * ok, the default theshold for the defragmentation
1109 * is 1M, since our maximum clustersize was 1M also.
1110 * any thought?
1111 */
1112 if (!range.me_threshold)
1113 range.me_threshold = 1024 * 1024;
1114
1115 if (range.me_threshold > i_size_read(inode))
1116 range.me_threshold = i_size_read(inode);
1117
1118 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1119 context->partial = 1;
1120 } else {
1121 /*
1122 * first best-effort attempt to validate and adjust the goal
1123 * (physical address in block), while it can't guarantee later
1124 * operation can succeed all the time since global_bitmap may
1125 * change a bit over time.
1126 */
1127
1128 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1129 if (status)
1130 goto out;
1131 }
1132
1133 status = ocfs2_move_extents(context);
1134 if (status)
1135 mlog_errno(status);
1136out:
1137 /*
1138 * movement/defragmentation may end up being partially completed,
1139 * that's the reason why we need to return userspace the finished
1140 * length and new_offset even if failure happens somewhere.
1141 */
1142 if (argp) {
1143 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1144 sizeof(range)))
1145 status = -EFAULT;
1146 }
1147
1148 kfree(context);
1149
1150 mnt_drop_write(filp->f_path.mnt);
1151
1152 return status;
1153}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.h
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_MOVE_EXTENTS_H
18#define OCFS2_MOVE_EXTENTS_H
19
20int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
21
22#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
142 __u64 ij_journal_size; 142 __u64 ij_journal_size;
143}; 143};
144 144
145struct ocfs2_info_freeinode {
146 struct ocfs2_info_request ifi_req;
147 struct ocfs2_info_local_freeinode {
148 __u64 lfi_total;
149 __u64 lfi_free;
150 } ifi_stat[OCFS2_MAX_SLOTS];
151 __u32 ifi_slotnum; /* out */
152 __u32 ifi_pad;
153};
154
155#define OCFS2_INFO_MAX_HIST (32)
156
157struct ocfs2_info_freefrag {
158 struct ocfs2_info_request iff_req;
159 struct ocfs2_info_freefrag_stats { /* (out) */
160 struct ocfs2_info_free_chunk_list {
161 __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
162 __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
163 } ffs_fc_hist;
164 __u32 ffs_clusters;
165 __u32 ffs_free_clusters;
166 __u32 ffs_free_chunks;
167 __u32 ffs_free_chunks_real;
168 __u32 ffs_min; /* Minimum free chunksize in clusters */
169 __u32 ffs_max;
170 __u32 ffs_avg;
171 __u32 ffs_pad;
172 } iff_ffs;
173 __u32 iff_chunksize; /* chunksize in clusters(in) */
174 __u32 iff_pad;
175};
176
145/* Codes for ocfs2_info_request */ 177/* Codes for ocfs2_info_request */
146enum ocfs2_info_type { 178enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1, 179 OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
151 OCFS2_INFO_UUID, 183 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES, 184 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE, 185 OCFS2_INFO_JOURNAL_SIZE,
186 OCFS2_INFO_FREEINODE,
187 OCFS2_INFO_FREEFRAG,
154 OCFS2_INFO_NUM_TYPES 188 OCFS2_INFO_NUM_TYPES
155}; 189};
156 190
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
171 205
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) 206#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173 207
208struct ocfs2_move_extents {
209/* All values are in bytes */
210 /* in */
211 __u64 me_start; /* Virtual start in the file to move */
212 __u64 me_len; /* Length of the extents to be moved */
213 __u64 me_goal; /* Physical offset of the goal,
214 it's in block unit */
215 __u64 me_threshold; /* Maximum distance from goal or threshold
216 for auto defragmentation */
217 __u64 me_flags; /* Flags for the operation:
218 * - auto defragmentation.
219 * - refcount,xattr cases.
220 */
221 /* out */
222 __u64 me_moved_len; /* Moved/defraged length */
223 __u64 me_new_offset; /* Resulting physical location */
224 __u32 me_reserved[2]; /* Reserved for futhure */
225};
226
227#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
228 claim new clusters
229 as the goal place
230 for extents moving */
231#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
232 moving, is to make
233 movement less likely
234 to fail, may make fs
235 even more fragmented */
236#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
237 completely gets done.
238 */
239
240#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
241
174#endif /* OCFS2_IOCTL_H */ 242#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5bb54ac..3b481f490633 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
688 __entry->blkno, __entry->bit) 688 __entry->blkno, __entry->bit)
689); 689);
690 690
691TRACE_EVENT(ocfs2_trim_extent,
692 TP_PROTO(struct super_block *sb, unsigned long long blk,
693 unsigned long long count),
694 TP_ARGS(sb, blk, count),
695 TP_STRUCT__entry(
696 __field(int, dev_major)
697 __field(int, dev_minor)
698 __field(unsigned long long, blk)
699 __field(__u64, count)
700 ),
701 TP_fast_assign(
702 __entry->dev_major = MAJOR(sb->s_dev);
703 __entry->dev_minor = MINOR(sb->s_dev);
704 __entry->blk = blk;
705 __entry->count = count;
706 ),
707 TP_printk("%d %d %llu %llu",
708 __entry->dev_major, __entry->dev_minor,
709 __entry->blk, __entry->count)
710);
711
712DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
713
714DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
715
691/* End of trace events for fs/ocfs2/alloc.c. */ 716/* End of trace events for fs/ocfs2/alloc.c. */
692 717
693/* Trace events for fs/ocfs2/localalloc.c. */ 718/* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
66 u32 *num_clusters, 66 u32 *num_clusters,
67 unsigned int *extent_flags); 67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle, 68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context, 69 struct file *file,
70 u32 cpos, u32 old_cluster, 70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len); 71 u32 new_cluster, u32 new_len);
72}; 72};
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2921 return 0; 2921 return 0;
2922} 2922}
2923 2923
2924static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2924int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925 struct ocfs2_cow_context *context, 2925 struct file *file,
2926 u32 cpos, u32 old_cluster, 2926 u32 cpos, u32 old_cluster,
2927 u32 new_cluster, u32 new_len) 2927 u32 new_cluster, u32 new_len)
2928{ 2928{
2929 int ret = 0, partial; 2929 int ret = 0, partial;
2930 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2930 struct inode *inode = file->f_path.dentry->d_inode;
2931 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2931 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to, readahead_pages; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = inode->i_mapping;
2938 2939
2939 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, 2940 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2940 new_cluster, new_len); 2941 new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2948 * We only duplicate pages until we reach the page contains i_size - 1. 2949 * We only duplicate pages until we reach the page contains i_size - 1.
2949 * So trim 'end' to i_size. 2950 * So trim 'end' to i_size.
2950 */ 2951 */
2951 if (end > i_size_read(context->inode)) 2952 if (end > i_size_read(inode))
2952 end = i_size_read(context->inode); 2953 end = i_size_read(inode);
2953 2954
2954 while (offset < end) { 2955 while (offset < end) {
2955 page_index = offset >> PAGE_CACHE_SHIFT; 2956 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2972 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2973 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2974 2975
2975 if (PageReadahead(page) && context->file) { 2976 if (PageReadahead(page)) {
2976 page_cache_async_readahead(mapping, 2977 page_cache_async_readahead(mapping,
2977 &context->file->f_ra, 2978 &file->f_ra, file,
2978 context->file,
2979 page, page_index, 2979 page, page_index,
2980 readahead_pages); 2980 readahead_pages);
2981 } 2981 }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2999 } 2999 }
3000 } 3000 }
3001 3001
3002 ocfs2_map_and_dirty_page(context->inode, 3002 ocfs2_map_and_dirty_page(inode, handle, from, to,
3003 handle, from, to,
3004 page, 0, &new_block); 3003 page, 0, &new_block);
3005 mark_page_accessed(page); 3004 mark_page_accessed(page);
3006unlock: 3005unlock:
@@ -3015,14 +3014,15 @@ unlock:
3015 return ret; 3014 return ret;
3016} 3015}
3017 3016
3018static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3019 struct ocfs2_cow_context *context, 3018 struct file *file,
3020 u32 cpos, u32 old_cluster, 3019 u32 cpos, u32 old_cluster,
3021 u32 new_cluster, u32 new_len) 3020 u32 new_cluster, u32 new_len)
3022{ 3021{
3023 int ret = 0; 3022 int ret = 0;
3024 struct super_block *sb = context->inode->i_sb; 3023 struct inode *inode = file->f_path.dentry->d_inode;
3025 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3024 struct super_block *sb = inode->i_sb;
3025 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
3145 3145
3146 /*If the old clusters is unwritten, no need to duplicate. */ 3146 /*If the old clusters is unwritten, no need to duplicate. */
3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148 ret = context->cow_duplicate_clusters(handle, context, cpos, 3148 ret = context->cow_duplicate_clusters(handle, context->file,
3149 old, new, len); 3149 cpos, old, new, len);
3150 if (ret) { 3150 if (ret) {
3151 mlog_errno(ret); 3151 mlog_errno(ret);
3152 goto out; 3152 goto out;
@@ -3162,22 +3162,22 @@ out:
3162 return ret; 3162 return ret;
3163} 3163}
3164 3164
3165static int ocfs2_cow_sync_writeback(struct super_block *sb, 3165int ocfs2_cow_sync_writeback(struct super_block *sb,
3166 struct ocfs2_cow_context *context, 3166 struct inode *inode,
3167 u32 cpos, u32 num_clusters) 3167 u32 cpos, u32 num_clusters)
3168{ 3168{
3169 int ret = 0; 3169 int ret = 0;
3170 loff_t offset, end, map_end; 3170 loff_t offset, end, map_end;
3171 pgoff_t page_index; 3171 pgoff_t page_index;
3172 struct page *page; 3172 struct page *page;
3173 3173
3174 if (ocfs2_should_order_data(context->inode)) 3174 if (ocfs2_should_order_data(inode))
3175 return 0; 3175 return 0;
3176 3176
3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179 3179
3180 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3180 ret = filemap_fdatawrite_range(inode->i_mapping,
3181 offset, end - 1); 3181 offset, end - 1);
3182 if (ret < 0) { 3182 if (ret < 0) {
3183 mlog_errno(ret); 3183 mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3190 if (map_end > end) 3190 if (map_end > end)
3191 map_end = end; 3191 map_end = end;
3192 3192
3193 page = find_or_create_page(context->inode->i_mapping, 3193 page = find_or_create_page(inode->i_mapping,
3194 page_index, GFP_NOFS); 3194 page_index, GFP_NOFS);
3195 BUG_ON(!page); 3195 BUG_ON(!page);
3196 3196
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3349 * in write-back mode. 3349 * in write-back mode.
3350 */ 3350 */
3351 if (context->get_clusters == ocfs2_di_get_clusters) { 3351 if (context->get_clusters == ocfs2_di_get_clusters) {
3352 ret = ocfs2_cow_sync_writeback(sb, context, cpos, 3352 ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353 orig_num_clusters); 3353 orig_num_clusters);
3354 if (ret) 3354 if (ret)
3355 mlog_errno(ret); 3355 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
84 struct buffer_head *ref_root_bh, 84 struct buffer_head *ref_root_bh,
85 u32 cpos, u32 write_len, 85 u32 cpos, u32 write_len,
86 struct ocfs2_post_refcount *post); 86 struct ocfs2_post_refcount *post);
87int ocfs2_duplicate_clusters_by_page(handle_t *handle,
88 struct file *file,
89 u32 cpos, u32 old_cluster,
90 u32 new_cluster, u32 new_len);
91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
92 struct file *file,
93 u32 cpos, u32 old_cluster,
94 u32 new_cluster, u32 new_len);
95int ocfs2_cow_sync_writeback(struct super_block *sb,
96 struct inode *inode,
97 u32 cpos, u32 num_clusters);
87int ocfs2_add_refcount_flag(struct inode *inode, 98int ocfs2_add_refcount_flag(struct inode *inode,
88 struct ocfs2_extent_tree *data_et, 99 struct ocfs2_extent_tree *data_et,
89 struct ocfs2_caching_info *ref_ci, 100 struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4129fb671d71..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1567,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1567 if (osb->preferred_slot != OCFS2_INVALID_SLOT) 1567 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
1568 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); 1568 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
1569 1569
1570 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 1570 if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
1571 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 1571 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
1572 1572
1573 if (osb->osb_commit_interval) 1573 if (osb->osb_commit_interval)