aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/removed/o2cb (renamed from Documentation/ABI/obsolete/o2cb)9
-rw-r--r--Documentation/feature-removal-schedule.txt10
-rw-r--r--Documentation/filesystems/ocfs2.txt8
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c166
-rw-r--r--fs/ocfs2/alloc.h1
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h14
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c94
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c255
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/ocfs2/ioctl.c492
-rw-r--r--fs/ocfs2/move_extents.c1153
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h68
-rw-r--r--fs/ocfs2/ocfs2_trace.h25
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h11
-rw-r--r--fs/ocfs2/super.c2
22 files changed, 2146 insertions, 262 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/removed/o2cb
index 9c49d8e6c0cc..7f5daa465093 100644
--- a/Documentation/ABI/obsolete/o2cb
+++ b/Documentation/ABI/removed/o2cb
@@ -1,11 +1,10 @@
1What: /sys/o2cb symlink 1What: /sys/o2cb symlink
2Date: Dec 2005 2Date: May 2011
3KernelVersion: 2.6.16 3KernelVersion: 2.6.40
4Contact: ocfs2-devel@oss.oracle.com 4Contact: ocfs2-devel@oss.oracle.com
5Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will 5Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
6 be removed when new versions of ocfs2-tools which know to look 6 removed when new versions of ocfs2-tools which know to look
7 in /sys/fs/o2cb are sufficiently prevalent. Don't code new 7 in /sys/fs/o2cb are sufficiently prevalent. Don't code new
8 software to look here, it should try /sys/fs/o2cb instead. 8 software to look here, it should try /sys/fs/o2cb instead.
9 See Documentation/ABI/stable/o2cb for more information on usage.
10Users: ocfs2-tools. It's sufficient to mail proposed changes to 9Users: ocfs2-tools. It's sufficient to mail proposed changes to
11 ocfs2-devel@oss.oracle.com. 10 ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 95788ad2506c..ff31b1cc50aa 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -262,16 +262,6 @@ Who: Michael Buesch <mb@bu3sch.de>
262 262
263--------------------------- 263---------------------------
264 264
265What: /sys/o2cb symlink
266When: January 2010
267Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
268 exists as a symlink for backwards compatibility for old versions of
269 ocfs2-tools. 2 years should be sufficient time to phase in new versions
270 which know to look in /sys/fs/o2cb.
271Who: ocfs2-devel@oss.oracle.com
272
273---------------------------
274
275What: Ability for non root users to shm_get hugetlb pages based on mlock 265What: Ability for non root users to shm_get hugetlb pages based on mlock
276 resource limits 266 resource limits
277When: 2.6.31 267When: 2.6.31
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 9ed920a8cd79..7618a287aa41 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs.
46intr (*) Allow signals to interrupt cluster operations. 46intr (*) Allow signals to interrupt cluster operations.
47nointr Do not allow signals to interrupt cluster 47nointr Do not allow signals to interrupt cluster
48 operations. 48 operations.
49noatime Do not update access time.
50relatime(*) Update atime if the previous atime is older than
51 mtime or ctime
52strictatime Always update atime, but the minimum update interval
53 is specified by atime_quantum.
49atime_quantum=60(*) OCFS2 will not update atime unless this number 54atime_quantum=60(*) OCFS2 will not update atime unless this number
50 of seconds has passed since the last update. 55 of seconds has passed since the last update.
51 Set to zero to always update atime. 56 Set to zero to always update atime. This option need
57 work with strictatime.
52data=ordered (*) All data are forced directly out to the main file 58data=ordered (*) All data are forced directly out to the main file
53 system prior to its metadata being committed to the 59 system prior to its metadata being committed to the
54 journal. 60 journal.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \ 32 reservations.o \
33 move_extents.o \
33 resize.o \ 34 resize.o \
34 slot_map.o \ 35 slot_map.o \
35 suballoc.o \ 36 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7401c7..ed553c60de82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h> 31#include <linux/quotaops.h>
32#include <linux/blkdev.h>
32 33
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
34 35
@@ -7184,3 +7185,168 @@ out_commit:
7184out: 7185out:
7185 return ret; 7186 return ret;
7186} 7187}
7188
7189static int ocfs2_trim_extent(struct super_block *sb,
7190 struct ocfs2_group_desc *gd,
7191 u32 start, u32 count)
7192{
7193 u64 discard, bcount;
7194
7195 bcount = ocfs2_clusters_to_blocks(sb, count);
7196 discard = le64_to_cpu(gd->bg_blkno) +
7197 ocfs2_clusters_to_blocks(sb, start);
7198
7199 trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7200
7201 return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7202}
7203
7204static int ocfs2_trim_group(struct super_block *sb,
7205 struct ocfs2_group_desc *gd,
7206 u32 start, u32 max, u32 minbits)
7207{
7208 int ret = 0, count = 0, next;
7209 void *bitmap = gd->bg_bitmap;
7210
7211 if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7212 return 0;
7213
7214 trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7215 start, max, minbits);
7216
7217 while (start < max) {
7218 start = ocfs2_find_next_zero_bit(bitmap, max, start);
7219 if (start >= max)
7220 break;
7221 next = ocfs2_find_next_bit(bitmap, max, start);
7222
7223 if ((next - start) >= minbits) {
7224 ret = ocfs2_trim_extent(sb, gd,
7225 start, next - start);
7226 if (ret < 0) {
7227 mlog_errno(ret);
7228 break;
7229 }
7230 count += next - start;
7231 }
7232 start = next + 1;
7233
7234 if (fatal_signal_pending(current)) {
7235 count = -ERESTARTSYS;
7236 break;
7237 }
7238
7239 if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7240 break;
7241 }
7242
7243 if (ret < 0)
7244 count = ret;
7245
7246 return count;
7247}
7248
7249int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7250{
7251 struct ocfs2_super *osb = OCFS2_SB(sb);
7252 u64 start, len, trimmed, first_group, last_group, group;
7253 int ret, cnt;
7254 u32 first_bit, last_bit, minlen;
7255 struct buffer_head *main_bm_bh = NULL;
7256 struct inode *main_bm_inode = NULL;
7257 struct buffer_head *gd_bh = NULL;
7258 struct ocfs2_dinode *main_bm;
7259 struct ocfs2_group_desc *gd = NULL;
7260
7261 start = range->start >> osb->s_clustersize_bits;
7262 len = range->len >> osb->s_clustersize_bits;
7263 minlen = range->minlen >> osb->s_clustersize_bits;
7264 trimmed = 0;
7265
7266 if (!len) {
7267 range->len = 0;
7268 return 0;
7269 }
7270
7271 if (minlen >= osb->bitmap_cpg)
7272 return -EINVAL;
7273
7274 main_bm_inode = ocfs2_get_system_file_inode(osb,
7275 GLOBAL_BITMAP_SYSTEM_INODE,
7276 OCFS2_INVALID_SLOT);
7277 if (!main_bm_inode) {
7278 ret = -EIO;
7279 mlog_errno(ret);
7280 goto out;
7281 }
7282
7283 mutex_lock(&main_bm_inode->i_mutex);
7284
7285 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7286 if (ret < 0) {
7287 mlog_errno(ret);
7288 goto out_mutex;
7289 }
7290 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7291
7292 if (start >= le32_to_cpu(main_bm->i_clusters)) {
7293 ret = -EINVAL;
7294 goto out_unlock;
7295 }
7296
7297 if (start + len > le32_to_cpu(main_bm->i_clusters))
7298 len = le32_to_cpu(main_bm->i_clusters) - start;
7299
7300 trace_ocfs2_trim_fs(start, len, minlen);
7301
7302 /* Determine first and last group to examine based on start and len */
7303 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7304 if (first_group == osb->first_cluster_group_blkno)
7305 first_bit = start;
7306 else
7307 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7308 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7309 last_bit = osb->bitmap_cpg;
7310
7311 for (group = first_group; group <= last_group;) {
7312 if (first_bit + len >= osb->bitmap_cpg)
7313 last_bit = osb->bitmap_cpg;
7314 else
7315 last_bit = first_bit + len;
7316
7317 ret = ocfs2_read_group_descriptor(main_bm_inode,
7318 main_bm, group,
7319 &gd_bh);
7320 if (ret < 0) {
7321 mlog_errno(ret);
7322 break;
7323 }
7324
7325 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7326 cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
7327 brelse(gd_bh);
7328 gd_bh = NULL;
7329 if (cnt < 0) {
7330 ret = cnt;
7331 mlog_errno(ret);
7332 break;
7333 }
7334
7335 trimmed += cnt;
7336 len -= osb->bitmap_cpg - first_bit;
7337 first_bit = 0;
7338 if (group == osb->first_cluster_group_blkno)
7339 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7340 else
7341 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7342 }
7343 range->len = trimmed * sb->s_blocksize;
7344out_unlock:
7345 ocfs2_inode_unlock(main_bm_inode, 0);
7346 brelse(main_bm_bh);
7347out_mutex:
7348 mutex_unlock(&main_bm_inode->i_mutex);
7349 iput(main_bm_inode);
7350out:
7351 return ret;
7352}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a03251c..ca381c584127 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
239 struct buffer_head **leaf_bh); 239 struct buffer_head **leaf_bh);
240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
241 241
242int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
242/* 243/*
243 * Helper function to look at the # of clusters in an extent record. 244 * Helper function to look at the # of clusters in an extent record.
244 */ 245 */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1f..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
61 kset_unregister(o2cb_kset); 60 kset_unregister(o2cb_kset);
62} 61}
63 62
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
69 if (!o2cb_kset) 68 if (!o2cb_kset)
70 return -ENOMEM; 69 return -ENOMEM;
71 70
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
81 if (ret) 72 if (ret)
82 goto error; 73 goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..d602abb51b61 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
144 wait_queue_head_t dlm_join_events; 144 wait_queue_head_t dlm_join_events;
145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 148 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
148 struct dlm_recovery_ctxt reco; 149 struct dlm_recovery_ctxt reco;
149 spinlock_t master_lock; 150 spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
401 return 1; 402 return 1;
402} 403}
403 404
405static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
406{
407 if (idx == DLM_GRANTED_LIST)
408 return "granted";
409 else if (idx == DLM_CONVERTING_LIST)
410 return "converting";
411 else if (idx == DLM_BLOCKED_LIST)
412 return "blocked";
413 else
414 return "unknown";
415}
416
404static inline struct list_head * 417static inline struct list_head *
405dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) 418dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
406{ 419{
@@ -448,6 +461,7 @@ enum {
448 DLM_FINALIZE_RECO_MSG = 518, 461 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION = 519, 462 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO = 520, 463 DLM_QUERY_NODEINFO = 520,
464 DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
451}; 465};
452 466
453struct dlm_reco_node_data 467struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..56f82cb912e3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
756 buf + out, len - out); 756 buf + out, len - out);
757 out += snprintf(buf + out, len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
758 758
759 /* Exit Domain Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Exit Domain Map: ");
761 out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
762 buf + out, len - out);
763 out += snprintf(buf + out, len - out, "\n");
764
759 /* Live Map: xx xx xx */ 765 /* Live Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Live Map: "); 766 out += snprintf(buf + out, len - out, "Live Map: ");
761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 767 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6cbde0..6ed6b95dcf93 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
132 * New in version 1.1: 132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat 133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes 134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
135 * New in version 1.2:
136 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
135 */ 137 */
136static const struct dlm_protocol_version dlm_protocol = { 138static const struct dlm_protocol_version dlm_protocol = {
137 .pv_major = 1, 139 .pv_major = 1,
138 .pv_minor = 1, 140 .pv_minor = 2,
139}; 141};
140 142
141#define DLM_DOMAIN_BACKOFF_MS 200 143#define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
449 dropped = dlm_empty_lockres(dlm, res); 451 dropped = dlm_empty_lockres(dlm, res);
450 452
451 spin_lock(&res->spinlock); 453 spin_lock(&res->spinlock);
452 __dlm_lockres_calc_usage(dlm, res); 454 if (dropped)
453 iter = res->hash_node.next; 455 __dlm_lockres_calc_usage(dlm, res);
456 else
457 iter = res->hash_node.next;
454 spin_unlock(&res->spinlock); 458 spin_unlock(&res->spinlock);
455 459
456 dlm_lockres_put(res); 460 dlm_lockres_put(res);
457 461
458 if (dropped) 462 if (dropped) {
463 cond_resched_lock(&dlm->spinlock);
459 goto redo_bucket; 464 goto redo_bucket;
465 }
460 } 466 }
461 cond_resched_lock(&dlm->spinlock); 467 cond_resched_lock(&dlm->spinlock);
462 num += n; 468 num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
486 return ret; 492 return ret;
487} 493}
488 494
495static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
496 void *data, void **ret_data)
497{
498 struct dlm_ctxt *dlm = data;
499 unsigned int node;
500 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
501
502 if (!dlm_grab(dlm))
503 return 0;
504
505 node = exit_msg->node_idx;
506 mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
507
508 spin_lock(&dlm->spinlock);
509 set_bit(node, dlm->exit_domain_map);
510 spin_unlock(&dlm->spinlock);
511
512 dlm_put(dlm);
513
514 return 0;
515}
516
489static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) 517static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
490{ 518{
491 /* Yikes, a double spinlock! I need domain_lock for the dlm 519 /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
542 570
543 spin_lock(&dlm->spinlock); 571 spin_lock(&dlm->spinlock);
544 clear_bit(node, dlm->domain_map); 572 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map);
545 __dlm_print_nodes(dlm); 574 __dlm_print_nodes(dlm);
546 575
547 /* notify anything attached to the heartbeat events */ 576 /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
554 return 0; 583 return 0;
555} 584}
556 585
557static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, 586static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
558 unsigned int node) 587 unsigned int node)
559{ 588{
560 int status; 589 int status;
561 struct dlm_exit_domain leave_msg; 590 struct dlm_exit_domain leave_msg;
562 591
563 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", 592 mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
564 node, dlm->name, dlm->node_num); 593 msg_type, node);
565 594
566 memset(&leave_msg, 0, sizeof(leave_msg)); 595 memset(&leave_msg, 0, sizeof(leave_msg));
567 leave_msg.node_idx = dlm->node_num; 596 leave_msg.node_idx = dlm->node_num;
568 597
569 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 598 status = o2net_send_message(msg_type, dlm->key, &leave_msg,
570 &leave_msg, sizeof(leave_msg), node, 599 sizeof(leave_msg), node, NULL);
571 NULL);
572 if (status < 0) 600 if (status < 0)
573 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 601 mlog(ML_ERROR, "Error %d sending domain exit message %u "
574 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); 602 "to node %u on domain %s\n", status, msg_type, node,
575 mlog(0, "status return %d from o2net_send_message\n", status); 603 dlm->name);
576 604
577 return status; 605 return status;
578} 606}
579 607
608static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
609{
610 int node = -1;
611
612 /* Support for begin exit domain was added in 1.2 */
613 if (dlm->dlm_locking_proto.pv_major == 1 &&
614 dlm->dlm_locking_proto.pv_minor < 2)
615 return;
616
617 /*
618 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
619 * informational. Meaning if a node does not receive the message,
620 * so be it.
621 */
622 spin_lock(&dlm->spinlock);
623 while (1) {
624 node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
625 if (node >= O2NM_MAX_NODES)
626 break;
627 if (node == dlm->node_num)
628 continue;
629
630 spin_unlock(&dlm->spinlock);
631 dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
632 spin_lock(&dlm->spinlock);
633 }
634 spin_unlock(&dlm->spinlock);
635}
580 636
581static void dlm_leave_domain(struct dlm_ctxt *dlm) 637static void dlm_leave_domain(struct dlm_ctxt *dlm)
582{ 638{
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
602 658
603 clear_node = 1; 659 clear_node = 1;
604 660
605 status = dlm_send_one_domain_exit(dlm, node); 661 status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
662 node);
606 if (status < 0 && 663 if (status < 0 &&
607 status != -ENOPROTOOPT && 664 status != -ENOPROTOOPT &&
608 status != -ENOTCONN) { 665 status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
677 734
678 if (leave) { 735 if (leave) {
679 mlog(0, "shutting down domain %s\n", dlm->name); 736 mlog(0, "shutting down domain %s\n", dlm->name);
737 dlm_begin_exit_domain(dlm);
680 738
681 /* We changed dlm state, notify the thread */ 739 /* We changed dlm state, notify the thread */
682 dlm_kick_thread(dlm, NULL); 740 dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
909 * leftover join state. */ 967 * leftover join state. */
910 BUG_ON(dlm->joining_node != assert->node_idx); 968 BUG_ON(dlm->joining_node != assert->node_idx);
911 set_bit(assert->node_idx, dlm->domain_map); 969 set_bit(assert->node_idx, dlm->domain_map);
970 clear_bit(assert->node_idx, dlm->exit_domain_map);
912 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
913 972
914 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1793 if (status) 1852 if (status)
1794 goto bail; 1853 goto bail;
1795 1854
1855 status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
1856 sizeof(struct dlm_exit_domain),
1857 dlm_begin_exit_domain_handler,
1858 dlm, NULL, &dlm->dlm_domain_handlers);
1859 if (status)
1860 goto bail;
1861
1796bail: 1862bail:
1797 if (status) 1863 if (status)
1798 dlm_unregister_domain_handlers(dlm); 1864 dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d166328cf7..11eefb8c12e9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2339 dlm_lockres_put(res); 2339 dlm_lockres_put(res);
2340} 2340}
2341 2341
2342/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 2342/*
2343 * if not. If 0, numlocks is set to the number of locks in the lockres. 2343 * A migrateable resource is one that is :
2344 * 1. locally mastered, and,
2345 * 2. zero local locks, and,
2346 * 3. one or more non-local locks, or, one or more references
2347 * Returns 1 if yes, 0 if not.
2344 */ 2348 */
2345static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2349static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2346 struct dlm_lock_resource *res, 2350 struct dlm_lock_resource *res)
2347 int *numlocks,
2348 int *hasrefs)
2349{ 2351{
2350 int ret; 2352 enum dlm_lockres_list idx;
2351 int i; 2353 int nonlocal = 0, node_ref;
2352 int count = 0;
2353 struct list_head *queue; 2354 struct list_head *queue;
2354 struct dlm_lock *lock; 2355 struct dlm_lock *lock;
2356 u64 cookie;
2355 2357
2356 assert_spin_locked(&res->spinlock); 2358 assert_spin_locked(&res->spinlock);
2357 2359
2358 *numlocks = 0; 2360 if (res->owner != dlm->node_num)
2359 *hasrefs = 0; 2361 return 0;
2360
2361 ret = -EINVAL;
2362 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2363 mlog(0, "cannot migrate lockres with unknown owner!\n");
2364 goto leave;
2365 }
2366
2367 if (res->owner != dlm->node_num) {
2368 mlog(0, "cannot migrate lockres this node doesn't own!\n");
2369 goto leave;
2370 }
2371 2362
2372 ret = 0; 2363 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2373 queue = &res->granted; 2364 queue = dlm_list_idx_to_ptr(res, idx);
2374 for (i = 0; i < 3; i++) {
2375 list_for_each_entry(lock, queue, list) { 2365 list_for_each_entry(lock, queue, list) {
2376 ++count; 2366 if (lock->ml.node != dlm->node_num) {
2377 if (lock->ml.node == dlm->node_num) { 2367 nonlocal++;
2378 mlog(0, "found a lock owned by this node still " 2368 continue;
2379 "on the %s queue! will not migrate this "
2380 "lockres\n", (i == 0 ? "granted" :
2381 (i == 1 ? "converting" :
2382 "blocked")));
2383 ret = -ENOTEMPTY;
2384 goto leave;
2385 } 2369 }
2370 cookie = be64_to_cpu(lock->ml.cookie);
2371 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
2372 "%s list\n", dlm->name, res->lockname.len,
2373 res->lockname.name,
2374 dlm_get_lock_cookie_node(cookie),
2375 dlm_get_lock_cookie_seq(cookie),
2376 dlm_list_in_text(idx));
2377 return 0;
2386 } 2378 }
2387 queue++;
2388 } 2379 }
2389 2380
2390 *numlocks = count; 2381 if (!nonlocal) {
2391 2382 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2392 count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 2383 if (node_ref >= O2NM_MAX_NODES)
2393 if (count < O2NM_MAX_NODES) 2384 return 0;
2394 *hasrefs = 1; 2385 }
2395 2386
2396 mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, 2387 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
2397 res->lockname.len, res->lockname.name, *numlocks, *hasrefs); 2388 res->lockname.name);
2398 2389
2399leave: 2390 return 1;
2400 return ret;
2401} 2391}
2402 2392
2403/* 2393/*
@@ -2406,8 +2396,7 @@ leave:
2406 2396
2407 2397
2408static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2398static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2409 struct dlm_lock_resource *res, 2399 struct dlm_lock_resource *res, u8 target)
2410 u8 target)
2411{ 2400{
2412 struct dlm_master_list_entry *mle = NULL; 2401 struct dlm_master_list_entry *mle = NULL;
2413 struct dlm_master_list_entry *oldmle = NULL; 2402 struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2416 const char *name; 2405 const char *name;
2417 unsigned int namelen; 2406 unsigned int namelen;
2418 int mle_added = 0; 2407 int mle_added = 0;
2419 int numlocks, hasrefs;
2420 int wake = 0; 2408 int wake = 0;
2421 2409
2422 if (!dlm_grab(dlm)) 2410 if (!dlm_grab(dlm))
2423 return -EINVAL; 2411 return -EINVAL;
2424 2412
2413 BUG_ON(target == O2NM_MAX_NODES);
2414
2425 name = res->lockname.name; 2415 name = res->lockname.name;
2426 namelen = res->lockname.len; 2416 namelen = res->lockname.len;
2427 2417
2428 mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); 2418 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2429 2419 target);
2430 /*
2431 * ensure this lockres is a proper candidate for migration
2432 */
2433 spin_lock(&res->spinlock);
2434 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2435 if (ret < 0) {
2436 spin_unlock(&res->spinlock);
2437 goto leave;
2438 }
2439 spin_unlock(&res->spinlock);
2440
2441 /* no work to do */
2442 if (numlocks == 0 && !hasrefs)
2443 goto leave;
2444
2445 /*
2446 * preallocate up front
2447 * if this fails, abort
2448 */
2449 2420
2421 /* preallocate up front. if this fails, abort */
2450 ret = -ENOMEM; 2422 ret = -ENOMEM;
2451 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 2423 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2452 if (!mres) { 2424 if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2462 ret = 0; 2434 ret = 0;
2463 2435
2464 /* 2436 /*
2465 * find a node to migrate the lockres to
2466 */
2467
2468 spin_lock(&dlm->spinlock);
2469 /* pick a new node */
2470 if (!test_bit(target, dlm->domain_map) ||
2471 target >= O2NM_MAX_NODES) {
2472 target = dlm_pick_migration_target(dlm, res);
2473 }
2474 mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
2475 namelen, name, target);
2476
2477 if (target >= O2NM_MAX_NODES ||
2478 !test_bit(target, dlm->domain_map)) {
2479 /* target chosen is not alive */
2480 ret = -EINVAL;
2481 }
2482
2483 if (ret) {
2484 spin_unlock(&dlm->spinlock);
2485 goto fail;
2486 }
2487
2488 mlog(0, "continuing with target = %u\n", target);
2489
2490 /*
2491 * clear any existing master requests and 2437 * clear any existing master requests and
2492 * add the migration mle to the list 2438 * add the migration mle to the list
2493 */ 2439 */
2440 spin_lock(&dlm->spinlock);
2494 spin_lock(&dlm->master_lock); 2441 spin_lock(&dlm->master_lock);
2495 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 2442 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2496 namelen, target, dlm->node_num); 2443 namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
2531 dlm_put_mle(mle); 2478 dlm_put_mle(mle);
2532 } else if (mle) { 2479 } else if (mle) {
2533 kmem_cache_free(dlm_mle_cache, mle); 2480 kmem_cache_free(dlm_mle_cache, mle);
2481 mle = NULL;
2534 } 2482 }
2535 goto leave; 2483 goto leave;
2536 } 2484 }
@@ -2652,69 +2600,52 @@ leave:
2652 if (wake) 2600 if (wake)
2653 wake_up(&res->wq); 2601 wake_up(&res->wq);
2654 2602
2655 /* TODO: cleanup */
2656 if (mres) 2603 if (mres)
2657 free_page((unsigned long)mres); 2604 free_page((unsigned long)mres);
2658 2605
2659 dlm_put(dlm); 2606 dlm_put(dlm);
2660 2607
2661 mlog(0, "returning %d\n", ret); 2608 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2609 name, target, ret);
2662 return ret; 2610 return ret;
2663} 2611}
2664 2612
2665#define DLM_MIGRATION_RETRY_MS 100 2613#define DLM_MIGRATION_RETRY_MS 100
2666 2614
2667/* Should be called only after beginning the domain leave process. 2615/*
2616 * Should be called only after beginning the domain leave process.
2668 * There should not be any remaining locks on nonlocal lock resources, 2617 * There should not be any remaining locks on nonlocal lock resources,
2669 * and there should be no local locks left on locally mastered resources. 2618 * and there should be no local locks left on locally mastered resources.
2670 * 2619 *
2671 * Called with the dlm spinlock held, may drop it to do migration, but 2620 * Called with the dlm spinlock held, may drop it to do migration, but
2672 * will re-acquire before exit. 2621 * will re-acquire before exit.
2673 * 2622 *
2674 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ 2623 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2624 */
2675int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2625int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2676{ 2626{
2677 int ret; 2627 int ret;
2678 int lock_dropped = 0; 2628 int lock_dropped = 0;
2679 int numlocks, hasrefs; 2629 u8 target = O2NM_MAX_NODES;
2630
2631 assert_spin_locked(&dlm->spinlock);
2680 2632
2681 spin_lock(&res->spinlock); 2633 spin_lock(&res->spinlock);
2682 if (res->owner != dlm->node_num) { 2634 if (dlm_is_lockres_migrateable(dlm, res))
2683 if (!__dlm_lockres_unused(res)) { 2635 target = dlm_pick_migration_target(dlm, res);
2684 mlog(ML_ERROR, "%s:%.*s: this node is not master, " 2636 spin_unlock(&res->spinlock);
2685 "trying to free this but locks remain\n",
2686 dlm->name, res->lockname.len, res->lockname.name);
2687 }
2688 spin_unlock(&res->spinlock);
2689 goto leave;
2690 }
2691 2637
2692 /* No need to migrate a lockres having no locks */ 2638 if (target == O2NM_MAX_NODES)
2693 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2694 if (ret >= 0 && numlocks == 0 && !hasrefs) {
2695 spin_unlock(&res->spinlock);
2696 goto leave; 2639 goto leave;
2697 }
2698 spin_unlock(&res->spinlock);
2699 2640
2700 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2641 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2701 spin_unlock(&dlm->spinlock); 2642 spin_unlock(&dlm->spinlock);
2702 lock_dropped = 1; 2643 lock_dropped = 1;
2703 while (1) { 2644 ret = dlm_migrate_lockres(dlm, res, target);
2704 ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); 2645 if (ret)
2705 if (ret >= 0) 2646 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2706 break; 2647 dlm->name, res->lockname.len, res->lockname.name,
2707 if (ret == -ENOTEMPTY) { 2648 target, ret);
2708 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
2709 res->lockname.len, res->lockname.name);
2710 BUG();
2711 }
2712
2713 mlog(0, "lockres %.*s: migrate failed, "
2714 "retrying\n", res->lockname.len,
2715 res->lockname.name);
2716 msleep(DLM_MIGRATION_RETRY_MS);
2717 }
2718 spin_lock(&dlm->spinlock); 2649 spin_lock(&dlm->spinlock);
2719leave: 2650leave:
2720 return lock_dropped; 2651 return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2898 } 2829 }
2899} 2830}
2900 2831
2901/* for now this is not too intelligent. we will 2832/*
2902 * need stats to make this do the right thing. 2833 * Pick a node to migrate the lock resource to. This function selects a
2903 * this just finds the first lock on one of the 2834 * potential target based first on the locks and then on refmap. It skips
2904 * queues and uses that node as the target. */ 2835 * nodes that are in the process of exiting the domain.
2836 */
2905static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 2837static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2906 struct dlm_lock_resource *res) 2838 struct dlm_lock_resource *res)
2907{ 2839{
2908 int i; 2840 enum dlm_lockres_list idx;
2909 struct list_head *queue = &res->granted; 2841 struct list_head *queue = &res->granted;
2910 struct dlm_lock *lock; 2842 struct dlm_lock *lock;
2911 int nodenum; 2843 int noderef;
2844 u8 nodenum = O2NM_MAX_NODES;
2912 2845
2913 assert_spin_locked(&dlm->spinlock); 2846 assert_spin_locked(&dlm->spinlock);
2847 assert_spin_locked(&res->spinlock);
2914 2848
2915 spin_lock(&res->spinlock); 2849 /* Go through all the locks */
2916 for (i=0; i<3; i++) { 2850 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2851 queue = dlm_list_idx_to_ptr(res, idx);
2917 list_for_each_entry(lock, queue, list) { 2852 list_for_each_entry(lock, queue, list) {
2918 /* up to the caller to make sure this node 2853 if (lock->ml.node == dlm->node_num)
2919 * is alive */ 2854 continue;
2920 if (lock->ml.node != dlm->node_num) { 2855 if (test_bit(lock->ml.node, dlm->exit_domain_map))
2921 spin_unlock(&res->spinlock); 2856 continue;
2922 return lock->ml.node; 2857 nodenum = lock->ml.node;
2923 } 2858 goto bail;
2924 } 2859 }
2925 queue++;
2926 }
2927
2928 nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2929 if (nodenum < O2NM_MAX_NODES) {
2930 spin_unlock(&res->spinlock);
2931 return nodenum;
2932 } 2860 }
2933 spin_unlock(&res->spinlock);
2934 mlog(0, "have not found a suitable target yet! checking domain map\n");
2935 2861
2936 /* ok now we're getting desperate. pick anyone alive. */ 2862 /* Go thru the refmap */
2937 nodenum = -1; 2863 noderef = -1;
2938 while (1) { 2864 while (1) {
2939 nodenum = find_next_bit(dlm->domain_map, 2865 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
2940 O2NM_MAX_NODES, nodenum+1); 2866 noderef + 1);
2941 mlog(0, "found %d in domain map\n", nodenum); 2867 if (noderef >= O2NM_MAX_NODES)
2942 if (nodenum >= O2NM_MAX_NODES)
2943 break; 2868 break;
2944 if (nodenum != dlm->node_num) { 2869 if (noderef == dlm->node_num)
2945 mlog(0, "picking %d\n", nodenum); 2870 continue;
2946 return nodenum; 2871 if (test_bit(noderef, dlm->exit_domain_map))
2947 } 2872 continue;
2873 nodenum = noderef;
2874 goto bail;
2948 } 2875 }
2949 2876
2950 mlog(0, "giving up. no master to migrate to\n"); 2877bail:
2951 return DLM_LOCK_RES_OWNER_UNKNOWN; 2878 return nodenum;
2952} 2879}
2953 2880
2954
2955
2956/* this is called by the new master once all lockres 2881/* this is called by the new master once all lockres
2957 * data has been received */ 2882 * data has been received */
2958static int dlm_do_migrate_request(struct dlm_ctxt *dlm, 2883static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6fc254d..7efab6d28a21 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2393 2393
2394 mlog(0, "node %u being removed from domain map!\n", idx); 2394 mlog(0, "node %u being removed from domain map!\n", idx);
2395 clear_bit(idx, dlm->domain_map); 2395 clear_bit(idx, dlm->domain_map);
2396 clear_bit(idx, dlm->exit_domain_map);
2396 /* wake up migration waiters if a node goes down. 2397 /* wake up migration waiters if a node goes down.
2397 * perhaps later we can genericize this for other waiters. */ 2398 * perhaps later we can genericize this for other waiters. */
2398 wake_up(&dlm->migration_wq); 2399 wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0eddc365..b42076797049 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
88 * signifies a bast fired on the lock. 88 * signifies a bast fired on the lock.
89 */ 89 */
90#define DLMFS_CAPABILITIES "bast stackglue" 90#define DLMFS_CAPABILITIES "bast stackglue"
91extern int param_set_dlmfs_capabilities(const char *val, 91static int param_set_dlmfs_capabilities(const char *val,
92 struct kernel_param *kp) 92 struct kernel_param *kp)
93{ 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name); 94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6dc206..b1e35a392ca5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
2670 .flock = ocfs2_flock, 2670 .flock = ocfs2_flock,
2671 .splice_read = ocfs2_file_splice_read, 2671 .splice_read = ocfs2_file_splice_read,
2672 .splice_write = ocfs2_file_splice_write, 2672 .splice_write = ocfs2_file_splice_write,
2673 .fallocate = ocfs2_fallocate,
2673}; 2674};
2674 2675
2675const struct file_operations ocfs2_dops_no_plocks = { 2676const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c5989eae..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h" 24#include "refcounttree.h"
25#include "sysfile.h"
26#include "dir.h"
27#include "buffer_head_io.h"
28#include "suballoc.h"
29#include "move_extents.h"
25 30
26#include <linux/ext2_fs.h> 31#include <linux/ext2_fs.h>
27 32
@@ -35,31 +40,27 @@
35 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 40 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
36 * just a best-effort to tell userspace that this request caused the error. 41 * just a best-effort to tell userspace that this request caused the error.
37 */ 42 */
38static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, 43static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
39 struct ocfs2_info_request __user *req) 44 struct ocfs2_info_request __user *req)
40{ 45{
41 kreq->ir_flags |= OCFS2_INFO_FL_ERROR; 46 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
42 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); 47 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
43} 48}
44 49
45#define o2info_set_request_error(a, b) \ 50static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
46 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
47
48static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
49{ 51{
50 req->ir_flags |= OCFS2_INFO_FL_FILLED; 52 req->ir_flags |= OCFS2_INFO_FL_FILLED;
51} 53}
52 54
53#define o2info_set_request_filled(a) \ 55static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
54 __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
55
56static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
57{ 56{
58 req->ir_flags &= ~OCFS2_INFO_FL_FILLED; 57 req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
59} 58}
60 59
61#define o2info_clear_request_filled(a) \ 60static inline int o2info_coherent(struct ocfs2_info_request *req)
62 __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) 61{
62 return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
63}
63 64
64static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 65static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
65{ 66{
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
153 154
154 oib.ib_blocksize = inode->i_sb->s_blocksize; 155 oib.ib_blocksize = inode->i_sb->s_blocksize;
155 156
156 o2info_set_request_filled(oib); 157 o2info_set_request_filled(&oib.ib_req);
157 158
158 if (o2info_to_user(oib, req)) 159 if (o2info_to_user(oib, req))
159 goto bail; 160 goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
161 status = 0; 162 status = 0;
162bail: 163bail:
163 if (status) 164 if (status)
164 o2info_set_request_error(oib, req); 165 o2info_set_request_error(&oib.ib_req, req);
165 166
166 return status; 167 return status;
167} 168}
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
178 179
179 oic.ic_clustersize = osb->s_clustersize; 180 oic.ic_clustersize = osb->s_clustersize;
180 181
181 o2info_set_request_filled(oic); 182 o2info_set_request_filled(&oic.ic_req);
182 183
183 if (o2info_to_user(oic, req)) 184 if (o2info_to_user(oic, req))
184 goto bail; 185 goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
186 status = 0; 187 status = 0;
187bail: 188bail:
188 if (status) 189 if (status)
189 o2info_set_request_error(oic, req); 190 o2info_set_request_error(&oic.ic_req, req);
190 191
191 return status; 192 return status;
192} 193}
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
203 204
204 oim.im_max_slots = osb->max_slots; 205 oim.im_max_slots = osb->max_slots;
205 206
206 o2info_set_request_filled(oim); 207 o2info_set_request_filled(&oim.im_req);
207 208
208 if (o2info_to_user(oim, req)) 209 if (o2info_to_user(oim, req))
209 goto bail; 210 goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
211 status = 0; 212 status = 0;
212bail: 213bail:
213 if (status) 214 if (status)
214 o2info_set_request_error(oim, req); 215 o2info_set_request_error(&oim.im_req, req);
215 216
216 return status; 217 return status;
217} 218}
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
228 229
229 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
230 231
231 o2info_set_request_filled(oil); 232 o2info_set_request_filled(&oil.il_req);
232 233
233 if (o2info_to_user(oil, req)) 234 if (o2info_to_user(oil, req))
234 goto bail; 235 goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
236 status = 0; 237 status = 0;
237bail: 238bail:
238 if (status) 239 if (status)
239 o2info_set_request_error(oil, req); 240 o2info_set_request_error(&oil.il_req, req);
240 241
241 return status; 242 return status;
242} 243}
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
253 254
254 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
255 256
256 o2info_set_request_filled(oiu); 257 o2info_set_request_filled(&oiu.iu_req);
257 258
258 if (o2info_to_user(oiu, req)) 259 if (o2info_to_user(oiu, req))
259 goto bail; 260 goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
261 status = 0; 262 status = 0;
262bail: 263bail:
263 if (status) 264 if (status)
264 o2info_set_request_error(oiu, req); 265 o2info_set_request_error(&oiu.iu_req, req);
265 266
266 return status; 267 return status;
267} 268}
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
280 oif.if_incompat_features = osb->s_feature_incompat; 281 oif.if_incompat_features = osb->s_feature_incompat;
281 oif.if_ro_compat_features = osb->s_feature_ro_compat; 282 oif.if_ro_compat_features = osb->s_feature_ro_compat;
282 283
283 o2info_set_request_filled(oif); 284 o2info_set_request_filled(&oif.if_req);
284 285
285 if (o2info_to_user(oif, req)) 286 if (o2info_to_user(oif, req))
286 goto bail; 287 goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
288 status = 0; 289 status = 0;
289bail: 290bail:
290 if (status) 291 if (status)
291 o2info_set_request_error(oif, req); 292 o2info_set_request_error(&oif.if_req, req);
292 293
293 return status; 294 return status;
294} 295}
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
305 306
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 307 oij.ij_journal_size = osb->journal->j_inode->i_size;
307 308
308 o2info_set_request_filled(oij); 309 o2info_set_request_filled(&oij.ij_req);
309 310
310 if (o2info_to_user(oij, req)) 311 if (o2info_to_user(oij, req))
311 goto bail; 312 goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
313 status = 0; 314 status = 0;
314bail: 315bail:
315 if (status) 316 if (status)
316 o2info_set_request_error(oij, req); 317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320}
321
322int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
323 struct inode *inode_alloc, u64 blkno,
324 struct ocfs2_info_freeinode *fi, u32 slot)
325{
326 int status = 0, unlock = 0;
327
328 struct buffer_head *bh = NULL;
329 struct ocfs2_dinode *dinode_alloc = NULL;
330
331 if (inode_alloc)
332 mutex_lock(&inode_alloc->i_mutex);
333
334 if (o2info_coherent(&fi->ifi_req)) {
335 status = ocfs2_inode_lock(inode_alloc, &bh, 0);
336 if (status < 0) {
337 mlog_errno(status);
338 goto bail;
339 }
340 unlock = 1;
341 } else {
342 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
343 if (status < 0) {
344 mlog_errno(status);
345 goto bail;
346 }
347 }
348
349 dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
350
351 fi->ifi_stat[slot].lfi_total =
352 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
353 fi->ifi_stat[slot].lfi_free =
354 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
355 le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
356
357bail:
358 if (unlock)
359 ocfs2_inode_unlock(inode_alloc, 0);
360
361 if (inode_alloc)
362 mutex_unlock(&inode_alloc->i_mutex);
363
364 brelse(bh);
365
366 return status;
367}
368
369int ocfs2_info_handle_freeinode(struct inode *inode,
370 struct ocfs2_info_request __user *req)
371{
372 u32 i;
373 u64 blkno = -1;
374 char namebuf[40];
375 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
376 struct ocfs2_info_freeinode *oifi = NULL;
377 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
378 struct inode *inode_alloc = NULL;
379
380 oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
381 if (!oifi) {
382 status = -ENOMEM;
383 mlog_errno(status);
384 goto bail;
385 }
386
387 if (o2info_from_user(*oifi, req))
388 goto bail;
389
390 oifi->ifi_slotnum = osb->max_slots;
391
392 for (i = 0; i < oifi->ifi_slotnum; i++) {
393 if (o2info_coherent(&oifi->ifi_req)) {
394 inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
395 if (!inode_alloc) {
396 mlog(ML_ERROR, "unable to get alloc inode in "
397 "slot %u\n", i);
398 status = -EIO;
399 goto bail;
400 }
401 } else {
402 ocfs2_sprintf_system_inode_name(namebuf,
403 sizeof(namebuf),
404 type, i);
405 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
406 namebuf,
407 strlen(namebuf),
408 &blkno);
409 if (status < 0) {
410 status = -ENOENT;
411 goto bail;
412 }
413 }
414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418
419 iput(inode_alloc);
420 inode_alloc = NULL;
421 }
422
423 o2info_set_request_filled(&oifi->ifi_req);
424
425 if (o2info_to_user(*oifi, req))
426 goto bail;
427
428 status = 0;
429bail:
430 if (status)
431 o2info_set_request_error(&oifi->ifi_req, req);
432
433 kfree(oifi);
434
435 return status;
436}
437
438static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
439 unsigned int chunksize)
440{
441 int index;
442
443 index = __ilog2_u32(chunksize);
444 if (index >= OCFS2_INFO_MAX_HIST)
445 index = OCFS2_INFO_MAX_HIST - 1;
446
447 hist->fc_chunks[index]++;
448 hist->fc_clusters[index] += chunksize;
449}
450
451static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
452 unsigned int chunksize)
453{
454 if (chunksize > stats->ffs_max)
455 stats->ffs_max = chunksize;
456
457 if (chunksize < stats->ffs_min)
458 stats->ffs_min = chunksize;
459
460 stats->ffs_avg += chunksize;
461 stats->ffs_free_chunks_real++;
462}
463
464void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
465 unsigned int chunksize)
466{
467 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
468 o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
469}
470
471int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
472 struct inode *gb_inode,
473 struct ocfs2_dinode *gb_dinode,
474 struct ocfs2_chain_rec *rec,
475 struct ocfs2_info_freefrag *ffg,
476 u32 chunks_in_group)
477{
478 int status = 0, used;
479 u64 blkno;
480
481 struct buffer_head *bh = NULL;
482 struct ocfs2_group_desc *bg = NULL;
483
484 unsigned int max_bits, num_clusters;
485 unsigned int offset = 0, cluster, chunk;
486 unsigned int chunk_free, last_chunksize = 0;
487
488 if (!le32_to_cpu(rec->c_free))
489 goto bail;
490
491 do {
492 if (!bg)
493 blkno = le64_to_cpu(rec->c_blkno);
494 else
495 blkno = le64_to_cpu(bg->bg_next_group);
496
497 if (bh) {
498 brelse(bh);
499 bh = NULL;
500 }
501
502 if (o2info_coherent(&ffg->iff_req))
503 status = ocfs2_read_group_descriptor(gb_inode,
504 gb_dinode,
505 blkno, &bh);
506 else
507 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
508
509 if (status < 0) {
510 mlog(ML_ERROR, "Can't read the group descriptor # "
511 "%llu from device.", (unsigned long long)blkno);
512 status = -EIO;
513 goto bail;
514 }
515
516 bg = (struct ocfs2_group_desc *)bh->b_data;
517
518 if (!le16_to_cpu(bg->bg_free_bits_count))
519 continue;
520
521 max_bits = le16_to_cpu(bg->bg_bits);
522 offset = 0;
523
524 for (chunk = 0; chunk < chunks_in_group; chunk++) {
525 /*
526 * last chunk may be not an entire one.
527 */
528 if ((offset + ffg->iff_chunksize) > max_bits)
529 num_clusters = max_bits - offset;
530 else
531 num_clusters = ffg->iff_chunksize;
532
533 chunk_free = 0;
534 for (cluster = 0; cluster < num_clusters; cluster++) {
535 used = ocfs2_test_bit(offset,
536 (unsigned long *)bg->bg_bitmap);
537 /*
538 * - chunk_free counts free clusters in #N chunk.
539 * - last_chunksize records the size(in) clusters
540 * for the last real free chunk being counted.
541 */
542 if (!used) {
543 last_chunksize++;
544 chunk_free++;
545 }
546
547 if (used && last_chunksize) {
548 ocfs2_info_update_ffg(ffg,
549 last_chunksize);
550 last_chunksize = 0;
551 }
552
553 offset++;
554 }
555
556 if (chunk_free == ffg->iff_chunksize)
557 ffg->iff_ffs.ffs_free_chunks++;
558 }
559
560 /*
561 * need to update the info for last free chunk.
562 */
563 if (last_chunksize)
564 ocfs2_info_update_ffg(ffg, last_chunksize);
565
566 } while (le64_to_cpu(bg->bg_next_group));
567
568bail:
569 brelse(bh);
570
571 return status;
572}
573
574int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
575 struct inode *gb_inode, u64 blkno,
576 struct ocfs2_info_freefrag *ffg)
577{
578 u32 chunks_in_group;
579 int status = 0, unlock = 0, i;
580
581 struct buffer_head *bh = NULL;
582 struct ocfs2_chain_list *cl = NULL;
583 struct ocfs2_chain_rec *rec = NULL;
584 struct ocfs2_dinode *gb_dinode = NULL;
585
586 if (gb_inode)
587 mutex_lock(&gb_inode->i_mutex);
588
589 if (o2info_coherent(&ffg->iff_req)) {
590 status = ocfs2_inode_lock(gb_inode, &bh, 0);
591 if (status < 0) {
592 mlog_errno(status);
593 goto bail;
594 }
595 unlock = 1;
596 } else {
597 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
598 if (status < 0) {
599 mlog_errno(status);
600 goto bail;
601 }
602 }
603
604 gb_dinode = (struct ocfs2_dinode *)bh->b_data;
605 cl = &(gb_dinode->id2.i_chain);
606
607 /*
608 * Chunksize(in) clusters from userspace should be
609 * less than clusters in a group.
610 */
611 if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
612 status = -EINVAL;
613 goto bail;
614 }
615
616 memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
617
618 ffg->iff_ffs.ffs_min = ~0U;
619 ffg->iff_ffs.ffs_clusters =
620 le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
621 ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
622 le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
623
624 chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
625
626 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
627 rec = &(cl->cl_recs[i]);
628 status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
629 gb_dinode,
630 rec, ffg,
631 chunks_in_group);
632 if (status)
633 goto bail;
634 }
635
636 if (ffg->iff_ffs.ffs_free_chunks_real)
637 ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
638 ffg->iff_ffs.ffs_free_chunks_real);
639bail:
640 if (unlock)
641 ocfs2_inode_unlock(gb_inode, 0);
642
643 if (gb_inode)
644 mutex_unlock(&gb_inode->i_mutex);
645
646 if (gb_inode)
647 iput(gb_inode);
648
649 brelse(bh);
650
651 return status;
652}
653
654int ocfs2_info_handle_freefrag(struct inode *inode,
655 struct ocfs2_info_request __user *req)
656{
657 u64 blkno = -1;
658 char namebuf[40];
659 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
660
661 struct ocfs2_info_freefrag *oiff;
662 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
663 struct inode *gb_inode = NULL;
664
665 oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
666 if (!oiff) {
667 status = -ENOMEM;
668 mlog_errno(status);
669 goto bail;
670 }
671
672 if (o2info_from_user(*oiff, req))
673 goto bail;
674 /*
675 * chunksize from userspace should be power of 2.
676 */
677 if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
678 (!oiff->iff_chunksize)) {
679 status = -EINVAL;
680 goto bail;
681 }
682
683 if (o2info_coherent(&oiff->iff_req)) {
684 gb_inode = ocfs2_get_system_file_inode(osb, type,
685 OCFS2_INVALID_SLOT);
686 if (!gb_inode) {
687 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
688 status = -EIO;
689 goto bail;
690 }
691 } else {
692 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
693 OCFS2_INVALID_SLOT);
694 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
695 namebuf,
696 strlen(namebuf),
697 &blkno);
698 if (status < 0) {
699 status = -ENOENT;
700 goto bail;
701 }
702 }
703
704 status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
705 if (status < 0)
706 goto bail;
707
708 o2info_set_request_filled(&oiff->iff_req);
709
710 if (o2info_to_user(*oiff, req))
711 goto bail;
712
713 status = 0;
714bail:
715 if (status)
716 o2info_set_request_error(&oiff->iff_req, req);
717
718 kfree(oiff);
317 719
318 return status; 720 return status;
319} 721}
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
327 if (o2info_from_user(oir, req)) 729 if (o2info_from_user(oir, req))
328 goto bail; 730 goto bail;
329 731
330 o2info_clear_request_filled(oir); 732 o2info_clear_request_filled(&oir);
331 733
332 if (o2info_to_user(oir, req)) 734 if (o2info_to_user(oir, req))
333 goto bail; 735 goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
335 status = 0; 737 status = 0;
336bail: 738bail:
337 if (status) 739 if (status)
338 o2info_set_request_error(oir, req); 740 o2info_set_request_error(&oir, req);
339 741
340 return status; 742 return status;
341} 743}
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
389 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) 791 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
390 status = ocfs2_info_handle_journal_size(inode, req); 792 status = ocfs2_info_handle_journal_size(inode, req);
391 break; 793 break;
794 case OCFS2_INFO_FREEINODE:
795 if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
796 status = ocfs2_info_handle_freeinode(inode, req);
797 break;
798 case OCFS2_INFO_FREEFRAG:
799 if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
800 status = ocfs2_info_handle_freefrag(inode, req);
801 break;
392 default: 802 default:
393 status = ocfs2_info_handle_unknown(inode, req); 803 status = ocfs2_info_handle_unknown(inode, req);
394 break; 804 break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
542 return -EFAULT; 952 return -EFAULT;
543 953
544 return ocfs2_info_handle(inode, &info, 0); 954 return ocfs2_info_handle(inode, &info, 0);
955 case FITRIM:
956 {
957 struct super_block *sb = inode->i_sb;
958 struct fstrim_range range;
959 int ret = 0;
960
961 if (!capable(CAP_SYS_ADMIN))
962 return -EPERM;
963
964 if (copy_from_user(&range, (struct fstrim_range *)arg,
965 sizeof(range)))
966 return -EFAULT;
967
968 ret = ocfs2_trim_fs(sb, &range);
969 if (ret < 0)
970 return ret;
971
972 if (copy_to_user((struct fstrim_range *)arg, &range,
973 sizeof(range)))
974 return -EFAULT;
975
976 return 0;
977 }
978 case OCFS2_IOC_MOVE_EXT:
979 return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
545 default: 980 default:
546 return -ENOTTY; 981 return -ENOTTY;
547 } 982 }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
569 case OCFS2_IOC_GROUP_EXTEND: 1004 case OCFS2_IOC_GROUP_EXTEND:
570 case OCFS2_IOC_GROUP_ADD: 1005 case OCFS2_IOC_GROUP_ADD:
571 case OCFS2_IOC_GROUP_ADD64: 1006 case OCFS2_IOC_GROUP_ADD64:
1007 case FITRIM:
572 break; 1008 break;
573 case OCFS2_IOC_REFLINK: 1009 case OCFS2_IOC_REFLINK:
574 if (copy_from_user(&args, (struct reflink_arguments *)arg, 1010 if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
584 return -EFAULT; 1020 return -EFAULT;
585 1021
586 return ocfs2_info_handle(inode, &info, 1); 1022 return ocfs2_info_handle(inode, &info, 1);
1023 case OCFS2_IOC_MOVE_EXT:
1024 break;
587 default: 1025 default:
588 return -ENOIOCTLCMD; 1026 return -ENOIOCTLCMD;
589 } 1027 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.c
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#include <linux/fs.h>
18#include <linux/types.h>
19#include <linux/mount.h>
20#include <linux/swap.h>
21
22#include <cluster/masklog.h>
23
24#include "ocfs2.h"
25#include "ocfs2_ioctl.h"
26
27#include "alloc.h"
28#include "aops.h"
29#include "dlmglue.h"
30#include "extent_map.h"
31#include "inode.h"
32#include "journal.h"
33#include "suballoc.h"
34#include "uptodate.h"
35#include "super.h"
36#include "dir.h"
37#include "buffer_head_io.h"
38#include "sysfile.h"
39#include "suballoc.h"
40#include "refcounttree.h"
41#include "move_extents.h"
42
43struct ocfs2_move_extents_context {
44 struct inode *inode;
45 struct file *file;
46 int auto_defrag;
47 int partial;
48 int credits;
49 u32 new_phys_cpos;
50 u32 clusters_moved;
51 u64 refcount_loc;
52 struct ocfs2_move_extents *range;
53 struct ocfs2_extent_tree et;
54 struct ocfs2_alloc_context *meta_ac;
55 struct ocfs2_alloc_context *data_ac;
56 struct ocfs2_cached_dealloc_ctxt dealloc;
57};
58
59static int __ocfs2_move_extent(handle_t *handle,
60 struct ocfs2_move_extents_context *context,
61 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
62 int ext_flags)
63{
64 int ret = 0, index;
65 struct inode *inode = context->inode;
66 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
67 struct ocfs2_extent_rec *rec, replace_rec;
68 struct ocfs2_path *path = NULL;
69 struct ocfs2_extent_list *el;
70 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
71 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
72
73 ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
74 p_cpos, new_p_cpos, len);
75 if (ret) {
76 mlog_errno(ret);
77 goto out;
78 }
79
80 memset(&replace_rec, 0, sizeof(replace_rec));
81 replace_rec.e_cpos = cpu_to_le32(cpos);
82 replace_rec.e_leaf_clusters = cpu_to_le16(len);
83 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
84 new_p_cpos));
85
86 path = ocfs2_new_path_from_et(&context->et);
87 if (!path) {
88 ret = -ENOMEM;
89 mlog_errno(ret);
90 goto out;
91 }
92
93 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
94 if (ret) {
95 mlog_errno(ret);
96 goto out;
97 }
98
99 el = path_leaf_el(path);
100
101 index = ocfs2_search_extent_list(el, cpos);
102 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
103 ocfs2_error(inode->i_sb,
104 "Inode %llu has an extent at cpos %u which can no "
105 "longer be found.\n",
106 (unsigned long long)ino, cpos);
107 ret = -EROFS;
108 goto out;
109 }
110
111 rec = &el->l_recs[index];
112
113 BUG_ON(ext_flags != rec->e_flags);
114 /*
115 * after moving/defraging to new location, the extent is not going
116 * to be refcounted anymore.
117 */
118 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
119
120 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
121 context->et.et_root_bh,
122 OCFS2_JOURNAL_ACCESS_WRITE);
123 if (ret) {
124 mlog_errno(ret);
125 goto out;
126 }
127
128 ret = ocfs2_split_extent(handle, &context->et, path, index,
129 &replace_rec, context->meta_ac,
130 &context->dealloc);
131 if (ret) {
132 mlog_errno(ret);
133 goto out;
134 }
135
136 ocfs2_journal_dirty(handle, context->et.et_root_bh);
137
138 context->new_phys_cpos = new_p_cpos;
139
140 /*
141 * need I to append truncate log for old clusters?
142 */
143 if (old_blkno) {
144 if (ext_flags & OCFS2_EXT_REFCOUNTED)
145 ret = ocfs2_decrease_refcount(inode, handle,
146 ocfs2_blocks_to_clusters(osb->sb,
147 old_blkno),
148 len, context->meta_ac,
149 &context->dealloc, 1);
150 else
151 ret = ocfs2_truncate_log_append(osb, handle,
152 old_blkno, len);
153 }
154
155out:
156 return ret;
157}
158
159/*
160 * lock allocators, and reserving appropriate number of bits for
161 * meta blocks and data clusters.
162 *
163 * in some cases, we don't need to reserve clusters, just let data_ac
164 * be NULL.
165 */
166static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167 struct ocfs2_extent_tree *et,
168 u32 clusters_to_move,
169 u32 extents_to_split,
170 struct ocfs2_alloc_context **meta_ac,
171 struct ocfs2_alloc_context **data_ac,
172 int extra_blocks,
173 int *credits)
174{
175 int ret, num_free_extents;
176 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179 num_free_extents = ocfs2_num_free_extents(osb, et);
180 if (num_free_extents < 0) {
181 ret = num_free_extents;
182 mlog_errno(ret);
183 goto out;
184 }
185
186 if (!num_free_extents ||
187 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191 if (ret) {
192 mlog_errno(ret);
193 goto out;
194 }
195
196 if (data_ac) {
197 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198 if (ret) {
199 mlog_errno(ret);
200 goto out;
201 }
202 }
203
204 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205 clusters_to_move + 2);
206
207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208 extra_blocks, clusters_to_move, *credits);
209out:
210 if (ret) {
211 if (*meta_ac) {
212 ocfs2_free_alloc_context(*meta_ac);
213 *meta_ac = NULL;
214 }
215 }
216
217 return ret;
218}
219
220/*
221 * Using one journal handle to guarantee the data consistency in case
222 * crash happens anywhere.
223 *
224 * XXX: defrag can end up with finishing partial extent as requested,
225 * due to not enough contiguous clusters can be found in allocator.
226 */
227static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
229{
230 int ret, credits = 0, extra_blocks = 0, partial = context->partial;
231 handle_t *handle;
232 struct inode *inode = context->inode;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234 struct inode *tl_inode = osb->osb_tl_inode;
235 struct ocfs2_refcount_tree *ref_tree = NULL;
236 u32 new_phys_cpos, new_len;
237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
238
239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
240
241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
242 OCFS2_HAS_REFCOUNT_FL));
243
244 BUG_ON(!context->refcount_loc);
245
246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
247 &ref_tree, NULL);
248 if (ret) {
249 mlog_errno(ret);
250 return ret;
251 }
252
253 ret = ocfs2_prepare_refcount_change_for_del(inode,
254 context->refcount_loc,
255 phys_blkno,
256 *len,
257 &credits,
258 &extra_blocks);
259 if (ret) {
260 mlog_errno(ret);
261 goto out;
262 }
263 }
264
265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
266 &context->meta_ac,
267 &context->data_ac,
268 extra_blocks, &credits);
269 if (ret) {
270 mlog_errno(ret);
271 goto out;
272 }
273
274 /*
275 * should be using allocation reservation strategy there?
276 *
277 * if (context->data_ac)
278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
279 */
280
281 mutex_lock(&tl_inode->i_mutex);
282
283 if (ocfs2_truncate_log_needs_flush(osb)) {
284 ret = __ocfs2_flush_truncate_log(osb);
285 if (ret < 0) {
286 mlog_errno(ret);
287 goto out_unlock_mutex;
288 }
289 }
290
291 handle = ocfs2_start_trans(osb, credits);
292 if (IS_ERR(handle)) {
293 ret = PTR_ERR(handle);
294 mlog_errno(ret);
295 goto out_unlock_mutex;
296 }
297
298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
299 &new_phys_cpos, &new_len);
300 if (ret) {
301 mlog_errno(ret);
302 goto out_commit;
303 }
304
305 /*
306 * allowing partial extent moving is kind of 'pros and cons', it makes
307 * whole defragmentation less likely to fail, on the contrary, the bad
308 * thing is it may make the fs even more fragmented after moving, let
309 * userspace make a good decision here.
310 */
311 if (new_len != *len) {
312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
313 if (!partial) {
314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
315 ret = -ENOSPC;
316 goto out_commit;
317 }
318 }
319
320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
321 phys_cpos, new_phys_cpos);
322
323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
324 new_phys_cpos, ext_flags);
325 if (ret)
326 mlog_errno(ret);
327
328 if (partial && (new_len != *len))
329 *len = new_len;
330
331 /*
332 * Here we should write the new page out first if we are
333 * in write-back mode.
334 */
335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
336 if (ret)
337 mlog_errno(ret);
338
339out_commit:
340 ocfs2_commit_trans(osb, handle);
341
342out_unlock_mutex:
343 mutex_unlock(&tl_inode->i_mutex);
344
345 if (context->data_ac) {
346 ocfs2_free_alloc_context(context->data_ac);
347 context->data_ac = NULL;
348 }
349
350 if (context->meta_ac) {
351 ocfs2_free_alloc_context(context->meta_ac);
352 context->meta_ac = NULL;
353 }
354
355out:
356 if (ref_tree)
357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
358
359 return ret;
360}
361
362/*
363 * find the victim alloc group, where #blkno fits.
364 */
365static int ocfs2_find_victim_alloc_group(struct inode *inode,
366 u64 vict_blkno,
367 int type, int slot,
368 int *vict_bit,
369 struct buffer_head **ret_bh)
370{
371 int ret, i, blocks_per_unit = 1;
372 u64 blkno;
373 char namebuf[40];
374
375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
377 struct ocfs2_chain_list *cl;
378 struct ocfs2_chain_rec *rec;
379 struct ocfs2_dinode *ac_dinode;
380 struct ocfs2_group_desc *bg;
381
382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
384 strlen(namebuf), &blkno);
385 if (ret) {
386 ret = -ENOENT;
387 goto out;
388 }
389
390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
391 if (ret) {
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
397 cl = &(ac_dinode->id2.i_chain);
398 rec = &(cl->cl_recs[0]);
399
400 if (type == GLOBAL_BITMAP_SYSTEM_INODE)
401 blocks_per_unit <<= (osb->s_clustersize_bits -
402 inode->i_sb->s_blocksize_bits);
403 /*
404 * 'vict_blkno' was out of the valid range.
405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
408 blocks_per_unit))) {
409 ret = -EINVAL;
410 goto out;
411 }
412
413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
414
415 rec = &(cl->cl_recs[i]);
416 if (!rec)
417 continue;
418
419 bg = NULL;
420
421 do {
422 if (!bg)
423 blkno = le64_to_cpu(rec->c_blkno);
424 else
425 blkno = le64_to_cpu(bg->bg_next_group);
426
427 if (gd_bh) {
428 brelse(gd_bh);
429 gd_bh = NULL;
430 }
431
432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
433 if (ret) {
434 mlog_errno(ret);
435 goto out;
436 }
437
438 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
439
440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
441 le16_to_cpu(bg->bg_bits))) {
442
443 *ret_bh = gd_bh;
444 *vict_bit = (vict_blkno - blkno) /
445 blocks_per_unit;
446 mlog(0, "find the victim group: #%llu, "
447 "total_bits: %u, vict_bit: %u\n",
448 blkno, le16_to_cpu(bg->bg_bits),
449 *vict_bit);
450 goto out;
451 }
452
453 } while (le64_to_cpu(bg->bg_next_group));
454 }
455
456 ret = -EINVAL;
457out:
458 brelse(ac_bh);
459
460 /*
461 * caller has to release the gd_bh properly.
462 */
463 return ret;
464}
465
466/*
467 * XXX: helper to validate and adjust moving goal.
468 */
469static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
470 struct ocfs2_move_extents *range)
471{
472 int ret, goal_bit = 0;
473
474 struct buffer_head *gd_bh = NULL;
475 struct ocfs2_group_desc *bg;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 int c_to_b = 1 << (osb->s_clustersize_bits -
478 inode->i_sb->s_blocksize_bits);
479
480 /*
481 * validate goal sits within global_bitmap, and return the victim
482 * group desc
483 */
484 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
485 GLOBAL_BITMAP_SYSTEM_INODE,
486 OCFS2_INVALID_SLOT,
487 &goal_bit, &gd_bh);
488 if (ret)
489 goto out;
490
491 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
492
493 /*
494 * make goal become cluster aligned.
495 */
496 if (range->me_goal % c_to_b)
497 range->me_goal = range->me_goal / c_to_b * c_to_b;
498
499 /*
500 * moving goal is not allowd to start with a group desc blok(#0 blk)
501 * let's compromise to the latter cluster.
502 */
503 if (range->me_goal == le64_to_cpu(bg->bg_blkno))
504 range->me_goal += c_to_b;
505
506 /*
507 * movement is not gonna cross two groups.
508 */
509 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
510 range->me_len) {
511 ret = -EINVAL;
512 goto out;
513 }
514 /*
515 * more exact validations/adjustments will be performed later during
516 * moving operation for each extent range.
517 */
518 mlog(0, "extents get ready to be moved to #%llu block\n",
519 range->me_goal);
520
521out:
522 brelse(gd_bh);
523
524 return ret;
525}
526
527static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
528 int *goal_bit, u32 move_len, u32 max_hop,
529 u32 *phys_cpos)
530{
531 int i, used, last_free_bits = 0, base_bit = *goal_bit;
532 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
533 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
534 le64_to_cpu(gd->bg_blkno));
535
536 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
537
538 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
539 if (used) {
540 /*
541 * we even tried searching the free chunk by jumping
542 * a 'max_hop' distance, but still failed.
543 */
544 if ((i - base_bit) > max_hop) {
545 *phys_cpos = 0;
546 break;
547 }
548
549 if (last_free_bits)
550 last_free_bits = 0;
551
552 continue;
553 } else
554 last_free_bits++;
555
556 if (last_free_bits == move_len) {
557 *goal_bit = i;
558 *phys_cpos = base_cpos + i;
559 break;
560 }
561 }
562
563 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
564}
565
566static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
567 handle_t *handle,
568 struct buffer_head *di_bh,
569 u32 num_bits,
570 u16 chain)
571{
572 int ret;
573 u32 tmp_used;
574 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
575 struct ocfs2_chain_list *cl =
576 (struct ocfs2_chain_list *) &di->id2.i_chain;
577
578 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
579 OCFS2_JOURNAL_ACCESS_WRITE);
580 if (ret < 0) {
581 mlog_errno(ret);
582 goto out;
583 }
584
585 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
586 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
587 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
588 ocfs2_journal_dirty(handle, di_bh);
589
590out:
591 return ret;
592}
593
594static inline int ocfs2_block_group_set_bits(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_group_desc *bg,
597 struct buffer_head *group_bh,
598 unsigned int bit_off,
599 unsigned int num_bits)
600{
601 int status;
602 void *bitmap = bg->bg_bitmap;
603 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
604
605 /* All callers get the descriptor via
606 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
607 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
608 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
609
610 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
611 num_bits);
612
613 if (ocfs2_is_cluster_bitmap(alloc_inode))
614 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
615
616 status = ocfs2_journal_access_gd(handle,
617 INODE_CACHE(alloc_inode),
618 group_bh,
619 journal_type);
620 if (status < 0) {
621 mlog_errno(status);
622 goto bail;
623 }
624
625 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
626 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
627 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
628 " count %u but claims %u are freed. num_bits %d",
629 (unsigned long long)le64_to_cpu(bg->bg_blkno),
630 le16_to_cpu(bg->bg_bits),
631 le16_to_cpu(bg->bg_free_bits_count), num_bits);
632 return -EROFS;
633 }
634 while (num_bits--)
635 ocfs2_set_bit(bit_off++, bitmap);
636
637 ocfs2_journal_dirty(handle, group_bh);
638
639bail:
640 return status;
641}
642
643static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
644 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
645 u32 len, int ext_flags)
646{
647 int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
648 handle_t *handle;
649 struct inode *inode = context->inode;
650 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
651 struct inode *tl_inode = osb->osb_tl_inode;
652 struct inode *gb_inode = NULL;
653 struct buffer_head *gb_bh = NULL;
654 struct buffer_head *gd_bh = NULL;
655 struct ocfs2_group_desc *gd;
656 struct ocfs2_refcount_tree *ref_tree = NULL;
657 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
658 context->range->me_threshold);
659 u64 phys_blkno, new_phys_blkno;
660
661 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
662
663 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
664
665 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
666 OCFS2_HAS_REFCOUNT_FL));
667
668 BUG_ON(!context->refcount_loc);
669
670 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
671 &ref_tree, NULL);
672 if (ret) {
673 mlog_errno(ret);
674 return ret;
675 }
676
677 ret = ocfs2_prepare_refcount_change_for_del(inode,
678 context->refcount_loc,
679 phys_blkno,
680 len,
681 &credits,
682 &extra_blocks);
683 if (ret) {
684 mlog_errno(ret);
685 goto out;
686 }
687 }
688
689 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
690 &context->meta_ac,
691 NULL, extra_blocks, &credits);
692 if (ret) {
693 mlog_errno(ret);
694 goto out;
695 }
696
697 /*
698 * need to count 2 extra credits for global_bitmap inode and
699 * group descriptor.
700 */
701 credits += OCFS2_INODE_UPDATE_CREDITS + 1;
702
703 /*
704 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
705 * logic, while we still need to lock the global_bitmap.
706 */
707 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
708 OCFS2_INVALID_SLOT);
709 if (!gb_inode) {
710 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
711 ret = -EIO;
712 goto out;
713 }
714
715 mutex_lock(&gb_inode->i_mutex);
716
717 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
718 if (ret) {
719 mlog_errno(ret);
720 goto out_unlock_gb_mutex;
721 }
722
723 mutex_lock(&tl_inode->i_mutex);
724
725 handle = ocfs2_start_trans(osb, credits);
726 if (IS_ERR(handle)) {
727 ret = PTR_ERR(handle);
728 mlog_errno(ret);
729 goto out_unlock_tl_inode;
730 }
731
732 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
733 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
734 GLOBAL_BITMAP_SYSTEM_INODE,
735 OCFS2_INVALID_SLOT,
736 &goal_bit, &gd_bh);
737 if (ret) {
738 mlog_errno(ret);
739 goto out_commit;
740 }
741
742 /*
743 * probe the victim cluster group to find a proper
744 * region to fit wanted movement, it even will perfrom
745 * a best-effort attempt by compromising to a threshold
746 * around the goal.
747 */
748 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
749 new_phys_cpos);
750 if (!new_phys_cpos) {
751 ret = -ENOSPC;
752 goto out_commit;
753 }
754
755 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
756 *new_phys_cpos, ext_flags);
757 if (ret) {
758 mlog_errno(ret);
759 goto out_commit;
760 }
761
762 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
763 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
764 le16_to_cpu(gd->bg_chain));
765 if (ret) {
766 mlog_errno(ret);
767 goto out_commit;
768 }
769
770 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
771 goal_bit, len);
772 if (ret)
773 mlog_errno(ret);
774
775 /*
776 * Here we should write the new page out first if we are
777 * in write-back mode.
778 */
779 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
780 if (ret)
781 mlog_errno(ret);
782
783out_commit:
784 ocfs2_commit_trans(osb, handle);
785 brelse(gd_bh);
786
787out_unlock_tl_inode:
788 mutex_unlock(&tl_inode->i_mutex);
789
790 ocfs2_inode_unlock(gb_inode, 1);
791out_unlock_gb_mutex:
792 mutex_unlock(&gb_inode->i_mutex);
793 brelse(gb_bh);
794 iput(gb_inode);
795
796out:
797 if (context->meta_ac) {
798 ocfs2_free_alloc_context(context->meta_ac);
799 context->meta_ac = NULL;
800 }
801
802 if (ref_tree)
803 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
804
805 return ret;
806}
807
808/*
809 * Helper to calculate the defraging length in one run according to threshold.
810 */
811static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
812 u32 threshold, int *skip)
813{
814 if ((*alloc_size + *len_defraged) < threshold) {
815 /*
816 * proceed defragmentation until we meet the thresh
817 */
818 *len_defraged += *alloc_size;
819 } else if (*len_defraged == 0) {
820 /*
821 * XXX: skip a large extent.
822 */
823 *skip = 1;
824 } else {
825 /*
826 * split this extent to coalesce with former pieces as
827 * to reach the threshold.
828 *
829 * we're done here with one cycle of defragmentation
830 * in a size of 'thresh', resetting 'len_defraged'
831 * forces a new defragmentation.
832 */
833 *alloc_size = threshold - *len_defraged;
834 *len_defraged = 0;
835 }
836}
837
838static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
839 struct ocfs2_move_extents_context *context)
840{
841 int ret = 0, flags, do_defrag, skip = 0;
842 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
843 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
844
845 struct inode *inode = context->inode;
846 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
847 struct ocfs2_move_extents *range = context->range;
848 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
849
850 if ((inode->i_size == 0) || (range->me_len == 0))
851 return 0;
852
853 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
854 return 0;
855
856 context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
857
858 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
859 ocfs2_init_dealloc_ctxt(&context->dealloc);
860
861 /*
862 * TO-DO XXX:
863 *
864 * - xattr extents.
865 */
866
867 do_defrag = context->auto_defrag;
868
869 /*
870 * extents moving happens in unit of clusters, for the sake
871 * of simplicity, we may ignore two clusters where 'byte_start'
872 * and 'byte_start + len' were within.
873 */
874 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
875 len_to_move = (range->me_start + range->me_len) >>
876 osb->s_clustersize_bits;
877 if (len_to_move >= move_start)
878 len_to_move -= move_start;
879 else
880 len_to_move = 0;
881
882 if (do_defrag) {
883 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
884 if (defrag_thresh <= 1)
885 goto done;
886 } else
887 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
888 range->me_goal);
889
890 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
891 "thresh: %u\n",
892 (unsigned long long)OCFS2_I(inode)->ip_blkno,
893 (unsigned long long)range->me_start,
894 (unsigned long long)range->me_len,
895 move_start, len_to_move, defrag_thresh);
896
897 cpos = move_start;
898 while (len_to_move) {
899 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
900 &flags);
901 if (ret) {
902 mlog_errno(ret);
903 goto out;
904 }
905
906 if (alloc_size > len_to_move)
907 alloc_size = len_to_move;
908
909 /*
910 * XXX: how to deal with a hole:
911 *
912 * - skip the hole of course
913 * - force a new defragmentation
914 */
915 if (!phys_cpos) {
916 if (do_defrag)
917 len_defraged = 0;
918
919 goto next;
920 }
921
922 if (do_defrag) {
923 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
924 defrag_thresh, &skip);
925 /*
926 * skip large extents
927 */
928 if (skip) {
929 skip = 0;
930 goto next;
931 }
932
933 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
934 "alloc_size: %u, len_defraged: %u\n",
935 cpos, phys_cpos, alloc_size, len_defraged);
936
937 ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
938 &alloc_size, flags);
939 } else {
940 ret = ocfs2_move_extent(context, cpos, phys_cpos,
941 &new_phys_cpos, alloc_size,
942 flags);
943
944 new_phys_cpos += alloc_size;
945 }
946
947 if (ret < 0) {
948 mlog_errno(ret);
949 goto out;
950 }
951
952 context->clusters_moved += alloc_size;
953next:
954 cpos += alloc_size;
955 len_to_move -= alloc_size;
956 }
957
958done:
959 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
960
961out:
962 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
963 context->clusters_moved);
964 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
965 context->new_phys_cpos);
966
967 ocfs2_schedule_truncate_log_flush(osb, 1);
968 ocfs2_run_deallocs(osb, &context->dealloc);
969
970 return ret;
971}
972
973static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
974{
975 int status;
976 handle_t *handle;
977 struct inode *inode = context->inode;
978 struct ocfs2_dinode *di;
979 struct buffer_head *di_bh = NULL;
980 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
981
982 if (!inode)
983 return -ENOENT;
984
985 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
986 return -EROFS;
987
988 mutex_lock(&inode->i_mutex);
989
990 /*
991 * This prevents concurrent writes from other nodes
992 */
993 status = ocfs2_rw_lock(inode, 1);
994 if (status) {
995 mlog_errno(status);
996 goto out;
997 }
998
999 status = ocfs2_inode_lock(inode, &di_bh, 1);
1000 if (status) {
1001 mlog_errno(status);
1002 goto out_rw_unlock;
1003 }
1004
1005 /*
1006 * rememer ip_xattr_sem also needs to be held if necessary
1007 */
1008 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1009
1010 status = __ocfs2_move_extents_range(di_bh, context);
1011
1012 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1013 if (status) {
1014 mlog_errno(status);
1015 goto out_inode_unlock;
1016 }
1017
1018 /*
1019 * We update ctime for these changes
1020 */
1021 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1022 if (IS_ERR(handle)) {
1023 status = PTR_ERR(handle);
1024 mlog_errno(status);
1025 goto out_inode_unlock;
1026 }
1027
1028 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1029 OCFS2_JOURNAL_ACCESS_WRITE);
1030 if (status) {
1031 mlog_errno(status);
1032 goto out_commit;
1033 }
1034
1035 di = (struct ocfs2_dinode *)di_bh->b_data;
1036 inode->i_ctime = CURRENT_TIME;
1037 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1038 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1039
1040 ocfs2_journal_dirty(handle, di_bh);
1041
1042out_commit:
1043 ocfs2_commit_trans(osb, handle);
1044
1045out_inode_unlock:
1046 brelse(di_bh);
1047 ocfs2_inode_unlock(inode, 1);
1048out_rw_unlock:
1049 ocfs2_rw_unlock(inode, 1);
1050out:
1051 mutex_unlock(&inode->i_mutex);
1052
1053 return status;
1054}
1055
1056int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1057{
1058 int status;
1059
1060 struct inode *inode = filp->f_path.dentry->d_inode;
1061 struct ocfs2_move_extents range;
1062 struct ocfs2_move_extents_context *context = NULL;
1063
1064 status = mnt_want_write(filp->f_path.mnt);
1065 if (status)
1066 return status;
1067
1068 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1069 goto out;
1070
1071 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1072 status = -EPERM;
1073 goto out;
1074 }
1075
1076 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1077 if (!context) {
1078 status = -ENOMEM;
1079 mlog_errno(status);
1080 goto out;
1081 }
1082
1083 context->inode = inode;
1084 context->file = filp;
1085
1086 if (argp) {
1087 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1088 sizeof(range))) {
1089 status = -EFAULT;
1090 goto out;
1091 }
1092 } else {
1093 status = -EINVAL;
1094 goto out;
1095 }
1096
1097 if (range.me_start > i_size_read(inode))
1098 goto out;
1099
1100 if (range.me_start + range.me_len > i_size_read(inode))
1101 range.me_len = i_size_read(inode) - range.me_start;
1102
1103 context->range = &range;
1104
1105 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1106 context->auto_defrag = 1;
1107 /*
1108 * ok, the default theshold for the defragmentation
1109 * is 1M, since our maximum clustersize was 1M also.
1110 * any thought?
1111 */
1112 if (!range.me_threshold)
1113 range.me_threshold = 1024 * 1024;
1114
1115 if (range.me_threshold > i_size_read(inode))
1116 range.me_threshold = i_size_read(inode);
1117
1118 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1119 context->partial = 1;
1120 } else {
1121 /*
1122 * first best-effort attempt to validate and adjust the goal
1123 * (physical address in block), while it can't guarantee later
1124 * operation can succeed all the time since global_bitmap may
1125 * change a bit over time.
1126 */
1127
1128 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1129 if (status)
1130 goto out;
1131 }
1132
1133 status = ocfs2_move_extents(context);
1134 if (status)
1135 mlog_errno(status);
1136out:
1137 /*
1138 * movement/defragmentation may end up being partially completed,
1139 * that's the reason why we need to return userspace the finished
1140 * length and new_offset even if failure happens somewhere.
1141 */
1142 if (argp) {
1143 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1144 sizeof(range)))
1145 status = -EFAULT;
1146 }
1147
1148 kfree(context);
1149
1150 mnt_drop_write(filp->f_path.mnt);
1151
1152 return status;
1153}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.h
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_MOVE_EXTENTS_H
18#define OCFS2_MOVE_EXTENTS_H
19
20int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
21
22#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
142 __u64 ij_journal_size; 142 __u64 ij_journal_size;
143}; 143};
144 144
145struct ocfs2_info_freeinode {
146 struct ocfs2_info_request ifi_req;
147 struct ocfs2_info_local_freeinode {
148 __u64 lfi_total;
149 __u64 lfi_free;
150 } ifi_stat[OCFS2_MAX_SLOTS];
151 __u32 ifi_slotnum; /* out */
152 __u32 ifi_pad;
153};
154
155#define OCFS2_INFO_MAX_HIST (32)
156
157struct ocfs2_info_freefrag {
158 struct ocfs2_info_request iff_req;
159 struct ocfs2_info_freefrag_stats { /* (out) */
160 struct ocfs2_info_free_chunk_list {
161 __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
162 __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
163 } ffs_fc_hist;
164 __u32 ffs_clusters;
165 __u32 ffs_free_clusters;
166 __u32 ffs_free_chunks;
167 __u32 ffs_free_chunks_real;
168 __u32 ffs_min; /* Minimum free chunksize in clusters */
169 __u32 ffs_max;
170 __u32 ffs_avg;
171 __u32 ffs_pad;
172 } iff_ffs;
173 __u32 iff_chunksize; /* chunksize in clusters(in) */
174 __u32 iff_pad;
175};
176
145/* Codes for ocfs2_info_request */ 177/* Codes for ocfs2_info_request */
146enum ocfs2_info_type { 178enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1, 179 OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
151 OCFS2_INFO_UUID, 183 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES, 184 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE, 185 OCFS2_INFO_JOURNAL_SIZE,
186 OCFS2_INFO_FREEINODE,
187 OCFS2_INFO_FREEFRAG,
154 OCFS2_INFO_NUM_TYPES 188 OCFS2_INFO_NUM_TYPES
155}; 189};
156 190
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
171 205
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) 206#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173 207
208struct ocfs2_move_extents {
209/* All values are in bytes */
210 /* in */
211 __u64 me_start; /* Virtual start in the file to move */
212 __u64 me_len; /* Length of the extents to be moved */
213 __u64 me_goal; /* Physical offset of the goal,
214 it's in block unit */
215 __u64 me_threshold; /* Maximum distance from goal or threshold
216 for auto defragmentation */
217 __u64 me_flags; /* Flags for the operation:
218 * - auto defragmentation.
219 * - refcount,xattr cases.
220 */
221 /* out */
222 __u64 me_moved_len; /* Moved/defraged length */
223 __u64 me_new_offset; /* Resulting physical location */
224 __u32 me_reserved[2]; /* Reserved for futhure */
225};
226
227#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
228 claim new clusters
229 as the goal place
230 for extents moving */
231#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
232 moving, is to make
233 movement less likely
234 to fail, may make fs
235 even more fragmented */
236#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
237 completely gets done.
238 */
239
240#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
241
174#endif /* OCFS2_IOCTL_H */ 242#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5bb54ac..3b481f490633 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
688 __entry->blkno, __entry->bit) 688 __entry->blkno, __entry->bit)
689); 689);
690 690
691TRACE_EVENT(ocfs2_trim_extent,
692 TP_PROTO(struct super_block *sb, unsigned long long blk,
693 unsigned long long count),
694 TP_ARGS(sb, blk, count),
695 TP_STRUCT__entry(
696 __field(int, dev_major)
697 __field(int, dev_minor)
698 __field(unsigned long long, blk)
699 __field(__u64, count)
700 ),
701 TP_fast_assign(
702 __entry->dev_major = MAJOR(sb->s_dev);
703 __entry->dev_minor = MINOR(sb->s_dev);
704 __entry->blk = blk;
705 __entry->count = count;
706 ),
707 TP_printk("%d %d %llu %llu",
708 __entry->dev_major, __entry->dev_minor,
709 __entry->blk, __entry->count)
710);
711
712DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
713
714DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
715
691/* End of trace events for fs/ocfs2/alloc.c. */ 716/* End of trace events for fs/ocfs2/alloc.c. */
692 717
693/* Trace events for fs/ocfs2/localalloc.c. */ 718/* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
66 u32 *num_clusters, 66 u32 *num_clusters,
67 unsigned int *extent_flags); 67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle, 68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context, 69 struct file *file,
70 u32 cpos, u32 old_cluster, 70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len); 71 u32 new_cluster, u32 new_len);
72}; 72};
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2921 return 0; 2921 return 0;
2922} 2922}
2923 2923
2924static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2924int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925 struct ocfs2_cow_context *context, 2925 struct file *file,
2926 u32 cpos, u32 old_cluster, 2926 u32 cpos, u32 old_cluster,
2927 u32 new_cluster, u32 new_len) 2927 u32 new_cluster, u32 new_len)
2928{ 2928{
2929 int ret = 0, partial; 2929 int ret = 0, partial;
2930 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2930 struct inode *inode = file->f_path.dentry->d_inode;
2931 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2931 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to, readahead_pages; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = inode->i_mapping;
2938 2939
2939 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, 2940 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2940 new_cluster, new_len); 2941 new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2948 * We only duplicate pages until we reach the page contains i_size - 1. 2949 * We only duplicate pages until we reach the page contains i_size - 1.
2949 * So trim 'end' to i_size. 2950 * So trim 'end' to i_size.
2950 */ 2951 */
2951 if (end > i_size_read(context->inode)) 2952 if (end > i_size_read(inode))
2952 end = i_size_read(context->inode); 2953 end = i_size_read(inode);
2953 2954
2954 while (offset < end) { 2955 while (offset < end) {
2955 page_index = offset >> PAGE_CACHE_SHIFT; 2956 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2972 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2973 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2974 2975
2975 if (PageReadahead(page) && context->file) { 2976 if (PageReadahead(page)) {
2976 page_cache_async_readahead(mapping, 2977 page_cache_async_readahead(mapping,
2977 &context->file->f_ra, 2978 &file->f_ra, file,
2978 context->file,
2979 page, page_index, 2979 page, page_index,
2980 readahead_pages); 2980 readahead_pages);
2981 } 2981 }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2999 } 2999 }
3000 } 3000 }
3001 3001
3002 ocfs2_map_and_dirty_page(context->inode, 3002 ocfs2_map_and_dirty_page(inode, handle, from, to,
3003 handle, from, to,
3004 page, 0, &new_block); 3003 page, 0, &new_block);
3005 mark_page_accessed(page); 3004 mark_page_accessed(page);
3006unlock: 3005unlock:
@@ -3015,14 +3014,15 @@ unlock:
3015 return ret; 3014 return ret;
3016} 3015}
3017 3016
3018static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3019 struct ocfs2_cow_context *context, 3018 struct file *file,
3020 u32 cpos, u32 old_cluster, 3019 u32 cpos, u32 old_cluster,
3021 u32 new_cluster, u32 new_len) 3020 u32 new_cluster, u32 new_len)
3022{ 3021{
3023 int ret = 0; 3022 int ret = 0;
3024 struct super_block *sb = context->inode->i_sb; 3023 struct inode *inode = file->f_path.dentry->d_inode;
3025 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3024 struct super_block *sb = inode->i_sb;
3025 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
3145 3145
3146 /*If the old clusters is unwritten, no need to duplicate. */ 3146 /*If the old clusters is unwritten, no need to duplicate. */
3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148 ret = context->cow_duplicate_clusters(handle, context, cpos, 3148 ret = context->cow_duplicate_clusters(handle, context->file,
3149 old, new, len); 3149 cpos, old, new, len);
3150 if (ret) { 3150 if (ret) {
3151 mlog_errno(ret); 3151 mlog_errno(ret);
3152 goto out; 3152 goto out;
@@ -3162,22 +3162,22 @@ out:
3162 return ret; 3162 return ret;
3163} 3163}
3164 3164
3165static int ocfs2_cow_sync_writeback(struct super_block *sb, 3165int ocfs2_cow_sync_writeback(struct super_block *sb,
3166 struct ocfs2_cow_context *context, 3166 struct inode *inode,
3167 u32 cpos, u32 num_clusters) 3167 u32 cpos, u32 num_clusters)
3168{ 3168{
3169 int ret = 0; 3169 int ret = 0;
3170 loff_t offset, end, map_end; 3170 loff_t offset, end, map_end;
3171 pgoff_t page_index; 3171 pgoff_t page_index;
3172 struct page *page; 3172 struct page *page;
3173 3173
3174 if (ocfs2_should_order_data(context->inode)) 3174 if (ocfs2_should_order_data(inode))
3175 return 0; 3175 return 0;
3176 3176
3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179 3179
3180 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3180 ret = filemap_fdatawrite_range(inode->i_mapping,
3181 offset, end - 1); 3181 offset, end - 1);
3182 if (ret < 0) { 3182 if (ret < 0) {
3183 mlog_errno(ret); 3183 mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3190 if (map_end > end) 3190 if (map_end > end)
3191 map_end = end; 3191 map_end = end;
3192 3192
3193 page = find_or_create_page(context->inode->i_mapping, 3193 page = find_or_create_page(inode->i_mapping,
3194 page_index, GFP_NOFS); 3194 page_index, GFP_NOFS);
3195 BUG_ON(!page); 3195 BUG_ON(!page);
3196 3196
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3349 * in write-back mode. 3349 * in write-back mode.
3350 */ 3350 */
3351 if (context->get_clusters == ocfs2_di_get_clusters) { 3351 if (context->get_clusters == ocfs2_di_get_clusters) {
3352 ret = ocfs2_cow_sync_writeback(sb, context, cpos, 3352 ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353 orig_num_clusters); 3353 orig_num_clusters);
3354 if (ret) 3354 if (ret)
3355 mlog_errno(ret); 3355 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
84 struct buffer_head *ref_root_bh, 84 struct buffer_head *ref_root_bh,
85 u32 cpos, u32 write_len, 85 u32 cpos, u32 write_len,
86 struct ocfs2_post_refcount *post); 86 struct ocfs2_post_refcount *post);
87int ocfs2_duplicate_clusters_by_page(handle_t *handle,
88 struct file *file,
89 u32 cpos, u32 old_cluster,
90 u32 new_cluster, u32 new_len);
91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
92 struct file *file,
93 u32 cpos, u32 old_cluster,
94 u32 new_cluster, u32 new_len);
95int ocfs2_cow_sync_writeback(struct super_block *sb,
96 struct inode *inode,
97 u32 cpos, u32 num_clusters);
87int ocfs2_add_refcount_flag(struct inode *inode, 98int ocfs2_add_refcount_flag(struct inode *inode,
88 struct ocfs2_extent_tree *data_et, 99 struct ocfs2_extent_tree *data_et,
89 struct ocfs2_caching_info *ref_ci, 100 struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4129fb671d71..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1567,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1567 if (osb->preferred_slot != OCFS2_INVALID_SLOT) 1567 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
1568 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); 1568 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
1569 1569
1570 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 1570 if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
1571 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 1571 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
1572 1572
1573 if (osb->osb_commit_interval) 1573 if (osb->osb_commit_interval)