aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c142
-rw-r--r--fs/dlm/lock.h3
-rw-r--r--fs/dlm/lockspace.c1
-rw-r--r--fs/dlm/lowcomms.c23
-rw-r--r--fs/dlm/member.c41
-rw-r--r--fs/dlm/midcomms.c17
-rw-r--r--fs/dlm/rcom.c36
-rw-r--r--fs/dlm/rcom.h5
-rw-r--r--fs/dlm/recoverd.c11
-rw-r--r--fs/dlm/requestqueue.c58
-rw-r--r--fs/dlm/requestqueue.h4
-rw-r--r--fs/gfs2/bmap.c35
-rw-r--r--fs/gfs2/daemon.c24
-rw-r--r--fs/gfs2/daemon.h1
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/eaops.c8
-rw-r--r--fs/gfs2/eaops.h4
-rw-r--r--fs/gfs2/glock.c293
-rw-r--r--fs/gfs2/glock.h5
-rw-r--r--fs/gfs2/glops.c24
-rw-r--r--fs/gfs2/incore.h31
-rw-r--r--fs/gfs2/inode.c78
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h1
-rw-r--r--fs/gfs2/locking/dlm/plock.c11
-rw-r--r--fs/gfs2/locking/dlm/thread.c20
-rw-r--r--fs/gfs2/locking/nolock/main.c1
-rw-r--r--fs/gfs2/log.c230
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c470
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c136
-rw-r--r--fs/gfs2/meta_io.h6
-rw-r--r--fs/gfs2/mount.c5
-rw-r--r--fs/gfs2/ops_address.c146
-rw-r--r--fs/gfs2/ops_export.c2
-rw-r--r--fs/gfs2/ops_file.c13
-rw-r--r--fs/gfs2/ops_fstype.c40
-rw-r--r--fs/gfs2/ops_inode.c38
-rw-r--r--fs/gfs2/ops_super.c14
-rw-r--r--fs/gfs2/quota.c13
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c39
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/trans.c22
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/ntfs/ChangeLog12
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c22
-rw-r--r--fs/ntfs/attrib.c8
-rw-r--r--fs/ntfs/file.c36
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/logfile.c143
-rw-r--r--fs/ntfs/runlist.c4
-rw-r--r--fs/ocfs2/alloc.c482
-rw-r--r--fs/ocfs2/alloc.h7
-rw-r--r--fs/ocfs2/aops.c309
-rw-r--r--fs/ocfs2/aops.h6
-rw-r--r--fs/ocfs2/dir.c1423
-rw-r--r--fs/ocfs2/dir.h48
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/dlmglue.h4
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/extent_map.c6
-rw-r--r--fs/ocfs2/file.c298
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/inode.h1
-rw-r--r--fs/ocfs2/journal.c120
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/namei.c552
-rw-r--r--fs/ocfs2/namei.h19
-rw-r--r--fs/ocfs2/ocfs2.h7
-rw-r--r--fs/ocfs2/ocfs2_fs.h64
-rw-r--r--fs/ocfs2/super.c62
-rw-r--r--fs/ocfs2/sysfile.c10
78 files changed, 3674 insertions, 2066 deletions
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 74901e981e10..d2fc2384c3be 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -491,6 +491,7 @@ struct dlm_ls {
491 uint64_t ls_recover_seq; 491 uint64_t ls_recover_seq;
492 struct dlm_recover *ls_recover_args; 492 struct dlm_recover *ls_recover_args;
493 struct rw_semaphore ls_in_recovery; /* block local requests */ 493 struct rw_semaphore ls_in_recovery; /* block local requests */
494 struct rw_semaphore ls_recv_active; /* block dlm_recv */
494 struct list_head ls_requestqueue;/* queue remote requests */ 495 struct list_head ls_requestqueue;/* queue remote requests */
495 struct mutex ls_requestqueue_mutex; 496 struct mutex ls_requestqueue_mutex;
496 char *ls_recover_buf; 497 char *ls_recover_buf;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2082daf083d8..3915b8e14146 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3638,55 +3638,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3638 dlm_put_lkb(lkb); 3638 dlm_put_lkb(lkb);
3639} 3639}
3640 3640
3641int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) 3641static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3642{ 3642{
3643 struct dlm_message *ms = (struct dlm_message *) hd;
3644 struct dlm_ls *ls;
3645 int error = 0;
3646
3647 if (!recovery)
3648 dlm_message_in(ms);
3649
3650 ls = dlm_find_lockspace_global(hd->h_lockspace);
3651 if (!ls) {
3652 log_print("drop message %d from %d for unknown lockspace %d",
3653 ms->m_type, nodeid, hd->h_lockspace);
3654 return -EINVAL;
3655 }
3656
3657 /* recovery may have just ended leaving a bunch of backed-up requests
3658 in the requestqueue; wait while dlm_recoverd clears them */
3659
3660 if (!recovery)
3661 dlm_wait_requestqueue(ls);
3662
3663 /* recovery may have just started while there were a bunch of
3664 in-flight requests -- save them in requestqueue to be processed
3665 after recovery. we can't let dlm_recvd block on the recovery
3666 lock. if dlm_recoverd is calling this function to clear the
3667 requestqueue, it needs to be interrupted (-EINTR) if another
3668 recovery operation is starting. */
3669
3670 while (1) {
3671 if (dlm_locking_stopped(ls)) {
3672 if (recovery) {
3673 error = -EINTR;
3674 goto out;
3675 }
3676 error = dlm_add_requestqueue(ls, nodeid, hd);
3677 if (error == -EAGAIN)
3678 continue;
3679 else {
3680 error = -EINTR;
3681 goto out;
3682 }
3683 }
3684
3685 if (dlm_lock_recovery_try(ls))
3686 break;
3687 schedule();
3688 }
3689
3690 switch (ms->m_type) { 3643 switch (ms->m_type) {
3691 3644
3692 /* messages sent to a master node */ 3645 /* messages sent to a master node */
@@ -3761,17 +3714,90 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3761 log_error(ls, "unknown message type %d", ms->m_type); 3714 log_error(ls, "unknown message type %d", ms->m_type);
3762 } 3715 }
3763 3716
3764 dlm_unlock_recovery(ls);
3765 out:
3766 dlm_put_lockspace(ls);
3767 dlm_astd_wake(); 3717 dlm_astd_wake();
3768 return error;
3769} 3718}
3770 3719
3720/* If the lockspace is in recovery mode (locking stopped), then normal
3721 messages are saved on the requestqueue for processing after recovery is
3722 done. When not in recovery mode, we wait for dlm_recoverd to drain saved
3723 messages off the requestqueue before we process new ones. This occurs right
3724 after recovery completes when we transition from saving all messages on
3725 requestqueue, to processing all the saved messages, to processing new
3726 messages as they arrive. */
3771 3727
3772/* 3728static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
3773 * Recovery related 3729 int nodeid)
3774 */ 3730{
3731 if (dlm_locking_stopped(ls)) {
3732 dlm_add_requestqueue(ls, nodeid, (struct dlm_header *) ms);
3733 } else {
3734 dlm_wait_requestqueue(ls);
3735 _receive_message(ls, ms);
3736 }
3737}
3738
3739/* This is called by dlm_recoverd to process messages that were saved on
3740 the requestqueue. */
3741
3742void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
3743{
3744 _receive_message(ls, ms);
3745}
3746
3747/* This is called by the midcomms layer when something is received for
3748 the lockspace. It could be either a MSG (normal message sent as part of
3749 standard locking activity) or an RCOM (recovery message sent as part of
3750 lockspace recovery). */
3751
3752void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
3753{
3754 struct dlm_message *ms = (struct dlm_message *) hd;
3755 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
3756 struct dlm_ls *ls;
3757 int type = 0;
3758
3759 switch (hd->h_cmd) {
3760 case DLM_MSG:
3761 dlm_message_in(ms);
3762 type = ms->m_type;
3763 break;
3764 case DLM_RCOM:
3765 dlm_rcom_in(rc);
3766 type = rc->rc_type;
3767 break;
3768 default:
3769 log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
3770 return;
3771 }
3772
3773 if (hd->h_nodeid != nodeid) {
3774 log_print("invalid h_nodeid %d from %d lockspace %x",
3775 hd->h_nodeid, nodeid, hd->h_lockspace);
3776 return;
3777 }
3778
3779 ls = dlm_find_lockspace_global(hd->h_lockspace);
3780 if (!ls) {
3781 log_print("invalid h_lockspace %x from %d cmd %d type %d",
3782 hd->h_lockspace, nodeid, hd->h_cmd, type);
3783
3784 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3785 dlm_send_ls_not_ready(nodeid, rc);
3786 return;
3787 }
3788
3789 /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
3790 be inactive (in this ls) before transitioning to recovery mode */
3791
3792 down_read(&ls->ls_recv_active);
3793 if (hd->h_cmd == DLM_MSG)
3794 dlm_receive_message(ls, ms, nodeid);
3795 else
3796 dlm_receive_rcom(ls, rc, nodeid);
3797 up_read(&ls->ls_recv_active);
3798
3799 dlm_put_lockspace(ls);
3800}
3775 3801
3776static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) 3802static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3777{ 3803{
@@ -4429,7 +4455,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4429 4455
4430 if (lvb_in && ua->lksb.sb_lvbptr) 4456 if (lvb_in && ua->lksb.sb_lvbptr)
4431 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); 4457 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
4432 ua->castparam = ua_tmp->castparam; 4458 if (ua_tmp->castparam)
4459 ua->castparam = ua_tmp->castparam;
4433 ua->user_lksb = ua_tmp->user_lksb; 4460 ua->user_lksb = ua_tmp->user_lksb;
4434 4461
4435 error = set_unlock_args(flags, ua, &args); 4462 error = set_unlock_args(flags, ua, &args);
@@ -4474,7 +4501,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4474 goto out; 4501 goto out;
4475 4502
4476 ua = (struct dlm_user_args *)lkb->lkb_astparam; 4503 ua = (struct dlm_user_args *)lkb->lkb_astparam;
4477 ua->castparam = ua_tmp->castparam; 4504 if (ua_tmp->castparam)
4505 ua->castparam = ua_tmp->castparam;
4478 ua->user_lksb = ua_tmp->user_lksb; 4506 ua->user_lksb = ua_tmp->user_lksb;
4479 4507
4480 error = set_unlock_args(flags, ua, &args); 4508 error = set_unlock_args(flags, ua, &args);
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 1720313c22df..ada04680a1e5 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -16,7 +16,8 @@
16void dlm_print_rsb(struct dlm_rsb *r); 16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r); 17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb); 18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery); 19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
20int dlm_modes_compat(int mode1, int mode2); 21int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, 22int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret); 23 unsigned int flags, struct dlm_rsb **r_ret);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f88f88fdedf1..6353a8384520 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -519,6 +519,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
519 ls->ls_recover_seq = 0; 519 ls->ls_recover_seq = 0;
520 ls->ls_recover_args = NULL; 520 ls->ls_recover_args = NULL;
521 init_rwsem(&ls->ls_in_recovery); 521 init_rwsem(&ls->ls_in_recovery);
522 init_rwsem(&ls->ls_recv_active);
522 INIT_LIST_HEAD(&ls->ls_requestqueue); 523 INIT_LIST_HEAD(&ls->ls_requestqueue);
523 mutex_init(&ls->ls_requestqueue_mutex); 524 mutex_init(&ls->ls_requestqueue_mutex);
524 mutex_init(&ls->ls_clear_proc_locks); 525 mutex_init(&ls->ls_clear_proc_locks);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9e9d2e82f40f..58bf3f5cdbe2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -334,18 +334,8 @@ static void close_connection(struct connection *con, bool and_other)
334 con->rx_page = NULL; 334 con->rx_page = NULL;
335 } 335 }
336 336
337 /* If we are an 'othercon' then NULL the pointer to us 337 con->retries = 0;
338 from the parent and tidy ourself up */ 338 mutex_unlock(&con->sock_mutex);
339 if (test_bit(CF_IS_OTHERCON, &con->flags)) {
340 struct connection *parent = __nodeid2con(con->nodeid, 0);
341 parent->othercon = NULL;
342 kmem_cache_free(con_cache, con);
343 }
344 else {
345 /* Parent connections get reused */
346 con->retries = 0;
347 mutex_unlock(&con->sock_mutex);
348 }
349} 339}
350 340
351/* We only send shutdown messages to nodes that are not part of the cluster */ 341/* We only send shutdown messages to nodes that are not part of the cluster */
@@ -731,6 +721,8 @@ static int tcp_accept_from_sock(struct connection *con)
731 INIT_WORK(&othercon->swork, process_send_sockets); 721 INIT_WORK(&othercon->swork, process_send_sockets);
732 INIT_WORK(&othercon->rwork, process_recv_sockets); 722 INIT_WORK(&othercon->rwork, process_recv_sockets);
733 set_bit(CF_IS_OTHERCON, &othercon->flags); 723 set_bit(CF_IS_OTHERCON, &othercon->flags);
724 }
725 if (!othercon->sock) {
734 newcon->othercon = othercon; 726 newcon->othercon = othercon;
735 othercon->sock = newsock; 727 othercon->sock = newsock;
736 newsock->sk->sk_user_data = othercon; 728 newsock->sk->sk_user_data = othercon;
@@ -1272,14 +1264,15 @@ static void send_to_sock(struct connection *con)
1272 if (len) { 1264 if (len) {
1273 ret = sendpage(con->sock, e->page, offset, len, 1265 ret = sendpage(con->sock, e->page, offset, len,
1274 msg_flags); 1266 msg_flags);
1275 if (ret == -EAGAIN || ret == 0) 1267 if (ret == -EAGAIN || ret == 0) {
1268 cond_resched();
1276 goto out; 1269 goto out;
1270 }
1277 if (ret <= 0) 1271 if (ret <= 0)
1278 goto send_error; 1272 goto send_error;
1279 } else { 1273 }
1280 /* Don't starve people filling buffers */ 1274 /* Don't starve people filling buffers */
1281 cond_resched(); 1275 cond_resched();
1282 }
1283 1276
1284 spin_lock(&con->writequeue_lock); 1277 spin_lock(&con->writequeue_lock);
1285 e->offset += ret; 1278 e->offset += ret;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index d09977528f69..e9cdcab306e2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -18,10 +18,6 @@
18#include "rcom.h" 18#include "rcom.h"
19#include "config.h" 19#include "config.h"
20 20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 21static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{ 22{
27 struct dlm_member *memb = NULL; 23 struct dlm_member *memb = NULL;
@@ -250,18 +246,30 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
250 return error; 246 return error;
251} 247}
252 248
253/* 249/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
254 * Following called from lockspace.c 250 dlm_ls_start() is called on any of them to start the new recovery. */
255 */
256 251
257int dlm_ls_stop(struct dlm_ls *ls) 252int dlm_ls_stop(struct dlm_ls *ls)
258{ 253{
259 int new; 254 int new;
260 255
261 /* 256 /*
262 * A stop cancels any recovery that's in progress (see RECOVERY_STOP, 257 * Prevent dlm_recv from being in the middle of something when we do
263 * dlm_recovery_stopped()) and prevents any new locks from being 258 * the stop. This includes ensuring dlm_recv isn't processing a
264 * processed (see RUNNING, dlm_locking_stopped()). 259 * recovery message (rcom), while dlm_recoverd is aborting and
260 * resetting things from an in-progress recovery. i.e. we want
261 * dlm_recoverd to abort its recovery without worrying about dlm_recv
262 * processing an rcom at the same time. Stopping dlm_recv also makes
263 * it easy for dlm_receive_message() to check locking stopped and add a
264 * message to the requestqueue without races.
265 */
266
267 down_write(&ls->ls_recv_active);
268
269 /*
270 * Abort any recovery that's in progress (see RECOVERY_STOP,
271 * dlm_recovery_stopped()) and tell any other threads running in the
272 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
265 */ 273 */
266 274
267 spin_lock(&ls->ls_recover_lock); 275 spin_lock(&ls->ls_recover_lock);
@@ -271,8 +279,14 @@ int dlm_ls_stop(struct dlm_ls *ls)
271 spin_unlock(&ls->ls_recover_lock); 279 spin_unlock(&ls->ls_recover_lock);
272 280
273 /* 281 /*
282 * Let dlm_recv run again, now any normal messages will be saved on the
283 * requestqueue for later.
284 */
285
286 up_write(&ls->ls_recv_active);
287
288 /*
274 * This in_recovery lock does two things: 289 * This in_recovery lock does two things:
275 *
276 * 1) Keeps this function from returning until all threads are out 290 * 1) Keeps this function from returning until all threads are out
277 * of locking routines and locking is truely stopped. 291 * of locking routines and locking is truely stopped.
278 * 2) Keeps any new requests from being processed until it's unlocked 292 * 2) Keeps any new requests from being processed until it's unlocked
@@ -284,9 +298,8 @@ int dlm_ls_stop(struct dlm_ls *ls)
284 298
285 /* 299 /*
286 * The recoverd suspend/resume makes sure that dlm_recoverd (if 300 * The recoverd suspend/resume makes sure that dlm_recoverd (if
287 * running) has noticed the clearing of RUNNING above and quit 301 * running) has noticed RECOVERY_STOP above and quit processing the
288 * processing the previous recovery. This will be true for all nodes 302 * previous recovery.
289 * before any nodes start the new recovery.
290 */ 303 */
291 304
292 dlm_recoverd_suspend(ls); 305 dlm_recoverd_suspend(ls);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index a5126e0c68a6..f8c69dda16a0 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,6 @@
27#include "dlm_internal.h" 27#include "dlm_internal.h"
28#include "lowcomms.h" 28#include "lowcomms.h"
29#include "config.h" 29#include "config.h"
30#include "rcom.h"
31#include "lock.h" 30#include "lock.h"
32#include "midcomms.h" 31#include "midcomms.h"
33 32
@@ -117,19 +116,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
117 offset &= (limit - 1); 116 offset &= (limit - 1);
118 len -= msglen; 117 len -= msglen;
119 118
120 switch (msg->h_cmd) { 119 dlm_receive_buffer(msg, nodeid);
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 } 120 }
134 121
135 if (msg != (struct dlm_header *) __tmp) 122 if (msg != (struct dlm_header *) __tmp)
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 188b91c027e4..ae2fd97fa4ad 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -386,7 +386,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
386 dlm_recover_process_copy(ls, rc_in); 386 dlm_recover_process_copy(ls, rc_in);
387} 387}
388 388
389static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) 389/* If the lockspace doesn't exist then still send a status message
390 back; it's possible that it just doesn't have its global_id yet. */
391
392int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
390{ 393{
391 struct dlm_rcom *rc; 394 struct dlm_rcom *rc;
392 struct rcom_config *rf; 395 struct rcom_config *rf;
@@ -446,28 +449,11 @@ static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
446 return rv; 449 return rv;
447} 450}
448 451
449/* Called by dlm_recvd; corresponds to dlm_receive_message() but special 452/* Called by dlm_recv; corresponds to dlm_receive_message() but special
450 recovery-only comms are sent through here. */ 453 recovery-only comms are sent through here. */
451 454
452void dlm_receive_rcom(struct dlm_header *hd, int nodeid) 455void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
453{ 456{
454 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
455 struct dlm_ls *ls;
456
457 dlm_rcom_in(rc);
458
459 /* If the lockspace doesn't exist then still send a status message
460 back; it's possible that it just doesn't have its global_id yet. */
461
462 ls = dlm_find_lockspace_global(hd->h_lockspace);
463 if (!ls) {
464 log_print("lockspace %x from %d type %x not found",
465 hd->h_lockspace, nodeid, rc->rc_type);
466 if (rc->rc_type == DLM_RCOM_STATUS)
467 send_ls_not_ready(nodeid, rc);
468 return;
469 }
470
471 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { 457 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
472 log_debug(ls, "ignoring recovery message %x from %d", 458 log_debug(ls, "ignoring recovery message %x from %d",
473 rc->rc_type, nodeid); 459 rc->rc_type, nodeid);
@@ -477,12 +463,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
477 if (is_old_reply(ls, rc)) 463 if (is_old_reply(ls, rc))
478 goto out; 464 goto out;
479 465
480 if (nodeid != rc->rc_header.h_nodeid) {
481 log_error(ls, "bad rcom nodeid %d from %d",
482 rc->rc_header.h_nodeid, nodeid);
483 goto out;
484 }
485
486 switch (rc->rc_type) { 466 switch (rc->rc_type) {
487 case DLM_RCOM_STATUS: 467 case DLM_RCOM_STATUS:
488 receive_rcom_status(ls, rc); 468 receive_rcom_status(ls, rc);
@@ -520,6 +500,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
520 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 500 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
521 } 501 }
522 out: 502 out:
523 dlm_put_lockspace(ls); 503 return;
524} 504}
525 505
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index d7984321ff41..b09abd29ba38 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -18,7 +18,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid); 21void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
22int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
22 23
23#endif 24#endif
24 25
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 66575997861c..4b89e20eebe7 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -24,19 +24,28 @@
24 24
25 25
26/* If the start for which we're re-enabling locking (seq) has been superseded 26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */ 27 by a newer stop (ls_recover_seq), we need to leave locking disabled.
28
29 We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
30 locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
31 enables locking and clears the requestqueue between a and b. */
28 32
29static int enable_locking(struct dlm_ls *ls, uint64_t seq) 33static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{ 34{
31 int error = -EINTR; 35 int error = -EINTR;
32 36
37 down_write(&ls->ls_recv_active);
38
33 spin_lock(&ls->ls_recover_lock); 39 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) { 40 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags); 41 set_bit(LSFL_RUNNING, &ls->ls_flags);
42 /* unblocks processes waiting to enter the dlm */
36 up_write(&ls->ls_in_recovery); 43 up_write(&ls->ls_in_recovery);
37 error = 0; 44 error = 0;
38 } 45 }
39 spin_unlock(&ls->ls_recover_lock); 46 spin_unlock(&ls->ls_recover_lock);
47
48 up_write(&ls->ls_recv_active);
40 return error; 49 return error;
41} 50}
42 51
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 65008d79c96d..0de04f17ccea 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,7 +20,7 @@
20struct rq_entry { 20struct rq_entry {
21 struct list_head list; 21 struct list_head list;
22 int nodeid; 22 int nodeid;
23 char request[1]; 23 char request[0];
24}; 24};
25 25
26/* 26/*
@@ -30,42 +30,39 @@ struct rq_entry {
30 * lockspace is enabled on some while still suspended on others. 30 * lockspace is enabled on some while still suspended on others.
31 */ 31 */
32 32
33int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) 33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{ 34{
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = hd->h_length; 36 int length = hd->h_length;
37 int rv = 0;
38 37
39 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); 38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
40 if (!e) { 39 if (!e) {
41 log_print("dlm_add_requestqueue: out of memory\n"); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
42 return 0; 41 return;
43 } 42 }
44 43
45 e->nodeid = nodeid; 44 e->nodeid = nodeid;
46 memcpy(e->request, hd, length); 45 memcpy(e->request, hd, length);
47 46
48 /* We need to check dlm_locking_stopped() after taking the mutex to
49 avoid a race where dlm_recoverd enables locking and runs
50 process_requestqueue between our earlier dlm_locking_stopped check
51 and this addition to the requestqueue. */
52
53 mutex_lock(&ls->ls_requestqueue_mutex); 47 mutex_lock(&ls->ls_requestqueue_mutex);
54 if (dlm_locking_stopped(ls)) 48 list_add_tail(&e->list, &ls->ls_requestqueue);
55 list_add_tail(&e->list, &ls->ls_requestqueue);
56 else {
57 log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
58 kfree(e);
59 rv = -EAGAIN;
60 }
61 mutex_unlock(&ls->ls_requestqueue_mutex); 49 mutex_unlock(&ls->ls_requestqueue_mutex);
62 return rv;
63} 50}
64 51
52/*
53 * Called by dlm_recoverd to process normal messages saved while recovery was
54 * happening. Normal locking has been enabled before this is called. dlm_recv
55 * upon receiving a message, will wait for all saved messages to be drained
56 * here before processing the message it got. If a new dlm_ls_stop() arrives
57 * while we're processing these saved messages, it may block trying to suspend
58 * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that
59 * case, we don't abort since locking_stopped is still 0. If dlm_recv is not
60 * waiting for us, then this processing may be aborted due to locking_stopped.
61 */
62
65int dlm_process_requestqueue(struct dlm_ls *ls) 63int dlm_process_requestqueue(struct dlm_ls *ls)
66{ 64{
67 struct rq_entry *e; 65 struct rq_entry *e;
68 struct dlm_header *hd;
69 int error = 0; 66 int error = 0;
70 67
71 mutex_lock(&ls->ls_requestqueue_mutex); 68 mutex_lock(&ls->ls_requestqueue_mutex);
@@ -79,14 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
79 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); 76 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
80 mutex_unlock(&ls->ls_requestqueue_mutex); 77 mutex_unlock(&ls->ls_requestqueue_mutex);
81 78
82 hd = (struct dlm_header *) e->request; 79 dlm_receive_message_saved(ls, (struct dlm_message *)e->request);
83 error = dlm_receive_message(hd, e->nodeid, 1);
84
85 if (error == -EINTR) {
86 /* entry is left on requestqueue */
87 log_debug(ls, "process_requestqueue abort eintr");
88 break;
89 }
90 80
91 mutex_lock(&ls->ls_requestqueue_mutex); 81 mutex_lock(&ls->ls_requestqueue_mutex);
92 list_del(&e->list); 82 list_del(&e->list);
@@ -106,10 +96,12 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
106 96
107/* 97/*
108 * After recovery is done, locking is resumed and dlm_recoverd takes all the 98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
109 * saved requests and processes them as they would have been by dlm_recvd. At 99 * saved requests and processes them as they would have been by dlm_recv. At
110 * the same time, dlm_recvd will start receiving new requests from remote 100 * the same time, dlm_recv will start receiving new requests from remote nodes.
111 * nodes. We want to delay dlm_recvd processing new requests until 101 * We want to delay dlm_recv processing new requests until dlm_recoverd has
112 * dlm_recoverd has finished processing the old saved requests. 102 * finished processing the old saved requests. We don't check for locking
103 * stopped here because dlm_ls_stop won't stop locking until it's suspended us
104 * (dlm_recv).
113 */ 105 */
114 106
115void dlm_wait_requestqueue(struct dlm_ls *ls) 107void dlm_wait_requestqueue(struct dlm_ls *ls)
@@ -118,8 +110,6 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
118 mutex_lock(&ls->ls_requestqueue_mutex); 110 mutex_lock(&ls->ls_requestqueue_mutex);
119 if (list_empty(&ls->ls_requestqueue)) 111 if (list_empty(&ls->ls_requestqueue))
120 break; 112 break;
121 if (dlm_locking_stopped(ls))
122 break;
123 mutex_unlock(&ls->ls_requestqueue_mutex); 113 mutex_unlock(&ls->ls_requestqueue_mutex);
124 schedule(); 114 schedule();
125 } 115 }
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 6a53ea03335d..aba34fc05ee4 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __REQUESTQUEUE_DOT_H__ 13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__ 14#define __REQUESTQUEUE_DOT_H__
15 15
16int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); 16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls); 17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls); 18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls); 19void dlm_purge_requestqueue(struct dlm_ls *ls);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cd805a66880d..93fa427bb5f5 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -93,9 +93,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
93 map_bh(bh, inode->i_sb, block); 93 map_bh(bh, inode->i_sb, block);
94 94
95 set_buffer_uptodate(bh); 95 set_buffer_uptodate(bh);
96 if (!gfs2_is_jdata(ip))
97 mark_buffer_dirty(bh);
96 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 98 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
97 gfs2_trans_add_bh(ip->i_gl, bh, 0); 99 gfs2_trans_add_bh(ip->i_gl, bh, 0);
98 mark_buffer_dirty(bh);
99 100
100 if (release) { 101 if (release) {
101 unlock_page(page); 102 unlock_page(page);
@@ -1085,6 +1086,33 @@ static int do_shrink(struct gfs2_inode *ip, u64 size)
1085 return error; 1086 return error;
1086} 1087}
1087 1088
1089static int do_touch(struct gfs2_inode *ip, u64 size)
1090{
1091 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1092 struct buffer_head *dibh;
1093 int error;
1094
1095 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1096 if (error)
1097 return error;
1098
1099 down_write(&ip->i_rw_mutex);
1100
1101 error = gfs2_meta_inode_buffer(ip, &dibh);
1102 if (error)
1103 goto do_touch_out;
1104
1105 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1106 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1107 gfs2_dinode_out(ip, dibh->b_data);
1108 brelse(dibh);
1109
1110do_touch_out:
1111 up_write(&ip->i_rw_mutex);
1112 gfs2_trans_end(sdp);
1113 return error;
1114}
1115
1088/** 1116/**
1089 * gfs2_truncatei - make a file a given size 1117 * gfs2_truncatei - make a file a given size
1090 * @ip: the inode 1118 * @ip: the inode
@@ -1105,8 +1133,11 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1105 1133
1106 if (size > ip->i_di.di_size) 1134 if (size > ip->i_di.di_size)
1107 error = do_grow(ip, size); 1135 error = do_grow(ip, size);
1108 else 1136 else if (size < ip->i_di.di_size)
1109 error = do_shrink(ip, size); 1137 error = do_shrink(ip, size);
1138 else
1139 /* update time stamps */
1140 error = do_touch(ip, size);
1110 1141
1111 return error; 1142 return error;
1112} 1143}
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3548d9f31e0d..3731ab0771d5 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -35,30 +35,6 @@
35 The kthread functions used to start these daemons block and flush signals. */ 35 The kthread functions used to start these daemons block and flush signals. */
36 36
37/** 37/**
38 * gfs2_scand - Look for cached glocks and inodes to toss from memory
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
42 * See gfs2_glockd()
43 */
44
45int gfs2_scand(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48 unsigned long t;
49
50 while (!kthread_should_stop()) {
51 gfs2_scand_internal(sdp);
52 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
53 if (freezing(current))
54 refrigerator();
55 schedule_timeout_interruptible(t);
56 }
57
58 return 0;
59}
60
61/**
62 * gfs2_glockd - Reclaim unused glock structures 38 * gfs2_glockd - Reclaim unused glock structures
63 * @sdp: Pointer to GFS2 superblock 39 * @sdp: Pointer to GFS2 superblock
64 * 40 *
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 801007120fb2..0de9b3557955 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -10,7 +10,6 @@
10#ifndef __DAEMON_DOT_H__ 10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__ 11#define __DAEMON_DOT_H__
12 12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data); 13int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data); 14int gfs2_recoverd(void *data);
16int gfs2_logd(void *data); 15int gfs2_logd(void *data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2beb2f401aa2..9949bb746a52 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1043,6 +1043,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1043 1043
1044 error = gfs2_meta_inode_buffer(dip, &dibh); 1044 error = gfs2_meta_inode_buffer(dip, &dibh);
1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { 1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1046 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1046 dip->i_di.di_blocks++; 1047 dip->i_di.di_blocks++;
1047 gfs2_set_inode_blocks(&dip->i_inode); 1048 gfs2_set_inode_blocks(&dip->i_inode);
1048 gfs2_dinode_out(dip, dibh->b_data); 1049 gfs2_dinode_out(dip, dibh->b_data);
@@ -1501,7 +1502,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1501 inode = gfs2_inode_lookup(dir->i_sb, 1502 inode = gfs2_inode_lookup(dir->i_sb,
1502 be16_to_cpu(dent->de_type), 1503 be16_to_cpu(dent->de_type),
1503 be64_to_cpu(dent->de_inum.no_addr), 1504 be64_to_cpu(dent->de_inum.no_addr),
1504 be64_to_cpu(dent->de_inum.no_formal_ino)); 1505 be64_to_cpu(dent->de_inum.no_formal_ino), 0);
1505 brelse(bh); 1506 brelse(bh);
1506 return inode; 1507 return inode;
1507 } 1508 }
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index 1ab3e9d73886..aa8dbf303f6d 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -200,28 +200,28 @@ static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
200 return gfs2_ea_remove_i(ip, er); 200 return gfs2_ea_remove_i(ip, er);
201} 201}
202 202
203static struct gfs2_eattr_operations gfs2_user_eaops = { 203static const struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get, 204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set, 205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove, 206 .eo_remove = user_eo_remove,
207 .eo_name = "user", 207 .eo_name = "user",
208}; 208};
209 209
210struct gfs2_eattr_operations gfs2_system_eaops = { 210const struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get, 211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set, 212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove, 213 .eo_remove = system_eo_remove,
214 .eo_name = "system", 214 .eo_name = "system",
215}; 215};
216 216
217static struct gfs2_eattr_operations gfs2_security_eaops = { 217static const struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get, 218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set, 219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove, 220 .eo_remove = security_eo_remove,
221 .eo_name = "security", 221 .eo_name = "security",
222}; 222};
223 223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = { 224const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL, 225 NULL,
226 &gfs2_user_eaops, 226 &gfs2_user_eaops,
227 &gfs2_system_eaops, 227 &gfs2_system_eaops,
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
index 508b4f7a2449..da2f7fbbb40d 100644
--- a/fs/gfs2/eaops.h
+++ b/fs/gfs2/eaops.h
@@ -22,9 +22,9 @@ struct gfs2_eattr_operations {
22 22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); 23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24 24
25extern struct gfs2_eattr_operations gfs2_system_eaops; 25extern const struct gfs2_eattr_operations gfs2_system_eaops;
26 26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[]; 27extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
28 28
29#endif /* __EAOPS_DOT_H__ */ 29#endif /* __EAOPS_DOT_H__ */
30 30
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3f0974e1afef..a37efe4aae6f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -25,8 +25,10 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <linux/seq_file.h> 26#include <linux/seq_file.h>
27#include <linux/debugfs.h> 27#include <linux/debugfs.h>
28#include <linux/module.h> 28#include <linux/kthread.h>
29#include <linux/kallsyms.h> 29#include <linux/freezer.h>
30#include <linux/workqueue.h>
31#include <linux/jiffies.h>
30 32
31#include "gfs2.h" 33#include "gfs2.h"
32#include "incore.h" 34#include "incore.h"
@@ -48,7 +50,6 @@ struct glock_iter {
48 int hash; /* hash bucket index */ 50 int hash; /* hash bucket index */
49 struct gfs2_sbd *sdp; /* incore superblock */ 51 struct gfs2_sbd *sdp; /* incore superblock */
50 struct gfs2_glock *gl; /* current glock struct */ 52 struct gfs2_glock *gl; /* current glock struct */
51 struct hlist_head *hb_list; /* current hash bucket ptr */
52 struct seq_file *seq; /* sequence file for debugfs */ 53 struct seq_file *seq; /* sequence file for debugfs */
53 char string[512]; /* scratch space */ 54 char string[512]; /* scratch space */
54}; 55};
@@ -59,8 +60,13 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
59static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); 60static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
60static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); 61static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
61static void gfs2_glock_drop_th(struct gfs2_glock *gl); 62static void gfs2_glock_drop_th(struct gfs2_glock *gl);
63static void run_queue(struct gfs2_glock *gl);
64
62static DECLARE_RWSEM(gfs2_umount_flush_sem); 65static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root; 66static struct dentry *gfs2_root;
67static struct task_struct *scand_process;
68static unsigned int scand_secs = 5;
69static struct workqueue_struct *glock_workqueue;
64 70
65#define GFS2_GL_HASH_SHIFT 15 71#define GFS2_GL_HASH_SHIFT 15
66#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 72#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -276,6 +282,18 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
276 return gl; 282 return gl;
277} 283}
278 284
285static void glock_work_func(struct work_struct *work)
286{
287 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
288
289 spin_lock(&gl->gl_spin);
290 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
291 set_bit(GLF_DEMOTE, &gl->gl_flags);
292 run_queue(gl);
293 spin_unlock(&gl->gl_spin);
294 gfs2_glock_put(gl);
295}
296
279/** 297/**
280 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist 298 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
281 * @sdp: The GFS2 superblock 299 * @sdp: The GFS2 superblock
@@ -315,6 +333,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
315 gl->gl_name = name; 333 gl->gl_name = name;
316 atomic_set(&gl->gl_ref, 1); 334 atomic_set(&gl->gl_ref, 1);
317 gl->gl_state = LM_ST_UNLOCKED; 335 gl->gl_state = LM_ST_UNLOCKED;
336 gl->gl_demote_state = LM_ST_EXCLUSIVE;
318 gl->gl_hash = hash; 337 gl->gl_hash = hash;
319 gl->gl_owner_pid = 0; 338 gl->gl_owner_pid = 0;
320 gl->gl_ip = 0; 339 gl->gl_ip = 0;
@@ -323,10 +342,12 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
323 gl->gl_req_bh = NULL; 342 gl->gl_req_bh = NULL;
324 gl->gl_vn = 0; 343 gl->gl_vn = 0;
325 gl->gl_stamp = jiffies; 344 gl->gl_stamp = jiffies;
345 gl->gl_tchange = jiffies;
326 gl->gl_object = NULL; 346 gl->gl_object = NULL;
327 gl->gl_sbd = sdp; 347 gl->gl_sbd = sdp;
328 gl->gl_aspace = NULL; 348 gl->gl_aspace = NULL;
329 lops_init_le(&gl->gl_le, &gfs2_glock_lops); 349 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
350 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
330 351
331 /* If this glock protects actual on-disk data or metadata blocks, 352 /* If this glock protects actual on-disk data or metadata blocks,
332 create a VFS inode to manage the pages/buffers holding them. */ 353 create a VFS inode to manage the pages/buffers holding them. */
@@ -440,6 +461,8 @@ static void wait_on_holder(struct gfs2_holder *gh)
440 461
441static void gfs2_demote_wake(struct gfs2_glock *gl) 462static void gfs2_demote_wake(struct gfs2_glock *gl)
442{ 463{
464 BUG_ON(!spin_is_locked(&gl->gl_spin));
465 gl->gl_demote_state = LM_ST_EXCLUSIVE;
443 clear_bit(GLF_DEMOTE, &gl->gl_flags); 466 clear_bit(GLF_DEMOTE, &gl->gl_flags);
444 smp_mb__after_clear_bit(); 467 smp_mb__after_clear_bit();
445 wake_up_bit(&gl->gl_flags, GLF_DEMOTE); 468 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
@@ -545,12 +568,14 @@ static int rq_demote(struct gfs2_glock *gl)
545 return 0; 568 return 0;
546 } 569 }
547 set_bit(GLF_LOCK, &gl->gl_flags); 570 set_bit(GLF_LOCK, &gl->gl_flags);
548 spin_unlock(&gl->gl_spin);
549 if (gl->gl_demote_state == LM_ST_UNLOCKED || 571 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
550 gl->gl_state != LM_ST_EXCLUSIVE) 572 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin);
551 gfs2_glock_drop_th(gl); 574 gfs2_glock_drop_th(gl);
552 else 575 } else {
576 spin_unlock(&gl->gl_spin);
553 gfs2_glock_xmote_th(gl, NULL); 577 gfs2_glock_xmote_th(gl, NULL);
578 }
554 spin_lock(&gl->gl_spin); 579 spin_lock(&gl->gl_spin);
555 580
556 return 0; 581 return 0;
@@ -679,24 +704,25 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
679 * practise: LM_ST_SHARED and LM_ST_UNLOCKED 704 * practise: LM_ST_SHARED and LM_ST_UNLOCKED
680 */ 705 */
681 706
682static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote) 707static void handle_callback(struct gfs2_glock *gl, unsigned int state,
708 int remote, unsigned long delay)
683{ 709{
710 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
711
684 spin_lock(&gl->gl_spin); 712 spin_lock(&gl->gl_spin);
685 if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) { 713 set_bit(bit, &gl->gl_flags);
714 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
686 gl->gl_demote_state = state; 715 gl->gl_demote_state = state;
687 gl->gl_demote_time = jiffies; 716 gl->gl_demote_time = jiffies;
688 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && 717 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
689 gl->gl_object) { 718 gl->gl_object) {
690 struct inode *inode = igrab(gl->gl_object); 719 gfs2_glock_schedule_for_reclaim(gl);
691 spin_unlock(&gl->gl_spin); 720 spin_unlock(&gl->gl_spin);
692 if (inode) {
693 d_prune_aliases(inode);
694 iput(inode);
695 }
696 return; 721 return;
697 } 722 }
698 } else if (gl->gl_demote_state != LM_ST_UNLOCKED) { 723 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
699 gl->gl_demote_state = state; 724 gl->gl_demote_state != state) {
725 gl->gl_demote_state = LM_ST_UNLOCKED;
700 } 726 }
701 spin_unlock(&gl->gl_spin); 727 spin_unlock(&gl->gl_spin);
702} 728}
@@ -723,6 +749,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
723 } 749 }
724 750
725 gl->gl_state = new_state; 751 gl->gl_state = new_state;
752 gl->gl_tchange = jiffies;
726} 753}
727 754
728/** 755/**
@@ -760,10 +787,20 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
760 787
761 if (!gh) { 788 if (!gh) {
762 gl->gl_stamp = jiffies; 789 gl->gl_stamp = jiffies;
763 if (ret & LM_OUT_CANCELED) 790 if (ret & LM_OUT_CANCELED) {
764 op_done = 0; 791 op_done = 0;
765 else 792 } else {
793 spin_lock(&gl->gl_spin);
794 if (gl->gl_state != gl->gl_demote_state) {
795 gl->gl_req_bh = NULL;
796 spin_unlock(&gl->gl_spin);
797 gfs2_glock_drop_th(gl);
798 gfs2_glock_put(gl);
799 return;
800 }
766 gfs2_demote_wake(gl); 801 gfs2_demote_wake(gl);
802 spin_unlock(&gl->gl_spin);
803 }
767 } else { 804 } else {
768 spin_lock(&gl->gl_spin); 805 spin_lock(&gl->gl_spin);
769 list_del_init(&gh->gh_list); 806 list_del_init(&gh->gh_list);
@@ -799,7 +836,6 @@ out:
799 gl->gl_req_gh = NULL; 836 gl->gl_req_gh = NULL;
800 gl->gl_req_bh = NULL; 837 gl->gl_req_bh = NULL;
801 clear_bit(GLF_LOCK, &gl->gl_flags); 838 clear_bit(GLF_LOCK, &gl->gl_flags);
802 run_queue(gl);
803 spin_unlock(&gl->gl_spin); 839 spin_unlock(&gl->gl_spin);
804 } 840 }
805 841
@@ -817,7 +853,7 @@ out:
817 * 853 *
818 */ 854 */
819 855
820void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) 856static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
821{ 857{
822 struct gfs2_sbd *sdp = gl->gl_sbd; 858 struct gfs2_sbd *sdp = gl->gl_sbd;
823 int flags = gh ? gh->gh_flags : 0; 859 int flags = gh ? gh->gh_flags : 0;
@@ -871,7 +907,6 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
871 gfs2_assert_warn(sdp, !ret); 907 gfs2_assert_warn(sdp, !ret);
872 908
873 state_change(gl, LM_ST_UNLOCKED); 909 state_change(gl, LM_ST_UNLOCKED);
874 gfs2_demote_wake(gl);
875 910
876 if (glops->go_inval) 911 if (glops->go_inval)
877 glops->go_inval(gl, DIO_METADATA); 912 glops->go_inval(gl, DIO_METADATA);
@@ -884,10 +919,10 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
884 } 919 }
885 920
886 spin_lock(&gl->gl_spin); 921 spin_lock(&gl->gl_spin);
922 gfs2_demote_wake(gl);
887 gl->gl_req_gh = NULL; 923 gl->gl_req_gh = NULL;
888 gl->gl_req_bh = NULL; 924 gl->gl_req_bh = NULL;
889 clear_bit(GLF_LOCK, &gl->gl_flags); 925 clear_bit(GLF_LOCK, &gl->gl_flags);
890 run_queue(gl);
891 spin_unlock(&gl->gl_spin); 926 spin_unlock(&gl->gl_spin);
892 927
893 gfs2_glock_put(gl); 928 gfs2_glock_put(gl);
@@ -1067,24 +1102,31 @@ static void add_to_queue(struct gfs2_holder *gh)
1067 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 1102 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1068 BUG(); 1103 BUG();
1069 1104
1070 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid); 1105 if (!(gh->gh_flags & GL_FLOCK)) {
1071 if (existing) { 1106 existing = find_holder_by_owner(&gl->gl_holders,
1072 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1107 gh->gh_owner_pid);
1073 printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid); 1108 if (existing) {
1074 printk(KERN_INFO "lock type : %d lock state : %d\n", 1109 print_symbol(KERN_WARNING "original: %s\n",
1075 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state); 1110 existing->gh_ip);
1076 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1111 printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
1077 printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid); 1112 printk(KERN_INFO "lock type : %d lock state : %d\n",
1078 printk(KERN_INFO "lock type : %d lock state : %d\n", 1113 existing->gh_gl->gl_name.ln_type,
1079 gl->gl_name.ln_type, gl->gl_state); 1114 existing->gh_gl->gl_state);
1080 BUG(); 1115 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1081 } 1116 printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
1082 1117 printk(KERN_INFO "lock type : %d lock state : %d\n",
1083 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid); 1118 gl->gl_name.ln_type, gl->gl_state);
1084 if (existing) { 1119 BUG();
1085 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1120 }
1086 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1121
1087 BUG(); 1122 existing = find_holder_by_owner(&gl->gl_waiters3,
1123 gh->gh_owner_pid);
1124 if (existing) {
1125 print_symbol(KERN_WARNING "original: %s\n",
1126 existing->gh_ip);
1127 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1128 BUG();
1129 }
1088 } 1130 }
1089 1131
1090 if (gh->gh_flags & LM_FLAG_PRIORITY) 1132 if (gh->gh_flags & LM_FLAG_PRIORITY)
@@ -1195,9 +1237,10 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1195{ 1237{
1196 struct gfs2_glock *gl = gh->gh_gl; 1238 struct gfs2_glock *gl = gh->gh_gl;
1197 const struct gfs2_glock_operations *glops = gl->gl_ops; 1239 const struct gfs2_glock_operations *glops = gl->gl_ops;
1240 unsigned delay = 0;
1198 1241
1199 if (gh->gh_flags & GL_NOCACHE) 1242 if (gh->gh_flags & GL_NOCACHE)
1200 handle_callback(gl, LM_ST_UNLOCKED, 0); 1243 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1201 1244
1202 gfs2_glmutex_lock(gl); 1245 gfs2_glmutex_lock(gl);
1203 1246
@@ -1215,8 +1258,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1215 } 1258 }
1216 1259
1217 clear_bit(GLF_LOCK, &gl->gl_flags); 1260 clear_bit(GLF_LOCK, &gl->gl_flags);
1218 run_queue(gl);
1219 spin_unlock(&gl->gl_spin); 1261 spin_unlock(&gl->gl_spin);
1262
1263 gfs2_glock_hold(gl);
1264 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1265 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1266 delay = gl->gl_ops->go_min_hold_time;
1267 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1268 gfs2_glock_put(gl);
1220} 1269}
1221 1270
1222void gfs2_glock_dq_wait(struct gfs2_holder *gh) 1271void gfs2_glock_dq_wait(struct gfs2_holder *gh)
@@ -1443,18 +1492,21 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1443 unsigned int state) 1492 unsigned int state)
1444{ 1493{
1445 struct gfs2_glock *gl; 1494 struct gfs2_glock *gl;
1495 unsigned long delay = 0;
1496 unsigned long holdtime;
1497 unsigned long now = jiffies;
1446 1498
1447 gl = gfs2_glock_find(sdp, name); 1499 gl = gfs2_glock_find(sdp, name);
1448 if (!gl) 1500 if (!gl)
1449 return; 1501 return;
1450 1502
1451 handle_callback(gl, state, 1); 1503 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1452 1504 if (time_before(now, holdtime))
1453 spin_lock(&gl->gl_spin); 1505 delay = holdtime - now;
1454 run_queue(gl);
1455 spin_unlock(&gl->gl_spin);
1456 1506
1457 gfs2_glock_put(gl); 1507 handle_callback(gl, state, 1, delay);
1508 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1509 gfs2_glock_put(gl);
1458} 1510}
1459 1511
1460/** 1512/**
@@ -1495,7 +1547,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1495 return; 1547 return;
1496 if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) 1548 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1497 gl->gl_req_bh(gl, async->lc_ret); 1549 gl->gl_req_bh(gl, async->lc_ret);
1498 gfs2_glock_put(gl); 1550 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1551 gfs2_glock_put(gl);
1499 up_read(&gfs2_umount_flush_sem); 1552 up_read(&gfs2_umount_flush_sem);
1500 return; 1553 return;
1501 } 1554 }
@@ -1588,7 +1641,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1588 if (gfs2_glmutex_trylock(gl)) { 1641 if (gfs2_glmutex_trylock(gl)) {
1589 if (list_empty(&gl->gl_holders) && 1642 if (list_empty(&gl->gl_holders) &&
1590 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1643 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1591 handle_callback(gl, LM_ST_UNLOCKED, 0); 1644 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1592 gfs2_glmutex_unlock(gl); 1645 gfs2_glmutex_unlock(gl);
1593 } 1646 }
1594 1647
@@ -1617,7 +1670,7 @@ static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1617 goto out; 1670 goto out;
1618 gl = list_entry(head->first, struct gfs2_glock, gl_list); 1671 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1619 while(1) { 1672 while(1) {
1620 if (gl->gl_sbd == sdp) { 1673 if (!sdp || gl->gl_sbd == sdp) {
1621 gfs2_glock_hold(gl); 1674 gfs2_glock_hold(gl);
1622 read_unlock(gl_lock_addr(hash)); 1675 read_unlock(gl_lock_addr(hash));
1623 if (prev) 1676 if (prev)
@@ -1635,6 +1688,7 @@ out:
1635 read_unlock(gl_lock_addr(hash)); 1688 read_unlock(gl_lock_addr(hash));
1636 if (prev) 1689 if (prev)
1637 gfs2_glock_put(prev); 1690 gfs2_glock_put(prev);
1691 cond_resched();
1638 return has_entries; 1692 return has_entries;
1639} 1693}
1640 1694
@@ -1663,20 +1717,6 @@ out_schedule:
1663} 1717}
1664 1718
1665/** 1719/**
1666 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1667 * @sdp: the filesystem
1668 *
1669 */
1670
1671void gfs2_scand_internal(struct gfs2_sbd *sdp)
1672{
1673 unsigned int x;
1674
1675 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1676 examine_bucket(scan_glock, sdp, x);
1677}
1678
1679/**
1680 * clear_glock - look at a glock and see if we can free it from glock cache 1720 * clear_glock - look at a glock and see if we can free it from glock cache
1681 * @gl: the glock to look at 1721 * @gl: the glock to look at
1682 * 1722 *
@@ -1701,7 +1741,7 @@ static void clear_glock(struct gfs2_glock *gl)
1701 if (gfs2_glmutex_trylock(gl)) { 1741 if (gfs2_glmutex_trylock(gl)) {
1702 if (list_empty(&gl->gl_holders) && 1742 if (list_empty(&gl->gl_holders) &&
1703 gl->gl_state != LM_ST_UNLOCKED) 1743 gl->gl_state != LM_ST_UNLOCKED)
1704 handle_callback(gl, LM_ST_UNLOCKED, 0); 1744 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1705 gfs2_glmutex_unlock(gl); 1745 gfs2_glmutex_unlock(gl);
1706 } 1746 }
1707} 1747}
@@ -1843,7 +1883,7 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1843 1883
1844 spin_lock(&gl->gl_spin); 1884 spin_lock(&gl->gl_spin);
1845 1885
1846 print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type, 1886 print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
1847 (unsigned long long)gl->gl_name.ln_number); 1887 (unsigned long long)gl->gl_name.ln_number);
1848 print_dbg(gi, " gl_flags ="); 1888 print_dbg(gi, " gl_flags =");
1849 for (x = 0; x < 32; x++) { 1889 for (x = 0; x < 32; x++) {
@@ -1963,6 +2003,35 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1963 return error; 2003 return error;
1964} 2004}
1965 2005
2006/**
2007 * gfs2_scand - Look for cached glocks and inodes to toss from memory
2008 * @sdp: Pointer to GFS2 superblock
2009 *
2010 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
2011 * See gfs2_glockd()
2012 */
2013
2014static int gfs2_scand(void *data)
2015{
2016 unsigned x;
2017 unsigned delay;
2018
2019 while (!kthread_should_stop()) {
2020 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2021 examine_bucket(scan_glock, NULL, x);
2022 if (freezing(current))
2023 refrigerator();
2024 delay = scand_secs;
2025 if (delay < 1)
2026 delay = 1;
2027 schedule_timeout_interruptible(delay * HZ);
2028 }
2029
2030 return 0;
2031}
2032
2033
2034
1966int __init gfs2_glock_init(void) 2035int __init gfs2_glock_init(void)
1967{ 2036{
1968 unsigned i; 2037 unsigned i;
@@ -1974,52 +2043,69 @@ int __init gfs2_glock_init(void)
1974 rwlock_init(&gl_hash_locks[i]); 2043 rwlock_init(&gl_hash_locks[i]);
1975 } 2044 }
1976#endif 2045#endif
2046
2047 scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
2048 if (IS_ERR(scand_process))
2049 return PTR_ERR(scand_process);
2050
2051 glock_workqueue = create_workqueue("glock_workqueue");
2052 if (IS_ERR(glock_workqueue)) {
2053 kthread_stop(scand_process);
2054 return PTR_ERR(glock_workqueue);
2055 }
2056
1977 return 0; 2057 return 0;
1978} 2058}
1979 2059
2060void gfs2_glock_exit(void)
2061{
2062 destroy_workqueue(glock_workqueue);
2063 kthread_stop(scand_process);
2064}
2065
2066module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
2067MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
2068
1980static int gfs2_glock_iter_next(struct glock_iter *gi) 2069static int gfs2_glock_iter_next(struct glock_iter *gi)
1981{ 2070{
2071 struct gfs2_glock *gl;
2072
2073restart:
1982 read_lock(gl_lock_addr(gi->hash)); 2074 read_lock(gl_lock_addr(gi->hash));
1983 while (1) { 2075 gl = gi->gl;
1984 if (!gi->hb_list) { /* If we don't have a hash bucket yet */ 2076 if (gl) {
1985 gi->hb_list = &gl_hash_table[gi->hash].hb_list; 2077 gi->gl = hlist_entry(gl->gl_list.next,
1986 if (hlist_empty(gi->hb_list)) { 2078 struct gfs2_glock, gl_list);
1987 read_unlock(gl_lock_addr(gi->hash));
1988 gi->hash++;
1989 read_lock(gl_lock_addr(gi->hash));
1990 gi->hb_list = NULL;
1991 if (gi->hash >= GFS2_GL_HASH_SIZE) {
1992 read_unlock(gl_lock_addr(gi->hash));
1993 return 1;
1994 }
1995 else
1996 continue;
1997 }
1998 if (!hlist_empty(gi->hb_list)) {
1999 gi->gl = list_entry(gi->hb_list->first,
2000 struct gfs2_glock,
2001 gl_list);
2002 }
2003 } else {
2004 if (gi->gl->gl_list.next == NULL) {
2005 read_unlock(gl_lock_addr(gi->hash));
2006 gi->hash++;
2007 read_lock(gl_lock_addr(gi->hash));
2008 gi->hb_list = NULL;
2009 continue;
2010 }
2011 gi->gl = list_entry(gi->gl->gl_list.next,
2012 struct gfs2_glock, gl_list);
2013 }
2014 if (gi->gl) 2079 if (gi->gl)
2015 break; 2080 gfs2_glock_hold(gi->gl);
2016 } 2081 }
2017 read_unlock(gl_lock_addr(gi->hash)); 2082 read_unlock(gl_lock_addr(gi->hash));
2083 if (gl)
2084 gfs2_glock_put(gl);
2085 if (gl && gi->gl == NULL)
2086 gi->hash++;
2087 while(gi->gl == NULL) {
2088 if (gi->hash >= GFS2_GL_HASH_SIZE)
2089 return 1;
2090 read_lock(gl_lock_addr(gi->hash));
2091 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
2092 struct gfs2_glock, gl_list);
2093 if (gi->gl)
2094 gfs2_glock_hold(gi->gl);
2095 read_unlock(gl_lock_addr(gi->hash));
2096 gi->hash++;
2097 }
2098
2099 if (gi->sdp != gi->gl->gl_sbd)
2100 goto restart;
2101
2018 return 0; 2102 return 0;
2019} 2103}
2020 2104
2021static void gfs2_glock_iter_free(struct glock_iter *gi) 2105static void gfs2_glock_iter_free(struct glock_iter *gi)
2022{ 2106{
2107 if (gi->gl)
2108 gfs2_glock_put(gi->gl);
2023 kfree(gi); 2109 kfree(gi);
2024} 2110}
2025 2111
@@ -2033,9 +2119,8 @@ static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
2033 2119
2034 gi->sdp = sdp; 2120 gi->sdp = sdp;
2035 gi->hash = 0; 2121 gi->hash = 0;
2036 gi->gl = NULL;
2037 gi->hb_list = NULL;
2038 gi->seq = NULL; 2122 gi->seq = NULL;
2123 gi->gl = NULL;
2039 memset(gi->string, 0, sizeof(gi->string)); 2124 memset(gi->string, 0, sizeof(gi->string));
2040 2125
2041 if (gfs2_glock_iter_next(gi)) { 2126 if (gfs2_glock_iter_next(gi)) {
@@ -2055,7 +2140,7 @@ static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
2055 if (!gi) 2140 if (!gi)
2056 return NULL; 2141 return NULL;
2057 2142
2058 while (n--) { 2143 while(n--) {
2059 if (gfs2_glock_iter_next(gi)) { 2144 if (gfs2_glock_iter_next(gi)) {
2060 gfs2_glock_iter_free(gi); 2145 gfs2_glock_iter_free(gi);
2061 return NULL; 2146 return NULL;
@@ -2082,7 +2167,9 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
2082 2167
2083static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) 2168static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
2084{ 2169{
2085 /* nothing for now */ 2170 struct glock_iter *gi = iter_ptr;
2171 if (gi)
2172 gfs2_glock_iter_free(gi);
2086} 2173}
2087 2174
2088static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) 2175static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
@@ -2095,7 +2182,7 @@ static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
2095 return 0; 2182 return 0;
2096} 2183}
2097 2184
2098static struct seq_operations gfs2_glock_seq_ops = { 2185static const struct seq_operations gfs2_glock_seq_ops = {
2099 .start = gfs2_glock_seq_start, 2186 .start = gfs2_glock_seq_start,
2100 .next = gfs2_glock_seq_next, 2187 .next = gfs2_glock_seq_next,
2101 .stop = gfs2_glock_seq_stop, 2188 .stop = gfs2_glock_seq_stop,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 7721ca3fff9e..b16f604eea9f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,6 +26,7 @@
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200 27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 28#define GL_NOCACHE 0x00000400
29#define GL_FLOCK 0x00000800
29#define GL_NOCANCEL 0x00001000 30#define GL_NOCANCEL 0x00001000
30 31
31#define GLR_TRYFAILED 13 32#define GLR_TRYFAILED 13
@@ -132,11 +133,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
132 133
133void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); 134void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
134void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 135void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
135
136void gfs2_scand_internal(struct gfs2_sbd *sdp);
137void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); 136void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
138 137
139int __init gfs2_glock_init(void); 138int __init gfs2_glock_init(void);
139void gfs2_glock_exit(void);
140
140int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); 141int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
141void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); 142void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
142int gfs2_register_debugfs(void); 143int gfs2_register_debugfs(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 777ca46010e8..4670dcb2a877 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -41,7 +41,6 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
41 struct list_head *head = &gl->gl_ail_list; 41 struct list_head *head = &gl->gl_ail_list;
42 struct gfs2_bufdata *bd; 42 struct gfs2_bufdata *bd;
43 struct buffer_head *bh; 43 struct buffer_head *bh;
44 u64 blkno;
45 int error; 44 int error;
46 45
47 blocks = atomic_read(&gl->gl_ail_count); 46 blocks = atomic_read(&gl->gl_ail_count);
@@ -57,19 +56,12 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
57 bd = list_entry(head->next, struct gfs2_bufdata, 56 bd = list_entry(head->next, struct gfs2_bufdata,
58 bd_ail_gl_list); 57 bd_ail_gl_list);
59 bh = bd->bd_bh; 58 bh = bd->bd_bh;
60 blkno = bh->b_blocknr; 59 gfs2_remove_from_ail(NULL, bd);
60 bd->bd_bh = NULL;
61 bh->b_private = NULL;
62 bd->bd_blkno = bh->b_blocknr;
61 gfs2_assert_withdraw(sdp, !buffer_busy(bh)); 63 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
62 64 gfs2_trans_add_revoke(sdp, bd);
63 bd->bd_ail = NULL;
64 list_del(&bd->bd_ail_st_list);
65 list_del(&bd->bd_ail_gl_list);
66 atomic_dec(&gl->gl_ail_count);
67 brelse(bh);
68 gfs2_log_unlock(sdp);
69
70 gfs2_trans_add_revoke(sdp, blkno);
71
72 gfs2_log_lock(sdp);
73 } 65 }
74 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 66 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
75 gfs2_log_unlock(sdp); 67 gfs2_log_unlock(sdp);
@@ -156,9 +148,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
156 ip = NULL; 148 ip = NULL;
157 149
158 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 150 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
159 if (ip) 151 if (ip && !gfs2_is_jdata(ip))
160 filemap_fdatawrite(ip->i_inode.i_mapping); 152 filemap_fdatawrite(ip->i_inode.i_mapping);
161 gfs2_log_flush(gl->gl_sbd, gl); 153 gfs2_log_flush(gl->gl_sbd, gl);
154 if (ip && gfs2_is_jdata(ip))
155 filemap_fdatawrite(ip->i_inode.i_mapping);
162 gfs2_meta_sync(gl); 156 gfs2_meta_sync(gl);
163 if (ip) { 157 if (ip) {
164 struct address_space *mapping = ip->i_inode.i_mapping; 158 struct address_space *mapping = ip->i_inode.i_mapping;
@@ -452,6 +446,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
452 .go_lock = inode_go_lock, 446 .go_lock = inode_go_lock,
453 .go_unlock = inode_go_unlock, 447 .go_unlock = inode_go_unlock,
454 .go_type = LM_TYPE_INODE, 448 .go_type = LM_TYPE_INODE,
449 .go_min_hold_time = HZ / 10,
455}; 450};
456 451
457const struct gfs2_glock_operations gfs2_rgrp_glops = { 452const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -462,6 +457,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
462 .go_lock = rgrp_go_lock, 457 .go_lock = rgrp_go_lock,
463 .go_unlock = rgrp_go_unlock, 458 .go_unlock = rgrp_go_unlock,
464 .go_type = LM_TYPE_RGRP, 459 .go_type = LM_TYPE_RGRP,
460 .go_min_hold_time = HZ / 10,
465}; 461};
466 462
467const struct gfs2_glock_operations gfs2_trans_glops = { 463const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 170ba93829c0..eaddfb5a8e6f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
11#define __INCORE_DOT_H__ 11#define __INCORE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h>
14 15
15#define DIO_WAIT 0x00000010 16#define DIO_WAIT 0x00000010
16#define DIO_METADATA 0x00000020 17#define DIO_METADATA 0x00000020
@@ -113,7 +114,13 @@ struct gfs2_bufdata {
113 struct buffer_head *bd_bh; 114 struct buffer_head *bd_bh;
114 struct gfs2_glock *bd_gl; 115 struct gfs2_glock *bd_gl;
115 116
116 struct list_head bd_list_tr; 117 union {
118 struct list_head list_tr;
119 u64 blkno;
120 } u;
121#define bd_list_tr u.list_tr
122#define bd_blkno u.blkno
123
117 struct gfs2_log_element bd_le; 124 struct gfs2_log_element bd_le;
118 125
119 struct gfs2_ail *bd_ail; 126 struct gfs2_ail *bd_ail;
@@ -130,6 +137,7 @@ struct gfs2_glock_operations {
130 int (*go_lock) (struct gfs2_holder *gh); 137 int (*go_lock) (struct gfs2_holder *gh);
131 void (*go_unlock) (struct gfs2_holder *gh); 138 void (*go_unlock) (struct gfs2_holder *gh);
132 const int go_type; 139 const int go_type;
140 const unsigned long go_min_hold_time;
133}; 141};
134 142
135enum { 143enum {
@@ -161,6 +169,7 @@ enum {
161 GLF_LOCK = 1, 169 GLF_LOCK = 1,
162 GLF_STICKY = 2, 170 GLF_STICKY = 2,
163 GLF_DEMOTE = 3, 171 GLF_DEMOTE = 3,
172 GLF_PENDING_DEMOTE = 4,
164 GLF_DIRTY = 5, 173 GLF_DIRTY = 5,
165}; 174};
166 175
@@ -193,6 +202,7 @@ struct gfs2_glock {
193 202
194 u64 gl_vn; 203 u64 gl_vn;
195 unsigned long gl_stamp; 204 unsigned long gl_stamp;
205 unsigned long gl_tchange;
196 void *gl_object; 206 void *gl_object;
197 207
198 struct list_head gl_reclaim; 208 struct list_head gl_reclaim;
@@ -203,6 +213,7 @@ struct gfs2_glock {
203 struct gfs2_log_element gl_le; 213 struct gfs2_log_element gl_le;
204 struct list_head gl_ail_list; 214 struct list_head gl_ail_list;
205 atomic_t gl_ail_count; 215 atomic_t gl_ail_count;
216 struct delayed_work gl_work;
206}; 217};
207 218
208struct gfs2_alloc { 219struct gfs2_alloc {
@@ -293,11 +304,6 @@ struct gfs2_file {
293 struct gfs2_holder f_fl_gh; 304 struct gfs2_holder f_fl_gh;
294}; 305};
295 306
296struct gfs2_revoke {
297 struct gfs2_log_element rv_le;
298 u64 rv_blkno;
299};
300
301struct gfs2_revoke_replay { 307struct gfs2_revoke_replay {
302 struct list_head rr_list; 308 struct list_head rr_list;
303 u64 rr_blkno; 309 u64 rr_blkno;
@@ -335,12 +341,6 @@ struct gfs2_quota_data {
335 unsigned long qd_last_touched; 341 unsigned long qd_last_touched;
336}; 342};
337 343
338struct gfs2_log_buf {
339 struct list_head lb_list;
340 struct buffer_head *lb_bh;
341 struct buffer_head *lb_real;
342};
343
344struct gfs2_trans { 344struct gfs2_trans {
345 unsigned long tr_ip; 345 unsigned long tr_ip;
346 346
@@ -429,7 +429,6 @@ struct gfs2_tune {
429 unsigned int gt_log_flush_secs; 429 unsigned int gt_log_flush_secs;
430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */ 430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
431 431
432 unsigned int gt_scand_secs;
433 unsigned int gt_recoverd_secs; 432 unsigned int gt_recoverd_secs;
434 unsigned int gt_logd_secs; 433 unsigned int gt_logd_secs;
435 unsigned int gt_quotad_secs; 434 unsigned int gt_quotad_secs;
@@ -574,7 +573,6 @@ struct gfs2_sbd {
574 573
575 /* Daemon stuff */ 574 /* Daemon stuff */
576 575
577 struct task_struct *sd_scand_process;
578 struct task_struct *sd_recoverd_process; 576 struct task_struct *sd_recoverd_process;
579 struct task_struct *sd_logd_process; 577 struct task_struct *sd_logd_process;
580 struct task_struct *sd_quotad_process; 578 struct task_struct *sd_quotad_process;
@@ -609,13 +607,13 @@ struct gfs2_sbd {
609 unsigned int sd_log_num_revoke; 607 unsigned int sd_log_num_revoke;
610 unsigned int sd_log_num_rg; 608 unsigned int sd_log_num_rg;
611 unsigned int sd_log_num_databuf; 609 unsigned int sd_log_num_databuf;
612 unsigned int sd_log_num_jdata;
613 610
614 struct list_head sd_log_le_gl; 611 struct list_head sd_log_le_gl;
615 struct list_head sd_log_le_buf; 612 struct list_head sd_log_le_buf;
616 struct list_head sd_log_le_revoke; 613 struct list_head sd_log_le_revoke;
617 struct list_head sd_log_le_rg; 614 struct list_head sd_log_le_rg;
618 struct list_head sd_log_le_databuf; 615 struct list_head sd_log_le_databuf;
616 struct list_head sd_log_le_ordered;
619 617
620 unsigned int sd_log_blks_free; 618 unsigned int sd_log_blks_free;
621 struct mutex sd_log_reserve_mutex; 619 struct mutex sd_log_reserve_mutex;
@@ -627,7 +625,8 @@ struct gfs2_sbd {
627 625
628 unsigned long sd_log_flush_time; 626 unsigned long sd_log_flush_time;
629 struct rw_semaphore sd_log_flush_lock; 627 struct rw_semaphore sd_log_flush_lock;
630 struct list_head sd_log_flush_list; 628 atomic_t sd_log_in_flight;
629 wait_queue_head_t sd_log_flush_wait;
631 630
632 unsigned int sd_log_flush_head; 631 unsigned int sd_log_flush_head;
633 u64 sd_log_flush_wrapped; 632 u64 sd_log_flush_wrapped;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 34f7bcdea1e9..5f6dc32946cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -77,6 +77,49 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
77 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 77 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
78} 78}
79 79
80struct gfs2_skip_data {
81 u64 no_addr;
82 int skipped;
83};
84
85static int iget_skip_test(struct inode *inode, void *opaque)
86{
87 struct gfs2_inode *ip = GFS2_I(inode);
88 struct gfs2_skip_data *data = opaque;
89
90 if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){
91 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
92 data->skipped = 1;
93 return 0;
94 }
95 return 1;
96 }
97 return 0;
98}
99
100static int iget_skip_set(struct inode *inode, void *opaque)
101{
102 struct gfs2_inode *ip = GFS2_I(inode);
103 struct gfs2_skip_data *data = opaque;
104
105 if (data->skipped)
106 return 1;
107 inode->i_ino = (unsigned long)(data->no_addr);
108 ip->i_no_addr = data->no_addr;
109 return 0;
110}
111
112static struct inode *gfs2_iget_skip(struct super_block *sb,
113 u64 no_addr)
114{
115 struct gfs2_skip_data data;
116 unsigned long hash = (unsigned long)no_addr;
117
118 data.no_addr = no_addr;
119 data.skipped = 0;
120 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
121}
122
80/** 123/**
81 * GFS2 lookup code fills in vfs inode contents based on info obtained 124 * GFS2 lookup code fills in vfs inode contents based on info obtained
82 * from directory entry inside gfs2_inode_lookup(). This has caused issues 125 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -112,6 +155,7 @@ void gfs2_set_iop(struct inode *inode)
112 * @sb: The super block 155 * @sb: The super block
113 * @no_addr: The inode number 156 * @no_addr: The inode number
114 * @type: The type of the inode 157 * @type: The type of the inode
158 * @skip_freeing: set this not return an inode if it is currently being freed.
115 * 159 *
116 * Returns: A VFS inode, or an error 160 * Returns: A VFS inode, or an error
117 */ 161 */
@@ -119,13 +163,19 @@ void gfs2_set_iop(struct inode *inode)
119struct inode *gfs2_inode_lookup(struct super_block *sb, 163struct inode *gfs2_inode_lookup(struct super_block *sb,
120 unsigned int type, 164 unsigned int type,
121 u64 no_addr, 165 u64 no_addr,
122 u64 no_formal_ino) 166 u64 no_formal_ino, int skip_freeing)
123{ 167{
124 struct inode *inode = gfs2_iget(sb, no_addr); 168 struct inode *inode;
125 struct gfs2_inode *ip = GFS2_I(inode); 169 struct gfs2_inode *ip;
126 struct gfs2_glock *io_gl; 170 struct gfs2_glock *io_gl;
127 int error; 171 int error;
128 172
173 if (skip_freeing)
174 inode = gfs2_iget_skip(sb, no_addr);
175 else
176 inode = gfs2_iget(sb, no_addr);
177 ip = GFS2_I(inode);
178
129 if (!inode) 179 if (!inode)
130 return ERR_PTR(-ENOBUFS); 180 return ERR_PTR(-ENOBUFS);
131 181
@@ -244,6 +294,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
244 return 0; 294 return 0;
245} 295}
246 296
297static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
298{
299 ip->i_cache[0] = bh;
300}
301
247/** 302/**
248 * gfs2_inode_refresh - Refresh the incore copy of the dinode 303 * gfs2_inode_refresh - Refresh the incore copy of the dinode
249 * @ip: The GFS2 inode 304 * @ip: The GFS2 inode
@@ -688,7 +743,7 @@ out:
688static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 743static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
689 const struct gfs2_inum_host *inum, unsigned int mode, 744 const struct gfs2_inum_host *inum, unsigned int mode,
690 unsigned int uid, unsigned int gid, 745 unsigned int uid, unsigned int gid,
691 const u64 *generation, dev_t dev) 746 const u64 *generation, dev_t dev, struct buffer_head **bhp)
692{ 747{
693 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 748 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
694 struct gfs2_dinode *di; 749 struct gfs2_dinode *di;
@@ -743,13 +798,15 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
743 di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); 798 di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
744 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); 799 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
745 memset(&di->di_reserved, 0, sizeof(di->di_reserved)); 800 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
801
802 set_buffer_uptodate(dibh);
746 803
747 brelse(dibh); 804 *bhp = dibh;
748} 805}
749 806
750static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 807static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
751 unsigned int mode, const struct gfs2_inum_host *inum, 808 unsigned int mode, const struct gfs2_inum_host *inum,
752 const u64 *generation, dev_t dev) 809 const u64 *generation, dev_t dev, struct buffer_head **bhp)
753{ 810{
754 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 811 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
755 unsigned int uid, gid; 812 unsigned int uid, gid;
@@ -770,7 +827,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
770 if (error) 827 if (error)
771 goto out_quota; 828 goto out_quota;
772 829
773 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev); 830 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
774 gfs2_quota_change(dip, +1, uid, gid); 831 gfs2_quota_change(dip, +1, uid, gid);
775 gfs2_trans_end(sdp); 832 gfs2_trans_end(sdp);
776 833
@@ -909,6 +966,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
909 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 966 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
910 int error; 967 int error;
911 u64 generation; 968 u64 generation;
969 struct buffer_head *bh=NULL;
912 970
913 if (!name->len || name->len > GFS2_FNAMESIZE) 971 if (!name->len || name->len > GFS2_FNAMESIZE)
914 return ERR_PTR(-ENAMETOOLONG); 972 return ERR_PTR(-ENAMETOOLONG);
@@ -935,16 +993,18 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
935 if (error) 993 if (error)
936 goto fail_gunlock; 994 goto fail_gunlock;
937 995
938 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev); 996 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
939 if (error) 997 if (error)
940 goto fail_gunlock2; 998 goto fail_gunlock2;
941 999
942 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), 1000 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
943 inum.no_addr, 1001 inum.no_addr,
944 inum.no_formal_ino); 1002 inum.no_formal_ino, 0);
945 if (IS_ERR(inode)) 1003 if (IS_ERR(inode))
946 goto fail_gunlock2; 1004 goto fail_gunlock2;
947 1005
1006 gfs2_inode_bh(GFS2_I(inode), bh);
1007
948 error = gfs2_inode_refresh(GFS2_I(inode)); 1008 error = gfs2_inode_refresh(GFS2_I(inode));
949 if (error) 1009 if (error)
950 goto fail_gunlock2; 1010 goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 4517ac82c01c..351ac87ab384 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -49,7 +49,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
49void gfs2_inode_attr_in(struct gfs2_inode *ip); 49void gfs2_inode_attr_in(struct gfs2_inode *ip);
50void gfs2_set_iop(struct inode *inode); 50void gfs2_set_iop(struct inode *inode);
51struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 51struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
52 u64 no_addr, u64 no_formal_ino); 52 u64 no_addr, u64 no_formal_ino,
53 int skip_freeing);
53struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 54struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
54 55
55int gfs2_inode_refresh(struct gfs2_inode *ip); 56int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 24d70f73b651..9e8265d28377 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -13,7 +13,6 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h> 16#include <linux/types.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <linux/list.h> 18#include <linux/list.h>
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index fba1f1d87e4f..1f7b038530b4 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -346,15 +346,16 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
346 346
347static unsigned int dev_poll(struct file *file, poll_table *wait) 347static unsigned int dev_poll(struct file *file, poll_table *wait)
348{ 348{
349 unsigned int mask = 0;
350
349 poll_wait(file, &send_wq, wait); 351 poll_wait(file, &send_wq, wait);
350 352
351 spin_lock(&ops_lock); 353 spin_lock(&ops_lock);
352 if (!list_empty(&send_list)) { 354 if (!list_empty(&send_list))
353 spin_unlock(&ops_lock); 355 mask = POLLIN | POLLRDNORM;
354 return POLLIN | POLLRDNORM;
355 }
356 spin_unlock(&ops_lock); 356 spin_unlock(&ops_lock);
357 return 0; 357
358 return mask;
358} 359}
359 360
360static const struct file_operations dev_fops = { 361static const struct file_operations dev_fops = {
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 1aca51e45092..bd938f06481d 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -268,20 +268,16 @@ static inline int check_drop(struct gdlm_ls *ls)
268 return 0; 268 return 0;
269} 269}
270 270
271static int gdlm_thread(void *data) 271static int gdlm_thread(void *data, int blist)
272{ 272{
273 struct gdlm_ls *ls = (struct gdlm_ls *) data; 273 struct gdlm_ls *ls = (struct gdlm_ls *) data;
274 struct gdlm_lock *lp = NULL; 274 struct gdlm_lock *lp = NULL;
275 int blist = 0;
276 uint8_t complete, blocking, submit, drop; 275 uint8_t complete, blocking, submit, drop;
277 DECLARE_WAITQUEUE(wait, current); 276 DECLARE_WAITQUEUE(wait, current);
278 277
279 /* Only thread1 is allowed to do blocking callbacks since gfs 278 /* Only thread1 is allowed to do blocking callbacks since gfs
280 may wait for a completion callback within a blocking cb. */ 279 may wait for a completion callback within a blocking cb. */
281 280
282 if (current == ls->thread1)
283 blist = 1;
284
285 while (!kthread_should_stop()) { 281 while (!kthread_should_stop()) {
286 set_current_state(TASK_INTERRUPTIBLE); 282 set_current_state(TASK_INTERRUPTIBLE);
287 add_wait_queue(&ls->thread_wait, &wait); 283 add_wait_queue(&ls->thread_wait, &wait);
@@ -333,12 +329,22 @@ static int gdlm_thread(void *data)
333 return 0; 329 return 0;
334} 330}
335 331
332static int gdlm_thread1(void *data)
333{
334 return gdlm_thread(data, 1);
335}
336
337static int gdlm_thread2(void *data)
338{
339 return gdlm_thread(data, 0);
340}
341
336int gdlm_init_threads(struct gdlm_ls *ls) 342int gdlm_init_threads(struct gdlm_ls *ls)
337{ 343{
338 struct task_struct *p; 344 struct task_struct *p;
339 int error; 345 int error;
340 346
341 p = kthread_run(gdlm_thread, ls, "lock_dlm1"); 347 p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
342 error = IS_ERR(p); 348 error = IS_ERR(p);
343 if (error) { 349 if (error) {
344 log_error("can't start lock_dlm1 thread %d", error); 350 log_error("can't start lock_dlm1 thread %d", error);
@@ -346,7 +352,7 @@ int gdlm_init_threads(struct gdlm_ls *ls)
346 } 352 }
347 ls->thread1 = p; 353 ls->thread1 = p;
348 354
349 p = kthread_run(gdlm_thread, ls, "lock_dlm2"); 355 p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
350 error = IS_ERR(p); 356 error = IS_ERR(p);
351 if (error) { 357 if (error) {
352 log_error("can't start lock_dlm2 thread %d", error); 358 log_error("can't start lock_dlm2 thread %d", error);
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index 0d149c8c493a..d3b8ce6fbbe3 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/types.h> 13#include <linux/types.h>
15#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f49a12e24086..7df702473252 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -60,6 +60,26 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
60} 60}
61 61
62/** 62/**
63 * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
64 * @mapping: The associated mapping (maybe NULL)
65 * @bd: The gfs2_bufdata to remove
66 *
67 * The log lock _must_ be held when calling this function
68 *
69 */
70
71void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
72{
73 bd->bd_ail = NULL;
74 list_del_init(&bd->bd_ail_st_list);
75 list_del_init(&bd->bd_ail_gl_list);
76 atomic_dec(&bd->bd_gl->gl_ail_count);
77 if (mapping)
78 gfs2_meta_cache_flush(GFS2_I(mapping->host));
79 brelse(bd->bd_bh);
80}
81
82/**
63 * gfs2_ail1_start_one - Start I/O on a part of the AIL 83 * gfs2_ail1_start_one - Start I/O on a part of the AIL
64 * @sdp: the filesystem 84 * @sdp: the filesystem
65 * @tr: the part of the AIL 85 * @tr: the part of the AIL
@@ -83,17 +103,9 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
83 103
84 gfs2_assert(sdp, bd->bd_ail == ai); 104 gfs2_assert(sdp, bd->bd_ail == ai);
85 105
86 if (!bh){
87 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
88 continue;
89 }
90
91 if (!buffer_busy(bh)) { 106 if (!buffer_busy(bh)) {
92 if (!buffer_uptodate(bh)) { 107 if (!buffer_uptodate(bh))
93 gfs2_log_unlock(sdp);
94 gfs2_io_error_bh(sdp, bh); 108 gfs2_io_error_bh(sdp, bh);
95 gfs2_log_lock(sdp);
96 }
97 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); 109 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
98 continue; 110 continue;
99 } 111 }
@@ -103,9 +115,16 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
103 115
104 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); 116 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
105 117
118 get_bh(bh);
106 gfs2_log_unlock(sdp); 119 gfs2_log_unlock(sdp);
107 wait_on_buffer(bh); 120 lock_buffer(bh);
108 ll_rw_block(WRITE, 1, &bh); 121 if (test_clear_buffer_dirty(bh)) {
122 bh->b_end_io = end_buffer_write_sync;
123 submit_bh(WRITE, bh);
124 } else {
125 unlock_buffer(bh);
126 brelse(bh);
127 }
109 gfs2_log_lock(sdp); 128 gfs2_log_lock(sdp);
110 129
111 retry = 1; 130 retry = 1;
@@ -130,11 +149,6 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
130 bd_ail_st_list) { 149 bd_ail_st_list) {
131 bh = bd->bd_bh; 150 bh = bd->bd_bh;
132 151
133 if (!bh){
134 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
135 continue;
136 }
137
138 gfs2_assert(sdp, bd->bd_ail == ai); 152 gfs2_assert(sdp, bd->bd_ail == ai);
139 153
140 if (buffer_busy(bh)) { 154 if (buffer_busy(bh)) {
@@ -155,13 +169,14 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
155 169
156static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 170static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
157{ 171{
158 struct list_head *head = &sdp->sd_ail1_list; 172 struct list_head *head;
159 u64 sync_gen; 173 u64 sync_gen;
160 struct list_head *first; 174 struct list_head *first;
161 struct gfs2_ail *first_ai, *ai, *tmp; 175 struct gfs2_ail *first_ai, *ai, *tmp;
162 int done = 0; 176 int done = 0;
163 177
164 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
179 head = &sdp->sd_ail1_list;
165 if (list_empty(head)) { 180 if (list_empty(head)) {
166 gfs2_log_unlock(sdp); 181 gfs2_log_unlock(sdp);
167 return; 182 return;
@@ -233,11 +248,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
233 bd = list_entry(head->prev, struct gfs2_bufdata, 248 bd = list_entry(head->prev, struct gfs2_bufdata,
234 bd_ail_st_list); 249 bd_ail_st_list);
235 gfs2_assert(sdp, bd->bd_ail == ai); 250 gfs2_assert(sdp, bd->bd_ail == ai);
236 bd->bd_ail = NULL; 251 gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
237 list_del(&bd->bd_ail_st_list);
238 list_del(&bd->bd_ail_gl_list);
239 atomic_dec(&bd->bd_gl->gl_ail_count);
240 brelse(bd->bd_bh);
241 } 252 }
242} 253}
243 254
@@ -439,10 +450,10 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
439 return tail; 450 return tail;
440} 451}
441 452
442static inline void log_incr_head(struct gfs2_sbd *sdp) 453void gfs2_log_incr_head(struct gfs2_sbd *sdp)
443{ 454{
444 if (sdp->sd_log_flush_head == sdp->sd_log_tail) 455 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
445 gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head); 456 BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
446 457
447 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { 458 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
448 sdp->sd_log_flush_head = 0; 459 sdp->sd_log_flush_head = 0;
@@ -451,6 +462,23 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
451} 462}
452 463
453/** 464/**
465 * gfs2_log_write_endio - End of I/O for a log buffer
466 * @bh: The buffer head
467 * @uptodate: I/O Status
468 *
469 */
470
471static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
472{
473 struct gfs2_sbd *sdp = bh->b_private;
474 bh->b_private = NULL;
475
476 end_buffer_write_sync(bh, uptodate);
477 if (atomic_dec_and_test(&sdp->sd_log_in_flight))
478 wake_up(&sdp->sd_log_flush_wait);
479}
480
481/**
454 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data 482 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
455 * @sdp: The GFS2 superblock 483 * @sdp: The GFS2 superblock
456 * 484 *
@@ -460,25 +488,43 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
460struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) 488struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
461{ 489{
462 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); 490 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
463 struct gfs2_log_buf *lb;
464 struct buffer_head *bh; 491 struct buffer_head *bh;
465 492
466 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); 493 bh = sb_getblk(sdp->sd_vfs, blkno);
467 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
468
469 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
470 lock_buffer(bh); 494 lock_buffer(bh);
471 memset(bh->b_data, 0, bh->b_size); 495 memset(bh->b_data, 0, bh->b_size);
472 set_buffer_uptodate(bh); 496 set_buffer_uptodate(bh);
473 clear_buffer_dirty(bh); 497 clear_buffer_dirty(bh);
474 unlock_buffer(bh); 498 gfs2_log_incr_head(sdp);
475 499 atomic_inc(&sdp->sd_log_in_flight);
476 log_incr_head(sdp); 500 bh->b_private = sdp;
501 bh->b_end_io = gfs2_log_write_endio;
477 502
478 return bh; 503 return bh;
479} 504}
480 505
481/** 506/**
507 * gfs2_fake_write_endio -
508 * @bh: The buffer head
509 * @uptodate: The I/O Status
510 *
511 */
512
513static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
514{
515 struct buffer_head *real_bh = bh->b_private;
516 struct gfs2_bufdata *bd = real_bh->b_private;
517 struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
518
519 end_buffer_write_sync(bh, uptodate);
520 free_buffer_head(bh);
521 unlock_buffer(real_bh);
522 brelse(real_bh);
523 if (atomic_dec_and_test(&sdp->sd_log_in_flight))
524 wake_up(&sdp->sd_log_flush_wait);
525}
526
527/**
482 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log 528 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
483 * @sdp: the filesystem 529 * @sdp: the filesystem
484 * @data: the data the buffer_head should point to 530 * @data: the data the buffer_head should point to
@@ -490,22 +536,20 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
490 struct buffer_head *real) 536 struct buffer_head *real)
491{ 537{
492 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); 538 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
493 struct gfs2_log_buf *lb;
494 struct buffer_head *bh; 539 struct buffer_head *bh;
495 540
496 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); 541 bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
497 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
498 lb->lb_real = real;
499
500 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
501 atomic_set(&bh->b_count, 1); 542 atomic_set(&bh->b_count, 1);
502 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate); 543 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
503 set_bh_page(bh, real->b_page, bh_offset(real)); 544 set_bh_page(bh, real->b_page, bh_offset(real));
504 bh->b_blocknr = blkno; 545 bh->b_blocknr = blkno;
505 bh->b_size = sdp->sd_sb.sb_bsize; 546 bh->b_size = sdp->sd_sb.sb_bsize;
506 bh->b_bdev = sdp->sd_vfs->s_bdev; 547 bh->b_bdev = sdp->sd_vfs->s_bdev;
548 bh->b_private = real;
549 bh->b_end_io = gfs2_fake_write_endio;
507 550
508 log_incr_head(sdp); 551 gfs2_log_incr_head(sdp);
552 atomic_inc(&sdp->sd_log_in_flight);
509 553
510 return bh; 554 return bh;
511} 555}
@@ -572,45 +616,75 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
572 gfs2_assert_withdraw(sdp, !pull); 616 gfs2_assert_withdraw(sdp, !pull);
573 617
574 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); 618 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
575 log_incr_head(sdp); 619 gfs2_log_incr_head(sdp);
576} 620}
577 621
578static void log_flush_commit(struct gfs2_sbd *sdp) 622static void log_flush_commit(struct gfs2_sbd *sdp)
579{ 623{
580 struct list_head *head = &sdp->sd_log_flush_list; 624 DEFINE_WAIT(wait);
581 struct gfs2_log_buf *lb; 625
582 struct buffer_head *bh; 626 if (atomic_read(&sdp->sd_log_in_flight)) {
583 int flushcount = 0; 627 do {
628 prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
629 TASK_UNINTERRUPTIBLE);
630 if (atomic_read(&sdp->sd_log_in_flight))
631 io_schedule();
632 } while(atomic_read(&sdp->sd_log_in_flight));
633 finish_wait(&sdp->sd_log_flush_wait, &wait);
634 }
584 635
585 while (!list_empty(head)) { 636 log_write_header(sdp, 0, 0);
586 lb = list_entry(head->next, struct gfs2_log_buf, lb_list); 637}
587 list_del(&lb->lb_list);
588 bh = lb->lb_bh;
589 638
590 wait_on_buffer(bh); 639static void gfs2_ordered_write(struct gfs2_sbd *sdp)
591 if (!buffer_uptodate(bh)) 640{
592 gfs2_io_error_bh(sdp, bh); 641 struct gfs2_bufdata *bd;
593 if (lb->lb_real) { 642 struct buffer_head *bh;
594 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */ 643 LIST_HEAD(written);
595 schedule(); 644
596 free_buffer_head(bh); 645 gfs2_log_lock(sdp);
597 } else 646 while (!list_empty(&sdp->sd_log_le_ordered)) {
647 bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
648 list_move(&bd->bd_le.le_list, &written);
649 bh = bd->bd_bh;
650 if (!buffer_dirty(bh))
651 continue;
652 get_bh(bh);
653 gfs2_log_unlock(sdp);
654 lock_buffer(bh);
655 if (test_clear_buffer_dirty(bh)) {
656 bh->b_end_io = end_buffer_write_sync;
657 submit_bh(WRITE, bh);
658 } else {
659 unlock_buffer(bh);
598 brelse(bh); 660 brelse(bh);
599 kfree(lb); 661 }
600 flushcount++; 662 gfs2_log_lock(sdp);
601 } 663 }
664 list_splice(&written, &sdp->sd_log_le_ordered);
665 gfs2_log_unlock(sdp);
666}
602 667
603 /* If nothing was journaled, the header is unplanned and unwanted. */ 668static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
604 if (flushcount) { 669{
605 log_write_header(sdp, 0, 0); 670 struct gfs2_bufdata *bd;
606 } else { 671 struct buffer_head *bh;
607 unsigned int tail;
608 tail = current_tail(sdp);
609 672
610 gfs2_ail1_empty(sdp, 0); 673 gfs2_log_lock(sdp);
611 if (sdp->sd_log_tail != tail) 674 while (!list_empty(&sdp->sd_log_le_ordered)) {
612 log_pull_tail(sdp, tail); 675 bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
676 bh = bd->bd_bh;
677 if (buffer_locked(bh)) {
678 get_bh(bh);
679 gfs2_log_unlock(sdp);
680 wait_on_buffer(bh);
681 brelse(bh);
682 gfs2_log_lock(sdp);
683 continue;
684 }
685 list_del_init(&bd->bd_le.le_list);
613 } 686 }
687 gfs2_log_unlock(sdp);
614} 688}
615 689
616/** 690/**
@@ -640,10 +714,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
640 INIT_LIST_HEAD(&ai->ai_ail1_list); 714 INIT_LIST_HEAD(&ai->ai_ail1_list);
641 INIT_LIST_HEAD(&ai->ai_ail2_list); 715 INIT_LIST_HEAD(&ai->ai_ail2_list);
642 716
643 gfs2_assert_withdraw(sdp, 717 if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
644 sdp->sd_log_num_buf + sdp->sd_log_num_jdata == 718 printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
645 sdp->sd_log_commited_buf + 719 sdp->sd_log_commited_buf);
646 sdp->sd_log_commited_databuf); 720 gfs2_assert_withdraw(sdp, 0);
721 }
722 if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
723 printk(KERN_INFO "GFS2: log databuf %u %u\n",
724 sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
725 gfs2_assert_withdraw(sdp, 0);
726 }
647 gfs2_assert_withdraw(sdp, 727 gfs2_assert_withdraw(sdp,
648 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 728 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
649 729
@@ -651,8 +731,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
651 sdp->sd_log_flush_wrapped = 0; 731 sdp->sd_log_flush_wrapped = 0;
652 ai->ai_first = sdp->sd_log_flush_head; 732 ai->ai_first = sdp->sd_log_flush_head;
653 733
734 gfs2_ordered_write(sdp);
654 lops_before_commit(sdp); 735 lops_before_commit(sdp);
655 if (!list_empty(&sdp->sd_log_flush_list)) 736 gfs2_ordered_wait(sdp);
737
738 if (sdp->sd_log_head != sdp->sd_log_flush_head)
656 log_flush_commit(sdp); 739 log_flush_commit(sdp);
657 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 740 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
658 gfs2_log_lock(sdp); 741 gfs2_log_lock(sdp);
@@ -744,7 +827,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
744 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 827 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
745 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); 828 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
746 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); 829 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
747 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
748 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 830 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
749 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); 831 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
750 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); 832 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 8e7aa0f29109..dae282400627 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -52,12 +52,14 @@ int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55 56
56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
58 struct buffer_head *real); 59 struct buffer_head *real);
59void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); 61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
61 63
62void gfs2_log_shutdown(struct gfs2_sbd *sdp); 64void gfs2_log_shutdown(struct gfs2_sbd *sdp);
63void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 65void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3b395c41b2f3..6c27cea761c6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -27,7 +27,104 @@
27#include "trans.h" 27#include "trans.h"
28#include "util.h" 28#include "util.h"
29 29
30static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 30/**
31 * gfs2_pin - Pin a buffer in memory
32 * @sdp: The superblock
33 * @bh: The buffer to be pinned
34 *
35 * The log lock must be held when calling this function
36 */
37static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
38{
39 struct gfs2_bufdata *bd;
40
41 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
42
43 clear_buffer_dirty(bh);
44 if (test_set_buffer_pinned(bh))
45 gfs2_assert_withdraw(sdp, 0);
46 if (!buffer_uptodate(bh))
47 gfs2_io_error_bh(sdp, bh);
48 bd = bh->b_private;
49 /* If this buffer is in the AIL and it has already been written
50 * to in-place disk block, remove it from the AIL.
51 */
52 if (bd->bd_ail)
53 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
54 get_bh(bh);
55}
56
57/**
58 * gfs2_unpin - Unpin a buffer
59 * @sdp: the filesystem the buffer belongs to
60 * @bh: The buffer to unpin
61 * @ai:
62 *
63 */
64
65static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
66 struct gfs2_ail *ai)
67{
68 struct gfs2_bufdata *bd = bh->b_private;
69
70 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
71
72 if (!buffer_pinned(bh))
73 gfs2_assert_withdraw(sdp, 0);
74
75 lock_buffer(bh);
76 mark_buffer_dirty(bh);
77 clear_buffer_pinned(bh);
78
79 gfs2_log_lock(sdp);
80 if (bd->bd_ail) {
81 list_del(&bd->bd_ail_st_list);
82 brelse(bh);
83 } else {
84 struct gfs2_glock *gl = bd->bd_gl;
85 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
86 atomic_inc(&gl->gl_ail_count);
87 }
88 bd->bd_ail = ai;
89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
90 gfs2_log_unlock(sdp);
91 unlock_buffer(bh);
92}
93
94
95static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh)
96{
97 return (struct gfs2_log_descriptor *)bh->b_data;
98}
99
100static inline __be64 *bh_log_ptr(struct buffer_head *bh)
101{
102 struct gfs2_log_descriptor *ld = bh_log_desc(bh);
103 return (__force __be64 *)(ld + 1);
104}
105
106static inline __be64 *bh_ptr_end(struct buffer_head *bh)
107{
108 return (__force __be64 *)(bh->b_data + bh->b_size);
109}
110
111
112static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
113{
114 struct buffer_head *bh = gfs2_log_get_buf(sdp);
115 struct gfs2_log_descriptor *ld = bh_log_desc(bh);
116 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
117 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
118 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
119 ld->ld_type = cpu_to_be32(ld_type);
120 ld->ld_length = 0;
121 ld->ld_data1 = 0;
122 ld->ld_data2 = 0;
123 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
124 return bh;
125}
126
127static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
31{ 128{
32 struct gfs2_glock *gl; 129 struct gfs2_glock *gl;
33 struct gfs2_trans *tr = current->journal_info; 130 struct gfs2_trans *tr = current->journal_info;
@@ -38,15 +135,19 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
38 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) 135 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
39 return; 136 return;
40 137
41 gfs2_log_lock(sdp); 138 if (!list_empty(&le->le_list))
42 if (!list_empty(&le->le_list)){
43 gfs2_log_unlock(sdp);
44 return; 139 return;
45 } 140
46 gfs2_glock_hold(gl); 141 gfs2_glock_hold(gl);
47 set_bit(GLF_DIRTY, &gl->gl_flags); 142 set_bit(GLF_DIRTY, &gl->gl_flags);
48 sdp->sd_log_num_gl++; 143 sdp->sd_log_num_gl++;
49 list_add(&le->le_list, &sdp->sd_log_le_gl); 144 list_add(&le->le_list, &sdp->sd_log_le_gl);
145}
146
147static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
148{
149 gfs2_log_lock(sdp);
150 __glock_lo_add(sdp, le);
50 gfs2_log_unlock(sdp); 151 gfs2_log_unlock(sdp);
51} 152}
52 153
@@ -71,30 +172,25 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
71 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 172 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
72 struct gfs2_trans *tr; 173 struct gfs2_trans *tr;
73 174
175 lock_buffer(bd->bd_bh);
74 gfs2_log_lock(sdp); 176 gfs2_log_lock(sdp);
75 if (!list_empty(&bd->bd_list_tr)) { 177 if (!list_empty(&bd->bd_list_tr))
76 gfs2_log_unlock(sdp); 178 goto out;
77 return;
78 }
79 tr = current->journal_info; 179 tr = current->journal_info;
80 tr->tr_touched = 1; 180 tr->tr_touched = 1;
81 tr->tr_num_buf++; 181 tr->tr_num_buf++;
82 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 182 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
83 gfs2_log_unlock(sdp);
84
85 if (!list_empty(&le->le_list)) 183 if (!list_empty(&le->le_list))
86 return; 184 goto out;
87 185 __glock_lo_add(sdp, &bd->bd_gl->gl_le);
88 gfs2_trans_add_gl(bd->bd_gl);
89
90 gfs2_meta_check(sdp, bd->bd_bh); 186 gfs2_meta_check(sdp, bd->bd_bh);
91 gfs2_pin(sdp, bd->bd_bh); 187 gfs2_pin(sdp, bd->bd_bh);
92 gfs2_log_lock(sdp);
93 sdp->sd_log_num_buf++; 188 sdp->sd_log_num_buf++;
94 list_add(&le->le_list, &sdp->sd_log_le_buf); 189 list_add(&le->le_list, &sdp->sd_log_le_buf);
95 gfs2_log_unlock(sdp);
96
97 tr->tr_num_buf_new++; 190 tr->tr_num_buf_new++;
191out:
192 gfs2_log_unlock(sdp);
193 unlock_buffer(bd->bd_bh);
98} 194}
99 195
100static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 196static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -117,8 +213,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
117 struct buffer_head *bh; 213 struct buffer_head *bh;
118 struct gfs2_log_descriptor *ld; 214 struct gfs2_log_descriptor *ld;
119 struct gfs2_bufdata *bd1 = NULL, *bd2; 215 struct gfs2_bufdata *bd1 = NULL, *bd2;
120 unsigned int total = sdp->sd_log_num_buf; 216 unsigned int total;
121 unsigned int offset = BUF_OFFSET;
122 unsigned int limit; 217 unsigned int limit;
123 unsigned int num; 218 unsigned int num;
124 unsigned n; 219 unsigned n;
@@ -127,22 +222,20 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
127 limit = buf_limit(sdp); 222 limit = buf_limit(sdp);
128 /* for 4k blocks, limit = 503 */ 223 /* for 4k blocks, limit = 503 */
129 224
225 gfs2_log_lock(sdp);
226 total = sdp->sd_log_num_buf;
130 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); 227 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
131 while(total) { 228 while(total) {
132 num = total; 229 num = total;
133 if (total > limit) 230 if (total > limit)
134 num = limit; 231 num = limit;
135 bh = gfs2_log_get_buf(sdp); 232 gfs2_log_unlock(sdp);
136 ld = (struct gfs2_log_descriptor *)bh->b_data; 233 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA);
137 ptr = (__be64 *)(bh->b_data + offset); 234 gfs2_log_lock(sdp);
138 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 235 ld = bh_log_desc(bh);
139 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); 236 ptr = bh_log_ptr(bh);
140 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
141 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
142 ld->ld_length = cpu_to_be32(num + 1); 237 ld->ld_length = cpu_to_be32(num + 1);
143 ld->ld_data1 = cpu_to_be32(num); 238 ld->ld_data1 = cpu_to_be32(num);
144 ld->ld_data2 = cpu_to_be32(0);
145 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
146 239
147 n = 0; 240 n = 0;
148 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, 241 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
@@ -152,21 +245,27 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
152 break; 245 break;
153 } 246 }
154 247
155 set_buffer_dirty(bh); 248 gfs2_log_unlock(sdp);
156 ll_rw_block(WRITE, 1, &bh); 249 submit_bh(WRITE, bh);
250 gfs2_log_lock(sdp);
157 251
158 n = 0; 252 n = 0;
159 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, 253 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
160 bd_le.le_list) { 254 bd_le.le_list) {
255 get_bh(bd2->bd_bh);
256 gfs2_log_unlock(sdp);
257 lock_buffer(bd2->bd_bh);
161 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 258 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
162 set_buffer_dirty(bh); 259 submit_bh(WRITE, bh);
163 ll_rw_block(WRITE, 1, &bh); 260 gfs2_log_lock(sdp);
164 if (++n >= num) 261 if (++n >= num)
165 break; 262 break;
166 } 263 }
167 264
265 BUG_ON(total < num);
168 total -= num; 266 total -= num;
169 } 267 }
268 gfs2_log_unlock(sdp);
170} 269}
171 270
172static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 271static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
@@ -270,11 +369,8 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
270 tr = current->journal_info; 369 tr = current->journal_info;
271 tr->tr_touched = 1; 370 tr->tr_touched = 1;
272 tr->tr_num_revoke++; 371 tr->tr_num_revoke++;
273
274 gfs2_log_lock(sdp);
275 sdp->sd_log_num_revoke++; 372 sdp->sd_log_num_revoke++;
276 list_add(&le->le_list, &sdp->sd_log_le_revoke); 373 list_add(&le->le_list, &sdp->sd_log_le_revoke);
277 gfs2_log_unlock(sdp);
278} 374}
279 375
280static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 376static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
@@ -284,32 +380,25 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
284 struct buffer_head *bh; 380 struct buffer_head *bh;
285 unsigned int offset; 381 unsigned int offset;
286 struct list_head *head = &sdp->sd_log_le_revoke; 382 struct list_head *head = &sdp->sd_log_le_revoke;
287 struct gfs2_revoke *rv; 383 struct gfs2_bufdata *bd;
288 384
289 if (!sdp->sd_log_num_revoke) 385 if (!sdp->sd_log_num_revoke)
290 return; 386 return;
291 387
292 bh = gfs2_log_get_buf(sdp); 388 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE);
293 ld = (struct gfs2_log_descriptor *)bh->b_data; 389 ld = bh_log_desc(bh);
294 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
295 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
296 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
297 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
298 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, 390 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
299 sizeof(u64))); 391 sizeof(u64)));
300 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); 392 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
301 ld->ld_data2 = cpu_to_be32(0);
302 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
303 offset = sizeof(struct gfs2_log_descriptor); 393 offset = sizeof(struct gfs2_log_descriptor);
304 394
305 while (!list_empty(head)) { 395 while (!list_empty(head)) {
306 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list); 396 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
307 list_del_init(&rv->rv_le.le_list); 397 list_del_init(&bd->bd_le.le_list);
308 sdp->sd_log_num_revoke--; 398 sdp->sd_log_num_revoke--;
309 399
310 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 400 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
311 set_buffer_dirty(bh); 401 submit_bh(WRITE, bh);
312 ll_rw_block(WRITE, 1, &bh);
313 402
314 bh = gfs2_log_get_buf(sdp); 403 bh = gfs2_log_get_buf(sdp);
315 mh = (struct gfs2_meta_header *)bh->b_data; 404 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -319,15 +408,14 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
319 offset = sizeof(struct gfs2_meta_header); 408 offset = sizeof(struct gfs2_meta_header);
320 } 409 }
321 410
322 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno); 411 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
323 kfree(rv); 412 kmem_cache_free(gfs2_bufdata_cachep, bd);
324 413
325 offset += sizeof(u64); 414 offset += sizeof(u64);
326 } 415 }
327 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 416 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
328 417
329 set_buffer_dirty(bh); 418 submit_bh(WRITE, bh);
330 ll_rw_block(WRITE, 1, &bh);
331} 419}
332 420
333static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 421static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -466,222 +554,136 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
466 struct address_space *mapping = bd->bd_bh->b_page->mapping; 554 struct address_space *mapping = bd->bd_bh->b_page->mapping;
467 struct gfs2_inode *ip = GFS2_I(mapping->host); 555 struct gfs2_inode *ip = GFS2_I(mapping->host);
468 556
557 lock_buffer(bd->bd_bh);
469 gfs2_log_lock(sdp); 558 gfs2_log_lock(sdp);
470 if (!list_empty(&bd->bd_list_tr)) { 559 if (!list_empty(&bd->bd_list_tr))
471 gfs2_log_unlock(sdp); 560 goto out;
472 return;
473 }
474 tr->tr_touched = 1; 561 tr->tr_touched = 1;
475 if (gfs2_is_jdata(ip)) { 562 if (gfs2_is_jdata(ip)) {
476 tr->tr_num_buf++; 563 tr->tr_num_buf++;
477 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 564 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
478 } 565 }
479 gfs2_log_unlock(sdp);
480 if (!list_empty(&le->le_list)) 566 if (!list_empty(&le->le_list))
481 return; 567 goto out;
482 568
483 gfs2_trans_add_gl(bd->bd_gl); 569 __glock_lo_add(sdp, &bd->bd_gl->gl_le);
484 if (gfs2_is_jdata(ip)) { 570 if (gfs2_is_jdata(ip)) {
485 sdp->sd_log_num_jdata++;
486 gfs2_pin(sdp, bd->bd_bh); 571 gfs2_pin(sdp, bd->bd_bh);
487 tr->tr_num_databuf_new++; 572 tr->tr_num_databuf_new++;
573 sdp->sd_log_num_databuf++;
574 list_add(&le->le_list, &sdp->sd_log_le_databuf);
575 } else {
576 list_add(&le->le_list, &sdp->sd_log_le_ordered);
488 } 577 }
489 gfs2_log_lock(sdp); 578out:
490 sdp->sd_log_num_databuf++;
491 list_add(&le->le_list, &sdp->sd_log_le_databuf);
492 gfs2_log_unlock(sdp); 579 gfs2_log_unlock(sdp);
580 unlock_buffer(bd->bd_bh);
493} 581}
494 582
495static int gfs2_check_magic(struct buffer_head *bh) 583static void gfs2_check_magic(struct buffer_head *bh)
496{ 584{
497 struct page *page = bh->b_page;
498 void *kaddr; 585 void *kaddr;
499 __be32 *ptr; 586 __be32 *ptr;
500 int rv = 0;
501 587
502 kaddr = kmap_atomic(page, KM_USER0); 588 clear_buffer_escaped(bh);
589 kaddr = kmap_atomic(bh->b_page, KM_USER0);
503 ptr = kaddr + bh_offset(bh); 590 ptr = kaddr + bh_offset(bh);
504 if (*ptr == cpu_to_be32(GFS2_MAGIC)) 591 if (*ptr == cpu_to_be32(GFS2_MAGIC))
505 rv = 1; 592 set_buffer_escaped(bh);
506 kunmap_atomic(kaddr, KM_USER0); 593 kunmap_atomic(kaddr, KM_USER0);
507
508 return rv;
509} 594}
510 595
511/** 596static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
512 * databuf_lo_before_commit - Scan the data buffers, writing as we go 597 struct list_head *list, struct list_head *done,
513 * 598 unsigned int n)
514 * Here we scan through the lists of buffers and make the assumption
515 * that any buffer thats been pinned is being journaled, and that
516 * any unpinned buffer is an ordered write data buffer and therefore
517 * will be written back rather than journaled.
518 */
519static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
520{ 599{
521 LIST_HEAD(started); 600 struct buffer_head *bh1;
522 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
523 struct buffer_head *bh = NULL,*bh1 = NULL;
524 struct gfs2_log_descriptor *ld; 601 struct gfs2_log_descriptor *ld;
525 unsigned int limit; 602 struct gfs2_bufdata *bd;
526 unsigned int total_dbuf; 603 __be64 *ptr;
527 unsigned int total_jdata = sdp->sd_log_num_jdata; 604
528 unsigned int num, n; 605 if (!bh)
529 __be64 *ptr = NULL; 606 return;
530 607
531 limit = databuf_limit(sdp); 608 ld = bh_log_desc(bh);
609 ld->ld_length = cpu_to_be32(n + 1);
610 ld->ld_data1 = cpu_to_be32(n);
532 611
533 /* 612 ptr = bh_log_ptr(bh);
534 * Start writing ordered buffers, write journaled buffers 613
535 * into the log along with a header 614 get_bh(bh);
536 */ 615 submit_bh(WRITE, bh);
537 gfs2_log_lock(sdp); 616 gfs2_log_lock(sdp);
538 total_dbuf = sdp->sd_log_num_databuf; 617 while(!list_empty(list)) {
539 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, 618 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
540 bd_le.le_list); 619 list_move_tail(&bd->bd_le.le_list, done);
541 while(total_dbuf) { 620 get_bh(bd->bd_bh);
542 num = total_jdata; 621 while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) {
543 if (num > limit) 622 gfs2_log_incr_head(sdp);
544 num = limit; 623 ptr += 2;
545 n = 0;
546 list_for_each_entry_safe_continue(bd1, bdt,
547 &sdp->sd_log_le_databuf,
548 bd_le.le_list) {
549 /* store off the buffer head in a local ptr since
550 * gfs2_bufdata might change when we drop the log lock
551 */
552 bh1 = bd1->bd_bh;
553
554 /* An ordered write buffer */
555 if (bh1 && !buffer_pinned(bh1)) {
556 list_move(&bd1->bd_le.le_list, &started);
557 if (bd1 == bd2) {
558 bd2 = NULL;
559 bd2 = list_prepare_entry(bd2,
560 &sdp->sd_log_le_databuf,
561 bd_le.le_list);
562 }
563 total_dbuf--;
564 if (bh1) {
565 if (buffer_dirty(bh1)) {
566 get_bh(bh1);
567
568 gfs2_log_unlock(sdp);
569
570 ll_rw_block(SWRITE, 1, &bh1);
571 brelse(bh1);
572
573 gfs2_log_lock(sdp);
574 }
575 continue;
576 }
577 continue;
578 } else if (bh1) { /* A journaled buffer */
579 int magic;
580 gfs2_log_unlock(sdp);
581 if (!bh) {
582 bh = gfs2_log_get_buf(sdp);
583 ld = (struct gfs2_log_descriptor *)
584 bh->b_data;
585 ptr = (__be64 *)(bh->b_data +
586 DATABUF_OFFSET);
587 ld->ld_header.mh_magic =
588 cpu_to_be32(GFS2_MAGIC);
589 ld->ld_header.mh_type =
590 cpu_to_be32(GFS2_METATYPE_LD);
591 ld->ld_header.mh_format =
592 cpu_to_be32(GFS2_FORMAT_LD);
593 ld->ld_type =
594 cpu_to_be32(GFS2_LOG_DESC_JDATA);
595 ld->ld_length = cpu_to_be32(num + 1);
596 ld->ld_data1 = cpu_to_be32(num);
597 ld->ld_data2 = cpu_to_be32(0);
598 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
599 }
600 magic = gfs2_check_magic(bh1);
601 *ptr++ = cpu_to_be64(bh1->b_blocknr);
602 *ptr++ = cpu_to_be64((__u64)magic);
603 clear_buffer_escaped(bh1);
604 if (unlikely(magic != 0))
605 set_buffer_escaped(bh1);
606 gfs2_log_lock(sdp);
607 if (++n >= num)
608 break;
609 } else if (!bh1) {
610 total_dbuf--;
611 sdp->sd_log_num_databuf--;
612 list_del_init(&bd1->bd_le.le_list);
613 if (bd1 == bd2) {
614 bd2 = NULL;
615 bd2 = list_prepare_entry(bd2,
616 &sdp->sd_log_le_databuf,
617 bd_le.le_list);
618 }
619 kmem_cache_free(gfs2_bufdata_cachep, bd1);
620 }
621 } 624 }
622 gfs2_log_unlock(sdp); 625 gfs2_log_unlock(sdp);
623 if (bh) { 626 lock_buffer(bd->bd_bh);
624 set_buffer_mapped(bh); 627 if (buffer_escaped(bd->bd_bh)) {
625 set_buffer_dirty(bh); 628 void *kaddr;
626 ll_rw_block(WRITE, 1, &bh); 629 bh1 = gfs2_log_get_buf(sdp);
627 bh = NULL; 630 kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0);
631 memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
632 bh1->b_size);
633 kunmap_atomic(kaddr, KM_USER0);
634 *(__be32 *)bh1->b_data = 0;
635 clear_buffer_escaped(bd->bd_bh);
636 unlock_buffer(bd->bd_bh);
637 brelse(bd->bd_bh);
638 } else {
639 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
628 } 640 }
629 n = 0; 641 submit_bh(WRITE, bh1);
630 gfs2_log_lock(sdp); 642 gfs2_log_lock(sdp);
631 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, 643 ptr += 2;
632 bd_le.le_list) {
633 if (!bd2->bd_bh)
634 continue;
635 /* copy buffer if it needs escaping */
636 gfs2_log_unlock(sdp);
637 if (unlikely(buffer_escaped(bd2->bd_bh))) {
638 void *kaddr;
639 struct page *page = bd2->bd_bh->b_page;
640 bh = gfs2_log_get_buf(sdp);
641 kaddr = kmap_atomic(page, KM_USER0);
642 memcpy(bh->b_data,
643 kaddr + bh_offset(bd2->bd_bh),
644 sdp->sd_sb.sb_bsize);
645 kunmap_atomic(kaddr, KM_USER0);
646 *(__be32 *)bh->b_data = 0;
647 } else {
648 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
649 }
650 set_buffer_dirty(bh);
651 ll_rw_block(WRITE, 1, &bh);
652 gfs2_log_lock(sdp);
653 if (++n >= num)
654 break;
655 }
656 bh = NULL;
657 BUG_ON(total_dbuf < num);
658 total_dbuf -= num;
659 total_jdata -= num;
660 } 644 }
661 gfs2_log_unlock(sdp); 645 gfs2_log_unlock(sdp);
646 brelse(bh);
647}
662 648
663 /* Wait on all ordered buffers */ 649/**
664 while (!list_empty(&started)) { 650 * databuf_lo_before_commit - Scan the data buffers, writing as we go
665 gfs2_log_lock(sdp); 651 *
666 bd1 = list_entry(started.next, struct gfs2_bufdata, 652 */
667 bd_le.le_list);
668 list_del_init(&bd1->bd_le.le_list);
669 sdp->sd_log_num_databuf--;
670 bh = bd1->bd_bh;
671 if (bh) {
672 bh->b_private = NULL;
673 get_bh(bh);
674 gfs2_log_unlock(sdp);
675 wait_on_buffer(bh);
676 brelse(bh);
677 } else
678 gfs2_log_unlock(sdp);
679 653
680 kmem_cache_free(gfs2_bufdata_cachep, bd1); 654static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
681 } 655{
656 struct gfs2_bufdata *bd = NULL;
657 struct buffer_head *bh = NULL;
658 unsigned int n = 0;
659 __be64 *ptr = NULL, *end = NULL;
660 LIST_HEAD(processed);
661 LIST_HEAD(in_progress);
682 662
683 /* We've removed all the ordered write bufs here, so only jdata left */ 663 gfs2_log_lock(sdp);
684 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata); 664 while (!list_empty(&sdp->sd_log_le_databuf)) {
665 if (ptr == end) {
666 gfs2_log_unlock(sdp);
667 gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
668 n = 0;
669 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA);
670 ptr = bh_log_ptr(bh);
671 end = bh_ptr_end(bh) - 1;
672 gfs2_log_lock(sdp);
673 continue;
674 }
675 bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list);
676 list_move_tail(&bd->bd_le.le_list, &in_progress);
677 gfs2_check_magic(bd->bd_bh);
678 *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr);
679 *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0);
680 n++;
681 }
682 gfs2_log_unlock(sdp);
683 gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
684 gfs2_log_lock(sdp);
685 list_splice(&processed, &sdp->sd_log_le_databuf);
686 gfs2_log_unlock(sdp);
685} 687}
686 688
687static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 689static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -765,11 +767,9 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
765 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); 767 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
766 list_del_init(&bd->bd_le.le_list); 768 list_del_init(&bd->bd_le.le_list);
767 sdp->sd_log_num_databuf--; 769 sdp->sd_log_num_databuf--;
768 sdp->sd_log_num_jdata--;
769 gfs2_unpin(sdp, bd->bd_bh, ai); 770 gfs2_unpin(sdp, bd->bd_bh, ai);
770 } 771 }
771 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); 772 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
772 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
773} 773}
774 774
775 775
@@ -817,10 +817,10 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
817 817
818const struct gfs2_log_operations *gfs2_log_ops[] = { 818const struct gfs2_log_operations *gfs2_log_ops[] = {
819 &gfs2_glock_lops, 819 &gfs2_glock_lops,
820 &gfs2_databuf_lops,
820 &gfs2_buf_lops, 821 &gfs2_buf_lops,
821 &gfs2_revoke_lops,
822 &gfs2_rg_lops, 822 &gfs2_rg_lops,
823 &gfs2_databuf_lops, 823 &gfs2_revoke_lops,
824 NULL, 824 NULL,
825}; 825};
826 826
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index d5d4e68b8807..79c91fd8381b 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -107,6 +107,8 @@ static int __init init_gfs2_fs(void)
107fail_unregister: 107fail_unregister:
108 unregister_filesystem(&gfs2_fs_type); 108 unregister_filesystem(&gfs2_fs_type);
109fail: 109fail:
110 gfs2_glock_exit();
111
110 if (gfs2_bufdata_cachep) 112 if (gfs2_bufdata_cachep)
111 kmem_cache_destroy(gfs2_bufdata_cachep); 113 kmem_cache_destroy(gfs2_bufdata_cachep);
112 114
@@ -127,6 +129,7 @@ fail:
127 129
128static void __exit exit_gfs2_fs(void) 130static void __exit exit_gfs2_fs(void)
129{ 131{
132 gfs2_glock_exit();
130 gfs2_unregister_debugfs(); 133 gfs2_unregister_debugfs();
131 unregister_filesystem(&gfs2_fs_type); 134 unregister_filesystem(&gfs2_fs_type);
132 unregister_filesystem(&gfs2meta_fs_type); 135 unregister_filesystem(&gfs2meta_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8da343b34ae7..4da423985e4f 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -297,74 +297,35 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
297 unlock_page(bh->b_page); 297 unlock_page(bh->b_page);
298} 298}
299 299
300/** 300void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
301 * gfs2_pin - Pin a buffer in memory
302 * @sdp: the filesystem the buffer belongs to
303 * @bh: The buffer to be pinned
304 *
305 */
306
307void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
308{ 301{
302 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
309 struct gfs2_bufdata *bd = bh->b_private; 303 struct gfs2_bufdata *bd = bh->b_private;
310 304 if (test_clear_buffer_pinned(bh)) {
311 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); 305 list_del_init(&bd->bd_le.le_list);
312 306 if (meta) {
313 if (test_set_buffer_pinned(bh)) 307 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
314 gfs2_assert_withdraw(sdp, 0); 308 sdp->sd_log_num_buf--;
315 309 tr->tr_num_buf_rm++;
316 wait_on_buffer(bh); 310 } else {
317 311 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
318 /* If this buffer is in the AIL and it has already been written 312 sdp->sd_log_num_databuf--;
319 to in-place disk block, remove it from the AIL. */ 313 tr->tr_num_databuf_rm++;
320 314 }
321 gfs2_log_lock(sdp); 315 tr->tr_touched = 1;
322 if (bd->bd_ail && !buffer_in_io(bh))
323 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
324 gfs2_log_unlock(sdp);
325
326 clear_buffer_dirty(bh);
327 wait_on_buffer(bh);
328
329 if (!buffer_uptodate(bh))
330 gfs2_io_error_bh(sdp, bh);
331
332 get_bh(bh);
333}
334
335/**
336 * gfs2_unpin - Unpin a buffer
337 * @sdp: the filesystem the buffer belongs to
338 * @bh: The buffer to unpin
339 * @ai:
340 *
341 */
342
343void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
344 struct gfs2_ail *ai)
345{
346 struct gfs2_bufdata *bd = bh->b_private;
347
348 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
349
350 if (!buffer_pinned(bh))
351 gfs2_assert_withdraw(sdp, 0);
352
353 mark_buffer_dirty(bh);
354 clear_buffer_pinned(bh);
355
356 gfs2_log_lock(sdp);
357 if (bd->bd_ail) {
358 list_del(&bd->bd_ail_st_list);
359 brelse(bh); 316 brelse(bh);
360 } else {
361 struct gfs2_glock *gl = bd->bd_gl;
362 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
363 atomic_inc(&gl->gl_ail_count);
364 } 317 }
365 bd->bd_ail = ai; 318 if (bd) {
366 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 319 if (bd->bd_ail) {
367 gfs2_log_unlock(sdp); 320 gfs2_remove_from_ail(NULL, bd);
321 bh->b_private = NULL;
322 bd->bd_bh = NULL;
323 bd->bd_blkno = bh->b_blocknr;
324 gfs2_trans_add_revoke(sdp, bd);
325 }
326 }
327 clear_buffer_dirty(bh);
328 clear_buffer_uptodate(bh);
368} 329}
369 330
370/** 331/**
@@ -383,44 +344,11 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
383 while (blen) { 344 while (blen) {
384 bh = getbuf(ip->i_gl, bstart, NO_CREATE); 345 bh = getbuf(ip->i_gl, bstart, NO_CREATE);
385 if (bh) { 346 if (bh) {
386 struct gfs2_bufdata *bd = bh->b_private;
387
388 if (test_clear_buffer_pinned(bh)) {
389 struct gfs2_trans *tr = current->journal_info;
390 struct gfs2_inode *bh_ip =
391 GFS2_I(bh->b_page->mapping->host);
392
393 gfs2_log_lock(sdp);
394 list_del_init(&bd->bd_le.le_list);
395 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
396 sdp->sd_log_num_buf--;
397 gfs2_log_unlock(sdp);
398 if (bh_ip->i_inode.i_private != NULL)
399 tr->tr_num_databuf_rm++;
400 else
401 tr->tr_num_buf_rm++;
402 brelse(bh);
403 }
404 if (bd) {
405 gfs2_log_lock(sdp);
406 if (bd->bd_ail) {
407 u64 blkno = bh->b_blocknr;
408 bd->bd_ail = NULL;
409 list_del(&bd->bd_ail_st_list);
410 list_del(&bd->bd_ail_gl_list);
411 atomic_dec(&bd->bd_gl->gl_ail_count);
412 brelse(bh);
413 gfs2_log_unlock(sdp);
414 gfs2_trans_add_revoke(sdp, blkno);
415 } else
416 gfs2_log_unlock(sdp);
417 }
418
419 lock_buffer(bh); 347 lock_buffer(bh);
420 clear_buffer_dirty(bh); 348 gfs2_log_lock(sdp);
421 clear_buffer_uptodate(bh); 349 gfs2_remove_from_journal(bh, current->journal_info, 1);
350 gfs2_log_unlock(sdp);
422 unlock_buffer(bh); 351 unlock_buffer(bh);
423
424 brelse(bh); 352 brelse(bh);
425 } 353 }
426 354
@@ -446,10 +374,10 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
446 374
447 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) { 375 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
448 bh_slot = &ip->i_cache[x]; 376 bh_slot = &ip->i_cache[x];
449 if (!*bh_slot) 377 if (*bh_slot) {
450 break; 378 brelse(*bh_slot);
451 brelse(*bh_slot); 379 *bh_slot = NULL;
452 *bh_slot = NULL; 380 }
453 } 381 }
454 382
455 spin_unlock(&ip->i_spin); 383 spin_unlock(&ip->i_spin);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 527bf19d9690..b7048222ebb4 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,9 +50,9 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50 50
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, 51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta); 52 int meta);
53void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); 53
54void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, 54void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
55 struct gfs2_ail *ai); 55 int meta);
56 56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); 57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58 58
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 4864659555d4..b941f9f9f958 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
42 Opt_nosuiddir, 42 Opt_nosuiddir,
43 Opt_data_writeback, 43 Opt_data_writeback,
44 Opt_data_ordered, 44 Opt_data_ordered,
45 Opt_err,
45}; 46};
46 47
47static match_table_t tokens = { 48static match_table_t tokens = {
@@ -64,7 +65,8 @@ static match_table_t tokens = {
64 {Opt_suiddir, "suiddir"}, 65 {Opt_suiddir, "suiddir"},
65 {Opt_nosuiddir, "nosuiddir"}, 66 {Opt_nosuiddir, "nosuiddir"},
66 {Opt_data_writeback, "data=writeback"}, 67 {Opt_data_writeback, "data=writeback"},
67 {Opt_data_ordered, "data=ordered"} 68 {Opt_data_ordered, "data=ordered"},
69 {Opt_err, NULL}
68}; 70};
69 71
70/** 72/**
@@ -237,6 +239,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
237 case Opt_data_ordered: 239 case Opt_data_ordered:
238 args->ar_data = GFS2_DATA_ORDERED; 240 args->ar_data = GFS2_DATA_ORDERED;
239 break; 241 break;
242 case Opt_err:
240 default: 243 default:
241 fs_info(sdp, "unknown option: %s\n", o); 244 fs_info(sdp, "unknown option: %s\n", o);
242 error = -EINVAL; 245 error = -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 42a5f58f6fca..873a511ef2be 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -90,7 +90,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
90 error = gfs2_block_map(inode, lblock, 0, bh_result); 90 error = gfs2_block_map(inode, lblock, 0, bh_result);
91 if (error) 91 if (error)
92 return error; 92 return error;
93 if (bh_result->b_blocknr == 0) 93 if (!buffer_mapped(bh_result))
94 return -EIO; 94 return -EIO;
95 return 0; 95 return 0;
96} 96}
@@ -414,7 +414,8 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
414 if (ind_blocks || data_blocks) 414 if (ind_blocks || data_blocks)
415 rblocks += RES_STATFS + RES_QUOTA; 415 rblocks += RES_STATFS + RES_QUOTA;
416 416
417 error = gfs2_trans_begin(sdp, rblocks, 0); 417 error = gfs2_trans_begin(sdp, rblocks,
418 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
418 if (error) 419 if (error)
419 goto out_trans_fail; 420 goto out_trans_fail;
420 421
@@ -616,58 +617,50 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
616 return dblock; 617 return dblock;
617} 618}
618 619
619static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh) 620static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
620{ 621{
621 struct gfs2_bufdata *bd; 622 struct gfs2_bufdata *bd;
622 623
624 lock_buffer(bh);
623 gfs2_log_lock(sdp); 625 gfs2_log_lock(sdp);
626 clear_buffer_dirty(bh);
624 bd = bh->b_private; 627 bd = bh->b_private;
625 if (bd) { 628 if (bd) {
626 bd->bd_bh = NULL; 629 if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
627 bh->b_private = NULL; 630 list_del_init(&bd->bd_le.le_list);
628 if (!bd->bd_ail && list_empty(&bd->bd_le.le_list)) 631 else
629 kmem_cache_free(gfs2_bufdata_cachep, bd); 632 gfs2_remove_from_journal(bh, current->journal_info, 0);
630 } 633 }
631 gfs2_log_unlock(sdp);
632
633 lock_buffer(bh);
634 clear_buffer_dirty(bh);
635 bh->b_bdev = NULL; 634 bh->b_bdev = NULL;
636 clear_buffer_mapped(bh); 635 clear_buffer_mapped(bh);
637 clear_buffer_req(bh); 636 clear_buffer_req(bh);
638 clear_buffer_new(bh); 637 clear_buffer_new(bh);
639 clear_buffer_delay(bh); 638 gfs2_log_unlock(sdp);
640 unlock_buffer(bh); 639 unlock_buffer(bh);
641} 640}
642 641
643static void gfs2_invalidatepage(struct page *page, unsigned long offset) 642static void gfs2_invalidatepage(struct page *page, unsigned long offset)
644{ 643{
645 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 644 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
646 struct buffer_head *head, *bh, *next; 645 struct buffer_head *bh, *head;
647 unsigned int curr_off = 0; 646 unsigned long pos = 0;
648 647
649 BUG_ON(!PageLocked(page)); 648 BUG_ON(!PageLocked(page));
650 if (offset == 0) 649 if (offset == 0)
651 ClearPageChecked(page); 650 ClearPageChecked(page);
652 if (!page_has_buffers(page)) 651 if (!page_has_buffers(page))
653 return; 652 goto out;
654 653
655 bh = head = page_buffers(page); 654 bh = head = page_buffers(page);
656 do { 655 do {
657 unsigned int next_off = curr_off + bh->b_size; 656 if (offset <= pos)
658 next = bh->b_this_page; 657 gfs2_discard(sdp, bh);
659 658 pos += bh->b_size;
660 if (offset <= curr_off) 659 bh = bh->b_this_page;
661 discard_buffer(sdp, bh);
662
663 curr_off = next_off;
664 bh = next;
665 } while (bh != head); 660 } while (bh != head);
666 661out:
667 if (!offset) 662 if (offset == 0)
668 try_to_release_page(page, 0); 663 try_to_release_page(page, 0);
669
670 return;
671} 664}
672 665
673/** 666/**
@@ -736,59 +729,6 @@ out:
736} 729}
737 730
738/** 731/**
739 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
740 * @bh: the buffer we're stuck on
741 *
742 */
743
744static void stuck_releasepage(struct buffer_head *bh)
745{
746 struct inode *inode = bh->b_page->mapping->host;
747 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
748 struct gfs2_bufdata *bd = bh->b_private;
749 struct gfs2_glock *gl;
750static unsigned limit = 0;
751
752 if (limit > 3)
753 return;
754 limit++;
755
756 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
757 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
758 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
759 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
760 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
761
762 if (!bd)
763 return;
764
765 gl = bd->bd_gl;
766
767 fs_warn(sdp, "gl = (%u, %llu)\n",
768 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
769
770 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
771 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
772 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
773
774 if (gl->gl_ops == &gfs2_inode_glops) {
775 struct gfs2_inode *ip = gl->gl_object;
776 unsigned int x;
777
778 if (!ip)
779 return;
780
781 fs_warn(sdp, "ip = %llu %llu\n",
782 (unsigned long long)ip->i_no_formal_ino,
783 (unsigned long long)ip->i_no_addr);
784
785 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
786 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
787 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
788 }
789}
790
791/**
792 * gfs2_releasepage - free the metadata associated with a page 732 * gfs2_releasepage - free the metadata associated with a page
793 * @page: the page that's being released 733 * @page: the page that's being released
794 * @gfp_mask: passed from Linux VFS, ignored by us 734 * @gfp_mask: passed from Linux VFS, ignored by us
@@ -805,41 +745,39 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
805 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 745 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
806 struct buffer_head *bh, *head; 746 struct buffer_head *bh, *head;
807 struct gfs2_bufdata *bd; 747 struct gfs2_bufdata *bd;
808 unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
809 748
810 if (!page_has_buffers(page)) 749 if (!page_has_buffers(page))
811 goto out; 750 return 0;
812 751
752 gfs2_log_lock(sdp);
813 head = bh = page_buffers(page); 753 head = bh = page_buffers(page);
814 do { 754 do {
815 while (atomic_read(&bh->b_count)) { 755 if (atomic_read(&bh->b_count))
816 if (!atomic_read(&aspace->i_writecount)) 756 goto cannot_release;
817 return 0; 757 bd = bh->b_private;
818 758 if (bd && bd->bd_ail)
819 if (!(gfp_mask & __GFP_WAIT)) 759 goto cannot_release;
820 return 0;
821
822 if (time_after_eq(jiffies, t)) {
823 stuck_releasepage(bh);
824 /* should we withdraw here? */
825 return 0;
826 }
827
828 yield();
829 }
830
831 gfs2_assert_warn(sdp, !buffer_pinned(bh)); 760 gfs2_assert_warn(sdp, !buffer_pinned(bh));
832 gfs2_assert_warn(sdp, !buffer_dirty(bh)); 761 gfs2_assert_warn(sdp, !buffer_dirty(bh));
762 bh = bh->b_this_page;
763 } while(bh != head);
764 gfs2_log_unlock(sdp);
833 765
766 head = bh = page_buffers(page);
767 do {
834 gfs2_log_lock(sdp); 768 gfs2_log_lock(sdp);
835 bd = bh->b_private; 769 bd = bh->b_private;
836 if (bd) { 770 if (bd) {
837 gfs2_assert_warn(sdp, bd->bd_bh == bh); 771 gfs2_assert_warn(sdp, bd->bd_bh == bh);
838 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); 772 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
839 gfs2_assert_warn(sdp, !bd->bd_ail); 773 if (!list_empty(&bd->bd_le.le_list)) {
840 bd->bd_bh = NULL; 774 if (!buffer_pinned(bh))
841 if (!list_empty(&bd->bd_le.le_list)) 775 list_del_init(&bd->bd_le.le_list);
842 bd = NULL; 776 else
777 bd = NULL;
778 }
779 if (bd)
780 bd->bd_bh = NULL;
843 bh->b_private = NULL; 781 bh->b_private = NULL;
844 } 782 }
845 gfs2_log_unlock(sdp); 783 gfs2_log_unlock(sdp);
@@ -849,8 +787,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
849 bh = bh->b_this_page; 787 bh = bh->b_this_page;
850 } while (bh != head); 788 } while (bh != head);
851 789
852out:
853 return try_to_free_buffers(page); 790 return try_to_free_buffers(page);
791cannot_release:
792 gfs2_log_unlock(sdp);
793 return 0;
854} 794}
855 795
856const struct address_space_operations gfs2_file_aops = { 796const struct address_space_operations gfs2_file_aops = {
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index b8312edee0e4..e2d1347796a9 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -237,7 +237,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
237 237
238 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, 238 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
239 inum->no_addr, 239 inum->no_addr,
240 0); 240 0, 0);
241 if (!inode) 241 if (!inode)
242 goto fail; 242 goto fail;
243 if (IS_ERR(inode)) { 243 if (IS_ERR(inode)) {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ace0b95..46a9e10ff17b 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -571,7 +571,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
571 int error = 0; 571 int error = 0;
572 572
573 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 573 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
574 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; 574 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE
575 | GL_FLOCK;
575 576
576 mutex_lock(&fp->f_fl_mutex); 577 mutex_lock(&fp->f_fl_mutex);
577 578
@@ -579,21 +580,19 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
579 if (gl) { 580 if (gl) {
580 if (fl_gh->gh_state == state) 581 if (fl_gh->gh_state == state)
581 goto out; 582 goto out;
582 gfs2_glock_hold(gl);
583 flock_lock_file_wait(file, 583 flock_lock_file_wait(file,
584 &(struct file_lock){.fl_type = F_UNLCK}); 584 &(struct file_lock){.fl_type = F_UNLCK});
585 gfs2_glock_dq_uninit(fl_gh); 585 gfs2_glock_dq_wait(fl_gh);
586 gfs2_holder_reinit(state, flags, fl_gh);
586 } else { 587 } else {
587 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), 588 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
588 ip->i_no_addr, &gfs2_flock_glops, 589 ip->i_no_addr, &gfs2_flock_glops,
589 CREATE, &gl); 590 CREATE, &gl);
590 if (error) 591 if (error)
591 goto out; 592 goto out;
593 gfs2_holder_init(gl, state, flags, fl_gh);
594 gfs2_glock_put(gl);
592 } 595 }
593
594 gfs2_holder_init(gl, state, flags, fl_gh);
595 gfs2_glock_put(gl);
596
597 error = gfs2_glock_nq(fl_gh); 596 error = gfs2_glock_nq(fl_gh);
598 if (error) { 597 if (error) {
599 gfs2_holder_uninit(fl_gh); 598 gfs2_holder_uninit(fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index cf5aa5050548..17de58e83d92 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -28,18 +28,18 @@
28#include "lm.h" 28#include "lm.h"
29#include "mount.h" 29#include "mount.h"
30#include "ops_fstype.h" 30#include "ops_fstype.h"
31#include "ops_dentry.h"
31#include "ops_super.h" 32#include "ops_super.h"
32#include "recovery.h" 33#include "recovery.h"
33#include "rgrp.h" 34#include "rgrp.h"
34#include "super.h" 35#include "super.h"
35#include "sys.h" 36#include "sys.h"
36#include "util.h" 37#include "util.h"
38#include "log.h"
37 39
38#define DO 0 40#define DO 0
39#define UNDO 1 41#define UNDO 1
40 42
41extern struct dentry_operations gfs2_dops;
42
43static struct gfs2_sbd *init_sbd(struct super_block *sb) 43static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{ 44{
45 struct gfs2_sbd *sdp; 45 struct gfs2_sbd *sdp;
@@ -82,13 +82,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
83 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 83 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
84 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 84 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
85 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
85 86
86 mutex_init(&sdp->sd_log_reserve_mutex); 87 mutex_init(&sdp->sd_log_reserve_mutex);
87 INIT_LIST_HEAD(&sdp->sd_ail1_list); 88 INIT_LIST_HEAD(&sdp->sd_ail1_list);
88 INIT_LIST_HEAD(&sdp->sd_ail2_list); 89 INIT_LIST_HEAD(&sdp->sd_ail2_list);
89 90
90 init_rwsem(&sdp->sd_log_flush_lock); 91 init_rwsem(&sdp->sd_log_flush_lock);
91 INIT_LIST_HEAD(&sdp->sd_log_flush_list); 92 atomic_set(&sdp->sd_log_in_flight, 0);
93 init_waitqueue_head(&sdp->sd_log_flush_wait);
92 94
93 INIT_LIST_HEAD(&sdp->sd_revoke_list); 95 INIT_LIST_HEAD(&sdp->sd_revoke_list);
94 96
@@ -145,7 +147,8 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
145 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); 147 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
146 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); 148 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
147 149
148 while ((table = strchr(sdp->sd_table_name, '/'))) 150 table = sdp->sd_table_name;
151 while ((table = strchr(table, '/')))
149 *table = '_'; 152 *table = '_';
150 153
151out: 154out:
@@ -161,14 +164,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
161 if (undo) 164 if (undo)
162 goto fail_trans; 165 goto fail_trans;
163 166
164 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
165 error = IS_ERR(p);
166 if (error) {
167 fs_err(sdp, "can't start scand thread: %d\n", error);
168 return error;
169 }
170 sdp->sd_scand_process = p;
171
172 for (sdp->sd_glockd_num = 0; 167 for (sdp->sd_glockd_num = 0;
173 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; 168 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
174 sdp->sd_glockd_num++) { 169 sdp->sd_glockd_num++) {
@@ -229,14 +224,13 @@ fail:
229 while (sdp->sd_glockd_num--) 224 while (sdp->sd_glockd_num--)
230 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); 225 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
231 226
232 kthread_stop(sdp->sd_scand_process);
233 return error; 227 return error;
234} 228}
235 229
236static inline struct inode *gfs2_lookup_root(struct super_block *sb, 230static inline struct inode *gfs2_lookup_root(struct super_block *sb,
237 u64 no_addr) 231 u64 no_addr)
238{ 232{
239 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); 233 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
240} 234}
241 235
242static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 236static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
@@ -301,8 +295,9 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
301 fs_err(sdp, "can't get root dentry\n"); 295 fs_err(sdp, "can't get root dentry\n");
302 error = -ENOMEM; 296 error = -ENOMEM;
303 iput(inode); 297 iput(inode);
304 } 298 } else
305 sb->s_root->d_op = &gfs2_dops; 299 sb->s_root->d_op = &gfs2_dops;
300
306out: 301out:
307 gfs2_glock_dq_uninit(&sb_gh); 302 gfs2_glock_dq_uninit(&sb_gh);
308 return error; 303 return error;
@@ -368,7 +363,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
368 363
369 ip = GFS2_I(sdp->sd_jdesc->jd_inode); 364 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
370 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 365 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
371 LM_FLAG_NOEXP | GL_EXACT, 366 LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
372 &sdp->sd_jinode_gh); 367 &sdp->sd_jinode_gh);
373 if (error) { 368 if (error) {
374 fs_err(sdp, "can't acquire journal inode glock: %d\n", 369 fs_err(sdp, "can't acquire journal inode glock: %d\n",
@@ -818,7 +813,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
818 struct nameidata nd; 813 struct nameidata nd;
819 struct file_system_type *fstype; 814 struct file_system_type *fstype;
820 struct super_block *sb = NULL, *s; 815 struct super_block *sb = NULL, *s;
821 struct list_head *l;
822 int error; 816 int error;
823 817
824 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 818 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
@@ -830,8 +824,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
830 error = vfs_getattr(nd.mnt, nd.dentry, &stat); 824 error = vfs_getattr(nd.mnt, nd.dentry, &stat);
831 825
832 fstype = get_fs_type("gfs2"); 826 fstype = get_fs_type("gfs2");
833 list_for_each(l, &fstype->fs_supers) { 827 list_for_each_entry(s, &fstype->fs_supers, s_instances) {
834 s = list_entry(l, struct super_block, s_instances);
835 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || 828 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
836 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) { 829 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
837 sb = s; 830 sb = s;
@@ -861,7 +854,7 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
861 error = -ENOENT; 854 error = -ENOENT;
862 goto error; 855 goto error;
863 } 856 }
864 sdp = (struct gfs2_sbd*) sb->s_fs_info; 857 sdp = sb->s_fs_info;
865 if (sdp->sd_vfs_meta) { 858 if (sdp->sd_vfs_meta) {
866 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); 859 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
867 error = -EBUSY; 860 error = -EBUSY;
@@ -896,7 +889,10 @@ error:
896 889
897static void gfs2_kill_sb(struct super_block *sb) 890static void gfs2_kill_sb(struct super_block *sb)
898{ 891{
899 gfs2_delete_debugfs_file(sb->s_fs_info); 892 if (sb->s_fs_info) {
893 gfs2_delete_debugfs_file(sb->s_fs_info);
894 gfs2_meta_syncfs(sb->s_fs_info);
895 }
900 kill_block_super(sb); 896 kill_block_super(sb);
901} 897}
902 898
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 911c115b5c6c..291f0c7eaa3b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
69 mark_inode_dirty(inode); 69 mark_inode_dirty(inode);
70 break; 70 break;
71 } else if (PTR_ERR(inode) != -EEXIST || 71 } else if (PTR_ERR(inode) != -EEXIST ||
72 (nd->intent.open.flags & O_EXCL)) { 72 (nd && (nd->intent.open.flags & O_EXCL))) {
73 gfs2_holder_uninit(ghs); 73 gfs2_holder_uninit(ghs);
74 return PTR_ERR(inode); 74 return PTR_ERR(inode);
75 } 75 }
@@ -278,17 +278,25 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
278 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 278 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
279 279
280 280
281 error = gfs2_glock_nq_m(3, ghs); 281 error = gfs2_glock_nq(ghs); /* parent */
282 if (error) 282 if (error)
283 goto out; 283 goto out_parent;
284
285 error = gfs2_glock_nq(ghs + 1); /* child */
286 if (error)
287 goto out_child;
288
289 error = gfs2_glock_nq(ghs + 2); /* rgrp */
290 if (error)
291 goto out_rgrp;
284 292
285 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 293 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
286 if (error) 294 if (error)
287 goto out_gunlock; 295 goto out_rgrp;
288 296
289 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 297 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
290 if (error) 298 if (error)
291 goto out_gunlock; 299 goto out_rgrp;
292 300
293 error = gfs2_dir_del(dip, &dentry->d_name); 301 error = gfs2_dir_del(dip, &dentry->d_name);
294 if (error) 302 if (error)
@@ -298,12 +306,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
298 306
299out_end_trans: 307out_end_trans:
300 gfs2_trans_end(sdp); 308 gfs2_trans_end(sdp);
301out_gunlock: 309 gfs2_glock_dq(ghs + 2);
302 gfs2_glock_dq_m(3, ghs); 310out_rgrp:
303out:
304 gfs2_holder_uninit(ghs);
305 gfs2_holder_uninit(ghs + 1);
306 gfs2_holder_uninit(ghs + 2); 311 gfs2_holder_uninit(ghs + 2);
312 gfs2_glock_dq(ghs + 1);
313out_child:
314 gfs2_holder_uninit(ghs + 1);
315 gfs2_glock_dq(ghs);
316out_parent:
317 gfs2_holder_uninit(ghs);
307 gfs2_glock_dq_uninit(&ri_gh); 318 gfs2_glock_dq_uninit(&ri_gh);
308 return error; 319 return error;
309} 320}
@@ -894,12 +905,17 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
894static int setattr_size(struct inode *inode, struct iattr *attr) 905static int setattr_size(struct inode *inode, struct iattr *attr)
895{ 906{
896 struct gfs2_inode *ip = GFS2_I(inode); 907 struct gfs2_inode *ip = GFS2_I(inode);
908 struct gfs2_sbd *sdp = GFS2_SB(inode);
897 int error; 909 int error;
898 910
899 if (attr->ia_size != ip->i_di.di_size) { 911 if (attr->ia_size != ip->i_di.di_size) {
900 error = vmtruncate(inode, attr->ia_size); 912 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
901 if (error) 913 if (error)
902 return error; 914 return error;
915 error = vmtruncate(inode, attr->ia_size);
916 gfs2_trans_end(sdp);
917 if (error)
918 return error;
903 } 919 }
904 920
905 error = gfs2_truncatei(ip, attr->ia_size); 921 error = gfs2_truncatei(ip, attr->ia_size);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 603d940f1159..950f31460e8b 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -92,7 +92,6 @@ static void gfs2_put_super(struct super_block *sb)
92 kthread_stop(sdp->sd_recoverd_process); 92 kthread_stop(sdp->sd_recoverd_process);
93 while (sdp->sd_glockd_num--) 93 while (sdp->sd_glockd_num--)
94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); 94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
95 kthread_stop(sdp->sd_scand_process);
96 95
97 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
98 error = gfs2_make_fs_ro(sdp); 97 error = gfs2_make_fs_ro(sdp);
@@ -456,12 +455,15 @@ static void gfs2_delete_inode(struct inode *inode)
456 } 455 }
457 456
458 error = gfs2_dinode_dealloc(ip); 457 error = gfs2_dinode_dealloc(ip);
459 /* 458 if (error)
460 * Must do this before unlock to avoid trying to write back 459 goto out_unlock;
461 * potentially dirty data now that inode no longer exists 460
462 * on disk. 461 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
463 */ 462 if (error)
463 goto out_unlock;
464 /* Needs to be done before glock release & also in a transaction */
464 truncate_inode_pages(&inode->i_data, 0); 465 truncate_inode_pages(&inode->i_data, 0);
466 gfs2_trans_end(sdp);
465 467
466out_unlock: 468out_unlock:
467 gfs2_glock_dq(&ip->i_iopen_gh); 469 gfs2_glock_dq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6e546ee8f3d4..addb51e0f135 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -70,6 +70,7 @@ struct gfs2_quota_host {
70 u64 qu_limit; 70 u64 qu_limit;
71 u64 qu_warn; 71 u64 qu_warn;
72 s64 qu_value; 72 s64 qu_value;
73 u32 qu_ll_next;
73}; 74};
74 75
75struct gfs2_quota_change_host { 76struct gfs2_quota_change_host {
@@ -580,6 +581,7 @@ static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
580 qu->qu_limit = be64_to_cpu(str->qu_limit); 581 qu->qu_limit = be64_to_cpu(str->qu_limit);
581 qu->qu_warn = be64_to_cpu(str->qu_warn); 582 qu->qu_warn = be64_to_cpu(str->qu_warn);
582 qu->qu_value = be64_to_cpu(str->qu_value); 583 qu->qu_value = be64_to_cpu(str->qu_value);
584 qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
583} 585}
584 586
585static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) 587static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
@@ -589,6 +591,7 @@ static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
589 str->qu_limit = cpu_to_be64(qu->qu_limit); 591 str->qu_limit = cpu_to_be64(qu->qu_limit);
590 str->qu_warn = cpu_to_be64(qu->qu_warn); 592 str->qu_warn = cpu_to_be64(qu->qu_warn);
591 str->qu_value = cpu_to_be64(qu->qu_value); 593 str->qu_value = cpu_to_be64(qu->qu_value);
594 str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
592 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); 595 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
593} 596}
594 597
@@ -614,6 +617,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
614 s64 value; 617 s64 value;
615 int err = -EIO; 618 int err = -EIO;
616 619
620 if (gfs2_is_stuffed(ip)) {
621 struct gfs2_alloc *al = NULL;
622 al = gfs2_alloc_get(ip);
623 /* just request 1 blk */
624 al->al_requested = 1;
625 gfs2_inplace_reserve(ip);
626 gfs2_unstuff_dinode(ip, NULL);
627 gfs2_inplace_release(ip);
628 gfs2_alloc_put(ip);
629 }
617 page = grab_cache_page(mapping, index); 630 page = grab_cache_page(mapping, index);
618 if (!page) 631 if (!page)
619 return -ENOMEM; 632 return -ENOMEM;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 5ada38c99a2c..beb6c7ac0086 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -469,7 +469,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
469 }; 469 };
470 470
471 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 471 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
472 LM_FLAG_NOEXP, &ji_gh); 472 LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
473 if (error) 473 if (error)
474 goto fail_gunlock_j; 474 goto fail_gunlock_j;
475 } else { 475 } else {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ce48c4594ec8..708c287e1d0e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -31,6 +31,7 @@
31#include "inode.h" 31#include "inode.h"
32 32
33#define BFITNOENT ((u32)~0) 33#define BFITNOENT ((u32)~0)
34#define NO_BLOCK ((u64)~0)
34 35
35/* 36/*
36 * These routines are used by the resource group routines (rgrp.c) 37 * These routines are used by the resource group routines (rgrp.c)
@@ -116,8 +117,7 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
116 * @buffer: the buffer that holds the bitmaps 117 * @buffer: the buffer that holds the bitmaps
117 * @buflen: the length (in bytes) of the buffer 118 * @buflen: the length (in bytes) of the buffer
118 * @goal: start search at this block's bit-pair (within @buffer) 119 * @goal: start search at this block's bit-pair (within @buffer)
119 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for; 120 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for.
120 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
121 * 121 *
122 * Scope of @goal and returned block number is only within this bitmap buffer, 122 * Scope of @goal and returned block number is only within this bitmap buffer,
123 * not entire rgrp or filesystem. @buffer will be offset from the actual 123 * not entire rgrp or filesystem. @buffer will be offset from the actual
@@ -137,9 +137,13 @@ static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
137 byte = buffer + (goal / GFS2_NBBY); 137 byte = buffer + (goal / GFS2_NBBY);
138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
139 end = buffer + buflen; 139 end = buffer + buflen;
140 alloc = (old_state & 1) ? 0 : 0x55; 140 alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
141 141
142 while (byte < end) { 142 while (byte < end) {
143 /* If we're looking for a free block we can eliminate all
144 bitmap settings with 0x55, which represents four data
145 blocks in a row. If we're looking for a data block, we can
146 eliminate 0x00 which corresponds to four free blocks. */
143 if ((*byte & 0x55) == alloc) { 147 if ((*byte & 0x55) == alloc) {
144 blk += (8 - bit) >> 1; 148 blk += (8 - bit) >> 1;
145 149
@@ -859,23 +863,28 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
859static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) 863static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
860{ 864{
861 struct inode *inode; 865 struct inode *inode;
862 u32 goal = 0; 866 u32 goal = 0, block;
863 u64 no_addr; 867 u64 no_addr;
868 struct gfs2_sbd *sdp = rgd->rd_sbd;
864 869
865 for(;;) { 870 for(;;) {
866 if (goal >= rgd->rd_data) 871 if (goal >= rgd->rd_data)
867 break; 872 break;
868 goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 873 down_write(&sdp->sd_log_flush_lock);
869 GFS2_BLKST_UNLINKED); 874 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
870 if (goal == BFITNOENT) 875 GFS2_BLKST_UNLINKED);
876 up_write(&sdp->sd_log_flush_lock);
877 if (block == BFITNOENT)
871 break; 878 break;
872 no_addr = goal + rgd->rd_data0; 879 /* rgblk_search can return a block < goal, so we need to
880 keep it marching forward. */
881 no_addr = block + rgd->rd_data0;
873 goal++; 882 goal++;
874 if (no_addr < *last_unlinked) 883 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
875 continue; 884 continue;
876 *last_unlinked = no_addr; 885 *last_unlinked = no_addr;
877 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 886 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
878 no_addr, -1); 887 no_addr, -1, 1);
879 if (!IS_ERR(inode)) 888 if (!IS_ERR(inode))
880 return inode; 889 return inode;
881 } 890 }
@@ -1152,7 +1161,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1152 struct gfs2_alloc *al = &ip->i_alloc; 1161 struct gfs2_alloc *al = &ip->i_alloc;
1153 struct inode *inode; 1162 struct inode *inode;
1154 int error = 0; 1163 int error = 0;
1155 u64 last_unlinked = 0; 1164 u64 last_unlinked = NO_BLOCK;
1156 1165
1157 if (gfs2_assert_warn(sdp, al->al_requested)) 1166 if (gfs2_assert_warn(sdp, al->al_requested))
1158 return -EINVAL; 1167 return -EINVAL;
@@ -1289,7 +1298,9 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1289 allocatable block anywhere else, we want to be able wrap around and 1298 allocatable block anywhere else, we want to be able wrap around and
1290 search in the first part of our first-searched bit block. */ 1299 search in the first part of our first-searched bit block. */
1291 for (x = 0; x <= length; x++) { 1300 for (x = 0; x <= length; x++) {
1292 if (bi->bi_clone) 1301 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1302 bitmaps, so we must search the originals for that. */
1303 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1293 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, 1304 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1294 bi->bi_len, goal, old_state); 1305 bi->bi_len, goal, old_state);
1295 else 1306 else
@@ -1305,9 +1316,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1305 goal = 0; 1316 goal = 0;
1306 } 1317 }
1307 1318
1308 if (old_state != new_state) { 1319 if (blk != BFITNOENT && old_state != new_state) {
1309 gfs2_assert_withdraw(rgd->rd_sbd, blk != BFITNOENT);
1310
1311 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1320 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1312 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1321 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1313 bi->bi_len, blk, new_state); 1322 bi->bi_len, blk, new_state);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index a2da76b5ae4c..dd3e737f528e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -58,7 +58,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
58 gt->gt_incore_log_blocks = 1024; 58 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60; 59 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60; 60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_scand_secs = 15;
62 gt->gt_recoverd_secs = 60; 61 gt->gt_recoverd_secs = 60;
63 gt->gt_logd_secs = 1; 62 gt->gt_logd_secs = 1;
64 gt->gt_quotad_secs = 5; 63 gt->gt_quotad_secs = 5;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 640cb6a6fc4c..06e0b7768d97 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -441,7 +441,6 @@ TUNE_ATTR(quota_simul_sync, 1);
441TUNE_ATTR(quota_cache_secs, 1); 441TUNE_ATTR(quota_cache_secs, 1);
442TUNE_ATTR(stall_secs, 1); 442TUNE_ATTR(stall_secs, 1);
443TUNE_ATTR(statfs_quantum, 1); 443TUNE_ATTR(statfs_quantum, 1);
444TUNE_ATTR_DAEMON(scand_secs, scand_process);
445TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 444TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
446TUNE_ATTR_DAEMON(logd_secs, logd_process); 445TUNE_ATTR_DAEMON(logd_secs, logd_process);
447TUNE_ATTR_DAEMON(quotad_secs, quotad_process); 446TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
@@ -463,7 +462,6 @@ static struct attribute *tune_attrs[] = {
463 &tune_attr_quota_cache_secs.attr, 462 &tune_attr_quota_cache_secs.attr,
464 &tune_attr_stall_secs.attr, 463 &tune_attr_stall_secs.attr,
465 &tune_attr_statfs_quantum.attr, 464 &tune_attr_statfs_quantum.attr,
466 &tune_attr_scand_secs.attr,
467 &tune_attr_recoverd_secs.attr, 465 &tune_attr_recoverd_secs.attr,
468 &tune_attr_logd_secs.attr, 466 &tune_attr_logd_secs.attr,
469 &tune_attr_quotad_secs.attr, 467 &tune_attr_quotad_secs.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f8dabf8446bb..717983e2c2ae 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -142,25 +142,25 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
142 lops_add(sdp, &bd->bd_le); 142 lops_add(sdp, &bd->bd_le);
143} 143}
144 144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno) 145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
146{ 146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke), 147 BUG_ON(!list_empty(&bd->bd_le.le_list));
148 GFP_NOFS | __GFP_NOFAIL); 148 BUG_ON(!list_empty(&bd->bd_ail_st_list));
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops); 149 BUG_ON(!list_empty(&bd->bd_ail_gl_list));
150 rv->rv_blkno = blkno; 150 lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
151 lops_add(sdp, &rv->rv_le); 151 lops_add(sdp, &bd->bd_le);
152} 152}
153 153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) 154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
155{ 155{
156 struct gfs2_revoke *rv; 156 struct gfs2_bufdata *bd;
157 int found = 0; 157 int found = 0;
158 158
159 gfs2_log_lock(sdp); 159 gfs2_log_lock(sdp);
160 160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) { 161 list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
162 if (rv->rv_blkno == blkno) { 162 if (bd->bd_blkno == blkno) {
163 list_del(&rv->rv_le.le_list); 163 list_del_init(&bd->bd_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); 164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--; 165 sdp->sd_log_num_revoke--;
166 found = 1; 166 found = 1;
@@ -172,7 +172,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
172 172
173 if (found) { 173 if (found) {
174 struct gfs2_trans *tr = current->journal_info; 174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv); 175 kmem_cache_free(gfs2_bufdata_cachep, bd);
176 tr->tr_num_revoke_rm++; 176 tr->tr_num_revoke_rm++;
177 } 177 }
178} 178}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 23d4cbe1de5b..043d5f4b9c4c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
32 32
33void gfs2_trans_add_gl(struct gfs2_glock *gl); 33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno); 35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); 36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); 37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
38 38
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index af4ef808fa94..345798ebd366 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -17,6 +17,18 @@ ToDo/Notes:
17 happen is unclear however so it is worth waiting until someone hits 17 happen is unclear however so it is worth waiting until someone hits
18 the problem. 18 the problem.
19 19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
202.1.28 - Fix a deadlock. 322.1.28 - Fix a deadlock.
21 33
22 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey 34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 825508385565..58b6be992544 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.28\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 6e5c2534f4bc..cfdc7900d271 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
2 * aops.c - NTFS kernel address space operations and page cache handling. 2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project. 3 * Part of the Linux-NTFS project.
4 * 4 *
5 * Copyright (c) 2001-2006 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon 6 * Copyright (c) 2002 Richard Russon
7 * 7 *
8 * This program/include file is free software; you can redistribute it and/or 8 * This program/include file is free software; you can redistribute it and/or
@@ -396,7 +396,7 @@ static int ntfs_readpage(struct file *file, struct page *page)
396 loff_t i_size; 396 loff_t i_size;
397 struct inode *vi; 397 struct inode *vi;
398 ntfs_inode *ni, *base_ni; 398 ntfs_inode *ni, *base_ni;
399 u8 *kaddr; 399 u8 *addr;
400 ntfs_attr_search_ctx *ctx; 400 ntfs_attr_search_ctx *ctx;
401 MFT_RECORD *mrec; 401 MFT_RECORD *mrec;
402 unsigned long flags; 402 unsigned long flags;
@@ -491,15 +491,15 @@ retry_readpage:
491 /* Race with shrinking truncate. */ 491 /* Race with shrinking truncate. */
492 attr_len = i_size; 492 attr_len = i_size;
493 } 493 }
494 kaddr = kmap_atomic(page, KM_USER0); 494 addr = kmap_atomic(page, KM_USER0);
495 /* Copy the data to the page. */ 495 /* Copy the data to the page. */
496 memcpy(kaddr, (u8*)ctx->attr + 496 memcpy(addr, (u8*)ctx->attr +
497 le16_to_cpu(ctx->attr->data.resident.value_offset), 497 le16_to_cpu(ctx->attr->data.resident.value_offset),
498 attr_len); 498 attr_len);
499 /* Zero the remainder of the page. */ 499 /* Zero the remainder of the page. */
500 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 500 memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
501 flush_dcache_page(page); 501 flush_dcache_page(page);
502 kunmap_atomic(kaddr, KM_USER0); 502 kunmap_atomic(addr, KM_USER0);
503put_unm_err_out: 503put_unm_err_out:
504 ntfs_attr_put_search_ctx(ctx); 504 ntfs_attr_put_search_ctx(ctx);
505unm_err_out: 505unm_err_out:
@@ -1344,7 +1344,7 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1344 loff_t i_size; 1344 loff_t i_size;
1345 struct inode *vi = page->mapping->host; 1345 struct inode *vi = page->mapping->host;
1346 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); 1346 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1347 char *kaddr; 1347 char *addr;
1348 ntfs_attr_search_ctx *ctx = NULL; 1348 ntfs_attr_search_ctx *ctx = NULL;
1349 MFT_RECORD *m = NULL; 1349 MFT_RECORD *m = NULL;
1350 u32 attr_len; 1350 u32 attr_len;
@@ -1484,14 +1484,14 @@ retry_writepage:
1484 /* Shrinking cannot fail. */ 1484 /* Shrinking cannot fail. */
1485 BUG_ON(err); 1485 BUG_ON(err);
1486 } 1486 }
1487 kaddr = kmap_atomic(page, KM_USER0); 1487 addr = kmap_atomic(page, KM_USER0);
1488 /* Copy the data from the page to the mft record. */ 1488 /* Copy the data from the page to the mft record. */
1489 memcpy((u8*)ctx->attr + 1489 memcpy((u8*)ctx->attr +
1490 le16_to_cpu(ctx->attr->data.resident.value_offset), 1490 le16_to_cpu(ctx->attr->data.resident.value_offset),
1491 kaddr, attr_len); 1491 addr, attr_len);
1492 /* Zero out of bounds area in the page cache page. */ 1492 /* Zero out of bounds area in the page cache page. */
1493 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1493 memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1494 kunmap_atomic(kaddr, KM_USER0); 1494 kunmap_atomic(addr, KM_USER0);
1495 flush_dcache_page(page); 1495 flush_dcache_page(page);
1496 flush_dcache_mft_record_page(ctx->ntfs_ino); 1496 flush_dcache_mft_record_page(ctx->ntfs_ino);
1497 /* We are done with the page. */ 1497 /* We are done with the page. */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1c08fefe487a..92dabdcf2b80 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. 2 * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2500,7 +2500,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2500 struct page *page; 2500 struct page *page;
2501 u8 *kaddr; 2501 u8 *kaddr;
2502 pgoff_t idx, end; 2502 pgoff_t idx, end;
2503 unsigned int start_ofs, end_ofs, size; 2503 unsigned start_ofs, end_ofs, size;
2504 2504
2505 ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", 2505 ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
2506 (long long)ofs, (long long)cnt, val); 2506 (long long)ofs, (long long)cnt, val);
@@ -2548,6 +2548,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2548 kunmap_atomic(kaddr, KM_USER0); 2548 kunmap_atomic(kaddr, KM_USER0);
2549 set_page_dirty(page); 2549 set_page_dirty(page);
2550 page_cache_release(page); 2550 page_cache_release(page);
2551 balance_dirty_pages_ratelimited(mapping);
2552 cond_resched();
2551 if (idx == end) 2553 if (idx == end)
2552 goto done; 2554 goto done;
2553 idx++; 2555 idx++;
@@ -2604,6 +2606,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2604 kunmap_atomic(kaddr, KM_USER0); 2606 kunmap_atomic(kaddr, KM_USER0);
2605 set_page_dirty(page); 2607 set_page_dirty(page);
2606 page_cache_release(page); 2608 page_cache_release(page);
2609 balance_dirty_pages_ratelimited(mapping);
2610 cond_resched();
2607 } 2611 }
2608done: 2612done:
2609 ntfs_debug("Done."); 2613 ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ffcc504a1667..c814204d4ea0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -26,7 +26,6 @@
26#include <linux/swap.h> 26#include <linux/swap.h>
27#include <linux/uio.h> 27#include <linux/uio.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/sched.h>
30 29
31#include <asm/page.h> 30#include <asm/page.h>
32#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -362,7 +361,7 @@ static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
362 volatile char c; 361 volatile char c;
363 362
364 /* Set @end to the first byte outside the last page we care about. */ 363 /* Set @end to the first byte outside the last page we care about. */
365 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes); 364 end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
366 365
367 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) 366 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
368 ; 367 ;
@@ -532,7 +531,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
532 blocksize_bits = vol->sb->s_blocksize_bits; 531 blocksize_bits = vol->sb->s_blocksize_bits;
533 u = 0; 532 u = 0;
534 do { 533 do {
535 struct page *page = pages[u]; 534 page = pages[u];
535 BUG_ON(!page);
536 /* 536 /*
537 * create_empty_buffers() will create uptodate/dirty buffers if 537 * create_empty_buffers() will create uptodate/dirty buffers if
538 * the page is uptodate/dirty. 538 * the page is uptodate/dirty.
@@ -1291,7 +1291,7 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
1291 size_t bytes) 1291 size_t bytes)
1292{ 1292{
1293 struct page **last_page = pages + nr_pages; 1293 struct page **last_page = pages + nr_pages;
1294 char *kaddr; 1294 char *addr;
1295 size_t total = 0; 1295 size_t total = 0;
1296 unsigned len; 1296 unsigned len;
1297 int left; 1297 int left;
@@ -1300,13 +1300,13 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
1300 len = PAGE_CACHE_SIZE - ofs; 1300 len = PAGE_CACHE_SIZE - ofs;
1301 if (len > bytes) 1301 if (len > bytes)
1302 len = bytes; 1302 len = bytes;
1303 kaddr = kmap_atomic(*pages, KM_USER0); 1303 addr = kmap_atomic(*pages, KM_USER0);
1304 left = __copy_from_user_inatomic(kaddr + ofs, buf, len); 1304 left = __copy_from_user_inatomic(addr + ofs, buf, len);
1305 kunmap_atomic(kaddr, KM_USER0); 1305 kunmap_atomic(addr, KM_USER0);
1306 if (unlikely(left)) { 1306 if (unlikely(left)) {
1307 /* Do it the slow way. */ 1307 /* Do it the slow way. */
1308 kaddr = kmap(*pages); 1308 addr = kmap(*pages);
1309 left = __copy_from_user(kaddr + ofs, buf, len); 1309 left = __copy_from_user(addr + ofs, buf, len);
1310 kunmap(*pages); 1310 kunmap(*pages);
1311 if (unlikely(left)) 1311 if (unlikely(left))
1312 goto err_out; 1312 goto err_out;
@@ -1408,26 +1408,26 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1408 size_t *iov_ofs, size_t bytes) 1408 size_t *iov_ofs, size_t bytes)
1409{ 1409{
1410 struct page **last_page = pages + nr_pages; 1410 struct page **last_page = pages + nr_pages;
1411 char *kaddr; 1411 char *addr;
1412 size_t copied, len, total = 0; 1412 size_t copied, len, total = 0;
1413 1413
1414 do { 1414 do {
1415 len = PAGE_CACHE_SIZE - ofs; 1415 len = PAGE_CACHE_SIZE - ofs;
1416 if (len > bytes) 1416 if (len > bytes)
1417 len = bytes; 1417 len = bytes;
1418 kaddr = kmap_atomic(*pages, KM_USER0); 1418 addr = kmap_atomic(*pages, KM_USER0);
1419 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, 1419 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1420 *iov, *iov_ofs, len); 1420 *iov, *iov_ofs, len);
1421 kunmap_atomic(kaddr, KM_USER0); 1421 kunmap_atomic(addr, KM_USER0);
1422 if (unlikely(copied != len)) { 1422 if (unlikely(copied != len)) {
1423 /* Do it the slow way. */ 1423 /* Do it the slow way. */
1424 kaddr = kmap(*pages); 1424 addr = kmap(*pages);
1425 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, 1425 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1426 *iov, *iov_ofs, len); 1426 *iov, *iov_ofs, len);
1427 /* 1427 /*
1428 * Zero the rest of the target like __copy_from_user(). 1428 * Zero the rest of the target like __copy_from_user().
1429 */ 1429 */
1430 memset(kaddr + ofs + copied, 0, len - copied); 1430 memset(addr + ofs + copied, 0, len - copied);
1431 kunmap(*pages); 1431 kunmap(*pages);
1432 if (unlikely(copied != len)) 1432 if (unlikely(copied != len))
1433 goto err_out; 1433 goto err_out;
@@ -1735,8 +1735,6 @@ static int ntfs_commit_pages_after_write(struct page **pages,
1735 read_unlock_irqrestore(&ni->size_lock, flags); 1735 read_unlock_irqrestore(&ni->size_lock, flags);
1736 BUG_ON(initialized_size != i_size); 1736 BUG_ON(initialized_size != i_size);
1737 if (end > initialized_size) { 1737 if (end > initialized_size) {
1738 unsigned long flags;
1739
1740 write_lock_irqsave(&ni->size_lock, flags); 1738 write_lock_irqsave(&ni->size_lock, flags);
1741 ni->initialized_size = end; 1739 ni->initialized_size = end;
1742 i_size_write(vi, end); 1740 i_size_write(vi, end);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b532a730cec2..e9da092e2772 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -34,7 +34,6 @@
34#include "dir.h" 34#include "dir.h"
35#include "debug.h" 35#include "debug.h"
36#include "inode.h" 36#include "inode.h"
37#include "attrib.h"
38#include "lcnalloc.h" 37#include "lcnalloc.h"
39#include "malloc.h" 38#include "malloc.h"
40#include "mft.h" 39#include "mft.h"
@@ -2500,8 +2499,6 @@ retry_truncate:
2500 /* Resize the attribute record to best fit the new attribute size. */ 2499 /* Resize the attribute record to best fit the new attribute size. */
2501 if (new_size < vol->mft_record_size && 2500 if (new_size < vol->mft_record_size &&
2502 !ntfs_resident_attr_value_resize(m, a, new_size)) { 2501 !ntfs_resident_attr_value_resize(m, a, new_size)) {
2503 unsigned long flags;
2504
2505 /* The resize succeeded! */ 2502 /* The resize succeeded! */
2506 flush_dcache_mft_record_page(ctx->ntfs_ino); 2503 flush_dcache_mft_record_page(ctx->ntfs_ino);
2507 mark_mft_record_dirty(ctx->ntfs_ino); 2504 mark_mft_record_dirty(ctx->ntfs_ino);
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index acfed325f4ec..d7932e95b1fd 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. 2 * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2002-2005 Anton Altaparmakov 4 * Copyright (c) 2002-2007 Anton Altaparmakov
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -724,24 +724,139 @@ bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
724 */ 724 */
725bool ntfs_empty_logfile(struct inode *log_vi) 725bool ntfs_empty_logfile(struct inode *log_vi)
726{ 726{
727 ntfs_volume *vol = NTFS_SB(log_vi->i_sb); 727 VCN vcn, end_vcn;
728 ntfs_inode *log_ni = NTFS_I(log_vi);
729 ntfs_volume *vol = log_ni->vol;
730 struct super_block *sb = vol->sb;
731 runlist_element *rl;
732 unsigned long flags;
733 unsigned block_size, block_size_bits;
734 int err;
735 bool should_wait = true;
728 736
729 ntfs_debug("Entering."); 737 ntfs_debug("Entering.");
730 if (!NVolLogFileEmpty(vol)) { 738 if (NVolLogFileEmpty(vol)) {
731 int err; 739 ntfs_debug("Done.");
732 740 return true;
733 err = ntfs_attr_set(NTFS_I(log_vi), 0, i_size_read(log_vi),
734 0xff);
735 if (unlikely(err)) {
736 ntfs_error(vol->sb, "Failed to fill $LogFile with "
737 "0xff bytes (error code %i).", err);
738 return false;
739 }
740 /* Set the flag so we do not have to do it again on remount. */
741 NVolSetLogFileEmpty(vol);
742 } 741 }
742 /*
743 * We cannot use ntfs_attr_set() because we may be still in the middle
744 * of a mount operation. Thus we do the emptying by hand by first
745 * zapping the page cache pages for the $LogFile/$DATA attribute and
746 * then emptying each of the buffers in each of the clusters specified
747 * by the runlist by hand.
748 */
749 block_size = sb->s_blocksize;
750 block_size_bits = sb->s_blocksize_bits;
751 vcn = 0;
752 read_lock_irqsave(&log_ni->size_lock, flags);
753 end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
754 vol->cluster_size_bits;
755 read_unlock_irqrestore(&log_ni->size_lock, flags);
756 truncate_inode_pages(log_vi->i_mapping, 0);
757 down_write(&log_ni->runlist.lock);
758 rl = log_ni->runlist.rl;
759 if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
760map_vcn:
761 err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
762 if (err) {
763 ntfs_error(sb, "Failed to map runlist fragment (error "
764 "%d).", -err);
765 goto err;
766 }
767 rl = log_ni->runlist.rl;
768 BUG_ON(!rl || vcn < rl->vcn || !rl->length);
769 }
770 /* Seek to the runlist element containing @vcn. */
771 while (rl->length && vcn >= rl[1].vcn)
772 rl++;
773 do {
774 LCN lcn;
775 sector_t block, end_block;
776 s64 len;
777
778 /*
779 * If this run is not mapped map it now and start again as the
780 * runlist will have been updated.
781 */
782 lcn = rl->lcn;
783 if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
784 vcn = rl->vcn;
785 goto map_vcn;
786 }
787 /* If this run is not valid abort with an error. */
788 if (unlikely(!rl->length || lcn < LCN_HOLE))
789 goto rl_err;
790 /* Skip holes. */
791 if (lcn == LCN_HOLE)
792 continue;
793 block = lcn << vol->cluster_size_bits >> block_size_bits;
794 len = rl->length;
795 if (rl[1].vcn > end_vcn)
796 len = end_vcn - rl->vcn;
797 end_block = (lcn + len) << vol->cluster_size_bits >>
798 block_size_bits;
799 /* Iterate over the blocks in the run and empty them. */
800 do {
801 struct buffer_head *bh;
802
803 /* Obtain the buffer, possibly not uptodate. */
804 bh = sb_getblk(sb, block);
805 BUG_ON(!bh);
806 /* Setup buffer i/o submission. */
807 lock_buffer(bh);
808 bh->b_end_io = end_buffer_write_sync;
809 get_bh(bh);
810 /* Set the entire contents of the buffer to 0xff. */
811 memset(bh->b_data, -1, block_size);
812 if (!buffer_uptodate(bh))
813 set_buffer_uptodate(bh);
814 if (buffer_dirty(bh))
815 clear_buffer_dirty(bh);
816 /*
817 * Submit the buffer and wait for i/o to complete but
818 * only for the first buffer so we do not miss really
819 * serious i/o errors. Once the first buffer has
820 * completed ignore errors afterwards as we can assume
821 * that if one buffer worked all of them will work.
822 */
823 submit_bh(WRITE, bh);
824 if (should_wait) {
825 should_wait = false;
826 wait_on_buffer(bh);
827 if (unlikely(!buffer_uptodate(bh)))
828 goto io_err;
829 }
830 brelse(bh);
831 } while (++block < end_block);
832 } while ((++rl)->vcn < end_vcn);
833 up_write(&log_ni->runlist.lock);
834 /*
835 * Zap the pages again just in case any got instantiated whilst we were
836 * emptying the blocks by hand. FIXME: We may not have completed
837 * writing to all the buffer heads yet so this may happen too early.
838 * We really should use a kernel thread to do the emptying
839 * asynchronously and then we can also set the volume dirty and output
840 * an error message if emptying should fail.
841 */
842 truncate_inode_pages(log_vi->i_mapping, 0);
843 /* Set the flag so we do not have to do it again on remount. */
844 NVolSetLogFileEmpty(vol);
743 ntfs_debug("Done."); 845 ntfs_debug("Done.");
744 return true; 846 return true;
847io_err:
848 ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk.");
849 goto dirty_err;
850rl_err:
851 ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk.");
852dirty_err:
853 NVolSetErrors(vol);
854 err = -EIO;
855err:
856 up_write(&log_ni->runlist.lock);
857 ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
858 -err);
859 return false;
745} 860}
746 861
747#endif /* NTFS_RW */ 862#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 9afd72c7ad0d..56a9a6d25a2a 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. 2 * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * Copyright (c) 2002-2005 Richard Russon 5 * Copyright (c) 2002-2005 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -1714,7 +1714,7 @@ extend_hole:
1714 sizeof(*rl)); 1714 sizeof(*rl));
1715 /* Adjust the beginning of the tail if necessary. */ 1715 /* Adjust the beginning of the tail if necessary. */
1716 if (end > rl->vcn) { 1716 if (end > rl->vcn) {
1717 s64 delta = end - rl->vcn; 1717 delta = end - rl->vcn;
1718 rl->vcn = end; 1718 rl->vcn = end;
1719 rl->length -= delta; 1719 rl->length -= delta;
1720 /* Only adjust the lcn if it is real. */ 1720 /* Only adjust the lcn if it is real. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 778a850b4634..4ba7f0bdc248 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -354,7 +354,6 @@ struct ocfs2_insert_type {
354 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
355 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
356 int ins_contig_index; 356 int ins_contig_index;
357 int ins_free_records;
358 int ins_tree_depth; 357 int ins_tree_depth;
359}; 358};
360 359
@@ -362,7 +361,6 @@ struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type; 361 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent; 362 int c_has_empty_extent;
364 int c_split_covers_rec; 363 int c_split_covers_rec;
365 int c_used_tail_recs;
366}; 364};
367 365
368/* 366/*
@@ -2808,36 +2806,28 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2808 struct ocfs2_merge_ctxt *ctxt) 2806 struct ocfs2_merge_ctxt *ctxt)
2809 2807
2810{ 2808{
2811 int ret = 0, delete_tail_recs = 0; 2809 int ret = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path); 2810 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 2811 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814 2812
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 2813 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816 2814
2817 if (ctxt->c_split_covers_rec) { 2815 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
2818 delete_tail_recs++; 2816 /*
2819 2817 * The merge code will need to create an empty
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT || 2818 * extent to take the place of the newly
2821 ctxt->c_has_empty_extent) 2819 * emptied slot. Remove any pre-existing empty
2822 delete_tail_recs++; 2820 * extents - having more than one in a leaf is
2823 2821 * illegal.
2824 if (ctxt->c_has_empty_extent) { 2822 */
2825 /* 2823 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2826 * The merge code will need to create an empty 2824 dealloc);
2827 * extent to take the place of the newly 2825 if (ret) {
2828 * emptied slot. Remove any pre-existing empty 2826 mlog_errno(ret);
2829 * extents - having more than one in a leaf is 2827 goto out;
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 } 2828 }
2829 split_index--;
2830 rec = &el->l_recs[split_index];
2841 } 2831 }
2842 2832
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 2833 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
@@ -3593,6 +3583,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3593 struct buffer_head *di_bh, 3583 struct buffer_head *di_bh,
3594 struct buffer_head **last_eb_bh, 3584 struct buffer_head **last_eb_bh,
3595 struct ocfs2_extent_rec *insert_rec, 3585 struct ocfs2_extent_rec *insert_rec,
3586 int *free_records,
3596 struct ocfs2_insert_type *insert) 3587 struct ocfs2_insert_type *insert)
3597{ 3588{
3598 int ret; 3589 int ret;
@@ -3633,7 +3624,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3633 * XXX: This test is simplistic, we can search for empty 3624 * XXX: This test is simplistic, we can search for empty
3634 * extent records too. 3625 * extent records too.
3635 */ 3626 */
3636 insert->ins_free_records = le16_to_cpu(el->l_count) - 3627 *free_records = le16_to_cpu(el->l_count) -
3637 le16_to_cpu(el->l_next_free_rec); 3628 le16_to_cpu(el->l_next_free_rec);
3638 3629
3639 if (!insert->ins_tree_depth) { 3630 if (!insert->ins_tree_depth) {
@@ -3730,10 +3721,13 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3730 struct ocfs2_alloc_context *meta_ac) 3721 struct ocfs2_alloc_context *meta_ac)
3731{ 3722{
3732 int status; 3723 int status;
3724 int uninitialized_var(free_records);
3733 struct buffer_head *last_eb_bh = NULL; 3725 struct buffer_head *last_eb_bh = NULL;
3734 struct ocfs2_insert_type insert = {0, }; 3726 struct ocfs2_insert_type insert = {0, };
3735 struct ocfs2_extent_rec rec; 3727 struct ocfs2_extent_rec rec;
3736 3728
3729 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
3730
3737 mlog(0, "add %u clusters at position %u to inode %llu\n", 3731 mlog(0, "add %u clusters at position %u to inode %llu\n",
3738 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 3732 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3739 3733
@@ -3752,7 +3746,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3752 rec.e_flags = flags; 3746 rec.e_flags = flags;
3753 3747
3754 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3748 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
3755 &insert); 3749 &free_records, &insert);
3756 if (status < 0) { 3750 if (status < 0) {
3757 mlog_errno(status); 3751 mlog_errno(status);
3758 goto bail; 3752 goto bail;
@@ -3762,9 +3756,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3762 "Insert.contig_index: %d, Insert.free_records: %d, " 3756 "Insert.contig_index: %d, Insert.free_records: %d, "
3763 "Insert.tree_depth: %d\n", 3757 "Insert.tree_depth: %d\n",
3764 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3758 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
3765 insert.ins_free_records, insert.ins_tree_depth); 3759 free_records, insert.ins_tree_depth);
3766 3760
3767 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { 3761 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
3768 status = ocfs2_grow_tree(inode, handle, fe_bh, 3762 status = ocfs2_grow_tree(inode, handle, fe_bh,
3769 &insert.ins_tree_depth, &last_eb_bh, 3763 &insert.ins_tree_depth, &last_eb_bh,
3770 meta_ac); 3764 meta_ac);
@@ -3847,26 +3841,17 @@ leftright:
3847 3841
3848 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 3842 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3849 le16_to_cpu(rightmost_el->l_count)) { 3843 le16_to_cpu(rightmost_el->l_count)) {
3850 int old_depth = depth;
3851
3852 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, 3844 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3853 meta_ac); 3845 meta_ac);
3854 if (ret) { 3846 if (ret) {
3855 mlog_errno(ret); 3847 mlog_errno(ret);
3856 goto out; 3848 goto out;
3857 } 3849 }
3858
3859 if (old_depth != depth) {
3860 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3861 rightmost_el = &eb->h_list;
3862 }
3863 } 3850 }
3864 3851
3865 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 3852 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3866 insert.ins_appending = APPEND_NONE; 3853 insert.ins_appending = APPEND_NONE;
3867 insert.ins_contig = CONTIG_NONE; 3854 insert.ins_contig = CONTIG_NONE;
3868 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3869 - le16_to_cpu(rightmost_el->l_next_free_rec);
3870 insert.ins_tree_depth = depth; 3855 insert.ins_tree_depth = depth;
3871 3856
3872 insert_range = le32_to_cpu(split_rec.e_cpos) + 3857 insert_range = le32_to_cpu(split_rec.e_cpos) +
@@ -4015,11 +4000,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4015 } else 4000 } else
4016 rightmost_el = path_root_el(path); 4001 rightmost_el = path_root_el(path);
4017 4002
4018 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4019 if (ctxt.c_used_tail_recs > 0 &&
4020 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4021 ctxt.c_used_tail_recs--;
4022
4023 if (rec->e_cpos == split_rec->e_cpos && 4003 if (rec->e_cpos == split_rec->e_cpos &&
4024 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 4004 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4025 ctxt.c_split_covers_rec = 1; 4005 ctxt.c_split_covers_rec = 1;
@@ -4028,10 +4008,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4028 4008
4029 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 4009 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4030 4010
4031 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, " 4011 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4032 "has_empty: %u, split_covers: %u\n", split_index, 4012 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4033 ctxt.c_contig_type, ctxt.c_used_tail_recs, 4013 ctxt.c_split_covers_rec);
4034 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4035 4014
4036 if (ctxt.c_contig_type == CONTIG_NONE) { 4015 if (ctxt.c_contig_type == CONTIG_NONE) {
4037 if (ctxt.c_split_covers_rec) 4016 if (ctxt.c_split_covers_rec)
@@ -4180,27 +4159,18 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4180 4159
4181 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4160 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4182 le16_to_cpu(rightmost_el->l_count)) { 4161 le16_to_cpu(rightmost_el->l_count)) {
4183 int old_depth = depth;
4184
4185 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, 4162 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4186 meta_ac); 4163 meta_ac);
4187 if (ret) { 4164 if (ret) {
4188 mlog_errno(ret); 4165 mlog_errno(ret);
4189 goto out; 4166 goto out;
4190 } 4167 }
4191
4192 if (old_depth != depth) {
4193 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4194 rightmost_el = &eb->h_list;
4195 }
4196 } 4168 }
4197 4169
4198 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4170 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4199 insert.ins_appending = APPEND_NONE; 4171 insert.ins_appending = APPEND_NONE;
4200 insert.ins_contig = CONTIG_NONE; 4172 insert.ins_contig = CONTIG_NONE;
4201 insert.ins_split = SPLIT_RIGHT; 4173 insert.ins_split = SPLIT_RIGHT;
4202 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4203 - le16_to_cpu(rightmost_el->l_next_free_rec);
4204 insert.ins_tree_depth = depth; 4174 insert.ins_tree_depth = depth;
4205 4175
4206 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); 4176 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
@@ -5665,12 +5635,50 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
5665 return ocfs2_journal_dirty_data(handle, bh); 5635 return ocfs2_journal_dirty_data(handle, bh);
5666} 5636}
5667 5637
5638static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
5639 unsigned int from, unsigned int to,
5640 struct page *page, int zero, u64 *phys)
5641{
5642 int ret, partial = 0;
5643
5644 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
5645 if (ret)
5646 mlog_errno(ret);
5647
5648 if (zero)
5649 zero_user_page(page, from, to - from, KM_USER0);
5650
5651 /*
5652 * Need to set the buffers we zero'd into uptodate
5653 * here if they aren't - ocfs2_map_page_blocks()
5654 * might've skipped some
5655 */
5656 if (ocfs2_should_order_data(inode)) {
5657 ret = walk_page_buffers(handle,
5658 page_buffers(page),
5659 from, to, &partial,
5660 ocfs2_ordered_zero_func);
5661 if (ret < 0)
5662 mlog_errno(ret);
5663 } else {
5664 ret = walk_page_buffers(handle, page_buffers(page),
5665 from, to, &partial,
5666 ocfs2_writeback_zero_func);
5667 if (ret < 0)
5668 mlog_errno(ret);
5669 }
5670
5671 if (!partial)
5672 SetPageUptodate(page);
5673
5674 flush_dcache_page(page);
5675}
5676
5668static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 5677static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
5669 loff_t end, struct page **pages, 5678 loff_t end, struct page **pages,
5670 int numpages, u64 phys, handle_t *handle) 5679 int numpages, u64 phys, handle_t *handle)
5671{ 5680{
5672 int i, ret, partial = 0; 5681 int i;
5673 void *kaddr;
5674 struct page *page; 5682 struct page *page;
5675 unsigned int from, to = PAGE_CACHE_SIZE; 5683 unsigned int from, to = PAGE_CACHE_SIZE;
5676 struct super_block *sb = inode->i_sb; 5684 struct super_block *sb = inode->i_sb;
@@ -5691,87 +5699,31 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
5691 BUG_ON(from > PAGE_CACHE_SIZE); 5699 BUG_ON(from > PAGE_CACHE_SIZE);
5692 BUG_ON(to > PAGE_CACHE_SIZE); 5700 BUG_ON(to > PAGE_CACHE_SIZE);
5693 5701
5694 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); 5702 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
5695 if (ret) 5703 &phys);
5696 mlog_errno(ret);
5697
5698 kaddr = kmap_atomic(page, KM_USER0);
5699 memset(kaddr + from, 0, to - from);
5700 kunmap_atomic(kaddr, KM_USER0);
5701
5702 /*
5703 * Need to set the buffers we zero'd into uptodate
5704 * here if they aren't - ocfs2_map_page_blocks()
5705 * might've skipped some
5706 */
5707 if (ocfs2_should_order_data(inode)) {
5708 ret = walk_page_buffers(handle,
5709 page_buffers(page),
5710 from, to, &partial,
5711 ocfs2_ordered_zero_func);
5712 if (ret < 0)
5713 mlog_errno(ret);
5714 } else {
5715 ret = walk_page_buffers(handle, page_buffers(page),
5716 from, to, &partial,
5717 ocfs2_writeback_zero_func);
5718 if (ret < 0)
5719 mlog_errno(ret);
5720 }
5721
5722 if (!partial)
5723 SetPageUptodate(page);
5724
5725 flush_dcache_page(page);
5726 5704
5727 start = (page->index + 1) << PAGE_CACHE_SHIFT; 5705 start = (page->index + 1) << PAGE_CACHE_SHIFT;
5728 } 5706 }
5729out: 5707out:
5730 if (pages) { 5708 if (pages)
5731 for (i = 0; i < numpages; i++) { 5709 ocfs2_unlock_and_free_pages(pages, numpages);
5732 page = pages[i];
5733 unlock_page(page);
5734 mark_page_accessed(page);
5735 page_cache_release(page);
5736 }
5737 }
5738} 5710}
5739 5711
5740static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 5712static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
5741 struct page **pages, int *num, u64 *phys) 5713 struct page **pages, int *num)
5742{ 5714{
5743 int i, numpages = 0, ret = 0; 5715 int numpages, ret = 0;
5744 unsigned int ext_flags;
5745 struct super_block *sb = inode->i_sb; 5716 struct super_block *sb = inode->i_sb;
5746 struct address_space *mapping = inode->i_mapping; 5717 struct address_space *mapping = inode->i_mapping;
5747 unsigned long index; 5718 unsigned long index;
5748 loff_t last_page_bytes; 5719 loff_t last_page_bytes;
5749 5720
5750 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5751 BUG_ON(start > end); 5721 BUG_ON(start > end);
5752 5722
5753 if (start == end)
5754 goto out;
5755
5756 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 5723 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5757 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 5724 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5758 5725
5759 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits, 5726 numpages = 0;
5760 phys, NULL, &ext_flags);
5761 if (ret) {
5762 mlog_errno(ret);
5763 goto out;
5764 }
5765
5766 /* Tail is a hole. */
5767 if (*phys == 0)
5768 goto out;
5769
5770 /* Tail is marked as unwritten, we can count on write to zero
5771 * in that case. */
5772 if (ext_flags & OCFS2_EXT_UNWRITTEN)
5773 goto out;
5774
5775 last_page_bytes = PAGE_ALIGN(end); 5727 last_page_bytes = PAGE_ALIGN(end);
5776 index = start >> PAGE_CACHE_SHIFT; 5728 index = start >> PAGE_CACHE_SHIFT;
5777 do { 5729 do {
@@ -5788,14 +5740,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
5788 5740
5789out: 5741out:
5790 if (ret != 0) { 5742 if (ret != 0) {
5791 if (pages) { 5743 if (pages)
5792 for (i = 0; i < numpages; i++) { 5744 ocfs2_unlock_and_free_pages(pages, numpages);
5793 if (pages[i]) {
5794 unlock_page(pages[i]);
5795 page_cache_release(pages[i]);
5796 }
5797 }
5798 }
5799 numpages = 0; 5745 numpages = 0;
5800 } 5746 }
5801 5747
@@ -5816,18 +5762,20 @@ out:
5816int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 5762int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
5817 u64 range_start, u64 range_end) 5763 u64 range_start, u64 range_end)
5818{ 5764{
5819 int ret, numpages; 5765 int ret = 0, numpages;
5820 struct page **pages = NULL; 5766 struct page **pages = NULL;
5821 u64 phys; 5767 u64 phys;
5768 unsigned int ext_flags;
5769 struct super_block *sb = inode->i_sb;
5822 5770
5823 /* 5771 /*
5824 * File systems which don't support sparse files zero on every 5772 * File systems which don't support sparse files zero on every
5825 * extend. 5773 * extend.
5826 */ 5774 */
5827 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 5775 if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
5828 return 0; 5776 return 0;
5829 5777
5830 pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), 5778 pages = kcalloc(ocfs2_pages_per_cluster(sb),
5831 sizeof(struct page *), GFP_NOFS); 5779 sizeof(struct page *), GFP_NOFS);
5832 if (pages == NULL) { 5780 if (pages == NULL) {
5833 ret = -ENOMEM; 5781 ret = -ENOMEM;
@@ -5835,16 +5783,31 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
5835 goto out; 5783 goto out;
5836 } 5784 }
5837 5785
5838 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 5786 if (range_start == range_end)
5839 &numpages, &phys); 5787 goto out;
5788
5789 ret = ocfs2_extent_map_get_blocks(inode,
5790 range_start >> sb->s_blocksize_bits,
5791 &phys, NULL, &ext_flags);
5840 if (ret) { 5792 if (ret) {
5841 mlog_errno(ret); 5793 mlog_errno(ret);
5842 goto out; 5794 goto out;
5843 } 5795 }
5844 5796
5845 if (numpages == 0) 5797 /*
5798 * Tail is a hole, or is marked unwritten. In either case, we
5799 * can count on read and write to return/push zero's.
5800 */
5801 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
5846 goto out; 5802 goto out;
5847 5803
5804 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5805 &numpages);
5806 if (ret) {
5807 mlog_errno(ret);
5808 goto out;
5809 }
5810
5848 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 5811 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
5849 numpages, phys, handle); 5812 numpages, phys, handle);
5850 5813
@@ -5865,6 +5828,178 @@ out:
5865 return ret; 5828 return ret;
5866} 5829}
5867 5830
5831static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
5832{
5833 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
5834
5835 memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
5836}
5837
5838void ocfs2_dinode_new_extent_list(struct inode *inode,
5839 struct ocfs2_dinode *di)
5840{
5841 ocfs2_zero_dinode_id2(inode, di);
5842 di->id2.i_list.l_tree_depth = 0;
5843 di->id2.i_list.l_next_free_rec = 0;
5844 di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
5845}
5846
5847void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
5848{
5849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5850 struct ocfs2_inline_data *idata = &di->id2.i_data;
5851
5852 spin_lock(&oi->ip_lock);
5853 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
5854 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5855 spin_unlock(&oi->ip_lock);
5856
5857 /*
5858 * We clear the entire i_data structure here so that all
5859 * fields can be properly initialized.
5860 */
5861 ocfs2_zero_dinode_id2(inode, di);
5862
5863 idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
5864}
5865
5866int ocfs2_convert_inline_data_to_extents(struct inode *inode,
5867 struct buffer_head *di_bh)
5868{
5869 int ret, i, has_data, num_pages = 0;
5870 handle_t *handle;
5871 u64 uninitialized_var(block);
5872 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5873 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5874 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
5875 struct ocfs2_alloc_context *data_ac = NULL;
5876 struct page **pages = NULL;
5877 loff_t end = osb->s_clustersize;
5878
5879 has_data = i_size_read(inode) ? 1 : 0;
5880
5881 if (has_data) {
5882 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
5883 sizeof(struct page *), GFP_NOFS);
5884 if (pages == NULL) {
5885 ret = -ENOMEM;
5886 mlog_errno(ret);
5887 goto out;
5888 }
5889
5890 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
5891 if (ret) {
5892 mlog_errno(ret);
5893 goto out;
5894 }
5895 }
5896
5897 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
5898 if (IS_ERR(handle)) {
5899 ret = PTR_ERR(handle);
5900 mlog_errno(ret);
5901 goto out_unlock;
5902 }
5903
5904 ret = ocfs2_journal_access(handle, inode, di_bh,
5905 OCFS2_JOURNAL_ACCESS_WRITE);
5906 if (ret) {
5907 mlog_errno(ret);
5908 goto out_commit;
5909 }
5910
5911 if (has_data) {
5912 u32 bit_off, num;
5913 unsigned int page_end;
5914 u64 phys;
5915
5916 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
5917 &num);
5918 if (ret) {
5919 mlog_errno(ret);
5920 goto out_commit;
5921 }
5922
5923 /*
5924 * Save two copies, one for insert, and one that can
5925 * be changed by ocfs2_map_and_dirty_page() below.
5926 */
5927 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
5928
5929 /*
5930 * Non sparse file systems zero on extend, so no need
5931 * to do that now.
5932 */
5933 if (!ocfs2_sparse_alloc(osb) &&
5934 PAGE_CACHE_SIZE < osb->s_clustersize)
5935 end = PAGE_CACHE_SIZE;
5936
5937 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
5938 if (ret) {
5939 mlog_errno(ret);
5940 goto out_commit;
5941 }
5942
5943 /*
5944 * This should populate the 1st page for us and mark
5945 * it up to date.
5946 */
5947 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
5948 if (ret) {
5949 mlog_errno(ret);
5950 goto out_commit;
5951 }
5952
5953 page_end = PAGE_CACHE_SIZE;
5954 if (PAGE_CACHE_SIZE > osb->s_clustersize)
5955 page_end = osb->s_clustersize;
5956
5957 for (i = 0; i < num_pages; i++)
5958 ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
5959 pages[i], i > 0, &phys);
5960 }
5961
5962 spin_lock(&oi->ip_lock);
5963 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
5964 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5965 spin_unlock(&oi->ip_lock);
5966
5967 ocfs2_dinode_new_extent_list(inode, di);
5968
5969 ocfs2_journal_dirty(handle, di_bh);
5970
5971 if (has_data) {
5972 /*
5973 * An error at this point should be extremely rare. If
5974 * this proves to be false, we could always re-build
5975 * the in-inode data from our pages.
5976 */
5977 ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
5978 0, block, 1, 0, NULL);
5979 if (ret) {
5980 mlog_errno(ret);
5981 goto out_commit;
5982 }
5983
5984 inode->i_blocks = ocfs2_inode_sector_count(inode);
5985 }
5986
5987out_commit:
5988 ocfs2_commit_trans(osb, handle);
5989
5990out_unlock:
5991 if (data_ac)
5992 ocfs2_free_alloc_context(data_ac);
5993
5994out:
5995 if (pages) {
5996 ocfs2_unlock_and_free_pages(pages, num_pages);
5997 kfree(pages);
5998 }
5999
6000 return ret;
6001}
6002
5868/* 6003/*
5869 * It is expected, that by the time you call this function, 6004 * It is expected, that by the time you call this function,
5870 * inode->i_size and fe->i_size have been adjusted. 6005 * inode->i_size and fe->i_size have been adjusted.
@@ -6090,6 +6225,81 @@ bail:
6090 return status; 6225 return status;
6091} 6226}
6092 6227
6228/*
6229 * 'start' is inclusive, 'end' is not.
6230 */
6231int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6232 unsigned int start, unsigned int end, int trunc)
6233{
6234 int ret;
6235 unsigned int numbytes;
6236 handle_t *handle;
6237 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6238 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6239 struct ocfs2_inline_data *idata = &di->id2.i_data;
6240
6241 if (end > i_size_read(inode))
6242 end = i_size_read(inode);
6243
6244 BUG_ON(start >= end);
6245
6246 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6247 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6248 !ocfs2_supports_inline_data(osb)) {
6249 ocfs2_error(inode->i_sb,
6250 "Inline data flags for inode %llu don't agree! "
6251 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6252 (unsigned long long)OCFS2_I(inode)->ip_blkno,
6253 le16_to_cpu(di->i_dyn_features),
6254 OCFS2_I(inode)->ip_dyn_features,
6255 osb->s_feature_incompat);
6256 ret = -EROFS;
6257 goto out;
6258 }
6259
6260 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6261 if (IS_ERR(handle)) {
6262 ret = PTR_ERR(handle);
6263 mlog_errno(ret);
6264 goto out;
6265 }
6266
6267 ret = ocfs2_journal_access(handle, inode, di_bh,
6268 OCFS2_JOURNAL_ACCESS_WRITE);
6269 if (ret) {
6270 mlog_errno(ret);
6271 goto out_commit;
6272 }
6273
6274 numbytes = end - start;
6275 memset(idata->id_data + start, 0, numbytes);
6276
6277 /*
6278 * No need to worry about the data page here - it's been
6279 * truncated already and inline data doesn't need it for
6280 * pushing zero's to disk, so we'll let readpage pick it up
6281 * later.
6282 */
6283 if (trunc) {
6284 i_size_write(inode, start);
6285 di->i_size = cpu_to_le64(start);
6286 }
6287
6288 inode->i_blocks = ocfs2_inode_sector_count(inode);
6289 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6290
6291 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6292 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6293
6294 ocfs2_journal_dirty(handle, di_bh);
6295
6296out_commit:
6297 ocfs2_commit_trans(osb, handle);
6298
6299out:
6300 return ret;
6301}
6302
6093static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6303static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6094{ 6304{
6095 /* 6305 /*
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 990df48ae8d3..42ff94bd8011 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -62,6 +62,11 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; 62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
63} 63}
64 64
65void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
66void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
67int ocfs2_convert_inline_data_to_extents(struct inode *inode,
68 struct buffer_head *di_bh);
69
65int ocfs2_truncate_log_init(struct ocfs2_super *osb); 70int ocfs2_truncate_log_init(struct ocfs2_super *osb);
66void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); 71void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
67void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 72void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
@@ -115,6 +120,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
115 struct inode *inode, 120 struct inode *inode,
116 struct buffer_head *fe_bh, 121 struct buffer_head *fe_bh,
117 struct ocfs2_truncate_context *tc); 122 struct ocfs2_truncate_context *tc);
123int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
124 unsigned int start, unsigned int end, int trunc);
118 125
119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 126int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
120 u32 cpos, struct buffer_head **leaf_bh); 127 u32 cpos, struct buffer_head **leaf_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37f25c931f5..34d10452c56d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -206,9 +206,70 @@ bail:
206 return err; 206 return err;
207} 207}
208 208
209int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh)
211{
212 void *kaddr;
213 unsigned int size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
217 ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
218 (unsigned long long)OCFS2_I(inode)->ip_blkno);
219 return -EROFS;
220 }
221
222 size = i_size_read(inode);
223
224 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
229 return -EROFS;
230 }
231
232 kaddr = kmap_atomic(page, KM_USER0);
233 if (size)
234 memcpy(kaddr, di->id2.i_data.id_data, size);
235 /* Clear the remaining part of the page */
236 memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
237 flush_dcache_page(page);
238 kunmap_atomic(kaddr, KM_USER0);
239
240 SetPageUptodate(page);
241
242 return 0;
243}
244
245static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
246{
247 int ret;
248 struct buffer_head *di_bh = NULL;
249 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
250
251 BUG_ON(!PageLocked(page));
252 BUG_ON(!OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
253
254 ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
255 OCFS2_BH_CACHED, inode);
256 if (ret) {
257 mlog_errno(ret);
258 goto out;
259 }
260
261 ret = ocfs2_read_inline_data(inode, page, di_bh);
262out:
263 unlock_page(page);
264
265 brelse(di_bh);
266 return ret;
267}
268
209static int ocfs2_readpage(struct file *file, struct page *page) 269static int ocfs2_readpage(struct file *file, struct page *page)
210{ 270{
211 struct inode *inode = page->mapping->host; 271 struct inode *inode = page->mapping->host;
272 struct ocfs2_inode_info *oi = OCFS2_I(inode);
212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 273 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
213 int ret, unlock = 1; 274 int ret, unlock = 1;
214 275
@@ -222,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
222 goto out; 283 goto out;
223 } 284 }
224 285
225 if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) { 286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
226 ret = AOP_TRUNCATED_PAGE; 287 ret = AOP_TRUNCATED_PAGE;
227 goto out_meta_unlock; 288 goto out_meta_unlock;
228 } 289 }
@@ -252,7 +313,10 @@ static int ocfs2_readpage(struct file *file, struct page *page)
252 goto out_alloc; 313 goto out_alloc;
253 } 314 }
254 315
255 ret = block_read_full_page(page, ocfs2_get_block); 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page);
318 else
319 ret = block_read_full_page(page, ocfs2_get_block);
256 unlock = 0; 320 unlock = 0;
257 321
258 ocfs2_data_unlock(inode, 0); 322 ocfs2_data_unlock(inode, 0);
@@ -301,12 +365,8 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
301{ 365{
302 int ret; 366 int ret;
303 367
304 down_read(&OCFS2_I(inode)->ip_alloc_sem);
305
306 ret = block_prepare_write(page, from, to, ocfs2_get_block); 368 ret = block_prepare_write(page, from, to, ocfs2_get_block);
307 369
308 up_read(&OCFS2_I(inode)->ip_alloc_sem);
309
310 return ret; 370 return ret;
311} 371}
312 372
@@ -401,7 +461,9 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
401 down_read(&OCFS2_I(inode)->ip_alloc_sem); 461 down_read(&OCFS2_I(inode)->ip_alloc_sem);
402 } 462 }
403 463
404 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 464 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
465 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
466 NULL);
405 467
406 if (!INODE_JOURNAL(inode)) { 468 if (!INODE_JOURNAL(inode)) {
407 up_read(&OCFS2_I(inode)->ip_alloc_sem); 469 up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -415,7 +477,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
415 goto bail; 477 goto bail;
416 } 478 }
417 479
418
419bail: 480bail:
420 status = err ? 0 : p_blkno; 481 status = err ? 0 : p_blkno;
421 482
@@ -570,6 +631,13 @@ static ssize_t ocfs2_direct_IO(int rw,
570 631
571 mlog_entry_void(); 632 mlog_entry_void();
572 633
634 /*
635 * Fallback to buffered I/O if we see an inode without
636 * extents.
637 */
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0;
640
573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
574 /* 642 /*
575 * We get PR data locks even for O_DIRECT. This 643 * We get PR data locks even for O_DIRECT. This
@@ -834,18 +902,22 @@ struct ocfs2_write_ctxt {
834 struct ocfs2_cached_dealloc_ctxt w_dealloc; 902 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835}; 903};
836 904
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 905void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
838{ 906{
839 int i; 907 int i;
840 908
841 for(i = 0; i < wc->w_num_pages; i++) { 909 for(i = 0; i < num_pages; i++) {
842 if (wc->w_pages[i] == NULL) 910 if (pages[i]) {
843 continue; 911 unlock_page(pages[i]);
844 912 mark_page_accessed(pages[i]);
845 unlock_page(wc->w_pages[i]); 913 page_cache_release(pages[i]);
846 mark_page_accessed(wc->w_pages[i]); 914 }
847 page_cache_release(wc->w_pages[i]);
848 } 915 }
916}
917
918static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
919{
920 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
849 921
850 brelse(wc->w_di_bh); 922 brelse(wc->w_di_bh);
851 kfree(wc); 923 kfree(wc);
@@ -1360,6 +1432,160 @@ out:
1360 return ret; 1432 return ret;
1361} 1433}
1362 1434
1435static int ocfs2_write_begin_inline(struct address_space *mapping,
1436 struct inode *inode,
1437 struct ocfs2_write_ctxt *wc)
1438{
1439 int ret;
1440 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1441 struct page *page;
1442 handle_t *handle;
1443 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1444
1445 page = find_or_create_page(mapping, 0, GFP_NOFS);
1446 if (!page) {
1447 ret = -ENOMEM;
1448 mlog_errno(ret);
1449 goto out;
1450 }
1451 /*
1452 * If we don't set w_num_pages then this page won't get unlocked
1453 * and freed on cleanup of the write context.
1454 */
1455 wc->w_pages[0] = wc->w_target_page = page;
1456 wc->w_num_pages = 1;
1457
1458 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1459 if (IS_ERR(handle)) {
1460 ret = PTR_ERR(handle);
1461 mlog_errno(ret);
1462 goto out;
1463 }
1464
1465 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1466 OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (ret) {
1468 ocfs2_commit_trans(osb, handle);
1469
1470 mlog_errno(ret);
1471 goto out;
1472 }
1473
1474 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1475 ocfs2_set_inode_data_inline(inode, di);
1476
1477 if (!PageUptodate(page)) {
1478 ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
1479 if (ret) {
1480 ocfs2_commit_trans(osb, handle);
1481
1482 goto out;
1483 }
1484 }
1485
1486 wc->w_handle = handle;
1487out:
1488 return ret;
1489}
1490
1491int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1492{
1493 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1494
1495 if (new_size < le16_to_cpu(di->id2.i_data.id_count))
1496 return 1;
1497 return 0;
1498}
1499
1500static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1501 struct inode *inode, loff_t pos,
1502 unsigned len, struct page *mmap_page,
1503 struct ocfs2_write_ctxt *wc)
1504{
1505 int ret, written = 0;
1506 loff_t end = pos + len;
1507 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1508
1509 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1510 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
1511 oi->ip_dyn_features);
1512
1513 /*
1514 * Handle inodes which already have inline data 1st.
1515 */
1516 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1517 if (mmap_page == NULL &&
1518 ocfs2_size_fits_inline_data(wc->w_di_bh, end))
1519 goto do_inline_write;
1520
1521 /*
1522 * The write won't fit - we have to give this inode an
1523 * inline extent list now.
1524 */
1525 ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
1526 if (ret)
1527 mlog_errno(ret);
1528 goto out;
1529 }
1530
1531 /*
1532 * Check whether the inode can accept inline data.
1533 */
1534 if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
1535 return 0;
1536
1537 /*
1538 * Check whether the write can fit.
1539 */
1540 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
1541 return 0;
1542
1543do_inline_write:
1544 ret = ocfs2_write_begin_inline(mapping, inode, wc);
1545 if (ret) {
1546 mlog_errno(ret);
1547 goto out;
1548 }
1549
1550 /*
1551 * This signals to the caller that the data can be written
1552 * inline.
1553 */
1554 written = 1;
1555out:
1556 return written ? written : ret;
1557}
1558
1559/*
1560 * This function only does anything for file systems which can't
1561 * handle sparse files.
1562 *
1563 * What we want to do here is fill in any hole between the current end
1564 * of allocation and the end of our write. That way the rest of the
1565 * write path can treat it as an non-allocating write, which has no
1566 * special case code for sparse/nonsparse files.
1567 */
1568static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1569 unsigned len,
1570 struct ocfs2_write_ctxt *wc)
1571{
1572 int ret;
1573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1574 loff_t newsize = pos + len;
1575
1576 if (ocfs2_sparse_alloc(osb))
1577 return 0;
1578
1579 if (newsize <= i_size_read(inode))
1580 return 0;
1581
1582 ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
1583 if (ret)
1584 mlog_errno(ret);
1585
1586 return ret;
1587}
1588
1363int ocfs2_write_begin_nolock(struct address_space *mapping, 1589int ocfs2_write_begin_nolock(struct address_space *mapping,
1364 loff_t pos, unsigned len, unsigned flags, 1590 loff_t pos, unsigned len, unsigned flags,
1365 struct page **pagep, void **fsdata, 1591 struct page **pagep, void **fsdata,
@@ -1381,6 +1607,25 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1381 return ret; 1607 return ret;
1382 } 1608 }
1383 1609
1610 if (ocfs2_supports_inline_data(osb)) {
1611 ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1612 mmap_page, wc);
1613 if (ret == 1) {
1614 ret = 0;
1615 goto success;
1616 }
1617 if (ret < 0) {
1618 mlog_errno(ret);
1619 goto out;
1620 }
1621 }
1622
1623 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
1624 if (ret) {
1625 mlog_errno(ret);
1626 goto out;
1627 }
1628
1384 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1629 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1385 &extents_to_split); 1630 &extents_to_split);
1386 if (ret) { 1631 if (ret) {
@@ -1462,6 +1707,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1462 if (meta_ac) 1707 if (meta_ac)
1463 ocfs2_free_alloc_context(meta_ac); 1708 ocfs2_free_alloc_context(meta_ac);
1464 1709
1710success:
1465 *pagep = wc->w_target_page; 1711 *pagep = wc->w_target_page;
1466 *fsdata = wc; 1712 *fsdata = wc;
1467 return 0; 1713 return 0;
@@ -1529,6 +1775,31 @@ out_fail:
1529 return ret; 1775 return ret;
1530} 1776}
1531 1777
1778static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1779 unsigned len, unsigned *copied,
1780 struct ocfs2_dinode *di,
1781 struct ocfs2_write_ctxt *wc)
1782{
1783 void *kaddr;
1784
1785 if (unlikely(*copied < len)) {
1786 if (!PageUptodate(wc->w_target_page)) {
1787 *copied = 0;
1788 return;
1789 }
1790 }
1791
1792 kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
1793 memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1794 kunmap_atomic(kaddr, KM_USER0);
1795
1796 mlog(0, "Data written to inode at offset %llu. "
1797 "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
1798 (unsigned long long)pos, *copied,
1799 le16_to_cpu(di->id2.i_data.id_count),
1800 le16_to_cpu(di->i_dyn_features));
1801}
1802
1532int ocfs2_write_end_nolock(struct address_space *mapping, 1803int ocfs2_write_end_nolock(struct address_space *mapping,
1533 loff_t pos, unsigned len, unsigned copied, 1804 loff_t pos, unsigned len, unsigned copied,
1534 struct page *page, void *fsdata) 1805 struct page *page, void *fsdata)
@@ -1542,6 +1813,11 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1542 handle_t *handle = wc->w_handle; 1813 handle_t *handle = wc->w_handle;
1543 struct page *tmppage; 1814 struct page *tmppage;
1544 1815
1816 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1817 ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
1818 goto out_write_size;
1819 }
1820
1545 if (unlikely(copied < len)) { 1821 if (unlikely(copied < len)) {
1546 if (!PageUptodate(wc->w_target_page)) 1822 if (!PageUptodate(wc->w_target_page))
1547 copied = 0; 1823 copied = 0;
@@ -1579,6 +1855,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1579 block_commit_write(tmppage, from, to); 1855 block_commit_write(tmppage, from, to);
1580 } 1856 }
1581 1857
1858out_write_size:
1582 pos += copied; 1859 pos += copied;
1583 if (pos > inode->i_size) { 1860 if (pos > inode->i_size) {
1584 i_size_write(inode, pos); 1861 i_size_write(inode, pos);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 389579bd64e3..113560877dbb 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -34,6 +34,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
34 struct inode *inode, unsigned int from, 34 struct inode *inode, unsigned int from,
35 unsigned int to, int new); 35 unsigned int to, int new);
36 36
37void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
38
37int walk_page_buffers( handle_t *handle, 39int walk_page_buffers( handle_t *handle,
38 struct buffer_head *head, 40 struct buffer_head *head,
39 unsigned from, 41 unsigned from,
@@ -59,6 +61,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
59 struct page **pagep, void **fsdata, 61 struct page **pagep, void **fsdata,
60 struct buffer_head *di_bh, struct page *mmap_page); 62 struct buffer_head *di_bh, struct page *mmap_page);
61 63
64int ocfs2_read_inline_data(struct inode *inode, struct page *page,
65 struct buffer_head *di_bh);
66int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
67
62/* all ocfs2_dio_end_io()'s fault */ 68/* all ocfs2_dio_end_io()'s fault */
63#define ocfs2_iocb_is_rw_locked(iocb) \ 69#define ocfs2_iocb_is_rw_locked(iocb) \
64 test_bit(0, (unsigned long *)&iocb->private) 70 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 0d5fdde959c8..7453b70c1a19 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -55,10 +55,16 @@
55#include "journal.h" 55#include "journal.h"
56#include "namei.h" 56#include "namei.h"
57#include "suballoc.h" 57#include "suballoc.h"
58#include "super.h"
58#include "uptodate.h" 59#include "uptodate.h"
59 60
60#include "buffer_head_io.h" 61#include "buffer_head_io.h"
61 62
63#define NAMEI_RA_CHUNKS 2
64#define NAMEI_RA_BLOCKS 4
65#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
66#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
67
62static unsigned char ocfs2_filetype_table[] = { 68static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 69 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64}; 70};
@@ -66,12 +72,614 @@ static unsigned char ocfs2_filetype_table[] = {
66static int ocfs2_extend_dir(struct ocfs2_super *osb, 72static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir, 73 struct inode *dir,
68 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
75 unsigned int blocks_wanted,
69 struct buffer_head **new_de_bh); 76 struct buffer_head **new_de_bh);
77static int ocfs2_do_extend_dir(struct super_block *sb,
78 handle_t *handle,
79 struct inode *dir,
80 struct buffer_head *parent_fe_bh,
81 struct ocfs2_alloc_context *data_ac,
82 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh);
84
70/* 85/*
71 * ocfs2_readdir() 86 * bh passed here can be an inode block or a dir data block, depending
87 * on the inode inline data flag.
88 */
89static int ocfs2_check_dir_entry(struct inode * dir,
90 struct ocfs2_dir_entry * de,
91 struct buffer_head * bh,
92 unsigned long offset)
93{
94 const char *error_msg = NULL;
95 const int rlen = le16_to_cpu(de->rec_len);
96
97 if (rlen < OCFS2_DIR_REC_LEN(1))
98 error_msg = "rec_len is smaller than minimal";
99 else if (rlen % 4 != 0)
100 error_msg = "rec_len % 4 != 0";
101 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
102 error_msg = "rec_len is too small for name_len";
103 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
104 error_msg = "directory entry across blocks";
105
106 if (error_msg != NULL)
107 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
108 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
109 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
110 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
111 de->name_len);
112 return error_msg == NULL ? 1 : 0;
113}
114
115static inline int ocfs2_match(int len,
116 const char * const name,
117 struct ocfs2_dir_entry *de)
118{
119 if (len != de->name_len)
120 return 0;
121 if (!de->inode)
122 return 0;
123 return !memcmp(name, de->name, len);
124}
125
126/*
127 * Returns 0 if not found, -1 on failure, and 1 on success
128 */
129static int inline ocfs2_search_dirblock(struct buffer_head *bh,
130 struct inode *dir,
131 const char *name, int namelen,
132 unsigned long offset,
133 char *first_de,
134 unsigned int bytes,
135 struct ocfs2_dir_entry **res_dir)
136{
137 struct ocfs2_dir_entry *de;
138 char *dlimit, *de_buf;
139 int de_len;
140 int ret = 0;
141
142 mlog_entry_void();
143
144 de_buf = first_de;
145 dlimit = de_buf + bytes;
146
147 while (de_buf < dlimit) {
148 /* this code is executed quadratically often */
149 /* do minimal checking `by hand' */
150
151 de = (struct ocfs2_dir_entry *) de_buf;
152
153 if (de_buf + namelen <= dlimit &&
154 ocfs2_match(namelen, name, de)) {
155 /* found a match - just to be sure, do a full check */
156 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
157 ret = -1;
158 goto bail;
159 }
160 *res_dir = de;
161 ret = 1;
162 goto bail;
163 }
164
165 /* prevent looping on a bad block */
166 de_len = le16_to_cpu(de->rec_len);
167 if (de_len <= 0) {
168 ret = -1;
169 goto bail;
170 }
171
172 de_buf += de_len;
173 offset += de_len;
174 }
175
176bail:
177 mlog_exit(ret);
178 return ret;
179}
180
181static struct buffer_head *ocfs2_find_entry_id(const char *name,
182 int namelen,
183 struct inode *dir,
184 struct ocfs2_dir_entry **res_dir)
185{
186 int ret, found;
187 struct buffer_head *di_bh = NULL;
188 struct ocfs2_dinode *di;
189 struct ocfs2_inline_data *data;
190
191 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
192 &di_bh, OCFS2_BH_CACHED, dir);
193 if (ret) {
194 mlog_errno(ret);
195 goto out;
196 }
197
198 di = (struct ocfs2_dinode *)di_bh->b_data;
199 data = &di->id2.i_data;
200
201 found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
202 data->id_data, i_size_read(dir), res_dir);
203 if (found == 1)
204 return di_bh;
205
206 brelse(di_bh);
207out:
208 return NULL;
209}
210
211struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
212 struct inode *dir,
213 struct ocfs2_dir_entry **res_dir)
214{
215 struct super_block *sb;
216 struct buffer_head *bh_use[NAMEI_RA_SIZE];
217 struct buffer_head *bh, *ret = NULL;
218 unsigned long start, block, b;
219 int ra_max = 0; /* Number of bh's in the readahead
220 buffer, bh_use[] */
221 int ra_ptr = 0; /* Current index into readahead
222 buffer */
223 int num = 0;
224 int nblocks, i, err;
225
226 mlog_entry_void();
227
228 sb = dir->i_sb;
229
230 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
231 start = OCFS2_I(dir)->ip_dir_start_lookup;
232 if (start >= nblocks)
233 start = 0;
234 block = start;
235
236restart:
237 do {
238 /*
239 * We deal with the read-ahead logic here.
240 */
241 if (ra_ptr >= ra_max) {
242 /* Refill the readahead buffer */
243 ra_ptr = 0;
244 b = block;
245 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
246 /*
247 * Terminate if we reach the end of the
248 * directory and must wrap, or if our
249 * search has finished at this block.
250 */
251 if (b >= nblocks || (num && block == start)) {
252 bh_use[ra_max] = NULL;
253 break;
254 }
255 num++;
256
257 bh = ocfs2_bread(dir, b++, &err, 1);
258 bh_use[ra_max] = bh;
259 }
260 }
261 if ((bh = bh_use[ra_ptr++]) == NULL)
262 goto next;
263 wait_on_buffer(bh);
264 if (!buffer_uptodate(bh)) {
265 /* read error, skip block & hope for the best */
266 ocfs2_error(dir->i_sb, "reading directory %llu, "
267 "offset %lu\n",
268 (unsigned long long)OCFS2_I(dir)->ip_blkno,
269 block);
270 brelse(bh);
271 goto next;
272 }
273 i = ocfs2_search_dirblock(bh, dir, name, namelen,
274 block << sb->s_blocksize_bits,
275 bh->b_data, sb->s_blocksize,
276 res_dir);
277 if (i == 1) {
278 OCFS2_I(dir)->ip_dir_start_lookup = block;
279 ret = bh;
280 goto cleanup_and_exit;
281 } else {
282 brelse(bh);
283 if (i < 0)
284 goto cleanup_and_exit;
285 }
286 next:
287 if (++block >= nblocks)
288 block = 0;
289 } while (block != start);
290
291 /*
292 * If the directory has grown while we were searching, then
293 * search the last part of the directory before giving up.
294 */
295 block = nblocks;
296 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
297 if (block < nblocks) {
298 start = 0;
299 goto restart;
300 }
301
302cleanup_and_exit:
303 /* Clean up the read-ahead blocks */
304 for (; ra_ptr < ra_max; ra_ptr++)
305 brelse(bh_use[ra_ptr]);
306
307 mlog_exit_ptr(ret);
308 return ret;
309}
310
311/*
312 * Try to find an entry of the provided name within 'dir'.
72 * 313 *
314 * If nothing was found, NULL is returned. Otherwise, a buffer_head
315 * and pointer to the dir entry are passed back.
316 *
317 * Caller can NOT assume anything about the contents of the
318 * buffer_head - it is passed back only so that it can be passed into
319 * any one of the manipulation functions (add entry, delete entry,
320 * etc). As an example, bh in the extent directory case is a data
321 * block, in the inline-data case it actually points to an inode.
73 */ 322 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) 323struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
324 struct inode *dir,
325 struct ocfs2_dir_entry **res_dir)
326{
327 *res_dir = NULL;
328
329 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
330 return ocfs2_find_entry_id(name, namelen, dir, res_dir);
331
332 return ocfs2_find_entry_el(name, namelen, dir, res_dir);
333}
334
335/*
336 * Update inode number and type of a previously found directory entry.
337 */
338int ocfs2_update_entry(struct inode *dir, handle_t *handle,
339 struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
340 struct inode *new_entry_inode)
341{
342 int ret;
343
344 /*
345 * The same code works fine for both inline-data and extent
346 * based directories, so no need to split this up.
347 */
348
349 ret = ocfs2_journal_access(handle, dir, de_bh,
350 OCFS2_JOURNAL_ACCESS_WRITE);
351 if (ret) {
352 mlog_errno(ret);
353 goto out;
354 }
355
356 de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
357 ocfs2_set_de_type(de, new_entry_inode->i_mode);
358
359 ocfs2_journal_dirty(handle, de_bh);
360
361out:
362 return ret;
363}
364
365static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
366 struct ocfs2_dir_entry *de_del,
367 struct buffer_head *bh, char *first_de,
368 unsigned int bytes)
369{
370 struct ocfs2_dir_entry *de, *pde;
371 int i, status = -ENOENT;
372
373 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
374
375 i = 0;
376 pde = NULL;
377 de = (struct ocfs2_dir_entry *) first_de;
378 while (i < bytes) {
379 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
380 status = -EIO;
381 mlog_errno(status);
382 goto bail;
383 }
384 if (de == de_del) {
385 status = ocfs2_journal_access(handle, dir, bh,
386 OCFS2_JOURNAL_ACCESS_WRITE);
387 if (status < 0) {
388 status = -EIO;
389 mlog_errno(status);
390 goto bail;
391 }
392 if (pde)
393 pde->rec_len =
394 cpu_to_le16(le16_to_cpu(pde->rec_len) +
395 le16_to_cpu(de->rec_len));
396 else
397 de->inode = 0;
398 dir->i_version++;
399 status = ocfs2_journal_dirty(handle, bh);
400 goto bail;
401 }
402 i += le16_to_cpu(de->rec_len);
403 pde = de;
404 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
405 }
406bail:
407 mlog_exit(status);
408 return status;
409}
410
411static inline int ocfs2_delete_entry_id(handle_t *handle,
412 struct inode *dir,
413 struct ocfs2_dir_entry *de_del,
414 struct buffer_head *bh)
415{
416 int ret;
417 struct buffer_head *di_bh = NULL;
418 struct ocfs2_dinode *di;
419 struct ocfs2_inline_data *data;
420
421 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
422 &di_bh, OCFS2_BH_CACHED, dir);
423 if (ret) {
424 mlog_errno(ret);
425 goto out;
426 }
427
428 di = (struct ocfs2_dinode *)di_bh->b_data;
429 data = &di->id2.i_data;
430
431 ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
432 i_size_read(dir));
433
434 brelse(di_bh);
435out:
436 return ret;
437}
438
439static inline int ocfs2_delete_entry_el(handle_t *handle,
440 struct inode *dir,
441 struct ocfs2_dir_entry *de_del,
442 struct buffer_head *bh)
443{
444 return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
445 bh->b_size);
446}
447
448/*
449 * ocfs2_delete_entry deletes a directory entry by merging it with the
450 * previous entry
451 */
452int ocfs2_delete_entry(handle_t *handle,
453 struct inode *dir,
454 struct ocfs2_dir_entry *de_del,
455 struct buffer_head *bh)
456{
457 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
458 return ocfs2_delete_entry_id(handle, dir, de_del, bh);
459
460 return ocfs2_delete_entry_el(handle, dir, de_del, bh);
461}
462
463/*
464 * Check whether 'de' has enough room to hold an entry of
465 * 'new_rec_len' bytes.
466 */
467static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
468 unsigned int new_rec_len)
469{
470 unsigned int de_really_used;
471
472 /* Check whether this is an empty record with enough space */
473 if (le64_to_cpu(de->inode) == 0 &&
474 le16_to_cpu(de->rec_len) >= new_rec_len)
475 return 1;
476
477 /*
478 * Record might have free space at the end which we can
479 * use.
480 */
481 de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
482 if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
483 return 1;
484
485 return 0;
486}
487
488/* we don't always have a dentry for what we want to add, so people
489 * like orphan dir can call this instead.
490 *
491 * If you pass me insert_bh, I'll skip the search of the other dir
492 * blocks and put the record in there.
493 */
494int __ocfs2_add_entry(handle_t *handle,
495 struct inode *dir,
496 const char *name, int namelen,
497 struct inode *inode, u64 blkno,
498 struct buffer_head *parent_fe_bh,
499 struct buffer_head *insert_bh)
500{
501 unsigned long offset;
502 unsigned short rec_len;
503 struct ocfs2_dir_entry *de, *de1;
504 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
505 struct super_block *sb = dir->i_sb;
506 int retval, status;
507 unsigned int size = sb->s_blocksize;
508 char *data_start = insert_bh->b_data;
509
510 mlog_entry_void();
511
512 if (!namelen)
513 return -EINVAL;
514
515 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
516 data_start = di->id2.i_data.id_data;
517 size = i_size_read(dir);
518
519 BUG_ON(insert_bh != parent_fe_bh);
520 }
521
522 rec_len = OCFS2_DIR_REC_LEN(namelen);
523 offset = 0;
524 de = (struct ocfs2_dir_entry *) data_start;
525 while (1) {
526 BUG_ON((char *)de >= (size + data_start));
527
528 /* These checks should've already been passed by the
529 * prepare function, but I guess we can leave them
530 * here anyway. */
531 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
532 retval = -ENOENT;
533 goto bail;
534 }
535 if (ocfs2_match(namelen, name, de)) {
536 retval = -EEXIST;
537 goto bail;
538 }
539
540 if (ocfs2_dirent_would_fit(de, rec_len)) {
541 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
542 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
543 if (retval < 0) {
544 mlog_errno(retval);
545 goto bail;
546 }
547
548 status = ocfs2_journal_access(handle, dir, insert_bh,
549 OCFS2_JOURNAL_ACCESS_WRITE);
550 /* By now the buffer is marked for journaling */
551 offset += le16_to_cpu(de->rec_len);
552 if (le64_to_cpu(de->inode)) {
553 de1 = (struct ocfs2_dir_entry *)((char *) de +
554 OCFS2_DIR_REC_LEN(de->name_len));
555 de1->rec_len =
556 cpu_to_le16(le16_to_cpu(de->rec_len) -
557 OCFS2_DIR_REC_LEN(de->name_len));
558 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
559 de = de1;
560 }
561 de->file_type = OCFS2_FT_UNKNOWN;
562 if (blkno) {
563 de->inode = cpu_to_le64(blkno);
564 ocfs2_set_de_type(de, inode->i_mode);
565 } else
566 de->inode = 0;
567 de->name_len = namelen;
568 memcpy(de->name, name, namelen);
569
570 dir->i_version++;
571 status = ocfs2_journal_dirty(handle, insert_bh);
572 retval = 0;
573 goto bail;
574 }
575 offset += le16_to_cpu(de->rec_len);
576 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
577 }
578
579 /* when you think about it, the assert above should prevent us
580 * from ever getting here. */
581 retval = -ENOSPC;
582bail:
583
584 mlog_exit(retval);
585 return retval;
586}
587
588static int ocfs2_dir_foreach_blk_id(struct inode *inode,
589 unsigned long *f_version,
590 loff_t *f_pos, void *priv,
591 filldir_t filldir, int *filldir_err)
592{
593 int ret, i, filldir_ret;
594 unsigned long offset = *f_pos;
595 struct buffer_head *di_bh = NULL;
596 struct ocfs2_dinode *di;
597 struct ocfs2_inline_data *data;
598 struct ocfs2_dir_entry *de;
599
600 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
601 &di_bh, OCFS2_BH_CACHED, inode);
602 if (ret) {
603 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
604 (unsigned long long)OCFS2_I(inode)->ip_blkno);
605 goto out;
606 }
607
608 di = (struct ocfs2_dinode *)di_bh->b_data;
609 data = &di->id2.i_data;
610
611 while (*f_pos < i_size_read(inode)) {
612revalidate:
613 /* If the dir block has changed since the last call to
614 * readdir(2), then we might be pointing to an invalid
615 * dirent right now. Scan from the start of the block
616 * to make sure. */
617 if (*f_version != inode->i_version) {
618 for (i = 0; i < i_size_read(inode) && i < offset; ) {
619 de = (struct ocfs2_dir_entry *)
620 (data->id_data + i);
621 /* It's too expensive to do a full
622 * dirent test each time round this
623 * loop, but we do have to test at
624 * least that it is non-zero. A
625 * failure will be detected in the
626 * dirent test below. */
627 if (le16_to_cpu(de->rec_len) <
628 OCFS2_DIR_REC_LEN(1))
629 break;
630 i += le16_to_cpu(de->rec_len);
631 }
632 *f_pos = offset = i;
633 *f_version = inode->i_version;
634 }
635
636 de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
637 if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
638 /* On error, skip the f_pos to the end. */
639 *f_pos = i_size_read(inode);
640 goto out;
641 }
642 offset += le16_to_cpu(de->rec_len);
643 if (le64_to_cpu(de->inode)) {
644 /* We might block in the next section
645 * if the data destination is
646 * currently swapped out. So, use a
647 * version stamp to detect whether or
648 * not the directory has been modified
649 * during the copy operation.
650 */
651 unsigned long version = *f_version;
652 unsigned char d_type = DT_UNKNOWN;
653
654 if (de->file_type < OCFS2_FT_MAX)
655 d_type = ocfs2_filetype_table[de->file_type];
656
657 filldir_ret = filldir(priv, de->name,
658 de->name_len,
659 *f_pos,
660 le64_to_cpu(de->inode),
661 d_type);
662 if (filldir_ret) {
663 if (filldir_err)
664 *filldir_err = filldir_ret;
665 break;
666 }
667 if (version != *f_version)
668 goto revalidate;
669 }
670 *f_pos += le16_to_cpu(de->rec_len);
671 }
672
673out:
674 brelse(di_bh);
675
676 return 0;
677}
678
679static int ocfs2_dir_foreach_blk_el(struct inode *inode,
680 unsigned long *f_version,
681 loff_t *f_pos, void *priv,
682 filldir_t filldir, int *filldir_err)
75{ 683{
76 int error = 0; 684 int error = 0;
77 unsigned long offset, blk, last_ra_blk = 0; 685 unsigned long offset, blk, last_ra_blk = 0;
@@ -79,45 +687,23 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
79 struct buffer_head * bh, * tmp; 687 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de; 688 struct ocfs2_dir_entry * de;
81 int err; 689 int err;
82 struct inode *inode = filp->f_path.dentry->d_inode;
83 struct super_block * sb = inode->i_sb; 690 struct super_block * sb = inode->i_sb;
84 unsigned int ra_sectors = 16; 691 unsigned int ra_sectors = 16;
85 int lock_level = 0;
86
87 mlog_entry("dirino=%llu\n",
88 (unsigned long long)OCFS2_I(inode)->ip_blkno);
89 692
90 stored = 0; 693 stored = 0;
91 bh = NULL; 694 bh = NULL;
92 695
93 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 696 offset = (*f_pos) & (sb->s_blocksize - 1);
94 if (lock_level && error >= 0) {
95 /* We release EX lock which used to update atime
96 * and get PR lock again to reduce contention
97 * on commonly accessed directories. */
98 ocfs2_meta_unlock(inode, 1);
99 lock_level = 0;
100 error = ocfs2_meta_lock(inode, NULL, 0);
101 }
102 if (error < 0) {
103 if (error != -ENOENT)
104 mlog_errno(error);
105 /* we haven't got any yet, so propagate the error. */
106 stored = error;
107 goto bail_nolock;
108 }
109 697
110 offset = filp->f_pos & (sb->s_blocksize - 1); 698 while (!error && !stored && *f_pos < i_size_read(inode)) {
111 699 blk = (*f_pos) >> sb->s_blocksize_bits;
112 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
113 blk = (filp->f_pos) >> sb->s_blocksize_bits;
114 bh = ocfs2_bread(inode, blk, &err, 0); 700 bh = ocfs2_bread(inode, blk, &err, 0);
115 if (!bh) { 701 if (!bh) {
116 mlog(ML_ERROR, 702 mlog(ML_ERROR,
117 "directory #%llu contains a hole at offset %lld\n", 703 "directory #%llu contains a hole at offset %lld\n",
118 (unsigned long long)OCFS2_I(inode)->ip_blkno, 704 (unsigned long long)OCFS2_I(inode)->ip_blkno,
119 filp->f_pos); 705 *f_pos);
120 filp->f_pos += sb->s_blocksize - offset; 706 *f_pos += sb->s_blocksize - offset;
121 continue; 707 continue;
122 } 708 }
123 709
@@ -143,7 +729,7 @@ revalidate:
143 * readdir(2), then we might be pointing to an invalid 729 * readdir(2), then we might be pointing to an invalid
144 * dirent right now. Scan from the start of the block 730 * dirent right now. Scan from the start of the block
145 * to make sure. */ 731 * to make sure. */
146 if (filp->f_version != inode->i_version) { 732 if (*f_version != inode->i_version) {
147 for (i = 0; i < sb->s_blocksize && i < offset; ) { 733 for (i = 0; i < sb->s_blocksize && i < offset; ) {
148 de = (struct ocfs2_dir_entry *) (bh->b_data + i); 734 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
149 /* It's too expensive to do a full 735 /* It's too expensive to do a full
@@ -158,21 +744,20 @@ revalidate:
158 i += le16_to_cpu(de->rec_len); 744 i += le16_to_cpu(de->rec_len);
159 } 745 }
160 offset = i; 746 offset = i;
161 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 747 *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
162 | offset; 748 | offset;
163 filp->f_version = inode->i_version; 749 *f_version = inode->i_version;
164 } 750 }
165 751
166 while (!error && filp->f_pos < i_size_read(inode) 752 while (!error && *f_pos < i_size_read(inode)
167 && offset < sb->s_blocksize) { 753 && offset < sb->s_blocksize) {
168 de = (struct ocfs2_dir_entry *) (bh->b_data + offset); 754 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
169 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 755 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
170 /* On error, skip the f_pos to the 756 /* On error, skip the f_pos to the
171 next block. */ 757 next block. */
172 filp->f_pos = (filp->f_pos | 758 *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
173 (sb->s_blocksize - 1)) + 1;
174 brelse(bh); 759 brelse(bh);
175 goto bail; 760 goto out;
176 } 761 }
177 offset += le16_to_cpu(de->rec_len); 762 offset += le16_to_cpu(de->rec_len);
178 if (le64_to_cpu(de->inode)) { 763 if (le64_to_cpu(de->inode)) {
@@ -183,36 +768,109 @@ revalidate:
183 * not the directory has been modified 768 * not the directory has been modified
184 * during the copy operation. 769 * during the copy operation.
185 */ 770 */
186 unsigned long version = filp->f_version; 771 unsigned long version = *f_version;
187 unsigned char d_type = DT_UNKNOWN; 772 unsigned char d_type = DT_UNKNOWN;
188 773
189 if (de->file_type < OCFS2_FT_MAX) 774 if (de->file_type < OCFS2_FT_MAX)
190 d_type = ocfs2_filetype_table[de->file_type]; 775 d_type = ocfs2_filetype_table[de->file_type];
191 error = filldir(dirent, de->name, 776 error = filldir(priv, de->name,
192 de->name_len, 777 de->name_len,
193 filp->f_pos, 778 *f_pos,
194 ino_from_blkno(sb, le64_to_cpu(de->inode)), 779 le64_to_cpu(de->inode),
195 d_type); 780 d_type);
196 if (error) 781 if (error) {
782 if (filldir_err)
783 *filldir_err = error;
197 break; 784 break;
198 if (version != filp->f_version) 785 }
786 if (version != *f_version)
199 goto revalidate; 787 goto revalidate;
200 stored ++; 788 stored ++;
201 } 789 }
202 filp->f_pos += le16_to_cpu(de->rec_len); 790 *f_pos += le16_to_cpu(de->rec_len);
203 } 791 }
204 offset = 0; 792 offset = 0;
205 brelse(bh); 793 brelse(bh);
206 } 794 }
207 795
208 stored = 0; 796 stored = 0;
209bail: 797out:
798 return stored;
799}
800
801static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
802 loff_t *f_pos, void *priv, filldir_t filldir,
803 int *filldir_err)
804{
805 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
806 return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
807 filldir, filldir_err);
808
809 return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
810 filldir_err);
811}
812
813/*
814 * This is intended to be called from inside other kernel functions,
815 * so we fake some arguments.
816 */
817int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
818 filldir_t filldir)
819{
820 int ret = 0, filldir_err = 0;
821 unsigned long version = inode->i_version;
822
823 while (*f_pos < i_size_read(inode)) {
824 ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
825 filldir, &filldir_err);
826 if (ret || filldir_err)
827 break;
828 }
829
830 if (ret > 0)
831 ret = -EIO;
832
833 return 0;
834}
835
836/*
837 * ocfs2_readdir()
838 *
839 */
840int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
841{
842 int error = 0;
843 struct inode *inode = filp->f_path.dentry->d_inode;
844 int lock_level = 0;
845
846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1);
855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0);
857 }
858 if (error < 0) {
859 if (error != -ENOENT)
860 mlog_errno(error);
861 /* we haven't got any yet, so propagate the error. */
862 goto bail_nolock;
863 }
864
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL);
867
210 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_meta_unlock(inode, lock_level);
211 869
212bail_nolock: 870bail_nolock:
213 mlog_exit(stored); 871 mlog_exit(error);
214 872
215 return stored; 873 return error;
216} 874}
217 875
218/* 876/*
@@ -252,6 +910,23 @@ leave:
252 return status; 910 return status;
253} 911}
254 912
913/*
914 * Convenience function for callers which just want the block number
915 * mapped to a name and don't require the full dirent info, etc.
916 */
917int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
918 int namelen, u64 *blkno)
919{
920 int ret;
921 struct buffer_head *bh = NULL;
922 struct ocfs2_dir_entry *dirent = NULL;
923
924 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
925 brelse(bh);
926
927 return ret;
928}
929
255/* Check for a name within a directory. 930/* Check for a name within a directory.
256 * 931 *
257 * Return 0 if the name does not exist 932 * Return 0 if the name does not exist
@@ -284,77 +959,414 @@ bail:
284 return ret; 959 return ret;
285} 960}
286 961
962struct ocfs2_empty_dir_priv {
963 unsigned seen_dot;
964 unsigned seen_dot_dot;
965 unsigned seen_other;
966};
967static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
968 loff_t pos, u64 ino, unsigned type)
969{
970 struct ocfs2_empty_dir_priv *p = priv;
971
972 /*
973 * Check the positions of "." and ".." records to be sure
974 * they're in the correct place.
975 */
976 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
977 p->seen_dot = 1;
978 return 0;
979 }
980
981 if (name_len == 2 && !strncmp("..", name, 2) &&
982 pos == OCFS2_DIR_REC_LEN(1)) {
983 p->seen_dot_dot = 1;
984 return 0;
985 }
986
987 p->seen_other = 1;
988 return 1;
989}
287/* 990/*
288 * routine to check that the specified directory is empty (for rmdir) 991 * routine to check that the specified directory is empty (for rmdir)
992 *
993 * Returns 1 if dir is empty, zero otherwise.
289 */ 994 */
290int ocfs2_empty_dir(struct inode *inode) 995int ocfs2_empty_dir(struct inode *inode)
291{ 996{
292 unsigned long offset; 997 int ret;
293 struct buffer_head * bh; 998 loff_t start = 0;
294 struct ocfs2_dir_entry * de, * de1; 999 struct ocfs2_empty_dir_priv priv;
295 struct super_block * sb; 1000
296 int err; 1001 memset(&priv, 0, sizeof(priv));
1002
1003 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1004 if (ret)
1005 mlog_errno(ret);
297 1006
298 sb = inode->i_sb; 1007 if (!priv.seen_dot || !priv.seen_dot_dot) {
299 if ((i_size_read(inode) < 1008 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
300 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
301 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
302 mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",
303 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1009 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1010 /*
1011 * XXX: Is it really safe to allow an unlink to continue?
1012 */
304 return 1; 1013 return 1;
305 } 1014 }
306 1015
307 de = (struct ocfs2_dir_entry *) bh->b_data; 1016 return !priv.seen_other;
308 de1 = (struct ocfs2_dir_entry *) 1017}
309 ((char *)de + le16_to_cpu(de->rec_len)); 1018
310 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || 1019static void ocfs2_fill_initial_dirents(struct inode *inode,
311 !le64_to_cpu(de1->inode) || 1020 struct inode *parent,
312 strcmp(".", de->name) || 1021 char *start, unsigned int size)
313 strcmp("..", de1->name)) { 1022{
314 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", 1023 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
315 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1024
316 brelse(bh); 1025 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
317 return 1; 1026 de->name_len = 1;
1027 de->rec_len =
1028 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1029 strcpy(de->name, ".");
1030 ocfs2_set_de_type(de, S_IFDIR);
1031
1032 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
1033 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
1034 de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
1035 de->name_len = 2;
1036 strcpy(de->name, "..");
1037 ocfs2_set_de_type(de, S_IFDIR);
1038}
1039
1040/*
1041 * This works together with code in ocfs2_mknod_locked() which sets
1042 * the inline-data flag and initializes the inline-data section.
1043 */
1044static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
1045 handle_t *handle,
1046 struct inode *parent,
1047 struct inode *inode,
1048 struct buffer_head *di_bh)
1049{
1050 int ret;
1051 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1052 struct ocfs2_inline_data *data = &di->id2.i_data;
1053 unsigned int size = le16_to_cpu(data->id_count);
1054
1055 ret = ocfs2_journal_access(handle, inode, di_bh,
1056 OCFS2_JOURNAL_ACCESS_WRITE);
1057 if (ret) {
1058 mlog_errno(ret);
1059 goto out;
318 } 1060 }
319 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); 1061
320 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); 1062 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
321 while (offset < i_size_read(inode) ) { 1063
322 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { 1064 ocfs2_journal_dirty(handle, di_bh);
323 brelse(bh); 1065 if (ret) {
324 bh = ocfs2_bread(inode, 1066 mlog_errno(ret);
325 offset >> sb->s_blocksize_bits, &err, 0); 1067 goto out;
326 if (!bh) { 1068 }
327 mlog(ML_ERROR, "dir %llu has a hole at %lu\n", 1069
328 (unsigned long long)OCFS2_I(inode)->ip_blkno, offset); 1070 i_size_write(inode, size);
329 offset += sb->s_blocksize; 1071 inode->i_nlink = 2;
330 continue; 1072 inode->i_blocks = ocfs2_inode_sector_count(inode);
331 } 1073
332 de = (struct ocfs2_dir_entry *) bh->b_data; 1074 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
333 } 1075 if (ret < 0)
334 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 1076 mlog_errno(ret);
335 brelse(bh); 1077
336 return 1; 1078out:
1079 return ret;
1080}
1081
1082static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1083 handle_t *handle,
1084 struct inode *parent,
1085 struct inode *inode,
1086 struct buffer_head *fe_bh,
1087 struct ocfs2_alloc_context *data_ac)
1088{
1089 int status;
1090 struct buffer_head *new_bh = NULL;
1091
1092 mlog_entry_void();
1093
1094 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
1095 data_ac, NULL, &new_bh);
1096 if (status < 0) {
1097 mlog_errno(status);
1098 goto bail;
1099 }
1100
1101 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1102
1103 status = ocfs2_journal_access(handle, inode, new_bh,
1104 OCFS2_JOURNAL_ACCESS_CREATE);
1105 if (status < 0) {
1106 mlog_errno(status);
1107 goto bail;
1108 }
1109 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1110
1111 ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
1112 osb->sb->s_blocksize);
1113
1114 status = ocfs2_journal_dirty(handle, new_bh);
1115 if (status < 0) {
1116 mlog_errno(status);
1117 goto bail;
1118 }
1119
1120 i_size_write(inode, inode->i_sb->s_blocksize);
1121 inode->i_nlink = 2;
1122 inode->i_blocks = ocfs2_inode_sector_count(inode);
1123 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1124 if (status < 0) {
1125 mlog_errno(status);
1126 goto bail;
1127 }
1128
1129 status = 0;
1130bail:
1131 if (new_bh)
1132 brelse(new_bh);
1133
1134 mlog_exit(status);
1135 return status;
1136}
1137
1138int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1139 handle_t *handle,
1140 struct inode *parent,
1141 struct inode *inode,
1142 struct buffer_head *fe_bh,
1143 struct ocfs2_alloc_context *data_ac)
1144{
1145 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1146
1147 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1148 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1149
1150 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1151 data_ac);
1152}
1153
1154static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1155 unsigned int new_size)
1156{
1157 struct ocfs2_dir_entry *de;
1158 struct ocfs2_dir_entry *prev_de;
1159 char *de_buf, *limit;
1160 unsigned int bytes = new_size - old_size;
1161
1162 limit = start + old_size;
1163 de_buf = start;
1164 de = (struct ocfs2_dir_entry *)de_buf;
1165 do {
1166 prev_de = de;
1167 de_buf += le16_to_cpu(de->rec_len);
1168 de = (struct ocfs2_dir_entry *)de_buf;
1169 } while (de_buf < limit);
1170
1171 le16_add_cpu(&prev_de->rec_len, bytes);
1172}
1173
1174/*
1175 * We allocate enough clusters to fulfill "blocks_wanted", but set
1176 * i_size to exactly one block. Ocfs2_extend_dir() will handle the
1177 * rest automatically for us.
1178 *
1179 * *first_block_bh is a pointer to the 1st data block allocated to the
1180 * directory.
1181 */
1182static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1183 unsigned int blocks_wanted,
1184 struct buffer_head **first_block_bh)
1185{
1186 int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
1187 u32 alloc, bit_off, len;
1188 struct super_block *sb = dir->i_sb;
1189 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
1190 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1191 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1192 struct ocfs2_alloc_context *data_ac;
1193 struct buffer_head *dirdata_bh = NULL;
1194 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1195 handle_t *handle;
1196
1197 alloc = ocfs2_clusters_for_bytes(sb, bytes);
1198
1199 /*
1200 * We should never need more than 2 clusters for this -
1201 * maximum dirent size is far less than one block. In fact,
1202 * the only time we'd need more than one cluster is if
1203 * blocksize == clustersize and the dirent won't fit in the
1204 * extra space that the expansion to a single block gives. As
1205 * of today, that only happens on 4k/4k file systems.
1206 */
1207 BUG_ON(alloc > 2);
1208
1209 ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
1210 if (ret) {
1211 mlog_errno(ret);
1212 goto out;
1213 }
1214
1215 down_write(&oi->ip_alloc_sem);
1216
1217 /*
1218 * Prepare for worst case allocation scenario of two seperate
1219 * extents.
1220 */
1221 if (alloc == 2)
1222 credits += OCFS2_SUBALLOC_ALLOC;
1223
1224 handle = ocfs2_start_trans(osb, credits);
1225 if (IS_ERR(handle)) {
1226 ret = PTR_ERR(handle);
1227 mlog_errno(ret);
1228 goto out_sem;
1229 }
1230
1231 /*
1232 * Try to claim as many clusters as the bitmap can give though
1233 * if we only get one now, that's enough to continue. The rest
1234 * will be claimed after the conversion to extents.
1235 */
1236 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
1237 if (ret) {
1238 mlog_errno(ret);
1239 goto out_commit;
1240 }
1241
1242 /*
1243 * Operations are carefully ordered so that we set up the new
1244 * data block first. The conversion from inline data to
1245 * extents follows.
1246 */
1247 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
1248 dirdata_bh = sb_getblk(sb, blkno);
1249 if (!dirdata_bh) {
1250 ret = -EIO;
1251 mlog_errno(ret);
1252 goto out_commit;
1253 }
1254
1255 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
1256
1257 ret = ocfs2_journal_access(handle, dir, dirdata_bh,
1258 OCFS2_JOURNAL_ACCESS_CREATE);
1259 if (ret) {
1260 mlog_errno(ret);
1261 goto out_commit;
1262 }
1263
1264 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1265 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1266 sb->s_blocksize - i_size_read(dir));
1267 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
1268 sb->s_blocksize);
1269
1270 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1271 if (ret) {
1272 mlog_errno(ret);
1273 goto out_commit;
1274 }
1275
1276 /*
1277 * Set extent, i_size, etc on the directory. After this, the
1278 * inode should contain the same exact dirents as before and
1279 * be fully accessible from system calls.
1280 *
1281 * We let the later dirent insert modify c/mtime - to the user
1282 * the data hasn't changed.
1283 */
1284 ret = ocfs2_journal_access(handle, dir, di_bh,
1285 OCFS2_JOURNAL_ACCESS_CREATE);
1286 if (ret) {
1287 mlog_errno(ret);
1288 goto out_commit;
1289 }
1290
1291 spin_lock(&oi->ip_lock);
1292 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
1293 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1294 spin_unlock(&oi->ip_lock);
1295
1296 ocfs2_dinode_new_extent_list(dir, di);
1297
1298 i_size_write(dir, sb->s_blocksize);
1299 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1300
1301 di->i_size = cpu_to_le64(sb->s_blocksize);
1302 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
1303 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
1304 dir->i_blocks = ocfs2_inode_sector_count(dir);
1305
1306 /*
1307 * This should never fail as our extent list is empty and all
1308 * related blocks have been journaled already.
1309 */
1310 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
1311 NULL);
1312 if (ret) {
1313 mlog_errno(ret);
1314 goto out;
1315 }
1316
1317 ret = ocfs2_journal_dirty(handle, di_bh);
1318 if (ret) {
1319 mlog_errno(ret);
1320 goto out_commit;
1321 }
1322
1323 /*
1324 * We asked for two clusters, but only got one in the 1st
1325 * pass. Claim the 2nd cluster as a separate extent.
1326 */
1327 if (alloc > len) {
1328 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
1329 &len);
1330 if (ret) {
1331 mlog_errno(ret);
1332 goto out_commit;
337 } 1333 }
338 if (le64_to_cpu(de->inode)) { 1334 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
339 brelse(bh); 1335
340 return 0; 1336 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
1337 len, 0, NULL);
1338 if (ret) {
1339 mlog_errno(ret);
1340 goto out;
341 } 1341 }
342 offset += le16_to_cpu(de->rec_len);
343 de = (struct ocfs2_dir_entry *)
344 ((char *)de + le16_to_cpu(de->rec_len));
345 } 1342 }
346 brelse(bh); 1343
347 return 1; 1344 *first_block_bh = dirdata_bh;
1345 dirdata_bh = NULL;
1346
1347out_commit:
1348 ocfs2_commit_trans(osb, handle);
1349
1350out_sem:
1351 up_write(&oi->ip_alloc_sem);
1352
1353out:
1354 if (data_ac)
1355 ocfs2_free_alloc_context(data_ac);
1356
1357 brelse(dirdata_bh);
1358
1359 return ret;
348} 1360}
349 1361
350/* returns a bh of the 1st new block in the allocation. */ 1362/* returns a bh of the 1st new block in the allocation. */
351int ocfs2_do_extend_dir(struct super_block *sb, 1363static int ocfs2_do_extend_dir(struct super_block *sb,
352 handle_t *handle, 1364 handle_t *handle,
353 struct inode *dir, 1365 struct inode *dir,
354 struct buffer_head *parent_fe_bh, 1366 struct buffer_head *parent_fe_bh,
355 struct ocfs2_alloc_context *data_ac, 1367 struct ocfs2_alloc_context *data_ac,
356 struct ocfs2_alloc_context *meta_ac, 1368 struct ocfs2_alloc_context *meta_ac,
357 struct buffer_head **new_bh) 1369 struct buffer_head **new_bh)
358{ 1370{
359 int status; 1371 int status;
360 int extend; 1372 int extend;
@@ -396,10 +1408,18 @@ bail:
396 return status; 1408 return status;
397} 1409}
398 1410
399/* assumes you already have a cluster lock on the directory. */ 1411/*
1412 * Assumes you already have a cluster lock on the directory.
1413 *
1414 * 'blocks_wanted' is only used if we have an inline directory which
1415 * is to be turned into an extent based one. The size of the dirent to
1416 * insert might be larger than the space gained by growing to just one
1417 * block, so we may have to grow the inode by two blocks in that case.
1418 */
400static int ocfs2_extend_dir(struct ocfs2_super *osb, 1419static int ocfs2_extend_dir(struct ocfs2_super *osb,
401 struct inode *dir, 1420 struct inode *dir,
402 struct buffer_head *parent_fe_bh, 1421 struct buffer_head *parent_fe_bh,
1422 unsigned int blocks_wanted,
403 struct buffer_head **new_de_bh) 1423 struct buffer_head **new_de_bh)
404{ 1424{
405 int status = 0; 1425 int status = 0;
@@ -415,6 +1435,38 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
415 1435
416 mlog_entry_void(); 1436 mlog_entry_void();
417 1437
1438 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1439 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1440 blocks_wanted, &new_bh);
1441 if (status) {
1442 mlog_errno(status);
1443 goto bail;
1444 }
1445
1446 if (blocks_wanted == 1) {
1447 /*
1448 * If the new dirent will fit inside the space
1449 * created by pushing out to one block, then
1450 * we can complete the operation
1451 * here. Otherwise we have to expand i_size
1452 * and format the 2nd block below.
1453 */
1454 BUG_ON(new_bh == NULL);
1455 goto bail_bh;
1456 }
1457
1458 /*
1459 * Get rid of 'new_bh' - we want to format the 2nd
1460 * data block and return that instead.
1461 */
1462 brelse(new_bh);
1463 new_bh = NULL;
1464
1465 dir_i_size = i_size_read(dir);
1466 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
1467 goto do_extend;
1468 }
1469
418 dir_i_size = i_size_read(dir); 1470 dir_i_size = i_size_read(dir);
419 mlog(0, "extending dir %llu (i_size = %lld)\n", 1471 mlog(0, "extending dir %llu (i_size = %lld)\n",
420 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); 1472 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -452,6 +1504,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
452 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 1504 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
453 } 1505 }
454 1506
1507do_extend:
455 down_write(&OCFS2_I(dir)->ip_alloc_sem); 1508 down_write(&OCFS2_I(dir)->ip_alloc_sem);
456 drop_alloc_sem = 1; 1509 drop_alloc_sem = 1;
457 1510
@@ -497,6 +1550,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
497 goto bail; 1550 goto bail;
498 } 1551 }
499 1552
1553bail_bh:
500 *new_de_bh = new_bh; 1554 *new_de_bh = new_bh;
501 get_bh(*new_de_bh); 1555 get_bh(*new_de_bh);
502bail: 1556bail:
@@ -517,41 +1571,71 @@ bail:
517 return status; 1571 return status;
518} 1572}
519 1573
520/* 1574static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
521 * Search the dir for a good spot, extending it if necessary. The 1575 const char *name, int namelen,
522 * block containing an appropriate record is returned in ret_de_bh. 1576 struct buffer_head **ret_de_bh,
523 */ 1577 unsigned int *blocks_wanted)
524int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
525 struct inode *dir,
526 struct buffer_head *parent_fe_bh,
527 const char *name,
528 int namelen,
529 struct buffer_head **ret_de_bh)
530{ 1578{
531 unsigned long offset; 1579 int ret;
532 struct buffer_head * bh = NULL; 1580 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
533 unsigned short rec_len; 1581 struct ocfs2_dir_entry *de, *last_de = NULL;
534 struct ocfs2_dinode *fe; 1582 char *de_buf, *limit;
535 struct ocfs2_dir_entry *de; 1583 unsigned long offset = 0;
536 struct super_block *sb; 1584 unsigned int rec_len, new_rec_len;
537 int status; 1585
1586 de_buf = di->id2.i_data.id_data;
1587 limit = de_buf + i_size_read(dir);
1588 rec_len = OCFS2_DIR_REC_LEN(namelen);
538 1589
539 mlog_entry_void(); 1590 while (de_buf < limit) {
1591 de = (struct ocfs2_dir_entry *)de_buf;
540 1592
541 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 1593 if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
542 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 1594 ret = -ENOENT;
1595 goto out;
1596 }
1597 if (ocfs2_match(namelen, name, de)) {
1598 ret = -EEXIST;
1599 goto out;
1600 }
1601 if (ocfs2_dirent_would_fit(de, rec_len)) {
1602 /* Ok, we found a spot. Return this bh and let
1603 * the caller actually fill it in. */
1604 *ret_de_bh = di_bh;
1605 get_bh(*ret_de_bh);
1606 ret = 0;
1607 goto out;
1608 }
543 1609
544 BUG_ON(!S_ISDIR(dir->i_mode)); 1610 last_de = de;
545 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1611 de_buf += le16_to_cpu(de->rec_len);
546 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); 1612 offset += le16_to_cpu(de->rec_len);
1613 }
547 1614
548 sb = dir->i_sb; 1615 /*
1616 * We're going to require expansion of the directory - figure
1617 * out how many blocks we'll need so that a place for the
1618 * dirent can be found.
1619 */
1620 *blocks_wanted = 1;
1621 new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
1622 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
1623 *blocks_wanted = 2;
1624
1625 ret = -ENOSPC;
1626out:
1627 return ret;
1628}
549 1629
550 if (!namelen) { 1630static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
551 status = -EINVAL; 1631 int namelen, struct buffer_head **ret_de_bh)
552 mlog_errno(status); 1632{
553 goto bail; 1633 unsigned long offset;
554 } 1634 struct buffer_head *bh = NULL;
1635 unsigned short rec_len;
1636 struct ocfs2_dir_entry *de;
1637 struct super_block *sb = dir->i_sb;
1638 int status;
555 1639
556 bh = ocfs2_bread(dir, 0, &status, 0); 1640 bh = ocfs2_bread(dir, 0, &status, 0);
557 if (!bh) { 1641 if (!bh) {
@@ -568,17 +1652,11 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
568 bh = NULL; 1652 bh = NULL;
569 1653
570 if (i_size_read(dir) <= offset) { 1654 if (i_size_read(dir) <= offset) {
571 status = ocfs2_extend_dir(osb, 1655 /*
572 dir, 1656 * Caller will have to expand this
573 parent_fe_bh, 1657 * directory.
574 &bh); 1658 */
575 if (status < 0) { 1659 status = -ENOSPC;
576 mlog_errno(status);
577 goto bail;
578 }
579 BUG_ON(!bh);
580 *ret_de_bh = bh;
581 get_bh(*ret_de_bh);
582 goto bail; 1660 goto bail;
583 } 1661 }
584 bh = ocfs2_bread(dir, 1662 bh = ocfs2_bread(dir,
@@ -600,10 +1678,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
600 status = -EEXIST; 1678 status = -EEXIST;
601 goto bail; 1679 goto bail;
602 } 1680 }
603 if (((le64_to_cpu(de->inode) == 0) && 1681 if (ocfs2_dirent_would_fit(de, rec_len)) {
604 (le16_to_cpu(de->rec_len) >= rec_len)) ||
605 (le16_to_cpu(de->rec_len) >=
606 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
607 /* Ok, we found a spot. Return this bh and let 1682 /* Ok, we found a spot. Return this bh and let
608 * the caller actually fill it in. */ 1683 * the caller actually fill it in. */
609 *ret_de_bh = bh; 1684 *ret_de_bh = bh;
@@ -623,3 +1698,61 @@ bail:
623 mlog_exit(status); 1698 mlog_exit(status);
624 return status; 1699 return status;
625} 1700}
1701
1702int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1703 struct inode *dir,
1704 struct buffer_head *parent_fe_bh,
1705 const char *name,
1706 int namelen,
1707 struct buffer_head **ret_de_bh)
1708{
1709 int ret;
1710 unsigned int blocks_wanted = 1;
1711 struct buffer_head *bh = NULL;
1712
1713 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1714 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1715
1716 *ret_de_bh = NULL;
1717
1718 if (!namelen) {
1719 ret = -EINVAL;
1720 mlog_errno(ret);
1721 goto out;
1722 }
1723
1724 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1725 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1726 namelen, &bh, &blocks_wanted);
1727 } else
1728 ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
1729
1730 if (ret && ret != -ENOSPC) {
1731 mlog_errno(ret);
1732 goto out;
1733 }
1734
1735 if (ret == -ENOSPC) {
1736 /*
1737 * We have to expand the directory to add this name.
1738 */
1739 BUG_ON(bh);
1740
1741 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
1742 &bh);
1743 if (ret) {
1744 if (ret != -ENOSPC)
1745 mlog_errno(ret);
1746 goto out;
1747 }
1748
1749 BUG_ON(!bh);
1750 }
1751
1752 *ret_de_bh = bh;
1753 bh = NULL;
1754out:
1755 if (bh)
1756 brelse(bh);
1757 return ret;
1758}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 3f67e146864a..ce48b9080d87 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,17 +26,49 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name,
30 int namelen,
31 struct inode *dir,
32 struct ocfs2_dir_entry **res_dir);
33int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir,
35 struct ocfs2_dir_entry *de_del,
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir,
39 const char *name, int namelen,
40 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh);
43static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry,
45 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh)
48{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh);
52}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
55 struct inode *new_entry_inode);
56
29int ocfs2_check_dir_for_entry(struct inode *dir, 57int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name, 58 const char *name,
31 int namelen); 59 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ 60int ocfs2_empty_dir(struct inode *inode);
33int ocfs2_find_files_on_disk(const char *name, 61int ocfs2_find_files_on_disk(const char *name,
34 int namelen, 62 int namelen,
35 u64 *blkno, 63 u64 *blkno,
36 struct inode *inode, 64 struct inode *inode,
37 struct buffer_head **dirent_bh, 65 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent); 66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
70int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
71 filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 72int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir, 73 struct inode *dir,
42 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
@@ -44,11 +76,11 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
44 int namelen, 76 int namelen,
45 struct buffer_head **ret_de_bh); 77 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context; 78struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb, 79int ocfs2_fill_new_dir(struct ocfs2_super *osb,
48 handle_t *handle, 80 handle_t *handle,
49 struct inode *dir, 81 struct inode *parent,
50 struct buffer_head *parent_fe_bh, 82 struct inode *inode,
51 struct ocfs2_alloc_context *data_ac, 83 struct buffer_head *fe_bh,
52 struct ocfs2_alloc_context *meta_ac, 84 struct ocfs2_alloc_context *data_ac);
53 struct buffer_head **new_bh); 85
54#endif /* OCFS2_DIR_H */ 86#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f71250ed166f..41c76ff2fcfb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1482,6 +1482,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1482 lvb->lvb_imtime_packed = 1482 lvb->lvb_imtime_packed =
1483 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 1483 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1484 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); 1484 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
1485 lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
1485 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); 1486 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
1486 1487
1487out: 1488out:
@@ -1515,6 +1516,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1515 i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); 1516 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1516 1517
1517 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); 1518 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
1519 oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
1518 ocfs2_set_inode_flags(inode); 1520 ocfs2_set_inode_flags(inode);
1519 1521
1520 /* fast-symlinks are a special case */ 1522 /* fast-symlinks are a special case */
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 492bad32a8c0..87a785e41205 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -29,12 +29,12 @@
29 29
30#include "dcache.h" 30#include "dcache.h"
31 31
32#define OCFS2_LVB_VERSION 4 32#define OCFS2_LVB_VERSION 5
33 33
34struct ocfs2_meta_lvb { 34struct ocfs2_meta_lvb {
35 __u8 lvb_version; 35 __u8 lvb_version;
36 __u8 lvb_reserved0; 36 __u8 lvb_reserved0;
37 __be16 lvb_reserved1; 37 __be16 lvb_idynfeatures;
38 __be32 lvb_iclusters; 38 __be32 lvb_iclusters;
39 __be32 lvb_iuid; 39 __be32 lvb_iuid;
40 __be32 lvb_igid; 40 __be32 lvb_igid;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index bc48177bd183..c3bbc198f9ce 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -88,8 +88,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
88 struct dentry *parent; 88 struct dentry *parent;
89 struct inode *inode; 89 struct inode *inode;
90 struct inode *dir = child->d_inode; 90 struct inode *dir = child->d_inode;
91 struct buffer_head *dirent_bh = NULL;
92 struct ocfs2_dir_entry *dirent;
93 91
94 mlog_entry("(0x%p, '%.*s')\n", child, 92 mlog_entry("(0x%p, '%.*s')\n", child,
95 child->d_name.len, child->d_name.name); 93 child->d_name.len, child->d_name.name);
@@ -105,8 +103,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
105 goto bail; 103 goto bail;
106 } 104 }
107 105
108 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, 106 status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
109 &dirent);
110 if (status < 0) { 107 if (status < 0) {
111 parent = ERR_PTR(-ENOENT); 108 parent = ERR_PTR(-ENOENT);
112 goto bail_unlock; 109 goto bail_unlock;
@@ -131,9 +128,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
131bail_unlock: 128bail_unlock:
132 ocfs2_meta_unlock(dir, 0); 129 ocfs2_meta_unlock(dir, 0);
133 130
134 if (dirent_bh)
135 brelse(dirent_bh);
136
137bail: 131bail:
138 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
139 133
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 03c1d365c78b..c58668a326fe 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -387,6 +387,12 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
387 struct ocfs2_extent_rec *rec; 387 struct ocfs2_extent_rec *rec;
388 u32 coff; 388 u32 coff;
389 389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
390 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, 396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
391 num_clusters, extent_flags); 397 num_clusters, extent_flags);
392 if (ret == 0) 398 if (ret == 0)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f3bc3658e7a5..a62b14eb4065 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -397,6 +397,15 @@ static int ocfs2_truncate_file(struct inode *inode,
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 398 truncate_inode_pages(inode->i_mapping, new_i_size);
399 399
400 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
401 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
402 i_size_read(inode), 0);
403 if (status)
404 mlog_errno(status);
405
406 goto bail_unlock_data;
407 }
408
400 /* alright, we're going to need to do a full blown alloc size 409 /* alright, we're going to need to do a full blown alloc size
401 * change. Orphan the inode so that recovery can complete the 410 * change. Orphan the inode so that recovery can complete the
402 * truncate if necessary. This does the task of marking 411 * truncate if necessary. This does the task of marking
@@ -779,25 +788,6 @@ leave:
779 return status; 788 return status;
780} 789}
781 790
782static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
783 u32 clusters_to_add, int mark_unwritten)
784{
785 int ret;
786
787 /*
788 * The alloc sem blocks peope in read/write from reading our
789 * allocation until we're done changing it. We depend on
790 * i_mutex to block other extend/truncate calls while we're
791 * here.
792 */
793 down_write(&OCFS2_I(inode)->ip_alloc_sem);
794 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
795 mark_unwritten);
796 up_write(&OCFS2_I(inode)->ip_alloc_sem);
797
798 return ret;
799}
800
801/* Some parts of this taken from generic_cont_expand, which turned out 791/* Some parts of this taken from generic_cont_expand, which turned out
802 * to be too fragile to do exactly what we need without us having to 792 * to be too fragile to do exactly what we need without us having to
803 * worry about recursive locking in ->prepare_write() and 793 * worry about recursive locking in ->prepare_write() and
@@ -889,25 +879,48 @@ out:
889 return ret; 879 return ret;
890} 880}
891 881
892/* 882int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
893 * A tail_to_skip value > 0 indicates that we're being called from 883{
894 * ocfs2_file_aio_write(). This has the following implications: 884 int ret;
895 * 885 u32 clusters_to_add;
896 * - we don't want to update i_size 886 struct ocfs2_inode_info *oi = OCFS2_I(inode);
897 * - di_bh will be NULL, which is fine because it's only used in the 887
898 * case where we want to update i_size. 888 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
899 * - ocfs2_zero_extend() will then only be filling the hole created 889 if (clusters_to_add < oi->ip_clusters)
900 * between i_size and the start of the write. 890 clusters_to_add = 0;
901 */ 891 else
892 clusters_to_add -= oi->ip_clusters;
893
894 if (clusters_to_add) {
895 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
896 clusters_to_add, 0);
897 if (ret) {
898 mlog_errno(ret);
899 goto out;
900 }
901 }
902
903 /*
904 * Call this even if we don't add any clusters to the tree. We
905 * still need to zero the area between the old i_size and the
906 * new i_size.
907 */
908 ret = ocfs2_zero_extend(inode, zero_to);
909 if (ret < 0)
910 mlog_errno(ret);
911
912out:
913 return ret;
914}
915
902static int ocfs2_extend_file(struct inode *inode, 916static int ocfs2_extend_file(struct inode *inode,
903 struct buffer_head *di_bh, 917 struct buffer_head *di_bh,
904 u64 new_i_size, 918 u64 new_i_size)
905 size_t tail_to_skip)
906{ 919{
907 int ret = 0; 920 int ret = 0, data_locked = 0;
908 u32 clusters_to_add = 0; 921 struct ocfs2_inode_info *oi = OCFS2_I(inode);
909 922
910 BUG_ON(!tail_to_skip && !di_bh); 923 BUG_ON(!di_bh);
911 924
912 /* setattr sometimes calls us like this. */ 925 /* setattr sometimes calls us like this. */
913 if (new_i_size == 0) 926 if (new_i_size == 0)
@@ -917,13 +930,18 @@ static int ocfs2_extend_file(struct inode *inode,
917 goto out; 930 goto out;
918 BUG_ON(new_i_size < i_size_read(inode)); 931 BUG_ON(new_i_size < i_size_read(inode));
919 932
920 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 933 /*
921 BUG_ON(tail_to_skip != 0); 934 * Fall through for converting inline data, even if the fs
935 * supports sparse files.
936 *
937 * The check for inline data here is legal - nobody can add
938 * the feature since we have i_mutex. We must check it again
939 * after acquiring ip_alloc_sem though, as paths like mmap
940 * might have raced us to converting the inode to extents.
941 */
942 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
922 goto out_update_size; 944 goto out_update_size;
923 }
924
925 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
926 OCFS2_I(inode)->ip_clusters;
927 945
928 /* 946 /*
929 * protect the pages that ocfs2_zero_extend is going to be 947 * protect the pages that ocfs2_zero_extend is going to be
@@ -937,39 +955,52 @@ static int ocfs2_extend_file(struct inode *inode,
937 mlog_errno(ret); 955 mlog_errno(ret);
938 goto out; 956 goto out;
939 } 957 }
958 data_locked = 1;
959
960 /*
961 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on
963 * i_mutex to block other extend/truncate calls while we're
964 * here.
965 */
966 down_write(&oi->ip_alloc_sem);
967
968 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
969 /*
970 * We can optimize small extends by keeping the inodes
971 * inline data.
972 */
973 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
974 up_write(&oi->ip_alloc_sem);
975 goto out_update_size;
976 }
977
978 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
979 if (ret) {
980 up_write(&oi->ip_alloc_sem);
940 981
941 if (clusters_to_add) {
942 ret = ocfs2_extend_allocation(inode,
943 OCFS2_I(inode)->ip_clusters,
944 clusters_to_add, 0);
945 if (ret < 0) {
946 mlog_errno(ret); 982 mlog_errno(ret);
947 goto out_unlock; 983 goto out_unlock;
948 } 984 }
949 } 985 }
950 986
951 /* 987 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
952 * Call this even if we don't add any clusters to the tree. We 988 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
953 * still need to zero the area between the old i_size and the 989
954 * new i_size. 990 up_write(&oi->ip_alloc_sem);
955 */ 991
956 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
957 if (ret < 0) { 992 if (ret < 0) {
958 mlog_errno(ret); 993 mlog_errno(ret);
959 goto out_unlock; 994 goto out_unlock;
960 } 995 }
961 996
962out_update_size: 997out_update_size:
963 if (!tail_to_skip) { 998 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
964 /* We're being called from ocfs2_setattr() which wants 999 if (ret < 0)
965 * us to update i_size */ 1000 mlog_errno(ret);
966 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
967 if (ret < 0)
968 mlog_errno(ret);
969 }
970 1001
971out_unlock: 1002out_unlock:
972 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1003 if (data_locked)
973 ocfs2_data_unlock(inode, 1); 1004 ocfs2_data_unlock(inode, 1);
974 1005
975out: 1006out:
@@ -1035,7 +1066,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1035 if (i_size_read(inode) > attr->ia_size) 1066 if (i_size_read(inode) > attr->ia_size)
1036 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1067 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1037 else 1068 else
1038 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 1069 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1039 if (status < 0) { 1070 if (status < 0) {
1040 if (status != -ENOSPC) 1071 if (status != -ENOSPC)
1041 mlog_errno(status); 1072 mlog_errno(status);
@@ -1243,6 +1274,31 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1243{ 1274{
1244 int ret; 1275 int ret;
1245 u32 cpos, phys_cpos, clusters, alloc_size; 1276 u32 cpos, phys_cpos, clusters, alloc_size;
1277 u64 end = start + len;
1278 struct buffer_head *di_bh = NULL;
1279
1280 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1281 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1282 OCFS2_I(inode)->ip_blkno, &di_bh,
1283 OCFS2_BH_CACHED, inode);
1284 if (ret) {
1285 mlog_errno(ret);
1286 goto out;
1287 }
1288
1289 /*
1290 * Nothing to do if the requested reservation range
1291 * fits within the inode.
1292 */
1293 if (ocfs2_size_fits_inline_data(di_bh, end))
1294 goto out;
1295
1296 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1297 if (ret) {
1298 mlog_errno(ret);
1299 goto out;
1300 }
1301 }
1246 1302
1247 /* 1303 /*
1248 * We consider both start and len to be inclusive. 1304 * We consider both start and len to be inclusive.
@@ -1288,6 +1344,8 @@ next:
1288 1344
1289 ret = 0; 1345 ret = 0;
1290out: 1346out:
1347
1348 brelse(di_bh);
1291 return ret; 1349 return ret;
1292} 1350}
1293 1351
@@ -1469,6 +1527,14 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1469 if (byte_len == 0) 1527 if (byte_len == 0)
1470 return 0; 1528 return 0;
1471 1529
1530 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1531 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1532 byte_start + byte_len, 1);
1533 if (ret)
1534 mlog_errno(ret);
1535 return ret;
1536 }
1537
1472 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1538 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1473 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1539 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1474 if (trunc_len >= trunc_start) 1540 if (trunc_len >= trunc_start)
@@ -1713,15 +1779,13 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1713 int appending, 1779 int appending,
1714 int *direct_io) 1780 int *direct_io)
1715{ 1781{
1716 int ret = 0, meta_level = appending; 1782 int ret = 0, meta_level = 0;
1717 struct inode *inode = dentry->d_inode; 1783 struct inode *inode = dentry->d_inode;
1718 u32 clusters; 1784 loff_t saved_pos, end;
1719 loff_t newsize, saved_pos;
1720 1785
1721 /* 1786 /*
1722 * We sample i_size under a read level meta lock to see if our write 1787 * We start with a read level meta lock and only jump to an ex
1723 * is extending the file, if it is we back off and get a write level 1788 * if we need to make modifications here.
1724 * meta lock.
1725 */ 1789 */
1726 for(;;) { 1790 for(;;) {
1727 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1791 ret = ocfs2_meta_lock(inode, NULL, meta_level);
@@ -1763,87 +1827,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1763 saved_pos = *ppos; 1827 saved_pos = *ppos;
1764 } 1828 }
1765 1829
1766 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 1830 end = saved_pos + count;
1767 loff_t end = saved_pos + count;
1768 1831
1769 /* 1832 /*
1770 * Skip the O_DIRECT checks if we don't need 1833 * Skip the O_DIRECT checks if we don't need
1771 * them. 1834 * them.
1772 */ 1835 */
1773 if (!direct_io || !(*direct_io)) 1836 if (!direct_io || !(*direct_io))
1774 break;
1775
1776 /*
1777 * Allowing concurrent direct writes means
1778 * i_size changes wouldn't be synchronized, so
1779 * one node could wind up truncating another
1780 * nodes writes.
1781 */
1782 if (end > i_size_read(inode)) {
1783 *direct_io = 0;
1784 break;
1785 }
1786
1787 /*
1788 * We don't fill holes during direct io, so
1789 * check for them here. If any are found, the
1790 * caller will have to retake some cluster
1791 * locks and initiate the io as buffered.
1792 */
1793 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1794 count);
1795 if (ret == 1) {
1796 *direct_io = 0;
1797 ret = 0;
1798 } else if (ret < 0)
1799 mlog_errno(ret);
1800 break; 1837 break;
1801 }
1802 1838
1803 /* 1839 /*
1804 * The rest of this loop is concerned with legacy file 1840 * There's no sane way to do direct writes to an inode
1805 * systems which don't support sparse files. 1841 * with inline data.
1806 */ 1842 */
1807 1843 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1808 newsize = count + saved_pos; 1844 *direct_io = 0;
1809
1810 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1811 (long long) saved_pos, (long long) newsize,
1812 (long long) i_size_read(inode));
1813
1814 /* No need for a higher level metadata lock if we're
1815 * never going past i_size. */
1816 if (newsize <= i_size_read(inode))
1817 break; 1845 break;
1818
1819 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level);
1821 meta_level = 1;
1822 continue;
1823 } 1846 }
1824 1847
1825 spin_lock(&OCFS2_I(inode)->ip_lock); 1848 /*
1826 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1849 * Allowing concurrent direct writes means
1827 OCFS2_I(inode)->ip_clusters; 1850 * i_size changes wouldn't be synchronized, so
1828 spin_unlock(&OCFS2_I(inode)->ip_lock); 1851 * one node could wind up truncating another
1829 1852 * nodes writes.
1830 mlog(0, "Writing at EOF, may need more allocation: " 1853 */
1831 "i_size = %lld, newsize = %lld, need %u clusters\n", 1854 if (end > i_size_read(inode)) {
1832 (long long) i_size_read(inode), (long long) newsize, 1855 *direct_io = 0;
1833 clusters);
1834
1835 /* We only want to continue the rest of this loop if
1836 * our extend will actually require more
1837 * allocation. */
1838 if (!clusters)
1839 break; 1856 break;
1840
1841 ret = ocfs2_extend_file(inode, NULL, newsize, count);
1842 if (ret < 0) {
1843 if (ret != -ENOSPC)
1844 mlog_errno(ret);
1845 goto out_unlock;
1846 } 1857 }
1858
1859 /*
1860 * We don't fill holes during direct io, so
1861 * check for them here. If any are found, the
1862 * caller will have to retake some cluster
1863 * locks and initiate the io as buffered.
1864 */
1865 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1866 if (ret == 1) {
1867 *direct_io = 0;
1868 ret = 0;
1869 } else if (ret < 0)
1870 mlog_errno(ret);
1847 break; 1871 break;
1848 } 1872 }
1849 1873
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 36fe27f268ee..066f14add3a8 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -47,6 +47,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
47 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
48 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
49 enum ocfs2_alloc_restarted *reason_ret); 49 enum ocfs2_alloc_restarted *reason_ret);
50int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
51 u64 zero_to);
50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 52int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
51 u32 clusters_to_add, u32 extents_to_split, 53 u32 clusters_to_add, u32 extents_to_split,
52 struct ocfs2_alloc_context **data_ac, 54 struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c53a6763bbbe..1d5e0cb0fda1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -241,6 +241,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
241 241
242 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 242 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
243 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 243 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
244 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
244 245
245 inode->i_version = 1; 246 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation); 247 inode->i_generation = le32_to_cpu(fe->i_generation);
@@ -513,6 +514,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
513 514
514 fe = (struct ocfs2_dinode *) fe_bh->b_data; 515 fe = (struct ocfs2_dinode *) fe_bh->b_data;
515 516
517 /*
518 * This check will also skip truncate of inodes with inline
519 * data and fast symlinks.
520 */
516 if (fe->i_clusters) { 521 if (fe->i_clusters) {
517 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 522 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
518 if (IS_ERR(handle)) { 523 if (IS_ERR(handle)) {
@@ -1220,6 +1225,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1220 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1225 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1221 ocfs2_get_inode_flags(OCFS2_I(inode)); 1226 ocfs2_get_inode_flags(OCFS2_I(inode));
1222 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); 1227 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
1228 fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
1223 spin_unlock(&OCFS2_I(inode)->ip_lock); 1229 spin_unlock(&OCFS2_I(inode)->ip_lock);
1224 1230
1225 fe->i_size = cpu_to_le64(i_size_read(inode)); 1231 fe->i_size = cpu_to_le64(i_size_read(inode));
@@ -1257,6 +1263,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1257 1263
1258 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1264 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1259 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 1265 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
1266 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1260 ocfs2_set_inode_flags(inode); 1267 ocfs2_set_inode_flags(inode);
1261 i_size_write(inode, le64_to_cpu(fe->i_size)); 1268 i_size_write(inode, le64_to_cpu(fe->i_size));
1262 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1269 inode->i_nlink = le16_to_cpu(fe->i_links_count);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a41d0817121b..70e881c55536 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -51,6 +51,7 @@ struct ocfs2_inode_info
51 51
52 u32 ip_flags; /* see below */ 52 u32 ip_flags; /* see below */
53 u32 ip_attr; /* inode attributes */ 53 u32 ip_attr; /* inode attributes */
54 u16 ip_dyn_features;
54 55
55 /* protected by recovery_lock. */ 56 /* protected by recovery_lock. */
56 struct inode *ip_next_orphan; 57 struct inode *ip_next_orphan;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dbfb20bb27ea..f9d01e25298d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,13 +35,13 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "dir.h"
38#include "dlmglue.h" 39#include "dlmglue.h"
39#include "extent_map.h" 40#include "extent_map.h"
40#include "heartbeat.h" 41#include "heartbeat.h"
41#include "inode.h" 42#include "inode.h"
42#include "journal.h" 43#include "journal.h"
43#include "localalloc.h" 44#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h" 47#include "vote.h"
@@ -1213,17 +1213,49 @@ bail:
1213 return status; 1213 return status;
1214} 1214}
1215 1215
1216struct ocfs2_orphan_filldir_priv {
1217 struct inode *head;
1218 struct ocfs2_super *osb;
1219};
1220
1221static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1222 loff_t pos, u64 ino, unsigned type)
1223{
1224 struct ocfs2_orphan_filldir_priv *p = priv;
1225 struct inode *iter;
1226
1227 if (name_len == 1 && !strncmp(".", name, 1))
1228 return 0;
1229 if (name_len == 2 && !strncmp("..", name, 2))
1230 return 0;
1231
1232 /* Skip bad inodes so that recovery can continue */
1233 iter = ocfs2_iget(p->osb, ino,
1234 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1235 if (IS_ERR(iter))
1236 return 0;
1237
1238 mlog(0, "queue orphan %llu\n",
1239 (unsigned long long)OCFS2_I(iter)->ip_blkno);
1240 /* No locking is required for the next_orphan queue as there
1241 * is only ever a single process doing orphan recovery. */
1242 OCFS2_I(iter)->ip_next_orphan = p->head;
1243 p->head = iter;
1244
1245 return 0;
1246}
1247
1216static int ocfs2_queue_orphans(struct ocfs2_super *osb, 1248static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1217 int slot, 1249 int slot,
1218 struct inode **head) 1250 struct inode **head)
1219{ 1251{
1220 int status; 1252 int status;
1221 struct inode *orphan_dir_inode = NULL; 1253 struct inode *orphan_dir_inode = NULL;
1222 struct inode *iter; 1254 struct ocfs2_orphan_filldir_priv priv;
1223 unsigned long offset, blk, local; 1255 loff_t pos = 0;
1224 struct buffer_head *bh = NULL; 1256
1225 struct ocfs2_dir_entry *de; 1257 priv.osb = osb;
1226 struct super_block *sb = osb->sb; 1258 priv.head = *head;
1227 1259
1228 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1260 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1229 ORPHAN_DIR_SYSTEM_INODE, 1261 ORPHAN_DIR_SYSTEM_INODE,
@@ -1241,77 +1273,15 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1241 goto out; 1273 goto out;
1242 } 1274 }
1243 1275
1244 offset = 0; 1276 status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
1245 iter = NULL; 1277 ocfs2_orphan_filldir);
1246 while(offset < i_size_read(orphan_dir_inode)) { 1278 if (status) {
1247 blk = offset >> sb->s_blocksize_bits; 1279 mlog_errno(status);
1248 1280 goto out;
1249 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1250 if (!bh)
1251 status = -EINVAL;
1252 if (status < 0) {
1253 if (bh)
1254 brelse(bh);
1255 mlog_errno(status);
1256 goto out_unlock;
1257 }
1258
1259 local = 0;
1260 while(offset < i_size_read(orphan_dir_inode)
1261 && local < sb->s_blocksize) {
1262 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1263
1264 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1265 de, bh, local)) {
1266 status = -EINVAL;
1267 mlog_errno(status);
1268 brelse(bh);
1269 goto out_unlock;
1270 }
1271
1272 local += le16_to_cpu(de->rec_len);
1273 offset += le16_to_cpu(de->rec_len);
1274
1275 /* I guess we silently fail on no inode? */
1276 if (!le64_to_cpu(de->inode))
1277 continue;
1278 if (de->file_type > OCFS2_FT_MAX) {
1279 mlog(ML_ERROR,
1280 "block %llu contains invalid de: "
1281 "inode = %llu, rec_len = %u, "
1282 "name_len = %u, file_type = %u, "
1283 "name='%.*s'\n",
1284 (unsigned long long)bh->b_blocknr,
1285 (unsigned long long)le64_to_cpu(de->inode),
1286 le16_to_cpu(de->rec_len),
1287 de->name_len,
1288 de->file_type,
1289 de->name_len,
1290 de->name);
1291 continue;
1292 }
1293 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1294 continue;
1295 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1296 continue;
1297
1298 iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1299 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1300 if (IS_ERR(iter))
1301 continue;
1302
1303 mlog(0, "queue orphan %llu\n",
1304 (unsigned long long)OCFS2_I(iter)->ip_blkno);
1305 /* No locking is required for the next_orphan
1306 * queue as there is only ever a single
1307 * process doing orphan recovery. */
1308 OCFS2_I(iter)->ip_next_orphan = *head;
1309 *head = iter;
1310 }
1311 brelse(bh);
1312 } 1281 }
1313 1282
1314out_unlock: 1283 *head = priv.head;
1284
1315 ocfs2_meta_unlock(orphan_dir_inode, 0); 1285 ocfs2_meta_unlock(orphan_dir_inode, 0);
1316out: 1286out:
1317 mutex_unlock(&orphan_dir_inode->i_mutex); 1287 mutex_unlock(&orphan_dir_inode->i_mutex);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index ce60aab013aa..4b32e0961568 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -282,6 +282,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
282 * prev. group desc. if we relink. */ 282 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 283#define OCFS2_SUBALLOC_ALLOC (3)
284 284
285#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \
286 + OCFS2_INODE_UPDATE_CREDITS)
287
285/* dinode + group descriptor update. We don't relink on free yet. */ 288/* dinode + group descriptor update. We don't relink on free yet. */
286#define OCFS2_SUBALLOC_FREE (2) 289#define OCFS2_SUBALLOC_FREE (2)
287 290
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 701e6d04ed5d..729259016c18 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -64,29 +64,6 @@
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
67#define NAMEI_RA_CHUNKS 2
68#define NAMEI_RA_BLOCKS 4
69#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
70#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
71
72static int inline ocfs2_search_dirblock(struct buffer_head *bh,
73 struct inode *dir,
74 const char *name, int namelen,
75 unsigned long offset,
76 struct ocfs2_dir_entry **res_dir);
77
78static int ocfs2_delete_entry(handle_t *handle,
79 struct inode *dir,
80 struct ocfs2_dir_entry *de_del,
81 struct buffer_head *bh);
82
83static int __ocfs2_add_entry(handle_t *handle,
84 struct inode *dir,
85 const char *name, int namelen,
86 struct inode *inode, u64 blkno,
87 struct buffer_head *parent_fe_bh,
88 struct buffer_head *insert_bh);
89
90static int ocfs2_mknod_locked(struct ocfs2_super *osb, 67static int ocfs2_mknod_locked(struct ocfs2_super *osb,
91 struct inode *dir, 68 struct inode *dir,
92 struct dentry *dentry, int mode, 69 struct dentry *dentry, int mode,
@@ -97,13 +74,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
97 struct inode **ret_inode, 74 struct inode **ret_inode,
98 struct ocfs2_alloc_context *inode_ac); 75 struct ocfs2_alloc_context *inode_ac);
99 76
100static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
101 handle_t *handle,
102 struct inode *parent,
103 struct inode *inode,
104 struct buffer_head *fe_bh,
105 struct ocfs2_alloc_context *data_ac);
106
107static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 77static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
108 struct inode **ret_orphan_dir, 78 struct inode **ret_orphan_dir,
109 struct inode *inode, 79 struct inode *inode,
@@ -123,17 +93,6 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
123 struct inode *inode, 93 struct inode *inode,
124 const char *symname); 94 const char *symname);
125 95
126static inline int ocfs2_add_entry(handle_t *handle,
127 struct dentry *dentry,
128 struct inode *inode, u64 blkno,
129 struct buffer_head *parent_fe_bh,
130 struct buffer_head *insert_bh)
131{
132 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
133 dentry->d_name.name, dentry->d_name.len,
134 inode, blkno, parent_fe_bh, insert_bh);
135}
136
137/* An orphan dir name is an 8 byte value, printed as a hex string */ 96/* An orphan dir name is an 8 byte value, printed as a hex string */
138#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 97#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
139 98
@@ -142,10 +101,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
142{ 101{
143 int status; 102 int status;
144 u64 blkno; 103 u64 blkno;
145 struct buffer_head *dirent_bh = NULL;
146 struct inode *inode = NULL; 104 struct inode *inode = NULL;
147 struct dentry *ret; 105 struct dentry *ret;
148 struct ocfs2_dir_entry *dirent;
149 struct ocfs2_inode_info *oi; 106 struct ocfs2_inode_info *oi;
150 107
151 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 108 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
@@ -167,9 +124,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
167 goto bail; 124 goto bail;
168 } 125 }
169 126
170 status = ocfs2_find_files_on_disk(dentry->d_name.name, 127 status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
171 dentry->d_name.len, &blkno, 128 dentry->d_name.len, &blkno);
172 dir, &dirent_bh, &dirent);
173 if (status < 0) 129 if (status < 0)
174 goto bail_add; 130 goto bail_add;
175 131
@@ -224,83 +180,12 @@ bail_unlock:
224 ocfs2_meta_unlock(dir, 0); 180 ocfs2_meta_unlock(dir, 0);
225 181
226bail: 182bail:
227 if (dirent_bh)
228 brelse(dirent_bh);
229 183
230 mlog_exit_ptr(ret); 184 mlog_exit_ptr(ret);
231 185
232 return ret; 186 return ret;
233} 187}
234 188
235static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
236 handle_t *handle,
237 struct inode *parent,
238 struct inode *inode,
239 struct buffer_head *fe_bh,
240 struct ocfs2_alloc_context *data_ac)
241{
242 int status;
243 struct buffer_head *new_bh = NULL;
244 struct ocfs2_dir_entry *de = NULL;
245
246 mlog_entry_void();
247
248 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
249 data_ac, NULL, &new_bh);
250 if (status < 0) {
251 mlog_errno(status);
252 goto bail;
253 }
254
255 ocfs2_set_new_buffer_uptodate(inode, new_bh);
256
257 status = ocfs2_journal_access(handle, inode, new_bh,
258 OCFS2_JOURNAL_ACCESS_CREATE);
259 if (status < 0) {
260 mlog_errno(status);
261 goto bail;
262 }
263 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
264
265 de = (struct ocfs2_dir_entry *) new_bh->b_data;
266 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
267 de->name_len = 1;
268 de->rec_len =
269 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
270 strcpy(de->name, ".");
271 ocfs2_set_de_type(de, S_IFDIR);
272 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
273 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
274 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
275 OCFS2_DIR_REC_LEN(1));
276 de->name_len = 2;
277 strcpy(de->name, "..");
278 ocfs2_set_de_type(de, S_IFDIR);
279
280 status = ocfs2_journal_dirty(handle, new_bh);
281 if (status < 0) {
282 mlog_errno(status);
283 goto bail;
284 }
285
286 i_size_write(inode, inode->i_sb->s_blocksize);
287 inode->i_nlink = 2;
288 inode->i_blocks = ocfs2_inode_sector_count(inode);
289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
290 if (status < 0) {
291 mlog_errno(status);
292 goto bail;
293 }
294
295 status = 0;
296bail:
297 if (new_bh)
298 brelse(new_bh);
299
300 mlog_exit(status);
301 return status;
302}
303
304static int ocfs2_mknod(struct inode *dir, 189static int ocfs2_mknod(struct inode *dir,
305 struct dentry *dentry, 190 struct dentry *dentry,
306 int mode, 191 int mode,
@@ -365,9 +250,8 @@ static int ocfs2_mknod(struct inode *dir,
365 goto leave; 250 goto leave;
366 } 251 }
367 252
368 /* are we making a directory? If so, reserve a cluster for his 253 /* Reserve a cluster if creating an extent based directory. */
369 * 1st extent. */ 254 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
370 if (S_ISDIR(mode)) {
371 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 255 status = ocfs2_reserve_clusters(osb, 1, &data_ac);
372 if (status < 0) { 256 if (status < 0) {
373 if (status != -ENOSPC) 257 if (status != -ENOSPC)
@@ -564,10 +448,21 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
564 cpu_to_le32(CURRENT_TIME.tv_nsec); 448 cpu_to_le32(CURRENT_TIME.tv_nsec);
565 fe->i_dtime = 0; 449 fe->i_dtime = 0;
566 450
567 fel = &fe->id2.i_list; 451 /*
568 fel->l_tree_depth = 0; 452 * If supported, directories start with inline data.
569 fel->l_next_free_rec = 0; 453 */
570 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 454 if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
455 u16 feat = le16_to_cpu(fe->i_dyn_features);
456
457 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
458
459 fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
460 } else {
461 fel = &fe->id2.i_list;
462 fel->l_tree_depth = 0;
463 fel->l_next_free_rec = 0;
464 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
465 }
571 466
572 status = ocfs2_journal_dirty(handle, *new_fe_bh); 467 status = ocfs2_journal_dirty(handle, *new_fe_bh);
573 if (status < 0) { 468 if (status < 0) {
@@ -1048,11 +943,6 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
1048 ocfs2_meta_unlock(inode2, 1); 943 ocfs2_meta_unlock(inode2, 1);
1049} 944}
1050 945
1051#define PARENT_INO(buffer) \
1052 ((struct ocfs2_dir_entry *) \
1053 ((char *)buffer + \
1054 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
1055
1056static int ocfs2_rename(struct inode *old_dir, 946static int ocfs2_rename(struct inode *old_dir,
1057 struct dentry *old_dentry, 947 struct dentry *old_dentry,
1058 struct inode *new_dir, 948 struct inode *new_dir,
@@ -1070,12 +960,12 @@ static int ocfs2_rename(struct inode *old_dir,
1070 struct buffer_head *old_inode_bh = NULL; 960 struct buffer_head *old_inode_bh = NULL;
1071 struct buffer_head *insert_entry_bh = NULL; 961 struct buffer_head *insert_entry_bh = NULL;
1072 struct ocfs2_super *osb = NULL; 962 struct ocfs2_super *osb = NULL;
1073 u64 newfe_blkno; 963 u64 newfe_blkno, old_de_ino;
1074 handle_t *handle = NULL; 964 handle_t *handle = NULL;
1075 struct buffer_head *old_dir_bh = NULL; 965 struct buffer_head *old_dir_bh = NULL;
1076 struct buffer_head *new_dir_bh = NULL; 966 struct buffer_head *new_dir_bh = NULL;
1077 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry 967 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1078 // and new_dentry 968 *new_de = NULL;
1079 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above 969 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1080 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, 970 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1081 // this is the 1st dirent bh 971 // this is the 1st dirent bh
@@ -1159,27 +1049,35 @@ static int ocfs2_rename(struct inode *old_dir,
1159 } 1049 }
1160 1050
1161 if (S_ISDIR(old_inode->i_mode)) { 1051 if (S_ISDIR(old_inode->i_mode)) {
1162 status = -EIO; 1052 u64 old_inode_parent;
1163 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); 1053
1164 if (!old_inode_de_bh) 1054 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1055 old_inode, &old_inode_de_bh,
1056 &old_inode_dot_dot_de);
1057 if (status) {
1058 status = -EIO;
1165 goto bail; 1059 goto bail;
1060 }
1166 1061
1167 status = -EIO; 1062 if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
1168 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != 1063 status = -EIO;
1169 OCFS2_I(old_dir)->ip_blkno)
1170 goto bail; 1064 goto bail;
1171 status = -EMLINK; 1065 }
1172 if (!new_inode && new_dir!=old_dir && 1066
1173 new_dir->i_nlink >= OCFS2_LINK_MAX) 1067 if (!new_inode && new_dir != old_dir &&
1068 new_dir->i_nlink >= OCFS2_LINK_MAX) {
1069 status = -EMLINK;
1174 goto bail; 1070 goto bail;
1071 }
1175 } 1072 }
1176 1073
1177 status = -ENOENT; 1074 status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
1178 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1075 old_dentry->d_name.len,
1179 old_dentry->d_name.len, 1076 &old_de_ino);
1180 old_dir, &old_de); 1077 if (status) {
1181 if (!old_de_bh) 1078 status = -ENOENT;
1182 goto bail; 1079 goto bail;
1080 }
1183 1081
1184 /* 1082 /*
1185 * Check for inode number is _not_ due to possible IO errors. 1083 * Check for inode number is _not_ due to possible IO errors.
@@ -1187,8 +1085,10 @@ static int ocfs2_rename(struct inode *old_dir,
1187 * and merrily kill the link to whatever was created under the 1085 * and merrily kill the link to whatever was created under the
1188 * same name. Goodbye sticky bit ;-< 1086 * same name. Goodbye sticky bit ;-<
1189 */ 1087 */
1190 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) 1088 if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
1089 status = -ENOENT;
1191 goto bail; 1090 goto bail;
1091 }
1192 1092
1193 /* check if the target already exists (in which case we need 1093 /* check if the target already exists (in which case we need
1194 * to delete it */ 1094 * to delete it */
@@ -1321,20 +1221,13 @@ static int ocfs2_rename(struct inode *old_dir,
1321 } 1221 }
1322 1222
1323 /* change the dirent to point to the correct inode */ 1223 /* change the dirent to point to the correct inode */
1324 status = ocfs2_journal_access(handle, new_dir, new_de_bh, 1224 status = ocfs2_update_entry(new_dir, handle, new_de_bh,
1325 OCFS2_JOURNAL_ACCESS_WRITE); 1225 new_de, old_inode);
1326 if (status < 0) { 1226 if (status < 0) {
1327 mlog_errno(status); 1227 mlog_errno(status);
1328 goto bail; 1228 goto bail;
1329 } 1229 }
1330 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1331 new_de->file_type = old_de->file_type;
1332 new_dir->i_version++; 1230 new_dir->i_version++;
1333 status = ocfs2_journal_dirty(handle, new_de_bh);
1334 if (status < 0) {
1335 mlog_errno(status);
1336 goto bail;
1337 }
1338 1231
1339 if (S_ISDIR(new_inode->i_mode)) 1232 if (S_ISDIR(new_inode->i_mode))
1340 newfe->i_links_count = 0; 1233 newfe->i_links_count = 0;
@@ -1370,7 +1263,21 @@ static int ocfs2_rename(struct inode *old_dir,
1370 } else 1263 } else
1371 mlog_errno(status); 1264 mlog_errno(status);
1372 1265
1373 /* now that the name has been added to new_dir, remove the old name */ 1266 /*
1267 * Now that the name has been added to new_dir, remove the old name.
1268 *
1269 * We don't keep any directory entry context around until now
1270 * because the insert might have changed the type of directory
1271 * we're dealing with.
1272 */
1273 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1274 old_dentry->d_name.len,
1275 old_dir, &old_de);
1276 if (!old_de_bh) {
1277 status = -EIO;
1278 goto bail;
1279 }
1280
1374 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1281 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1375 if (status < 0) { 1282 if (status < 0) {
1376 mlog_errno(status); 1283 mlog_errno(status);
@@ -1383,12 +1290,8 @@ static int ocfs2_rename(struct inode *old_dir,
1383 } 1290 }
1384 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1291 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1385 if (old_inode_de_bh) { 1292 if (old_inode_de_bh) {
1386 status = ocfs2_journal_access(handle, old_inode, 1293 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
1387 old_inode_de_bh, 1294 old_inode_dot_dot_de, new_dir);
1388 OCFS2_JOURNAL_ACCESS_WRITE);
1389 PARENT_INO(old_inode_de_bh->b_data) =
1390 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1391 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1392 old_dir->i_nlink--; 1295 old_dir->i_nlink--;
1393 if (new_inode) { 1296 if (new_inode) {
1394 new_inode->i_nlink--; 1297 new_inode->i_nlink--;
@@ -1767,329 +1670,6 @@ bail:
1767 return status; 1670 return status;
1768} 1671}
1769 1672
1770int ocfs2_check_dir_entry(struct inode * dir,
1771 struct ocfs2_dir_entry * de,
1772 struct buffer_head * bh,
1773 unsigned long offset)
1774{
1775 const char *error_msg = NULL;
1776 const int rlen = le16_to_cpu(de->rec_len);
1777
1778 if (rlen < OCFS2_DIR_REC_LEN(1))
1779 error_msg = "rec_len is smaller than minimal";
1780 else if (rlen % 4 != 0)
1781 error_msg = "rec_len % 4 != 0";
1782 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1783 error_msg = "rec_len is too small for name_len";
1784 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1785 error_msg = "directory entry across blocks";
1786
1787 if (error_msg != NULL)
1788 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
1789 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
1790 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
1791 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
1792 de->name_len);
1793 return error_msg == NULL ? 1 : 0;
1794}
1795
1796/* we don't always have a dentry for what we want to add, so people
1797 * like orphan dir can call this instead.
1798 *
1799 * If you pass me insert_bh, I'll skip the search of the other dir
1800 * blocks and put the record in there.
1801 */
1802static int __ocfs2_add_entry(handle_t *handle,
1803 struct inode *dir,
1804 const char *name, int namelen,
1805 struct inode *inode, u64 blkno,
1806 struct buffer_head *parent_fe_bh,
1807 struct buffer_head *insert_bh)
1808{
1809 unsigned long offset;
1810 unsigned short rec_len;
1811 struct ocfs2_dir_entry *de, *de1;
1812 struct super_block *sb;
1813 int retval, status;
1814
1815 mlog_entry_void();
1816
1817 sb = dir->i_sb;
1818
1819 if (!namelen)
1820 return -EINVAL;
1821
1822 rec_len = OCFS2_DIR_REC_LEN(namelen);
1823 offset = 0;
1824 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1825 while (1) {
1826 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1827 /* These checks should've already been passed by the
1828 * prepare function, but I guess we can leave them
1829 * here anyway. */
1830 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1831 retval = -ENOENT;
1832 goto bail;
1833 }
1834 if (ocfs2_match(namelen, name, de)) {
1835 retval = -EEXIST;
1836 goto bail;
1837 }
1838 if (((le64_to_cpu(de->inode) == 0) &&
1839 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1840 (le16_to_cpu(de->rec_len) >=
1841 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1842 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1843 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1844 if (retval < 0) {
1845 mlog_errno(retval);
1846 goto bail;
1847 }
1848
1849 status = ocfs2_journal_access(handle, dir, insert_bh,
1850 OCFS2_JOURNAL_ACCESS_WRITE);
1851 /* By now the buffer is marked for journaling */
1852 offset += le16_to_cpu(de->rec_len);
1853 if (le64_to_cpu(de->inode)) {
1854 de1 = (struct ocfs2_dir_entry *)((char *) de +
1855 OCFS2_DIR_REC_LEN(de->name_len));
1856 de1->rec_len =
1857 cpu_to_le16(le16_to_cpu(de->rec_len) -
1858 OCFS2_DIR_REC_LEN(de->name_len));
1859 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1860 de = de1;
1861 }
1862 de->file_type = OCFS2_FT_UNKNOWN;
1863 if (blkno) {
1864 de->inode = cpu_to_le64(blkno);
1865 ocfs2_set_de_type(de, inode->i_mode);
1866 } else
1867 de->inode = 0;
1868 de->name_len = namelen;
1869 memcpy(de->name, name, namelen);
1870
1871 dir->i_version++;
1872 status = ocfs2_journal_dirty(handle, insert_bh);
1873 retval = 0;
1874 goto bail;
1875 }
1876 offset += le16_to_cpu(de->rec_len);
1877 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1878 }
1879
1880 /* when you think about it, the assert above should prevent us
1881 * from ever getting here. */
1882 retval = -ENOSPC;
1883bail:
1884
1885 mlog_exit(retval);
1886 return retval;
1887}
1888
1889
1890/*
1891 * ocfs2_delete_entry deletes a directory entry by merging it with the
1892 * previous entry
1893 */
1894static int ocfs2_delete_entry(handle_t *handle,
1895 struct inode *dir,
1896 struct ocfs2_dir_entry *de_del,
1897 struct buffer_head *bh)
1898{
1899 struct ocfs2_dir_entry *de, *pde;
1900 int i, status = -ENOENT;
1901
1902 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1903
1904 i = 0;
1905 pde = NULL;
1906 de = (struct ocfs2_dir_entry *) bh->b_data;
1907 while (i < bh->b_size) {
1908 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1909 status = -EIO;
1910 mlog_errno(status);
1911 goto bail;
1912 }
1913 if (de == de_del) {
1914 status = ocfs2_journal_access(handle, dir, bh,
1915 OCFS2_JOURNAL_ACCESS_WRITE);
1916 if (status < 0) {
1917 status = -EIO;
1918 mlog_errno(status);
1919 goto bail;
1920 }
1921 if (pde)
1922 pde->rec_len =
1923 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1924 le16_to_cpu(de->rec_len));
1925 else
1926 de->inode = 0;
1927 dir->i_version++;
1928 status = ocfs2_journal_dirty(handle, bh);
1929 goto bail;
1930 }
1931 i += le16_to_cpu(de->rec_len);
1932 pde = de;
1933 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1934 }
1935bail:
1936 mlog_exit(status);
1937 return status;
1938}
1939
1940/*
1941 * Returns 0 if not found, -1 on failure, and 1 on success
1942 */
1943static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1944 struct inode *dir,
1945 const char *name, int namelen,
1946 unsigned long offset,
1947 struct ocfs2_dir_entry **res_dir)
1948{
1949 struct ocfs2_dir_entry *de;
1950 char *dlimit, *de_buf;
1951 int de_len;
1952 int ret = 0;
1953
1954 mlog_entry_void();
1955
1956 de_buf = bh->b_data;
1957 dlimit = de_buf + dir->i_sb->s_blocksize;
1958
1959 while (de_buf < dlimit) {
1960 /* this code is executed quadratically often */
1961 /* do minimal checking `by hand' */
1962
1963 de = (struct ocfs2_dir_entry *) de_buf;
1964
1965 if (de_buf + namelen <= dlimit &&
1966 ocfs2_match(namelen, name, de)) {
1967 /* found a match - just to be sure, do a full check */
1968 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1969 ret = -1;
1970 goto bail;
1971 }
1972 *res_dir = de;
1973 ret = 1;
1974 goto bail;
1975 }
1976
1977 /* prevent looping on a bad block */
1978 de_len = le16_to_cpu(de->rec_len);
1979 if (de_len <= 0) {
1980 ret = -1;
1981 goto bail;
1982 }
1983
1984 de_buf += de_len;
1985 offset += de_len;
1986 }
1987
1988bail:
1989 mlog_exit(ret);
1990 return ret;
1991}
1992
1993struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1994 struct inode *dir,
1995 struct ocfs2_dir_entry **res_dir)
1996{
1997 struct super_block *sb;
1998 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1999 struct buffer_head *bh, *ret = NULL;
2000 unsigned long start, block, b;
2001 int ra_max = 0; /* Number of bh's in the readahead
2002 buffer, bh_use[] */
2003 int ra_ptr = 0; /* Current index into readahead
2004 buffer */
2005 int num = 0;
2006 int nblocks, i, err;
2007
2008 mlog_entry_void();
2009
2010 *res_dir = NULL;
2011 sb = dir->i_sb;
2012
2013 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2014 start = OCFS2_I(dir)->ip_dir_start_lookup;
2015 if (start >= nblocks)
2016 start = 0;
2017 block = start;
2018
2019restart:
2020 do {
2021 /*
2022 * We deal with the read-ahead logic here.
2023 */
2024 if (ra_ptr >= ra_max) {
2025 /* Refill the readahead buffer */
2026 ra_ptr = 0;
2027 b = block;
2028 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
2029 /*
2030 * Terminate if we reach the end of the
2031 * directory and must wrap, or if our
2032 * search has finished at this block.
2033 */
2034 if (b >= nblocks || (num && block == start)) {
2035 bh_use[ra_max] = NULL;
2036 break;
2037 }
2038 num++;
2039
2040 bh = ocfs2_bread(dir, b++, &err, 1);
2041 bh_use[ra_max] = bh;
2042 }
2043 }
2044 if ((bh = bh_use[ra_ptr++]) == NULL)
2045 goto next;
2046 wait_on_buffer(bh);
2047 if (!buffer_uptodate(bh)) {
2048 /* read error, skip block & hope for the best */
2049 ocfs2_error(dir->i_sb, "reading directory %llu, "
2050 "offset %lu\n",
2051 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2052 block);
2053 brelse(bh);
2054 goto next;
2055 }
2056 i = ocfs2_search_dirblock(bh, dir, name, namelen,
2057 block << sb->s_blocksize_bits,
2058 res_dir);
2059 if (i == 1) {
2060 OCFS2_I(dir)->ip_dir_start_lookup = block;
2061 ret = bh;
2062 goto cleanup_and_exit;
2063 } else {
2064 brelse(bh);
2065 if (i < 0)
2066 goto cleanup_and_exit;
2067 }
2068 next:
2069 if (++block >= nblocks)
2070 block = 0;
2071 } while (block != start);
2072
2073 /*
2074 * If the directory has grown while we were searching, then
2075 * search the last part of the directory before giving up.
2076 */
2077 block = nblocks;
2078 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2079 if (block < nblocks) {
2080 start = 0;
2081 goto restart;
2082 }
2083
2084cleanup_and_exit:
2085 /* Clean up the read-ahead blocks */
2086 for (; ra_ptr < ra_max; ra_ptr++)
2087 brelse(bh_use[ra_ptr]);
2088
2089 mlog_exit_ptr(ret);
2090 return ret;
2091}
2092
2093static int ocfs2_blkno_stringify(u64 blkno, char *name) 1673static int ocfs2_blkno_stringify(u64 blkno, char *name)
2094{ 1674{
2095 int status, namelen; 1675 int status, namelen;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 0975c7b7212b..688aef64c879 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -30,29 +30,10 @@ extern const struct inode_operations ocfs2_dir_iops;
30 30
31struct dentry *ocfs2_get_parent(struct dentry *child); 31struct dentry *ocfs2_get_parent(struct dentry *child);
32 32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb, 33int ocfs2_orphan_del(struct ocfs2_super *osb,
42 handle_t *handle, 34 handle_t *handle,
43 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
44 struct inode *inode, 36 struct inode *inode,
45 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh);
46 38
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */ 39#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 58307853fb4a..60a23e1906b0 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -319,6 +319,13 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
319 return 0; 319 return 0;
320} 320}
321 321
322static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
323{
324 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
325 return 1;
326 return 0;
327}
328
322/* set / clear functions because cluster events can make these happen 329/* set / clear functions because cluster events can make these happen
323 * in parallel so we want the transitions to be atomic. this also 330 * in parallel so we want the transitions to be atomic. this also
324 * means that any future flags osb_flags must be protected by spinlock 331 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 82f8a75b207e..6ef876759a73 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -87,7 +87,8 @@
87 87
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 92#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 93
93/* 94/*
@@ -111,6 +112,20 @@
111#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 112#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010
112 113
113/* 114/*
115 * Tunefs sets this incompat flag before starting an operation which
116 * would require cleanup on abort. This is done to protect users from
117 * inadvertently mounting the fs after an aborted run without
118 * fsck-ing.
119 *
120 * s_tunefs_flags on the super block describes precisely which
121 * operations were in progress.
122 */
123#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020
124
125/* Support for data packed into inode blocks */
126#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
127
128/*
114 * backup superblock flag is used to indicate that this volume 129 * backup superblock flag is used to indicate that this volume
115 * has backup superblocks. 130 * has backup superblocks.
116 */ 131 */
@@ -130,6 +145,11 @@
130#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 145#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6
131 146
132/* 147/*
148 * Flags on ocfs2_super_block.s_tunefs_flags
149 */
150#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */
151
152/*
133 * Flags on ocfs2_dinode.i_flags 153 * Flags on ocfs2_dinode.i_flags
134 */ 154 */
135#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ 155#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
@@ -146,6 +166,17 @@
146#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 166#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
147#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 167#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
148 168
169/*
170 * Flags on ocfs2_dinode.i_dyn_features
171 *
172 * These can change much more often than i_flags. When adding flags,
173 * keep in mind that i_dyn_features is only 16 bits wide.
174 */
175#define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */
176#define OCFS2_HAS_XATTR_FL (0x0002)
177#define OCFS2_INLINE_XATTR_FL (0x0004)
178#define OCFS2_INDEXED_DIR_FL (0x0008)
179
149/* Inode attributes, keep in sync with EXT2 */ 180/* Inode attributes, keep in sync with EXT2 */
150#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 181#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
151#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 182#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
@@ -447,8 +478,8 @@ struct ocfs2_super_block {
447 __le32 s_clustersize_bits; /* Clustersize for this fs */ 478 __le32 s_clustersize_bits; /* Clustersize for this fs */
448/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts 479/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
449 before tunefs required */ 480 before tunefs required */
450 __le16 s_reserved1; 481 __le16 s_tunefs_flag;
451 __le32 s_reserved2; 482 __le32 s_reserved1;
452 __le64 s_first_cluster_group; /* Block offset of 1st cluster 483 __le64 s_first_cluster_group; /* Block offset of 1st cluster
453 * group header */ 484 * group header */
454/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 485/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -471,6 +502,19 @@ struct ocfs2_local_alloc
471}; 502};
472 503
473/* 504/*
505 * Data-in-inode header. This is only used if i_dyn_features has
506 * OCFS2_INLINE_DATA_FL set.
507 */
508struct ocfs2_inline_data
509{
510/*00*/ __le16 id_count; /* Number of bytes that can be used
511 * for data, starting at id_data */
512 __le16 id_reserved0;
513 __le32 id_reserved1;
514 __u8 id_data[0]; /* Start of user data */
515};
516
517/*
474 * On disk inode for OCFS2 518 * On disk inode for OCFS2
475 */ 519 */
476struct ocfs2_dinode { 520struct ocfs2_dinode {
@@ -502,7 +546,7 @@ struct ocfs2_dinode {
502 __le32 i_attr; 546 __le32 i_attr;
503 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL 547 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
504 was set in i_flags */ 548 was set in i_flags */
505 __le16 i_reserved1; 549 __le16 i_dyn_features;
506/*70*/ __le64 i_reserved2[8]; 550/*70*/ __le64 i_reserved2[8];
507/*B8*/ union { 551/*B8*/ union {
508 __le64 i_pad1; /* Generic way to refer to this 552 __le64 i_pad1; /* Generic way to refer to this
@@ -528,6 +572,7 @@ struct ocfs2_dinode {
528 struct ocfs2_chain_list i_chain; 572 struct ocfs2_chain_list i_chain;
529 struct ocfs2_extent_list i_list; 573 struct ocfs2_extent_list i_list;
530 struct ocfs2_truncate_log i_dealloc; 574 struct ocfs2_truncate_log i_dealloc;
575 struct ocfs2_inline_data i_data;
531 __u8 i_symlink[0]; 576 __u8 i_symlink[0];
532 } id2; 577 } id2;
533/* Actual on-disk size is one block */ 578/* Actual on-disk size is one block */
@@ -577,6 +622,12 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
577 offsetof(struct ocfs2_dinode, id2.i_symlink); 622 offsetof(struct ocfs2_dinode, id2.i_symlink);
578} 623}
579 624
625static inline int ocfs2_max_inline_data(struct super_block *sb)
626{
627 return sb->s_blocksize -
628 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
629}
630
580static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) 631static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
581{ 632{
582 int size; 633 int size;
@@ -656,6 +707,11 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
656 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); 707 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
657} 708}
658 709
710static inline int ocfs2_max_inline_data(int blocksize)
711{
712 return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
713}
714
659static inline int ocfs2_extent_recs_per_inode(int blocksize) 715static inline int ocfs2_extent_recs_per_inode(int blocksize)
660{ 716{
661 int size; 717 int size;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c034b5129c1e..0e2a1b45bf92 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -39,6 +39,7 @@
39#include <linux/parser.h> 39#include <linux/parser.h>
40#include <linux/crc32.h> 40#include <linux/crc32.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h>
42 43
43#include <cluster/nodemanager.h> 44#include <cluster/nodemanager.h>
44 45
@@ -91,6 +92,7 @@ struct mount_options
91static int ocfs2_parse_options(struct super_block *sb, char *options, 92static int ocfs2_parse_options(struct super_block *sb, char *options,
92 struct mount_options *mopt, 93 struct mount_options *mopt,
93 int is_remount); 94 int is_remount);
95static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
94static void ocfs2_put_super(struct super_block *sb); 96static void ocfs2_put_super(struct super_block *sb);
95static int ocfs2_mount_volume(struct super_block *sb); 97static int ocfs2_mount_volume(struct super_block *sb);
96static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 98static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -105,7 +107,7 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
105 107
106static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 108static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
107static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 109static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_release_system_inodes(struct ocfs2_super *osb); 110static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); 111static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
110static int ocfs2_check_volume(struct ocfs2_super *osb); 112static int ocfs2_check_volume(struct ocfs2_super *osb);
111static int ocfs2_verify_volume(struct ocfs2_dinode *di, 113static int ocfs2_verify_volume(struct ocfs2_dinode *di,
@@ -133,6 +135,7 @@ static const struct super_operations ocfs2_sops = {
133 .write_super = ocfs2_write_super, 135 .write_super = ocfs2_write_super,
134 .put_super = ocfs2_put_super, 136 .put_super = ocfs2_put_super,
135 .remount_fs = ocfs2_remount, 137 .remount_fs = ocfs2_remount,
138 .show_options = ocfs2_show_options,
136}; 139};
137 140
138enum { 141enum {
@@ -177,7 +180,7 @@ static void ocfs2_write_super(struct super_block *sb)
177 180
178static int ocfs2_sync_fs(struct super_block *sb, int wait) 181static int ocfs2_sync_fs(struct super_block *sb, int wait)
179{ 182{
180 int status = 0; 183 int status;
181 tid_t target; 184 tid_t target;
182 struct ocfs2_super *osb = OCFS2_SB(sb); 185 struct ocfs2_super *osb = OCFS2_SB(sb);
183 186
@@ -275,9 +278,9 @@ bail:
275 return status; 278 return status;
276} 279}
277 280
278static int ocfs2_release_system_inodes(struct ocfs2_super *osb) 281static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
279{ 282{
280 int status = 0, i; 283 int i;
281 struct inode *inode; 284 struct inode *inode;
282 285
283 mlog_entry_void(); 286 mlog_entry_void();
@@ -302,8 +305,7 @@ static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
302 osb->root_inode = NULL; 305 osb->root_inode = NULL;
303 } 306 }
304 307
305 mlog_exit(status); 308 mlog_exit(0);
306 return status;
307} 309}
308 310
309/* We're allocating fs objects, use GFP_NOFS */ 311/* We're allocating fs objects, use GFP_NOFS */
@@ -453,7 +455,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
453 struct buffer_head **bh, 455 struct buffer_head **bh,
454 int *sector_size) 456 int *sector_size)
455{ 457{
456 int status = 0, tmpstat; 458 int status, tmpstat;
457 struct ocfs1_vol_disk_hdr *hdr; 459 struct ocfs1_vol_disk_hdr *hdr;
458 struct ocfs2_dinode *di; 460 struct ocfs2_dinode *di;
459 int blksize; 461 int blksize;
@@ -830,6 +832,41 @@ bail:
830 return status; 832 return status;
831} 833}
832 834
835static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
836{
837 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
838 unsigned long opts = osb->s_mount_opt;
839
840 if (opts & OCFS2_MOUNT_HB_LOCAL)
841 seq_printf(s, ",_netdev,heartbeat=local");
842 else
843 seq_printf(s, ",heartbeat=none");
844
845 if (opts & OCFS2_MOUNT_NOINTR)
846 seq_printf(s, ",nointr");
847
848 if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
849 seq_printf(s, ",data=writeback");
850 else
851 seq_printf(s, ",data=ordered");
852
853 if (opts & OCFS2_MOUNT_BARRIER)
854 seq_printf(s, ",barrier=1");
855
856 if (opts & OCFS2_MOUNT_ERRORS_PANIC)
857 seq_printf(s, ",errors=panic");
858 else
859 seq_printf(s, ",errors=remount-ro");
860
861 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
862 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
863
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866
867 return 0;
868}
869
833static int __init ocfs2_init(void) 870static int __init ocfs2_init(void)
834{ 871{
835 int status; 872 int status;
@@ -1209,12 +1246,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1209 tmp = ocfs2_request_umount_vote(osb); 1246 tmp = ocfs2_request_umount_vote(osb);
1210 if (tmp < 0) 1247 if (tmp < 0)
1211 mlog_errno(tmp); 1248 mlog_errno(tmp);
1249 }
1212 1250
1213 if (osb->slot_num != OCFS2_INVALID_SLOT) 1251 if (osb->slot_num != OCFS2_INVALID_SLOT)
1214 ocfs2_put_slot(osb); 1252 ocfs2_put_slot(osb);
1215 1253
1254 if (osb->dlm)
1216 ocfs2_super_unlock(osb, 1); 1255 ocfs2_super_unlock(osb, 1);
1217 }
1218 1256
1219 ocfs2_release_system_inodes(osb); 1257 ocfs2_release_system_inodes(osb);
1220 1258
@@ -1275,7 +1313,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1275 struct buffer_head *bh, 1313 struct buffer_head *bh,
1276 int sector_size) 1314 int sector_size)
1277{ 1315{
1278 int status = 0; 1316 int status;
1279 int i, cbits, bbits; 1317 int i, cbits, bbits;
1280 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1318 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1281 struct inode *inode = NULL; 1319 struct inode *inode = NULL;
@@ -1596,7 +1634,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1596 1634
1597static int ocfs2_check_volume(struct ocfs2_super *osb) 1635static int ocfs2_check_volume(struct ocfs2_super *osb)
1598{ 1636{
1599 int status = 0; 1637 int status;
1600 int dirty; 1638 int dirty;
1601 int local; 1639 int local;
1602 struct ocfs2_dinode *local_alloc = NULL; /* only used if we 1640 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 5df6e35d09b1..fd2e846e3e6f 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -100,17 +100,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
100 char namebuf[40]; 100 char namebuf[40];
101 struct inode *inode = NULL; 101 struct inode *inode = NULL;
102 u64 blkno; 102 u64 blkno;
103 struct buffer_head *dirent_bh = NULL;
104 struct ocfs2_dir_entry *de = NULL;
105 int status = 0; 103 int status = 0;
106 104
107 ocfs2_sprintf_system_inode_name(namebuf, 105 ocfs2_sprintf_system_inode_name(namebuf,
108 sizeof(namebuf), 106 sizeof(namebuf),
109 type, slot); 107 type, slot);
110 108
111 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), 109 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
112 &blkno, osb->sys_root_inode, 110 strlen(namebuf), &blkno);
113 &dirent_bh, &de);
114 if (status < 0) { 111 if (status < 0) {
115 goto bail; 112 goto bail;
116 } 113 }
@@ -122,8 +119,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
122 goto bail; 119 goto bail;
123 } 120 }
124bail: 121bail:
125 if (dirent_bh) 122
126 brelse(dirent_bh);
127 return inode; 123 return inode;
128} 124}
129 125