aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlmglue.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlmglue.c')
-rw-r--r--fs/ocfs2/dlmglue.c546
1 files changed, 403 insertions, 143 deletions
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4e97dcceaf8f..3867244fb144 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -55,7 +55,6 @@
55#include "slot_map.h" 55#include "slot_map.h"
56#include "super.h" 56#include "super.h"
57#include "uptodate.h" 57#include "uptodate.h"
58#include "vote.h"
59 58
60#include "buffer_head_io.h" 59#include "buffer_head_io.h"
61 60
@@ -69,6 +68,7 @@ struct ocfs2_mask_waiter {
69 68
70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 69static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
71static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
72 72
73/* 73/*
74 * Return value from ->downconvert_worker functions. 74 * Return value from ->downconvert_worker functions.
@@ -153,10 +153,10 @@ struct ocfs2_lock_res_ops {
153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); 153 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
154 154
155 /* 155 /*
156 * Optionally called in the downconvert (or "vote") thread 156 * Optionally called in the downconvert thread after a
157 * after a successful downconvert. The lockres will not be 157 * successful downconvert. The lockres will not be referenced
158 * referenced after this callback is called, so it is safe to 158 * after this callback is called, so it is safe to free
159 * free memory, etc. 159 * memory, etc.
160 * 160 *
161 * The exact semantics of when this is called are controlled 161 * The exact semantics of when this is called are controlled
162 * by ->downconvert_worker() 162 * by ->downconvert_worker()
@@ -225,17 +225,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
225 .flags = 0, 225 .flags = 0,
226}; 226};
227 227
228static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 228static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
229 .get_osb = ocfs2_get_inode_osb, 229 .get_osb = ocfs2_get_inode_osb,
230 .check_downconvert = ocfs2_check_meta_downconvert, 230 .check_downconvert = ocfs2_check_meta_downconvert,
231 .set_lvb = ocfs2_set_meta_lvb, 231 .set_lvb = ocfs2_set_meta_lvb,
232 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
233};
234
235static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
236 .get_osb = ocfs2_get_inode_osb,
237 .downconvert_worker = ocfs2_data_convert_worker, 232 .downconvert_worker = ocfs2_data_convert_worker,
238 .flags = 0, 233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
239}; 234};
240 235
241static struct ocfs2_lock_res_ops ocfs2_super_lops = { 236static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -258,10 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .flags = 0, 253 .flags = 0,
259}; 254};
260 255
256static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
257 .get_osb = ocfs2_get_file_osb,
258 .flags = 0,
259};
260
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 262{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 263 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
264 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
265 lockres->l_type == OCFS2_LOCK_TYPE_RW || 264 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
266 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 265 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
267} 266}
@@ -310,12 +309,24 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
310 "resource %s: %s\n", dlm_errname(_stat), _func, \ 309 "resource %s: %s\n", dlm_errname(_stat), _func, \
311 _lockres->l_name, dlm_errmsg(_stat)); \ 310 _lockres->l_name, dlm_errmsg(_stat)); \
312} while (0) 311} while (0)
313static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 312static int ocfs2_downconvert_thread(void *arg);
314 struct ocfs2_lock_res *lockres); 313static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
315static int ocfs2_meta_lock_update(struct inode *inode, 314 struct ocfs2_lock_res *lockres);
315static int ocfs2_inode_lock_update(struct inode *inode,
316 struct buffer_head **bh); 316 struct buffer_head **bh);
317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 317static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
318static inline int ocfs2_highest_compat_lock_level(int level); 318static inline int ocfs2_highest_compat_lock_level(int level);
319static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
320 int new_level);
321static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
322 struct ocfs2_lock_res *lockres,
323 int new_level,
324 int lvb);
325static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
326 struct ocfs2_lock_res *lockres);
327static int ocfs2_cancel_convert(struct ocfs2_super *osb,
328 struct ocfs2_lock_res *lockres);
329
319 330
320static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 331static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
321 u64 blkno, 332 u64 blkno,
@@ -402,10 +413,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
402 ops = &ocfs2_inode_rw_lops; 413 ops = &ocfs2_inode_rw_lops;
403 break; 414 break;
404 case OCFS2_LOCK_TYPE_META: 415 case OCFS2_LOCK_TYPE_META:
405 ops = &ocfs2_inode_meta_lops; 416 ops = &ocfs2_inode_inode_lops;
406 break;
407 case OCFS2_LOCK_TYPE_DATA:
408 ops = &ocfs2_inode_data_lops;
409 break; 417 break;
410 case OCFS2_LOCK_TYPE_OPEN: 418 case OCFS2_LOCK_TYPE_OPEN:
411 ops = &ocfs2_inode_open_lops; 419 ops = &ocfs2_inode_open_lops;
@@ -428,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
428 return OCFS2_SB(inode->i_sb); 436 return OCFS2_SB(inode->i_sb);
429} 437}
430 438
439static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
440{
441 struct ocfs2_file_private *fp = lockres->l_priv;
442
443 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
444}
445
431static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) 446static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
432{ 447{
433 __be64 inode_blkno_be; 448 __be64 inode_blkno_be;
@@ -508,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
508 &ocfs2_rename_lops, osb); 523 &ocfs2_rename_lops, osb);
509} 524}
510 525
526void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
527 struct ocfs2_file_private *fp)
528{
529 struct inode *inode = fp->fp_file->f_mapping->host;
530 struct ocfs2_inode_info *oi = OCFS2_I(inode);
531
532 ocfs2_lock_res_init_once(lockres);
533 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
534 inode->i_generation, lockres->l_name);
535 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
536 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
537 fp);
538 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
539}
540
511void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 541void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
512{ 542{
513 mlog_entry_void(); 543 mlog_entry_void();
@@ -724,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
724 lockres->l_name, level, lockres->l_level, 754 lockres->l_name, level, lockres->l_level,
725 ocfs2_lock_type_string(lockres->l_type)); 755 ocfs2_lock_type_string(lockres->l_type));
726 756
757 /*
758 * We can skip the bast for locks which don't enable caching -
759 * they'll be dropped at the earliest possible time anyway.
760 */
761 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
762 return;
763
727 spin_lock_irqsave(&lockres->l_lock, flags); 764 spin_lock_irqsave(&lockres->l_lock, flags);
728 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 765 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
729 if (needs_downconvert) 766 if (needs_downconvert)
@@ -732,7 +769,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
732 769
733 wake_up(&lockres->l_event); 770 wake_up(&lockres->l_event);
734 771
735 ocfs2_kick_vote_thread(osb); 772 ocfs2_wake_downconvert_thread(osb);
736} 773}
737 774
738static void ocfs2_locking_ast(void *opaque) 775static void ocfs2_locking_ast(void *opaque)
@@ -935,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
935 972
936} 973}
937 974
975static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
976 struct ocfs2_lock_res *lockres)
977{
978 int ret;
979
980 ret = wait_for_completion_interruptible(&mw->mw_complete);
981 if (ret)
982 lockres_remove_mask_waiter(lockres, mw);
983 else
984 ret = mw->mw_status;
985 /* Re-arm the completion in case we want to wait on it again */
986 INIT_COMPLETION(mw->mw_complete);
987 return ret;
988}
989
938static int ocfs2_cluster_lock(struct ocfs2_super *osb, 990static int ocfs2_cluster_lock(struct ocfs2_super *osb,
939 struct ocfs2_lock_res *lockres, 991 struct ocfs2_lock_res *lockres,
940 int level, 992 int level,
@@ -1089,7 +1141,7 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1089 mlog_entry_void(); 1141 mlog_entry_void();
1090 spin_lock_irqsave(&lockres->l_lock, flags); 1142 spin_lock_irqsave(&lockres->l_lock, flags);
1091 ocfs2_dec_holders(lockres, level); 1143 ocfs2_dec_holders(lockres, level);
1092 ocfs2_vote_on_unlock(osb, lockres); 1144 ocfs2_downconvert_on_unlock(osb, lockres);
1093 spin_unlock_irqrestore(&lockres->l_lock, flags); 1145 spin_unlock_irqrestore(&lockres->l_lock, flags);
1094 mlog_exit_void(); 1146 mlog_exit_void();
1095} 1147}
@@ -1147,13 +1199,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1147 * We don't want to use LKM_LOCAL on a meta data lock as they 1199 * We don't want to use LKM_LOCAL on a meta data lock as they
1148 * don't use a generation in their lock names. 1200 * don't use a generation in their lock names.
1149 */ 1201 */
1150 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); 1202 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
1151 if (ret) {
1152 mlog_errno(ret);
1153 goto bail;
1154 }
1155
1156 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
1157 if (ret) { 1203 if (ret) {
1158 mlog_errno(ret); 1204 mlog_errno(ret);
1159 goto bail; 1205 goto bail;
@@ -1311,76 +1357,221 @@ out:
1311 mlog_exit_void(); 1357 mlog_exit_void();
1312} 1358}
1313 1359
1314int ocfs2_data_lock_full(struct inode *inode, 1360static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1315 int write, 1361 int level)
1316 int arg_flags)
1317{ 1362{
1318 int status = 0, level; 1363 int ret;
1319 struct ocfs2_lock_res *lockres; 1364 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1320 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 unsigned long flags;
1366 struct ocfs2_mask_waiter mw;
1321 1367
1322 BUG_ON(!inode); 1368 ocfs2_init_mask_waiter(&mw);
1323 1369
1324 mlog_entry_void(); 1370retry_cancel:
1371 spin_lock_irqsave(&lockres->l_lock, flags);
1372 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1373 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1374 if (ret) {
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376 ret = ocfs2_cancel_convert(osb, lockres);
1377 if (ret < 0) {
1378 mlog_errno(ret);
1379 goto out;
1380 }
1381 goto retry_cancel;
1382 }
1383 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1384 spin_unlock_irqrestore(&lockres->l_lock, flags);
1325 1385
1326 mlog(0, "inode %llu take %s DATA lock\n", 1386 ocfs2_wait_for_mask(&mw);
1327 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1387 goto retry_cancel;
1328 write ? "EXMODE" : "PRMODE"); 1388 }
1329 1389
1330 /* We'll allow faking a readonly data lock for 1390 ret = -ERESTARTSYS;
1331 * rodevices. */ 1391 /*
1332 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1392 * We may still have gotten the lock, in which case there's no
1333 if (write) { 1393 * point to restarting the syscall.
1334 status = -EROFS; 1394 */
1335 mlog_errno(status); 1395 if (lockres->l_level == level)
1396 ret = 0;
1397
1398 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1399 lockres->l_flags, lockres->l_level, lockres->l_action);
1400
1401 spin_unlock_irqrestore(&lockres->l_lock, flags);
1402
1403out:
1404 return ret;
1405}
1406
1407/*
1408 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1409 * flock() calls. The locking approach this requires is sufficiently
1410 * different from all other cluster lock types that we implement a
1411 * seperate path to the "low-level" dlm calls. In particular:
1412 *
1413 * - No optimization of lock levels is done - we take at exactly
1414 * what's been requested.
1415 *
1416 * - No lock caching is employed. We immediately downconvert to
1417 * no-lock at unlock time. This also means flock locks never go on
1418 * the blocking list).
1419 *
1420 * - Since userspace can trivially deadlock itself with flock, we make
1421 * sure to allow cancellation of a misbehaving applications flock()
1422 * request.
1423 *
1424 * - Access to any flock lockres doesn't require concurrency, so we
1425 * can simplify the code by requiring the caller to guarantee
1426 * serialization of dlmglue flock calls.
1427 */
1428int ocfs2_file_lock(struct file *file, int ex, int trylock)
1429{
1430 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
1431 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
1432 unsigned long flags;
1433 struct ocfs2_file_private *fp = file->private_data;
1434 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1435 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1436 struct ocfs2_mask_waiter mw;
1437
1438 ocfs2_init_mask_waiter(&mw);
1439
1440 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1441 (lockres->l_level > LKM_NLMODE)) {
1442 mlog(ML_ERROR,
1443 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1444 "level: %u\n", lockres->l_name, lockres->l_flags,
1445 lockres->l_level);
1446 return -EINVAL;
1447 }
1448
1449 spin_lock_irqsave(&lockres->l_lock, flags);
1450 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1451 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1452 spin_unlock_irqrestore(&lockres->l_lock, flags);
1453
1454 /*
1455 * Get the lock at NLMODE to start - that way we
1456 * can cancel the upconvert request if need be.
1457 */
1458 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1459 if (ret < 0) {
1460 mlog_errno(ret);
1461 goto out;
1336 } 1462 }
1337 goto out; 1463
1464 ret = ocfs2_wait_for_mask(&mw);
1465 if (ret) {
1466 mlog_errno(ret);
1467 goto out;
1468 }
1469 spin_lock_irqsave(&lockres->l_lock, flags);
1338 } 1470 }
1339 1471
1340 if (ocfs2_mount_local(osb)) 1472 lockres->l_action = OCFS2_AST_CONVERT;
1341 goto out; 1473 lkm_flags |= LKM_CONVERT;
1474 lockres->l_requested = level;
1475 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1342 1476
1343 lockres = &OCFS2_I(inode)->ip_data_lockres; 1477 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1478 spin_unlock_irqrestore(&lockres->l_lock, flags);
1344 1479
1345 level = write ? LKM_EXMODE : LKM_PRMODE; 1480 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
1481 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1482 ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
1483 if (ret != DLM_NORMAL) {
1484 if (trylock && ret == DLM_NOTQUEUED)
1485 ret = -EAGAIN;
1486 else {
1487 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1488 ret = -EINVAL;
1489 }
1346 1490
1347 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1491 ocfs2_recover_from_dlm_error(lockres, 1);
1348 0, arg_flags); 1492 lockres_remove_mask_waiter(lockres, &mw);
1349 if (status < 0 && status != -EAGAIN) 1493 goto out;
1350 mlog_errno(status); 1494 }
1495
1496 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1497 if (ret == -ERESTARTSYS) {
1498 /*
1499 * Userspace can cause deadlock itself with
1500 * flock(). Current behavior locally is to allow the
1501 * deadlock, but abort the system call if a signal is
1502 * received. We follow this example, otherwise a
1503 * poorly written program could sit in kernel until
1504 * reboot.
1505 *
1506 * Handling this is a bit more complicated for Ocfs2
1507 * though. We can't exit this function with an
1508 * outstanding lock request, so a cancel convert is
1509 * required. We intentionally overwrite 'ret' - if the
1510 * cancel fails and the lock was granted, it's easier
1511 * to just bubble sucess back up to the user.
1512 */
1513 ret = ocfs2_flock_handle_signal(lockres, level);
1514 }
1351 1515
1352out: 1516out:
1353 mlog_exit(status); 1517
1354 return status; 1518 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1519 lockres->l_name, ex, trylock, ret);
1520 return ret;
1355} 1521}
1356 1522
1357/* see ocfs2_meta_lock_with_page() */ 1523void ocfs2_file_unlock(struct file *file)
1358int ocfs2_data_lock_with_page(struct inode *inode,
1359 int write,
1360 struct page *page)
1361{ 1524{
1362 int ret; 1525 int ret;
1526 unsigned long flags;
1527 struct ocfs2_file_private *fp = file->private_data;
1528 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1529 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1530 struct ocfs2_mask_waiter mw;
1363 1531
1364 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1532 ocfs2_init_mask_waiter(&mw);
1365 if (ret == -EAGAIN) { 1533
1366 unlock_page(page); 1534 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1367 if (ocfs2_data_lock(inode, write) == 0) 1535 return;
1368 ocfs2_data_unlock(inode, write); 1536
1369 ret = AOP_TRUNCATED_PAGE; 1537 if (lockres->l_level == LKM_NLMODE)
1538 return;
1539
1540 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
1541 lockres->l_name, lockres->l_flags, lockres->l_level,
1542 lockres->l_action);
1543
1544 spin_lock_irqsave(&lockres->l_lock, flags);
1545 /*
1546 * Fake a blocking ast for the downconvert code.
1547 */
1548 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1549 lockres->l_blocking = LKM_EXMODE;
1550
1551 ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1552 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554
1555 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
1556 if (ret) {
1557 mlog_errno(ret);
1558 return;
1370 } 1559 }
1371 1560
1372 return ret; 1561 ret = ocfs2_wait_for_mask(&mw);
1562 if (ret)
1563 mlog_errno(ret);
1373} 1564}
1374 1565
1375static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1566static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1376 struct ocfs2_lock_res *lockres) 1567 struct ocfs2_lock_res *lockres)
1377{ 1568{
1378 int kick = 0; 1569 int kick = 0;
1379 1570
1380 mlog_entry_void(); 1571 mlog_entry_void();
1381 1572
1382 /* If we know that another node is waiting on our lock, kick 1573 /* If we know that another node is waiting on our lock, kick
1383 * the vote thread * pre-emptively when we reach a release 1574 * the downconvert thread * pre-emptively when we reach a release
1384 * condition. */ 1575 * condition. */
1385 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1576 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1386 switch(lockres->l_blocking) { 1577 switch(lockres->l_blocking) {
@@ -1398,27 +1589,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1398 } 1589 }
1399 1590
1400 if (kick) 1591 if (kick)
1401 ocfs2_kick_vote_thread(osb); 1592 ocfs2_wake_downconvert_thread(osb);
1402
1403 mlog_exit_void();
1404}
1405
1406void ocfs2_data_unlock(struct inode *inode,
1407 int write)
1408{
1409 int level = write ? LKM_EXMODE : LKM_PRMODE;
1410 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1411 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1412
1413 mlog_entry_void();
1414
1415 mlog(0, "inode %llu drop %s DATA lock\n",
1416 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1417 write ? "EXMODE" : "PRMODE");
1418
1419 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1420 !ocfs2_mount_local(osb))
1421 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1422 1593
1423 mlog_exit_void(); 1594 mlog_exit_void();
1424} 1595}
@@ -1442,11 +1613,11 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
1442 1613
1443/* Call this with the lockres locked. I am reasonably sure we don't 1614/* Call this with the lockres locked. I am reasonably sure we don't
1444 * need ip_lock in this function as anyone who would be changing those 1615 * need ip_lock in this function as anyone who would be changing those
1445 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1616 * values is supposed to be blocked in ocfs2_inode_lock right now. */
1446static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1617static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1447{ 1618{
1448 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1619 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1449 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1620 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1450 struct ocfs2_meta_lvb *lvb; 1621 struct ocfs2_meta_lvb *lvb;
1451 1622
1452 mlog_entry_void(); 1623 mlog_entry_void();
@@ -1496,7 +1667,7 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
1496static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1667static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1497{ 1668{
1498 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1669 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1499 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1670 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1500 struct ocfs2_meta_lvb *lvb; 1671 struct ocfs2_meta_lvb *lvb;
1501 1672
1502 mlog_entry_void(); 1673 mlog_entry_void();
@@ -1604,12 +1775,12 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
1604} 1775}
1605 1776
1606/* may or may not return a bh if it went to disk. */ 1777/* may or may not return a bh if it went to disk. */
1607static int ocfs2_meta_lock_update(struct inode *inode, 1778static int ocfs2_inode_lock_update(struct inode *inode,
1608 struct buffer_head **bh) 1779 struct buffer_head **bh)
1609{ 1780{
1610 int status = 0; 1781 int status = 0;
1611 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1782 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1612 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1783 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
1613 struct ocfs2_dinode *fe; 1784 struct ocfs2_dinode *fe;
1614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1615 1786
@@ -1721,7 +1892,7 @@ static int ocfs2_assign_bh(struct inode *inode,
1721 * returns < 0 error if the callback will never be called, otherwise 1892 * returns < 0 error if the callback will never be called, otherwise
1722 * the result of the lock will be communicated via the callback. 1893 * the result of the lock will be communicated via the callback.
1723 */ 1894 */
1724int ocfs2_meta_lock_full(struct inode *inode, 1895int ocfs2_inode_lock_full(struct inode *inode,
1725 struct buffer_head **ret_bh, 1896 struct buffer_head **ret_bh,
1726 int ex, 1897 int ex,
1727 int arg_flags) 1898 int arg_flags)
@@ -1756,7 +1927,7 @@ int ocfs2_meta_lock_full(struct inode *inode,
1756 wait_event(osb->recovery_event, 1927 wait_event(osb->recovery_event,
1757 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1928 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1758 1929
1759 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1930 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1760 level = ex ? LKM_EXMODE : LKM_PRMODE; 1931 level = ex ? LKM_EXMODE : LKM_PRMODE;
1761 dlm_flags = 0; 1932 dlm_flags = 0;
1762 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1933 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
@@ -1795,11 +1966,11 @@ local:
1795 } 1966 }
1796 1967
1797 /* This is fun. The caller may want a bh back, or it may 1968 /* This is fun. The caller may want a bh back, or it may
1798 * not. ocfs2_meta_lock_update definitely wants one in, but 1969 * not. ocfs2_inode_lock_update definitely wants one in, but
1799 * may or may not read one, depending on what's in the 1970 * may or may not read one, depending on what's in the
1800 * LVB. The result of all of this is that we've *only* gone to 1971 * LVB. The result of all of this is that we've *only* gone to
1801 * disk if we have to, so the complexity is worthwhile. */ 1972 * disk if we have to, so the complexity is worthwhile. */
1802 status = ocfs2_meta_lock_update(inode, &local_bh); 1973 status = ocfs2_inode_lock_update(inode, &local_bh);
1803 if (status < 0) { 1974 if (status < 0) {
1804 if (status != -ENOENT) 1975 if (status != -ENOENT)
1805 mlog_errno(status); 1976 mlog_errno(status);
@@ -1821,7 +1992,7 @@ bail:
1821 *ret_bh = NULL; 1992 *ret_bh = NULL;
1822 } 1993 }
1823 if (acquired) 1994 if (acquired)
1824 ocfs2_meta_unlock(inode, ex); 1995 ocfs2_inode_unlock(inode, ex);
1825 } 1996 }
1826 1997
1827 if (local_bh) 1998 if (local_bh)
@@ -1832,19 +2003,20 @@ bail:
1832} 2003}
1833 2004
1834/* 2005/*
1835 * This is working around a lock inversion between tasks acquiring DLM locks 2006 * This is working around a lock inversion between tasks acquiring DLM
1836 * while holding a page lock and the vote thread which blocks dlm lock acquiry 2007 * locks while holding a page lock and the downconvert thread which
1837 * while acquiring page locks. 2008 * blocks dlm lock acquiry while acquiring page locks.
1838 * 2009 *
1839 * ** These _with_page variantes are only intended to be called from aop 2010 * ** These _with_page variantes are only intended to be called from aop
1840 * methods that hold page locks and return a very specific *positive* error 2011 * methods that hold page locks and return a very specific *positive* error
1841 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 2012 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1842 * 2013 *
1843 * The DLM is called such that it returns -EAGAIN if it would have blocked 2014 * The DLM is called such that it returns -EAGAIN if it would have
1844 * waiting for the vote thread. In that case we unlock our page so the vote 2015 * blocked waiting for the downconvert thread. In that case we unlock
1845 * thread can make progress. Once we've done this we have to return 2016 * our page so the downconvert thread can make progress. Once we've
1846 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 2017 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
1847 * into the VFS who will then immediately retry the aop call. 2018 * that called us can bubble that back up into the VFS who will then
2019 * immediately retry the aop call.
1848 * 2020 *
1849 * We do a blocking lock and immediate unlock before returning, though, so that 2021 * We do a blocking lock and immediate unlock before returning, though, so that
1850 * the lock has a great chance of being cached on this node by the time the VFS 2022 * the lock has a great chance of being cached on this node by the time the VFS
@@ -1852,32 +2024,32 @@ bail:
1852 * ping locks back and forth, but that's a risk we're willing to take to avoid 2024 * ping locks back and forth, but that's a risk we're willing to take to avoid
1853 * the lock inversion simply. 2025 * the lock inversion simply.
1854 */ 2026 */
1855int ocfs2_meta_lock_with_page(struct inode *inode, 2027int ocfs2_inode_lock_with_page(struct inode *inode,
1856 struct buffer_head **ret_bh, 2028 struct buffer_head **ret_bh,
1857 int ex, 2029 int ex,
1858 struct page *page) 2030 struct page *page)
1859{ 2031{
1860 int ret; 2032 int ret;
1861 2033
1862 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); 2034 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
1863 if (ret == -EAGAIN) { 2035 if (ret == -EAGAIN) {
1864 unlock_page(page); 2036 unlock_page(page);
1865 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0) 2037 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
1866 ocfs2_meta_unlock(inode, ex); 2038 ocfs2_inode_unlock(inode, ex);
1867 ret = AOP_TRUNCATED_PAGE; 2039 ret = AOP_TRUNCATED_PAGE;
1868 } 2040 }
1869 2041
1870 return ret; 2042 return ret;
1871} 2043}
1872 2044
1873int ocfs2_meta_lock_atime(struct inode *inode, 2045int ocfs2_inode_lock_atime(struct inode *inode,
1874 struct vfsmount *vfsmnt, 2046 struct vfsmount *vfsmnt,
1875 int *level) 2047 int *level)
1876{ 2048{
1877 int ret; 2049 int ret;
1878 2050
1879 mlog_entry_void(); 2051 mlog_entry_void();
1880 ret = ocfs2_meta_lock(inode, NULL, 0); 2052 ret = ocfs2_inode_lock(inode, NULL, 0);
1881 if (ret < 0) { 2053 if (ret < 0) {
1882 mlog_errno(ret); 2054 mlog_errno(ret);
1883 return ret; 2055 return ret;
@@ -1890,8 +2062,8 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1890 if (ocfs2_should_update_atime(inode, vfsmnt)) { 2062 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1891 struct buffer_head *bh = NULL; 2063 struct buffer_head *bh = NULL;
1892 2064
1893 ocfs2_meta_unlock(inode, 0); 2065 ocfs2_inode_unlock(inode, 0);
1894 ret = ocfs2_meta_lock(inode, &bh, 1); 2066 ret = ocfs2_inode_lock(inode, &bh, 1);
1895 if (ret < 0) { 2067 if (ret < 0) {
1896 mlog_errno(ret); 2068 mlog_errno(ret);
1897 return ret; 2069 return ret;
@@ -1908,11 +2080,11 @@ int ocfs2_meta_lock_atime(struct inode *inode,
1908 return ret; 2080 return ret;
1909} 2081}
1910 2082
1911void ocfs2_meta_unlock(struct inode *inode, 2083void ocfs2_inode_unlock(struct inode *inode,
1912 int ex) 2084 int ex)
1913{ 2085{
1914 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2086 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1915 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2087 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
1916 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1917 2089
1918 mlog_entry_void(); 2090 mlog_entry_void();
@@ -2320,11 +2492,11 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2320 goto bail; 2492 goto bail;
2321 } 2493 }
2322 2494
2323 /* launch vote thread */ 2495 /* launch downconvert thread */
2324 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2496 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
2325 if (IS_ERR(osb->vote_task)) { 2497 if (IS_ERR(osb->dc_task)) {
2326 status = PTR_ERR(osb->vote_task); 2498 status = PTR_ERR(osb->dc_task);
2327 osb->vote_task = NULL; 2499 osb->dc_task = NULL;
2328 mlog_errno(status); 2500 mlog_errno(status);
2329 goto bail; 2501 goto bail;
2330 } 2502 }
@@ -2353,8 +2525,8 @@ local:
2353bail: 2525bail:
2354 if (status < 0) { 2526 if (status < 0) {
2355 ocfs2_dlm_shutdown_debug(osb); 2527 ocfs2_dlm_shutdown_debug(osb);
2356 if (osb->vote_task) 2528 if (osb->dc_task)
2357 kthread_stop(osb->vote_task); 2529 kthread_stop(osb->dc_task);
2358 } 2530 }
2359 2531
2360 mlog_exit(status); 2532 mlog_exit(status);
@@ -2369,9 +2541,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2369 2541
2370 ocfs2_drop_osb_locks(osb); 2542 ocfs2_drop_osb_locks(osb);
2371 2543
2372 if (osb->vote_task) { 2544 if (osb->dc_task) {
2373 kthread_stop(osb->vote_task); 2545 kthread_stop(osb->dc_task);
2374 osb->vote_task = NULL; 2546 osb->dc_task = NULL;
2375 } 2547 }
2376 2548
2377 ocfs2_lock_res_free(&osb->osb_super_lockres); 2549 ocfs2_lock_res_free(&osb->osb_super_lockres);
@@ -2527,7 +2699,7 @@ out:
2527 2699
2528/* Mark the lockres as being dropped. It will no longer be 2700/* Mark the lockres as being dropped. It will no longer be
2529 * queued if blocking, but we still may have to wait on it 2701 * queued if blocking, but we still may have to wait on it
2530 * being dequeued from the vote thread before we can consider 2702 * being dequeued from the downconvert thread before we can consider
2531 * it safe to drop. 2703 * it safe to drop.
2532 * 2704 *
2533 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2705 * You can *not* attempt to call cluster_lock on this lockres anymore. */
@@ -2590,14 +2762,7 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2590 status = err; 2762 status = err;
2591 2763
2592 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2764 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2593 &OCFS2_I(inode)->ip_data_lockres); 2765 &OCFS2_I(inode)->ip_inode_lockres);
2594 if (err < 0)
2595 mlog_errno(err);
2596 if (err < 0 && !status)
2597 status = err;
2598
2599 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2600 &OCFS2_I(inode)->ip_meta_lockres);
2601 if (err < 0) 2766 if (err < 0)
2602 mlog_errno(err); 2767 mlog_errno(err);
2603 if (err < 0 && !status) 2768 if (err < 0 && !status)
@@ -2850,6 +3015,9 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2850 inode = ocfs2_lock_res_inode(lockres); 3015 inode = ocfs2_lock_res_inode(lockres);
2851 mapping = inode->i_mapping; 3016 mapping = inode->i_mapping;
2852 3017
3018 if (S_ISREG(inode->i_mode))
3019 goto out;
3020
2853 /* 3021 /*
2854 * We need this before the filemap_fdatawrite() so that it can 3022 * We need this before the filemap_fdatawrite() so that it can
2855 * transfer the dirty bit from the PTE to the 3023 * transfer the dirty bit from the PTE to the
@@ -2875,6 +3043,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2875 filemap_fdatawait(mapping); 3043 filemap_fdatawait(mapping);
2876 } 3044 }
2877 3045
3046out:
2878 return UNBLOCK_CONTINUE; 3047 return UNBLOCK_CONTINUE;
2879} 3048}
2880 3049
@@ -2903,7 +3072,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2903 3072
2904/* 3073/*
2905 * Does the final reference drop on our dentry lock. Right now this 3074 * Does the final reference drop on our dentry lock. Right now this
2906 * happens in the vote thread, but we could choose to simplify the 3075 * happens in the downconvert thread, but we could choose to simplify the
2907 * dlmglue API and push these off to the ocfs2_wq in the future. 3076 * dlmglue API and push these off to the ocfs2_wq in the future.
2908 */ 3077 */
2909static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 3078static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -3042,7 +3211,7 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3042 mlog(0, "lockres %s blocked.\n", lockres->l_name); 3211 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3043 3212
3044 /* Detect whether a lock has been marked as going away while 3213 /* Detect whether a lock has been marked as going away while
3045 * the vote thread was processing other things. A lock can 3214 * the downconvert thread was processing other things. A lock can
3046 * still be marked with OCFS2_LOCK_FREEING after this check, 3215 * still be marked with OCFS2_LOCK_FREEING after this check,
3047 * but short circuiting here will still save us some 3216 * but short circuiting here will still save us some
3048 * performance. */ 3217 * performance. */
@@ -3091,13 +3260,104 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3091 3260
3092 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 3261 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3093 3262
3094 spin_lock(&osb->vote_task_lock); 3263 spin_lock(&osb->dc_task_lock);
3095 if (list_empty(&lockres->l_blocked_list)) { 3264 if (list_empty(&lockres->l_blocked_list)) {
3096 list_add_tail(&lockres->l_blocked_list, 3265 list_add_tail(&lockres->l_blocked_list,
3097 &osb->blocked_lock_list); 3266 &osb->blocked_lock_list);
3098 osb->blocked_lock_count++; 3267 osb->blocked_lock_count++;
3099 } 3268 }
3100 spin_unlock(&osb->vote_task_lock); 3269 spin_unlock(&osb->dc_task_lock);
3270
3271 mlog_exit_void();
3272}
3273
3274static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
3275{
3276 unsigned long processed;
3277 struct ocfs2_lock_res *lockres;
3278
3279 mlog_entry_void();
3280
3281 spin_lock(&osb->dc_task_lock);
3282 /* grab this early so we know to try again if a state change and
3283 * wake happens part-way through our work */
3284 osb->dc_work_sequence = osb->dc_wake_sequence;
3285
3286 processed = osb->blocked_lock_count;
3287 while (processed) {
3288 BUG_ON(list_empty(&osb->blocked_lock_list));
3289
3290 lockres = list_entry(osb->blocked_lock_list.next,
3291 struct ocfs2_lock_res, l_blocked_list);
3292 list_del_init(&lockres->l_blocked_list);
3293 osb->blocked_lock_count--;
3294 spin_unlock(&osb->dc_task_lock);
3295
3296 BUG_ON(!processed);
3297 processed--;
3298
3299 ocfs2_process_blocked_lock(osb, lockres);
3300
3301 spin_lock(&osb->dc_task_lock);
3302 }
3303 spin_unlock(&osb->dc_task_lock);
3101 3304
3102 mlog_exit_void(); 3305 mlog_exit_void();
3103} 3306}
3307
3308static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
3309{
3310 int empty = 0;
3311
3312 spin_lock(&osb->dc_task_lock);
3313 if (list_empty(&osb->blocked_lock_list))
3314 empty = 1;
3315
3316 spin_unlock(&osb->dc_task_lock);
3317 return empty;
3318}
3319
3320static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
3321{
3322 int should_wake = 0;
3323
3324 spin_lock(&osb->dc_task_lock);
3325 if (osb->dc_work_sequence != osb->dc_wake_sequence)
3326 should_wake = 1;
3327 spin_unlock(&osb->dc_task_lock);
3328
3329 return should_wake;
3330}
3331
3332int ocfs2_downconvert_thread(void *arg)
3333{
3334 int status = 0;
3335 struct ocfs2_super *osb = arg;
3336
3337 /* only quit once we've been asked to stop and there is no more
3338 * work available */
3339 while (!(kthread_should_stop() &&
3340 ocfs2_downconvert_thread_lists_empty(osb))) {
3341
3342 wait_event_interruptible(osb->dc_event,
3343 ocfs2_downconvert_thread_should_wake(osb) ||
3344 kthread_should_stop());
3345
3346 mlog(0, "downconvert_thread: awoken\n");
3347
3348 ocfs2_downconvert_thread_do_work(osb);
3349 }
3350
3351 osb->dc_task = NULL;
3352 return status;
3353}
3354
3355void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
3356{
3357 spin_lock(&osb->dc_task_lock);
3358 /* make sure the voting thread gets a swipe at whatever changes
3359 * the caller may have made to the voting state */
3360 osb->dc_wake_sequence++;
3361 spin_unlock(&osb->dc_task_lock);
3362 wake_up(&osb->dc_event);
3363}