aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Foster <bfoster@redhat.com>2015-05-28 19:18:32 -0400
committerDave Chinner <david@fromorbit.com>2015-05-28 19:18:32 -0400
commit56d1115c9bc7853e143f59fb5976cf3de609f657 (patch)
tree88f085d3e92fb3861cc7c31b3152ae663b16f0a7
parent4148c347a42a2aba31f6f4d9a31c647c2d475697 (diff)
xfs: allocate sparse inode chunks on full chunk allocation failure
xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode chunk. If all else fails, reduce the allocation to the sparse length and alignment and attempt to allocate a sparse inode chunk. If sparse chunk allocation succeeds, check whether an inobt record already exists that can track the chunk. If so, inherit and update the existing record. Otherwise, insert a new record for the sparse chunk. Create helpers to align sparse chunk inode records and insert or update existing records in the inode btrees. The xfs_inobt_insert_sprec() helper implements the merge or update semantics required for sparse inode records with respect to both the inobt and finobt. To update the inobt, either insert a new record or merge with an existing record. To update the finobt, use the updated inobt record to either insert or replace an existing record. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c330
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h7
-rw-r--r--fs/xfs/xfs_trace.h47
4 files changed, 401 insertions, 14 deletions
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d79e41c16114..90594b880653 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -378,6 +378,214 @@ xfs_ialloc_inode_init(
378} 378}
379 379
380/* 380/*
381 * Align startino and allocmask for a recently allocated sparse chunk such that
382 * they are fit for insertion (or merge) into the on-disk inode btrees.
383 *
384 * Background:
385 *
386 * When enabled, sparse inode support increases the inode alignment from cluster
387 * size to inode chunk size. This means that the minimum range between two
388 * non-adjacent inode records in the inobt is large enough for a full inode
389 * record. This allows for cluster sized, cluster aligned block allocation
390 * without need to worry about whether the resulting inode record overlaps with
391 * another record in the tree. Without this basic rule, we would have to deal
392 * with the consequences of overlap by potentially undoing recent allocations in
393 * the inode allocation codepath.
394 *
395 * Because of this alignment rule (which is enforced on mount), there are two
396 * inobt possibilities for newly allocated sparse chunks. One is that the
397 * aligned inode record for the chunk covers a range of inodes not already
398 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
399 * other is that a record already exists at the aligned startino that considers
400 * the newly allocated range as sparse. In the latter case, record content is
401 * merged in hope that sparse inode chunks fill to full chunks over time.
402 */
403STATIC void
404xfs_align_sparse_ino(
405 struct xfs_mount *mp,
406 xfs_agino_t *startino,
407 uint16_t *allocmask)
408{
409 xfs_agblock_t agbno;
410 xfs_agblock_t mod;
411 int offset;
412
413 agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
414 mod = agbno % mp->m_sb.sb_inoalignmt;
415 if (!mod)
416 return;
417
418 /* calculate the inode offset and align startino */
419 offset = mod << mp->m_sb.sb_inopblog;
420 *startino -= offset;
421
422 /*
423 * Since startino has been aligned down, left shift allocmask such that
424 * it continues to represent the same physical inodes relative to the
425 * new startino.
426 */
427 *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
428}
429
430/*
431 * Determine whether the source inode record can merge into the target. Both
432 * records must be sparse, the inode ranges must match and there must be no
433 * allocation overlap between the records.
434 */
435STATIC bool
436__xfs_inobt_can_merge(
437 struct xfs_inobt_rec_incore *trec, /* tgt record */
438 struct xfs_inobt_rec_incore *srec) /* src record */
439{
440 uint64_t talloc;
441 uint64_t salloc;
442
443 /* records must cover the same inode range */
444 if (trec->ir_startino != srec->ir_startino)
445 return false;
446
447 /* both records must be sparse */
448 if (!xfs_inobt_issparse(trec->ir_holemask) ||
449 !xfs_inobt_issparse(srec->ir_holemask))
450 return false;
451
452 /* both records must track some inodes */
453 if (!trec->ir_count || !srec->ir_count)
454 return false;
455
456 /* can't exceed capacity of a full record */
457 if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
458 return false;
459
460 /* verify there is no allocation overlap */
461 talloc = xfs_inobt_irec_to_allocmask(trec);
462 salloc = xfs_inobt_irec_to_allocmask(srec);
463 if (talloc & salloc)
464 return false;
465
466 return true;
467}
468
469/*
470 * Merge the source inode record into the target. The caller must call
471 * __xfs_inobt_can_merge() to ensure the merge is valid.
472 */
473STATIC void
474__xfs_inobt_rec_merge(
475 struct xfs_inobt_rec_incore *trec, /* target */
476 struct xfs_inobt_rec_incore *srec) /* src */
477{
478 ASSERT(trec->ir_startino == srec->ir_startino);
479
480 /* combine the counts */
481 trec->ir_count += srec->ir_count;
482 trec->ir_freecount += srec->ir_freecount;
483
484 /*
485 * Merge the holemask and free mask. For both fields, 0 bits refer to
486 * allocated inodes. We combine the allocated ranges with bitwise AND.
487 */
488 trec->ir_holemask &= srec->ir_holemask;
489 trec->ir_free &= srec->ir_free;
490}
491
492/*
493 * Insert a new sparse inode chunk into the associated inode btree. The inode
494 * record for the sparse chunk is pre-aligned to a startino that should match
495 * any pre-existing sparse inode record in the tree. This allows sparse chunks
496 * to fill over time.
497 *
498 * This function supports two modes of handling preexisting records depending on
499 * the merge flag. If merge is true, the provided record is merged with the
500 * existing record and updated in place. The merged record is returned in nrec.
501 * If merge is false, an existing record is replaced with the provided record.
502 * If no preexisting record exists, the provided record is always inserted.
503 *
504 * It is considered corruption if a merge is requested and not possible. Given
505 * the sparse inode alignment constraints, this should never happen.
506 */
507STATIC int
508xfs_inobt_insert_sprec(
509 struct xfs_mount *mp,
510 struct xfs_trans *tp,
511 struct xfs_buf *agbp,
512 int btnum,
513 struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
514 bool merge) /* merge or replace */
515{
516 struct xfs_btree_cur *cur;
517 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
518 xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
519 int error;
520 int i;
521 struct xfs_inobt_rec_incore rec;
522
523 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
524
525 /* the new record is pre-aligned so we know where to look */
526 error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
527 if (error)
528 goto error;
529 /* if nothing there, insert a new record and return */
530 if (i == 0) {
531 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
532 nrec->ir_count, nrec->ir_freecount,
533 nrec->ir_free, &i);
534 if (error)
535 goto error;
536 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
537
538 goto out;
539 }
540
541 /*
542 * A record exists at this startino. Merge or replace the record
543 * depending on what we've been asked to do.
544 */
545 if (merge) {
546 error = xfs_inobt_get_rec(cur, &rec, &i);
547 if (error)
548 goto error;
549 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
550 XFS_WANT_CORRUPTED_GOTO(mp,
551 rec.ir_startino == nrec->ir_startino,
552 error);
553
554 /*
555 * This should never fail. If we have coexisting records that
556 * cannot merge, something is seriously wrong.
557 */
558 XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
559 error);
560
561 trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
562 rec.ir_holemask, nrec->ir_startino,
563 nrec->ir_holemask);
564
565 /* merge to nrec to output the updated record */
566 __xfs_inobt_rec_merge(nrec, &rec);
567
568 trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
569 nrec->ir_holemask);
570
571 error = xfs_inobt_rec_check_count(mp, nrec);
572 if (error)
573 goto error;
574 }
575
576 error = xfs_inobt_update(cur, nrec);
577 if (error)
578 goto error;
579
580out:
581 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
582 return 0;
583error:
584 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
585 return error;
586}
587
588/*
381 * Allocate new inodes in the allocation group specified by agbp. 589 * Allocate new inodes in the allocation group specified by agbp.
382 * Return 0 for success, else error code. 590 * Return 0 for success, else error code.
383 */ 591 */
@@ -395,6 +603,8 @@ xfs_ialloc_ag_alloc(
395 xfs_agino_t newlen; /* new number of inodes */ 603 xfs_agino_t newlen; /* new number of inodes */
396 int isaligned = 0; /* inode allocation at stripe unit */ 604 int isaligned = 0; /* inode allocation at stripe unit */
397 /* boundary */ 605 /* boundary */
606 uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
607 struct xfs_inobt_rec_incore rec;
398 struct xfs_perag *pag; 608 struct xfs_perag *pag;
399 609
400 memset(&args, 0, sizeof(args)); 610 memset(&args, 0, sizeof(args));
@@ -511,6 +721,45 @@ xfs_ialloc_ag_alloc(
511 return error; 721 return error;
512 } 722 }
513 723
724 /*
725 * Finally, try a sparse allocation if the filesystem supports it and
726 * the sparse allocation length is smaller than a full chunk.
727 */
728 if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
729 args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
730 args.fsbno == NULLFSBLOCK) {
731 args.type = XFS_ALLOCTYPE_NEAR_BNO;
732 args.agbno = be32_to_cpu(agi->agi_root);
733 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
734 args.alignment = args.mp->m_sb.sb_spino_align;
735 args.prod = 1;
736
737 args.minlen = args.mp->m_ialloc_min_blks;
738 args.maxlen = args.minlen;
739
740 /*
741 * The inode record will be aligned to full chunk size. We must
742 * prevent sparse allocation from AG boundaries that result in
743 * invalid inode records, such as records that start at agbno 0
744 * or extend beyond the AG.
745 *
746 * Set min agbno to the first aligned, non-zero agbno and max to
747 * the last aligned agbno that is at least one full chunk from
748 * the end of the AG.
749 */
750 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
751 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
752 args.mp->m_sb.sb_inoalignmt) -
753 args.mp->m_ialloc_blks;
754
755 error = xfs_alloc_vextent(&args);
756 if (error)
757 return error;
758
759 newlen = args.len << args.mp->m_sb.sb_inopblog;
760 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
761 }
762
514 if (args.fsbno == NULLFSBLOCK) { 763 if (args.fsbno == NULLFSBLOCK) {
515 *alloc = 0; 764 *alloc = 0;
516 return 0; 765 return 0;
@@ -535,6 +784,73 @@ xfs_ialloc_ag_alloc(
535 * Convert the results. 784 * Convert the results.
536 */ 785 */
537 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 786 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
787
788 if (xfs_inobt_issparse(~allocmask)) {
789 /*
790 * We've allocated a sparse chunk. Align the startino and mask.
791 */
792 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
793
794 rec.ir_startino = newino;
795 rec.ir_holemask = ~allocmask;
796 rec.ir_count = newlen;
797 rec.ir_freecount = newlen;
798 rec.ir_free = XFS_INOBT_ALL_FREE;
799
800 /*
801 * Insert the sparse record into the inobt and allow for a merge
802 * if necessary. If a merge does occur, rec is updated to the
803 * merged record.
804 */
805 error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
806 &rec, true);
807 if (error == -EFSCORRUPTED) {
808 xfs_alert(args.mp,
809 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
810 XFS_AGINO_TO_INO(args.mp, agno,
811 rec.ir_startino),
812 rec.ir_holemask, rec.ir_count);
813 xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
814 }
815 if (error)
816 return error;
817
818 /*
819 * We can't merge the part we've just allocated as for the inobt
820 * due to finobt semantics. The original record may or may not
821 * exist independent of whether physical inodes exist in this
822 * sparse chunk.
823 *
824 * We must update the finobt record based on the inobt record.
825 * rec contains the fully merged and up to date inobt record
826 * from the previous call. Set merge false to replace any
827 * existing record with this one.
828 */
829 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
830 error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
831 XFS_BTNUM_FINO, &rec,
832 false);
833 if (error)
834 return error;
835 }
836 } else {
837 /* full chunk - insert new records to both btrees */
838 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
839 XFS_BTNUM_INO);
840 if (error)
841 return error;
842
843 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
844 error = xfs_inobt_insert(args.mp, tp, agbp, newino,
845 newlen, XFS_BTNUM_FINO);
846 if (error)
847 return error;
848 }
849 }
850
851 /*
852 * Update AGI counts and newino.
853 */
538 be32_add_cpu(&agi->agi_count, newlen); 854 be32_add_cpu(&agi->agi_count, newlen);
539 be32_add_cpu(&agi->agi_freecount, newlen); 855 be32_add_cpu(&agi->agi_freecount, newlen);
540 pag = xfs_perag_get(args.mp, agno); 856 pag = xfs_perag_get(args.mp, agno);
@@ -543,20 +859,6 @@ xfs_ialloc_ag_alloc(
543 agi->agi_newino = cpu_to_be32(newino); 859 agi->agi_newino = cpu_to_be32(newino);
544 860
545 /* 861 /*
546 * Insert records describing the new inode chunk into the btrees.
547 */
548 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
549 XFS_BTNUM_INO);
550 if (error)
551 return error;
552
553 if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
554 error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
555 XFS_BTNUM_FINO);
556 if (error)
557 return error;
558 }
559 /*
560 * Log allocation group header fields 862 * Log allocation group header fields
561 */ 863 */
562 xfs_ialloc_log_agi(tp, agbp, 864 xfs_ialloc_log_agi(tp, agbp,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index aa13b468a064..674ad8f760be 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -478,3 +478,34 @@ xfs_inobt_irec_to_allocmask(
478 478
479 return bitmap; 479 return bitmap;
480} 480}
481
482#if defined(DEBUG) || defined(XFS_WARN)
483/*
484 * Verify that an in-core inode record has a valid inode count.
485 */
486int
487xfs_inobt_rec_check_count(
488 struct xfs_mount *mp,
489 struct xfs_inobt_rec_incore *rec)
490{
491 int inocount = 0;
492 int nextbit = 0;
493 uint64_t allocbmap;
494 int wordsz;
495
496 wordsz = sizeof(allocbmap) / sizeof(unsigned int);
497 allocbmap = xfs_inobt_irec_to_allocmask(rec);
498
499 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
500 while (nextbit != -1) {
501 inocount++;
502 nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
503 nextbit + 1);
504 }
505
506 if (inocount != rec->ir_count)
507 return -EFSCORRUPTED;
508
509 return 0;
510}
511#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 2c581ba69cde..bd88453217ce 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -65,4 +65,11 @@ extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
65/* ir_holemask to inode allocation bitmap conversion */ 65/* ir_holemask to inode allocation bitmap conversion */
66uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); 66uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
67 67
68#if defined(DEBUG) || defined(XFS_WARN)
69int xfs_inobt_rec_check_count(struct xfs_mount *,
70 struct xfs_inobt_rec_incore *);
71#else
72#define xfs_inobt_rec_check_count(mp, rec) 0
73#endif /* DEBUG */
74
68#endif /* __XFS_IALLOC_BTREE_H__ */ 75#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 615781bf4ee5..8d916d33d93d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
738 __entry->blocks, __entry->shift, __entry->writeio_blocks) 738 __entry->blocks, __entry->shift, __entry->writeio_blocks)
739) 739)
740 740
741TRACE_EVENT(xfs_irec_merge_pre,
742 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
743 uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
744 TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
745 TP_STRUCT__entry(
746 __field(dev_t, dev)
747 __field(xfs_agnumber_t, agno)
748 __field(xfs_agino_t, agino)
749 __field(uint16_t, holemask)
750 __field(xfs_agino_t, nagino)
751 __field(uint16_t, nholemask)
752 ),
753 TP_fast_assign(
754 __entry->dev = mp->m_super->s_dev;
755 __entry->agno = agno;
756 __entry->agino = agino;
757 __entry->holemask = holemask;
758 __entry->nagino = nagino;
759 __entry->nholemask = holemask;
760 ),
761 TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
762 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
763 __entry->agino, __entry->holemask, __entry->nagino,
764 __entry->nholemask)
765)
766
767TRACE_EVENT(xfs_irec_merge_post,
768 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
769 uint16_t holemask),
770 TP_ARGS(mp, agno, agino, holemask),
771 TP_STRUCT__entry(
772 __field(dev_t, dev)
773 __field(xfs_agnumber_t, agno)
774 __field(xfs_agino_t, agino)
775 __field(uint16_t, holemask)
776 ),
777 TP_fast_assign(
778 __entry->dev = mp->m_super->s_dev;
779 __entry->agno = agno;
780 __entry->agino = agino;
781 __entry->holemask = holemask;
782 ),
783 TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
784 MINOR(__entry->dev), __entry->agno, __entry->agino,
785 __entry->holemask)
786)
787
741#define DEFINE_IREF_EVENT(name) \ 788#define DEFINE_IREF_EVENT(name) \
742DEFINE_EVENT(xfs_iref_class, name, \ 789DEFINE_EVENT(xfs_iref_class, name, \
743 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 790 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \