aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/fsync.c8
-rw-r--r--fs/ext4/ialloc.c15
-rw-r--r--fs/ext4/inode.c69
-rw-r--r--fs/ext4/mballoc.c77
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/super.c6
6 files changed, 55 insertions, 121 deletions
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h> 31
32#include "ext4.h" 32#include "ext4.h"
33#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
34 34
35#include <trace/events/ext4.h>
36
35/* 37/*
36 * akpm: A new design for ext4_sync_file(). 38 * akpm: A new design for ext4_sync_file().
37 * 39 *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
52 54
53 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
54 56
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", 57 trace_ext4_sync_file(file, dentry, datasync);
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58 58
59 /* 59 /*
60 * data=writeback: 60 * data=writeback:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3743bd849bce..7d502f3be914 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,14 @@
23#include <linux/bitops.h> 23#include <linux/bitops.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <asm/byteorder.h> 25#include <asm/byteorder.h>
26
26#include "ext4.h" 27#include "ext4.h"
27#include "ext4_jbd2.h" 28#include "ext4_jbd2.h"
28#include "xattr.h" 29#include "xattr.h"
29#include "acl.h" 30#include "acl.h"
30 31
32#include <trace/events/ext4.h>
33
31/* 34/*
32 * ialloc.c contains the inodes allocation and deallocation routines 35 * ialloc.c contains the inodes allocation and deallocation routines
33 */ 36 */
@@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
208 211
209 ino = inode->i_ino; 212 ino = inode->i_ino;
210 ext4_debug("freeing inode %lu\n", ino); 213 ext4_debug("freeing inode %lu\n", ino);
211 trace_mark(ext4_free_inode, 214 trace_ext4_free_inode(inode);
212 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
213 sb->s_id, inode->i_ino, inode->i_mode,
214 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
215 (unsigned long long) inode->i_blocks);
216 215
217 /* 216 /*
218 * Note: we must free any quota before locking the superblock, 217 * Note: we must free any quota before locking the superblock,
@@ -815,8 +814,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
815 814
816 sb = dir->i_sb; 815 sb = dir->i_sb;
817 ngroups = ext4_get_groups_count(sb); 816 ngroups = ext4_get_groups_count(sb);
818 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, 817 trace_ext4_request_inode(dir, mode);
819 dir->i_ino, mode);
820 inode = new_inode(sb); 818 inode = new_inode(sb);
821 if (!inode) 819 if (!inode)
822 return ERR_PTR(-ENOMEM); 820 return ERR_PTR(-ENOMEM);
@@ -1047,8 +1045,7 @@ got:
1047 } 1045 }
1048 1046
1049 ext4_debug("allocating inode %lu\n", inode->i_ino); 1047 ext4_debug("allocating inode %lu\n", inode->i_ino);
1050 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", 1048 trace_ext4_allocate_inode(inode, dir, mode);
1051 sb->s_id, inode->i_ino, dir->i_ino, mode);
1052 goto really_out; 1049 goto really_out;
1053fail: 1050fail:
1054 ext4_std_error(sb, err); 1051 ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 875db944b22f..2418ad36eab5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40
40#include "ext4_jbd2.h" 41#include "ext4_jbd2.h"
41#include "xattr.h" 42#include "xattr.h"
42#include "acl.h" 43#include "acl.h"
43#include "ext4_extents.h" 44#include "ext4_extents.h"
44 45
46#include <trace/events/ext4.h>
47
45#define MPAGE_DA_EXTENT_TAIL 0x01 48#define MPAGE_DA_EXTENT_TAIL 0x01
46 49
47static inline int ext4_begin_ordered_truncate(struct inode *inode, 50static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -1466,10 +1469,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1466 pgoff_t index; 1469 pgoff_t index;
1467 unsigned from, to; 1470 unsigned from, to;
1468 1471
1469 trace_mark(ext4_write_begin, 1472 trace_ext4_write_begin(inode, pos, len, flags);
1470 "dev %s ino %lu pos %llu len %u flags %u",
1471 inode->i_sb->s_id, inode->i_ino,
1472 (unsigned long long) pos, len, flags);
1473 /* 1473 /*
1474 * Reserve one block more for addition to orphan list in case 1474 * Reserve one block more for addition to orphan list in case
1475 * we allocate blocks but write fails for some reason 1475 * we allocate blocks but write fails for some reason
@@ -1611,10 +1611,7 @@ static int ext4_ordered_write_end(struct file *file,
1611 struct inode *inode = mapping->host; 1611 struct inode *inode = mapping->host;
1612 int ret = 0, ret2; 1612 int ret = 0, ret2;
1613 1613
1614 trace_mark(ext4_ordered_write_end, 1614 trace_ext4_ordered_write_end(inode, pos, len, copied);
1615 "dev %s ino %lu pos %llu len %u copied %u",
1616 inode->i_sb->s_id, inode->i_ino,
1617 (unsigned long long) pos, len, copied);
1618 ret = ext4_jbd2_file_inode(handle, inode); 1615 ret = ext4_jbd2_file_inode(handle, inode);
1619 1616
1620 if (ret == 0) { 1617 if (ret == 0) {
@@ -1658,10 +1655,7 @@ static int ext4_writeback_write_end(struct file *file,
1658 struct inode *inode = mapping->host; 1655 struct inode *inode = mapping->host;
1659 int ret = 0, ret2; 1656 int ret = 0, ret2;
1660 1657
1661 trace_mark(ext4_writeback_write_end, 1658 trace_ext4_writeback_write_end(inode, pos, len, copied);
1662 "dev %s ino %lu pos %llu len %u copied %u",
1663 inode->i_sb->s_id, inode->i_ino,
1664 (unsigned long long) pos, len, copied);
1665 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1659 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
1666 page, fsdata); 1660 page, fsdata);
1667 copied = ret2; 1661 copied = ret2;
@@ -1705,10 +1699,7 @@ static int ext4_journalled_write_end(struct file *file,
1705 unsigned from, to; 1699 unsigned from, to;
1706 loff_t new_i_size; 1700 loff_t new_i_size;
1707 1701
1708 trace_mark(ext4_journalled_write_end, 1702 trace_ext4_journalled_write_end(inode, pos, len, copied);
1709 "dev %s ino %lu pos %llu len %u copied %u",
1710 inode->i_sb->s_id, inode->i_ino,
1711 (unsigned long long) pos, len, copied);
1712 from = pos & (PAGE_CACHE_SIZE - 1); 1703 from = pos & (PAGE_CACHE_SIZE - 1);
1713 to = from + len; 1704 to = from + len;
1714 1705
@@ -2554,9 +2545,7 @@ static int ext4_da_writepage(struct page *page,
2554 struct buffer_head *page_bufs; 2545 struct buffer_head *page_bufs;
2555 struct inode *inode = page->mapping->host; 2546 struct inode *inode = page->mapping->host;
2556 2547
2557 trace_mark(ext4_da_writepage, 2548 trace_ext4_da_writepage(inode, page);
2558 "dev %s ino %lu page_index %lu",
2559 inode->i_sb->s_id, inode->i_ino, page->index);
2560 size = i_size_read(inode); 2549 size = i_size_read(inode);
2561 if (page->index == size >> PAGE_CACHE_SHIFT) 2550 if (page->index == size >> PAGE_CACHE_SHIFT)
2562 len = size & ~PAGE_CACHE_MASK; 2551 len = size & ~PAGE_CACHE_MASK;
@@ -2667,19 +2656,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2667 int needed_blocks, ret = 0, nr_to_writebump = 0; 2656 int needed_blocks, ret = 0, nr_to_writebump = 0;
2668 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2657 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2669 2658
2670 trace_mark(ext4_da_writepages, 2659 trace_ext4_da_writepages(inode, wbc);
2671 "dev %s ino %lu nr_t_write %ld "
2672 "pages_skipped %ld range_start %llu "
2673 "range_end %llu nonblocking %d "
2674 "for_kupdate %d for_reclaim %d "
2675 "for_writepages %d range_cyclic %d",
2676 inode->i_sb->s_id, inode->i_ino,
2677 wbc->nr_to_write, wbc->pages_skipped,
2678 (unsigned long long) wbc->range_start,
2679 (unsigned long long) wbc->range_end,
2680 wbc->nonblocking, wbc->for_kupdate,
2681 wbc->for_reclaim, wbc->for_writepages,
2682 wbc->range_cyclic);
2683 2660
2684 /* 2661 /*
2685 * No pages to write? This is mainly a kludge to avoid starting 2662 * No pages to write? This is mainly a kludge to avoid starting
@@ -2845,14 +2822,7 @@ out_writepages:
2845 if (!no_nrwrite_index_update) 2822 if (!no_nrwrite_index_update)
2846 wbc->no_nrwrite_index_update = 0; 2823 wbc->no_nrwrite_index_update = 0;
2847 wbc->nr_to_write -= nr_to_writebump; 2824 wbc->nr_to_write -= nr_to_writebump;
2848 trace_mark(ext4_da_writepage_result, 2825 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2849 "dev %s ino %lu ret %d pages_written %d "
2850 "pages_skipped %ld congestion %d "
2851 "more_io %d no_nrwrite_index_update %d",
2852 inode->i_sb->s_id, inode->i_ino, ret,
2853 pages_written, wbc->pages_skipped,
2854 wbc->encountered_congestion, wbc->more_io,
2855 wbc->no_nrwrite_index_update);
2856 return ret; 2826 return ret;
2857} 2827}
2858 2828
@@ -2904,11 +2874,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2904 len, flags, pagep, fsdata); 2874 len, flags, pagep, fsdata);
2905 } 2875 }
2906 *fsdata = (void *)0; 2876 *fsdata = (void *)0;
2907 2877 trace_ext4_da_write_begin(inode, pos, len, flags);
2908 trace_mark(ext4_da_write_begin,
2909 "dev %s ino %lu pos %llu len %u flags %u",
2910 inode->i_sb->s_id, inode->i_ino,
2911 (unsigned long long) pos, len, flags);
2912retry: 2878retry:
2913 /* 2879 /*
2914 * With delayed allocation, we don't log the i_disksize update 2880 * With delayed allocation, we don't log the i_disksize update
@@ -3001,10 +2967,7 @@ static int ext4_da_write_end(struct file *file,
3001 } 2967 }
3002 } 2968 }
3003 2969
3004 trace_mark(ext4_da_write_end, 2970 trace_ext4_da_write_end(inode, pos, len, copied);
3005 "dev %s ino %lu pos %llu len %u copied %u",
3006 inode->i_sb->s_id, inode->i_ino,
3007 (unsigned long long) pos, len, copied);
3008 start = pos & (PAGE_CACHE_SIZE - 1); 2971 start = pos & (PAGE_CACHE_SIZE - 1);
3009 end = start + copied - 1; 2972 end = start + copied - 1;
3010 2973
@@ -3255,9 +3218,7 @@ static int ext4_normal_writepage(struct page *page,
3255 loff_t size = i_size_read(inode); 3218 loff_t size = i_size_read(inode);
3256 loff_t len; 3219 loff_t len;
3257 3220
3258 trace_mark(ext4_normal_writepage, 3221 trace_ext4_normal_writepage(inode, page);
3259 "dev %s ino %lu page_index %lu",
3260 inode->i_sb->s_id, inode->i_ino, page->index);
3261 J_ASSERT(PageLocked(page)); 3222 J_ASSERT(PageLocked(page));
3262 if (page->index == size >> PAGE_CACHE_SHIFT) 3223 if (page->index == size >> PAGE_CACHE_SHIFT)
3263 len = size & ~PAGE_CACHE_MASK; 3224 len = size & ~PAGE_CACHE_MASK;
@@ -3343,9 +3304,7 @@ static int ext4_journalled_writepage(struct page *page,
3343 loff_t size = i_size_read(inode); 3304 loff_t size = i_size_read(inode);
3344 loff_t len; 3305 loff_t len;
3345 3306
3346 trace_mark(ext4_journalled_writepage, 3307 trace_ext4_journalled_writepage(inode, page);
3347 "dev %s ino %lu page_index %lu",
3348 inode->i_sb->s_id, inode->i_ino, page->index);
3349 J_ASSERT(PageLocked(page)); 3308 J_ASSERT(PageLocked(page));
3350 if (page->index == size >> PAGE_CACHE_SHIFT) 3309 if (page->index == size >> PAGE_CACHE_SHIFT)
3351 len = size & ~PAGE_CACHE_MASK; 3310 len = size & ~PAGE_CACHE_MASK;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ed8482e22c0e..8d98070b48fb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <trace/events/ext4.h>
26
25/* 27/*
26 * MUSTDO: 28 * MUSTDO:
27 * - test ext4_ext_search_left() and ext4_ext_search_right() 29 * - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
340 ext4_group_t group); 342 ext4_group_t group);
341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 343static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
342 344
343
344
345static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 345static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
346{ 346{
347#if BITS_PER_LONG == 64 347#if BITS_PER_LONG == 64
@@ -2859,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2859 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2859 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2860 + entry->start_blk 2860 + entry->start_blk
2861 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2861 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2862 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", 2862 trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
2863 sb->s_id, (unsigned long long) discard_block, 2863 entry->count);
2864 entry->count);
2865 sb_issue_discard(sb, discard_block, entry->count); 2864 sb_issue_discard(sb, discard_block, entry->count);
2866 2865
2867 kmem_cache_free(ext4_free_ext_cachep, entry); 2866 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -3629,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3629 3628
3630 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3629 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3631 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3630 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3632 trace_mark(ext4_mb_new_inode_pa, 3631 trace_ext4_mb_new_inode_pa(ac, pa);
3633 "dev %s ino %lu pstart %llu len %u lstart %u",
3634 sb->s_id, ac->ac_inode->i_ino,
3635 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3636 3632
3637 ext4_mb_use_inode_pa(ac, pa); 3633 ext4_mb_use_inode_pa(ac, pa);
3638 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3634 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3691,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3691 pa->pa_type = MB_GROUP_PA; 3687 pa->pa_type = MB_GROUP_PA;
3692 3688
3693 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3689 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3694 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3690 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3695 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", 3691 trace_ext4_mb_new_group_pa(ac, pa);
3696 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3697 3692
3698 ext4_mb_use_group_pa(ac, pa); 3693 ext4_mb_use_group_pa(ac, pa);
3699 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3694 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3783,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3783 ext4_mb_store_history(ac); 3778 ext4_mb_store_history(ac);
3784 } 3779 }
3785 3780
3786 trace_mark(ext4_mb_release_inode_pa, 3781 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
3787 "dev %s ino %lu block %llu count %u", 3782 next - bit);
3788 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3789 next - bit);
3790 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3783 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3791 bit = next + 1; 3784 bit = next + 1;
3792 } 3785 }
@@ -3820,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3820 if (ac) 3813 if (ac)
3821 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3814 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3822 3815
3823 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", 3816 trace_ext4_mb_release_group_pa(ac, pa);
3824 sb->s_id, pa->pa_pstart, pa->pa_len);
3825 BUG_ON(pa->pa_deleted == 0); 3817 BUG_ON(pa->pa_deleted == 0);
3826 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3818 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3827 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3819 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3889 3881
3890 INIT_LIST_HEAD(&list); 3882 INIT_LIST_HEAD(&list);
3891 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3883 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3884 if (ac)
3885 ac->ac_sb = sb;
3892repeat: 3886repeat:
3893 ext4_lock_group(sb, group); 3887 ext4_lock_group(sb, group);
3894 list_for_each_entry_safe(pa, tmp, 3888 list_for_each_entry_safe(pa, tmp,
@@ -3987,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode)
3987 } 3981 }
3988 3982
3989 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 3983 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
3990 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, 3984 trace_ext4_discard_preallocations(inode);
3991 inode->i_ino);
3992 3985
3993 INIT_LIST_HEAD(&list); 3986 INIT_LIST_HEAD(&list);
3994 3987
3995 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3988 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3989 if (ac) {
3990 ac->ac_sb = sb;
3991 ac->ac_inode = inode;
3992 }
3996repeat: 3993repeat:
3997 /* first, collect all pa's in the inode */ 3994 /* first, collect all pa's in the inode */
3998 spin_lock(&ei->i_prealloc_lock); 3995 spin_lock(&ei->i_prealloc_lock);
@@ -4276,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4276 4273
4277 INIT_LIST_HEAD(&discard_list); 4274 INIT_LIST_HEAD(&discard_list);
4278 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4275 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4276 if (ac)
4277 ac->ac_sb = sb;
4279 4278
4280 spin_lock(&lg->lg_prealloc_lock); 4279 spin_lock(&lg->lg_prealloc_lock);
4281 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4280 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4445,8 +4444,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4445 int ret; 4444 int ret;
4446 int freed = 0; 4445 int freed = 0;
4447 4446
4448 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", 4447 trace_ext4_mb_discard_preallocations(sb, needed);
4449 sb->s_id, needed);
4450 for (i = 0; i < ngroups && needed > 0; i++) { 4448 for (i = 0; i < ngroups && needed > 0; i++) {
4451 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4449 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4452 freed += ret; 4450 freed += ret;
@@ -4475,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4475 sb = ar->inode->i_sb; 4473 sb = ar->inode->i_sb;
4476 sbi = EXT4_SB(sb); 4474 sbi = EXT4_SB(sb);
4477 4475
4478 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " 4476 trace_ext4_request_blocks(ar);
4479 "lblk %llu goal %llu lleft %llu lright %llu "
4480 "pleft %llu pright %llu ",
4481 sb->s_id, ar->flags, ar->len,
4482 ar->inode ? ar->inode->i_ino : 0,
4483 (unsigned long long) ar->logical,
4484 (unsigned long long) ar->goal,
4485 (unsigned long long) ar->lleft,
4486 (unsigned long long) ar->lright,
4487 (unsigned long long) ar->pleft,
4488 (unsigned long long) ar->pright);
4489 4477
4490 /* 4478 /*
4491 * For delayed allocation, we could skip the ENOSPC and 4479 * For delayed allocation, we could skip the ENOSPC and
@@ -4521,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4521 } 4509 }
4522 4510
4523 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4511 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4524 if (!ac) { 4512 if (ac) {
4513 ac->ac_sb = sb;
4514 ac->ac_inode = ar->inode;
4515 } else {
4525 ar->len = 0; 4516 ar->len = 0;
4526 *errp = -ENOMEM; 4517 *errp = -ENOMEM;
4527 goto out1; 4518 goto out1;
@@ -4594,18 +4585,7 @@ out3:
4594 reserv_blks); 4585 reserv_blks);
4595 } 4586 }
4596 4587
4597 trace_mark(ext4_allocate_blocks, 4588 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4598 "dev %s block %llu flags %u len %u ino %lu "
4599 "logical %llu goal %llu lleft %llu lright %llu "
4600 "pleft %llu pright %llu ",
4601 sb->s_id, (unsigned long long) block,
4602 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4603 (unsigned long long) ar->logical,
4604 (unsigned long long) ar->goal,
4605 (unsigned long long) ar->lleft,
4606 (unsigned long long) ar->lright,
4607 (unsigned long long) ar->pleft,
4608 (unsigned long long) ar->pright);
4609 4589
4610 return block; 4590 return block;
4611} 4591}
@@ -4740,10 +4720,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4740 } 4720 }
4741 4721
4742 ext4_debug("freeing block %lu\n", block); 4722 ext4_debug("freeing block %lu\n", block);
4743 trace_mark(ext4_free_blocks, 4723 trace_ext4_free_blocks(inode, block, count, metadata);
4744 "dev %s block %llu count %lu metadata %d ino %lu",
4745 sb->s_id, (unsigned long long) block, count, metadata,
4746 inode ? inode->i_ino : 0);
4747 4724
4748 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4725 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4749 if (ac) { 4726 if (ac) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 75e34f69215b..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h>
23#include <linux/mutex.h> 22#include <linux/mutex.h>
24#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
25#include "ext4.h" 24#include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397e..e8f0b2af4607 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,7 +37,6 @@
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/proc_fs.h> 38#include <linux/proc_fs.h>
39#include <linux/ctype.h> 39#include <linux/ctype.h>
40#include <linux/marker.h>
41#include <linux/log2.h> 40#include <linux/log2.h>
42#include <linux/crc16.h> 41#include <linux/crc16.h>
43#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -47,6 +46,9 @@
47#include "xattr.h" 46#include "xattr.h"
48#include "acl.h" 47#include "acl.h"
49 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h>
51
50static int default_mb_history_length = 1000; 52static int default_mb_history_length = 1000;
51 53
52module_param_named(default_mb_history_length, default_mb_history_length, 54module_param_named(default_mb_history_length, default_mb_history_length,
@@ -3346,7 +3348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3346 int ret = 0; 3348 int ret = 0;
3347 tid_t target; 3349 tid_t target;
3348 3350
3349 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3351 trace_ext4_sync_fs(sb, wait);
3350 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3352 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
3351 if (wait) 3353 if (wait)
3352 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3354 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
hl opt">->nr_batching; ioc->last_waited = jiffies; } static void __freed_request(struct request_queue *q, int sync) { struct request_list *rl = &q->rq; if (rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); blk_clear_queue_full(q, sync); } } /* * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ static void freed_request(struct request_queue *q, int sync, int priv) { struct request_list *rl = &q->rq; rl->count[sync]--; if (priv) rl->elvpriv--; __freed_request(q, sync); if (unlikely(rl->starved[sync ^ 1])) __freed_request(q, sync ^ 1); } /* * Get a free request, queue_lock must be held. * Returns NULL on failure, with queue_lock held. * Returns !NULL on success, with queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) { struct request *rq = NULL; struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue, priv; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { ioc = current_io_context(GFP_ATOMIC, q->node); /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. */ if (!blk_queue_full(q, is_sync)) { ioc_set_batching(q, ioc); blk_set_queue_full(q, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { /* * The queue is full and the allocating * process is not a "batcher", and not * exempted by the IO scheduler */ goto out; } } } blk_set_queue_congested(q, is_sync); } /* * Only allow batching queuers to allocate up to 50% over the defined * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) goto out; rl->count[is_sync]++; rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); if (priv) rl->elvpriv++; if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything * we might have messed up. * * Allocating task should really be put onto the front of the * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); freed_request(q, is_sync, priv); /* * in the very unlikely event that allocation failed and no * requests for this direction was pending, mark us starved * so that freeing of a request in the other direction will * notice us. another possible fix would be to split the * rq mempool into READ and WRITE */ rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; goto out; } /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need * not count toward the nr_batch_requests limit. There will always * be some limit enforced by BLK_BATCH_TIME. */ if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; trace_block_getrq(q, bio, rw_flags & 1); out: return rq; } /* * No available requests for this queue, unplug the device and wait for some * requests to become available. * * Called with q->queue_lock held, and returns with it unlocked. */ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) { const bool is_sync = rw_is_sync(rw_flags) != 0; struct request *rq; rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); struct io_context *ioc; struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); trace_block_sleeprq(q, bio, rw_flags & 1); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); io_schedule(); /* * After sleeping, we become a "batching" process and * will be able to allocate at least one request, and * up to a big batch of them for a small period time. * See ioc_batching, ioc_set_batching */ ioc = current_io_context(GFP_NOIO, q->node); ioc_set_batching(q, ioc); spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[is_sync], &wait); rq = get_request(q, rw_flags, bio, GFP_NOIO); }; return rq; } struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { struct request *rq; BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); if (gfp_mask & __GFP_WAIT) { rq = get_request_wait(q, rw, NULL); } else { rq = get_request(q, rw, NULL, gfp_mask); if (!rq) spin_unlock_irq(q->queue_lock); } /* q->queue_lock is unlocked at this point */ return rq; } EXPORT_SYMBOL(blk_get_request); /** * blk_make_request - given a bio, allocate a corresponding struct request. * @q: target request queue * @bio: The bio describing the memory mappings that will be submitted for IO. * It may be a chained-bio properly constructed by block/bio layer. * @gfp_mask: gfp flags to be used for memory allocation * * blk_make_request is the parallel of generic_make_request for BLOCK_PC * type commands. Where the struct request needs to be farther initialized by * the caller. It is passed a &struct bio, which describes the memory info of * the I/O transfer. * * The caller of blk_make_request must make sure that bi_io_vec * are set to describe the memory buffers. That bio_data_dir() will return * the needed direction of the request. (And all bio's in the passed bio-chain * are properly set accordingly) * * If called under none-sleepable conditions, mapped bio buffers must not * need bouncing, by calling the appropriate masked or flagged allocator, * suitable for the target device. Otherwise the call to blk_queue_bounce will * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for * anything but the first bio in the chain. Otherwise you risk waiting for IO * completion of a bio that hasn't been submitted yet, thus resulting in a * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead * of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ struct request *blk_make_request(struct request_queue *q, struct bio *bio, gfp_t gfp_mask) { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); if (unlikely(!rq)) return ERR_PTR(-ENOMEM); for_each_bio(bio) { struct bio *bounce_bio = bio; int ret; blk_queue_bounce(q, &bounce_bio); ret = blk_rq_append_bio(q, rq, bounce_bio); if (unlikely(ret)) { blk_put_request(rq); return ERR_PTR(ret); } } return rq; } EXPORT_SYMBOL(blk_make_request); /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted * * Description: * Drivers often keep queueing requests until the hardware cannot accept * more, when that condition happens we need to put the request back * on the queue. Must be called with queue lock held. */ void blk_requeue_request(struct request_queue *q, struct request *rq) { blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); elv_requeue_request(q, rq); } EXPORT_SYMBOL(blk_requeue_request); /** * blk_insert_request - insert a special request into a request queue * @q: request queue where request should be inserted * @rq: request to be inserted * @at_head: insert request at head or tail of queue * @data: private data * * Description: * Many block devices need to execute commands asynchronously, so they don't * block the whole kernel from preemption during request execution. This is * accomplished normally by inserting aritficial requests tagged as * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them * be scheduled for actual execution by the request queue. * * We have the option of inserting the head or the tail of the queue. * Typically we use the tail for new ioctls and so forth. We use the head * of the queue for things like a QUEUE_FULL message from a device, or a * host that is unable to accept a particular command. */ void blk_insert_request(struct request_queue *q, struct request *rq, int at_head, void *data) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; unsigned long flags; /* * tell I/O scheduler that this isn't a regular read/write (ie it * must not attempt merges on this) and that it acts as a soft * barrier */ rq->cmd_type = REQ_TYPE_SPECIAL; rq->special = data; spin_lock_irqsave(q->queue_lock, flags); /* * If command is tagged, release the tag */ if (blk_rq_tagged(rq)) blk_queue_end_tag(q, rq); drive_stat_acct(rq, 1); __elv_add_request(q, rq, where, 0); __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_insert_request); /* * add-request adds a request to the linked list. * queue lock is held and interrupts disabled, as we muck with the * request queue list. */ static inline void add_request(struct request_queue *q, struct request *req) { drive_stat_acct(req, 1); /* * elevator indicated where it wants this request to be * inserted at elevator_merge time */ __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); } static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { if (now == part->stamp) return; if (part->in_flight) { __part_stat_add(cpu, part, time_in_queue, part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; } /** * part_round_stats() - Round off the performance stats on a struct disk_stats. * @cpu: cpu number for stats access * @part: target partition * * The average IO queue length and utilisation statistics are maintained * by observing the current state of the queue length and the amount of * time it has been in this state for. * * Normally, that accounting is done on IO completion, but that can result * in more than a second's worth of IO being accounted for within any one * second, leading to >100% utilisation. To deal with that, we call this * function to do a round-off before returning the results when reading * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ void part_round_stats(int cpu, struct hd_struct *part) { unsigned long now = jiffies; if (part->partno) part_round_stats_single(cpu, &part_to_disk(part)->part0, now); part_round_stats_single(cpu, part, now); } EXPORT_SYMBOL_GPL(part_round_stats); /* * queue lock must be held */ void __blk_put_request(struct request_queue *q, struct request *req) { if (unlikely(!q)) return; if (unlikely(--req->ref_count)) return; elv_completed_request(q, req); /* this is a bio leak */ WARN_ON(req->bio != NULL); /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { int is_sync = rq_is_sync(req) != 0; int priv = req->cmd_flags & REQ_ELVPRIV; BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); freed_request(q, is_sync, priv); } } EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { unsigned long flags; struct request_queue *q = req->q; spin_lock_irqsave(q->queue_lock, flags); __blk_put_request(q, req); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_put_request); void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; req->cmd_type = REQ_TYPE_FS; /* * Inherit FAILFAST from bio (for read-ahead, and explicit * FAILFAST). FAILFAST flags are identical for req and bio. */ if (bio_rw_flagged(bio, BIO_RW_AHEAD)) req->cmd_flags |= REQ_FAILFAST_MASK; else req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { req->cmd_flags |= REQ_DISCARD; if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_SOFTBARRIER; req->q->prepare_discard_fn(req->q, req); } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) req->cmd_flags |= REQ_HARDBARRIER; if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; if (bio_rw_flagged(bio, BIO_RW_META)) req->cmd_flags |= REQ_RW_META; if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; req->__sector = bio->bi_sector; req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } /* * Only disabling plugging for non-rotational devices if it does tagging * as well, otherwise we do need the proper merging */ static inline bool queue_should_plug(struct request_queue *q) { return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); } static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; } /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even * ISA dma in theory) */ blk_queue_bounce(q, &bio); spin_lock_irq(q->queue_lock); if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); switch (el_ret) { case ELEVATOR_BACK_MERGE: BUG_ON(!rq_mergeable(req)); if (!ll_back_merge_fn(q, req, bio)) break; trace_block_bio_backmerge(q, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; case ELEVATOR_FRONT_MERGE: BUG_ON(!rq_mergeable(req)); if (!ll_front_merge_fn(q, req, bio)) break; trace_block_bio_frontmerge(q, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { blk_rq_set_mixed_merge(req); req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= ff; } bio->bi_next = req->bio; req->bio = bio; /* * may not be valid. if the low level driver said * it didn't need a bounce buffer then it better * not touch req->buffer either... */ req->buffer = bio_data(bio); req->__sector = bio->bi_sector; req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; /* ELV_NO_MERGE: elevator says don't/can't merge. */ default: ; } get_rq: /* * This sync check and mask will be re-done in init_request_from_bio(), * but we need to set it earlier to expose the sync flag to the * rq allocator and io schedulers. */ rw_flags = bio_data_dir(bio); if (sync) rw_flags |= REQ_RW_SYNC; /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ req = get_request_wait(q, rw_flags, bio); /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). * We don't worry about that case for efficiency. It won't happen * often, and the elevators are able to handle it. */ init_request_from_bio(req, bio); spin_lock_irq(q->queue_lock); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || bio_flagged(bio, BIO_CPU_AFFINE)) req->cpu = blk_cpu_to_group(smp_processor_id()); if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); return 0; } /* * If bio->bi_dev is a partition, remap the location */ static inline void blk_partition_remap(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; if (bio_sectors(bio) && bdev != bdev->bd_contains) { struct hd_struct *p = bdev->bd_part; bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, bdev->bd_dev, bio->bi_sector - p->start_sect); } } static void handle_bad_sector(struct bio *bio) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio->bi_sector + bio_sectors(bio), (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); static int should_fail_request(struct bio *bio) { struct hd_struct *part = bio->bi_bdev->bd_part; if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) return should_fail(&fail_make_request, bio->bi_size); return 0; } static int __init fail_make_request_debugfs(void) { return init_fault_attr_dentries(&fail_make_request, "fail_make_request"); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline int should_fail_request(struct bio *bio) { return 0; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* * Check whether this bio extends beyond the end of the device. */ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) { sector_t maxsector; if (!nr_sectors) return 0; /* Test device or partition size, when known. */ maxsector = bio->bi_bdev->bd_inode->i_size >> 9; if (maxsector) { sector_t sector = bio->bi_sector; if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { /* * This may well happen - the kernel calls bread() * without checking the size of the device, e.g., when * mounting a device. */ handle_bad_sector(bio); return 1; } } return 0; } /** * generic_make_request - hand a buffer to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * generic_make_request() is used to make I/O requests of block * devices. It is passed a &struct bio, which describes the I/O that needs * to be done. * * generic_make_request() does not return any status. The * success/failure status of the request, along with notification of * completion, is delivered asynchronously through the bio->bi_end_io * function described (one day) else where. * * The caller of generic_make_request must make sure that bi_io_vec * are set to describe the memory buffer, and that bi_dev and bi_sector are * set to describe the device address, and the * bi_end_io and optionally bi_private are set to describe how * completion notification should be signaled. * * generic_make_request and the drivers it calls may use bi_next if this * bio happens to be merged with someone else, and may change bi_dev and * bi_sector for remaps as it sees fit. So the values of these fields * should NOT be depended on after the call to generic_make_request. */ static inline void __generic_make_request(struct bio *bio) { struct request_queue *q; sector_t old_sector; int ret, nr_sectors = bio_sectors(bio); dev_t old_dev; int err = -EIO; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; /* * Resolve the mapping until finished. (drivers are * still free to implement/resolve their own stacking * by explicitly returning 0) * * NOTE: we don't repeat the blk_size check for each new device. * Stacking drivers are expected to know what they are doing. */ old_sector = -1; old_dev = 0; do { char b[BDEVNAME_SIZE]; q = bdev_get_queue(bio->bi_bdev); if (unlikely(!q)) { printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", bdevname(bio->bi_bdev, b), (long long) bio->bi_sector); goto end_io; } if (unlikely(nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), queue_max_hw_sectors(q)); goto end_io; } if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) goto end_io; if (should_fail_request(bio)) goto end_io; /* * If this device has partitions, remap block n * of partition p to block n+start(p) of the disk. */ blk_partition_remap(bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) goto end_io; if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; if (bio_check_eod(bio, nr_sectors)) goto end_io; if (bio_rw_flagged(bio, BIO_RW_DISCARD) && !q->prepare_discard_fn) { err = -EOPNOTSUPP; goto end_io; } trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); } while (ret); return; end_io: bio_endio(bio, err); } /* * We only want one ->make_request_fn to be active at a time, * else stack usage with stacked devices could be a problem. * So use current->bio_{list,tail} to keep a list of requests * submited by a make_request_fn function. * current->bio_tail is also used as a flag to say if * generic_make_request is currently active in this task or not. * If it is NULL, then no make_request is active. If it is non-NULL, * then a make_request is active, and new requests should be added * at the tail */ void generic_make_request(struct bio *bio) { if (current->bio_tail) { /* make_request is active */ *(current->bio_tail) = bio; bio->bi_next = NULL; current->bio_tail = &bio->bi_next; return; } /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so * we assign bio_list to the next (which is NULL) and bio_tail * to &bio_list, thus initialising the bio_list of new bios to be * added. __generic_make_request may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so fixup bio_list and * bio_tail or bi_next, and call into __generic_make_request again. * * The loop was structured like this to make only one call to * __generic_make_request (which is important as it is large and * inlined) and to keep the structure simple. */ BUG_ON(bio->bi_next); do { current->bio_list = bio->bi_next; if (bio->bi_next == NULL) current->bio_tail = &current->bio_list; else bio->bi_next = NULL; __generic_make_request(bio); bio = current->bio_list; } while (bio); current->bio_tail = NULL; /* deactivate */ } EXPORT_SYMBOL(generic_make_request); /** * submit_bio - submit a bio to the block device layer for I/O * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and * uses that function to do most of the work. Both are fairly rough * interfaces; @bio must be presetup and ready for I/O. * */ void submit_bio(int rw, struct bio *bio) { int count = bio_sectors(bio); bio->bi_rw |= rw; /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, bdevname(bio->bi_bdev, b)); } } generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** * blk_rq_check_limits - Helper function to check a request for the queue limit * @q: the queue * @rq: the request being checked * * Description: * @rq may have been made based on weaker limitations of upper-level queues * in request stacking drivers, and it may violate the limitation of @q. * Since the block layer and the underlying device driver trust @rq * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * * This function should also be useful for request stacking drivers * in some cases below, so export this fuction. * Request stacking drivers like request-based dm may change the queue * limits while requests are in the queue (e.g. dm's table swapping). * Such request stacking drivers should check those requests agaist * the new queue limits again when they dispatch those requests, * although such checkings are also done against the old queue limits * when submitting requests. */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { if (blk_rq_sectors(rq) > queue_max_sectors(q) || blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } /* * queue's settings related to segment counting like q->bounce_pfn * may differ from that of other stacking queues. * Recalculate it to check the request correctly on this queue's * limitation. */ blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_phys_segments(q) || rq->nr_phys_segments > queue_max_hw_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } return 0; } EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued */ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; if (blk_rq_check_limits(q, rq)) return -EIO; #ifdef CONFIG_FAIL_MAKE_REQUEST if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && should_fail(&fail_make_request, blk_rq_bytes(rq))) return -EIO; #endif spin_lock_irqsave(q->queue_lock, flags); /* * Submitting request must be dequeued before calling this function * because it will be linked to another request_queue */ BUG_ON(blk_queued_rq(rq)); drive_stat_acct(rq, 1); __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); spin_unlock_irqrestore(q->queue_lock, flags); return 0; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. * * Context: * queue_lock must be held. */ unsigned int blk_rq_err_bytes(const struct request *rq) { unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->cmd_flags & REQ_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_rw & ff) != ff) break; bytes += bio->bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } } static void blk_account_io_done(struct request *req) { /* * Account IO completion. bar_rq isn't accounted as a normal * IO on queueing nor completion. Accounting the containing * request is enough. */ if (blk_do_io_stat(req) && req != &req->q->bar_rq) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); part_dec_in_flight(part, rw); part_stat_unlock(); } } /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at * * Description: * Return the request at the top of @q. The returned request * should be started using blk_start_request() before LLD starts * processing it. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. * * Context: * queue_lock must be held. */ struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; while ((rq = __elv_next_request(q)) != NULL) { if (!(rq->cmd_flags & REQ_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. */ if (blk_sorted_rq(rq)) elv_activate_rq(q, rq); /* * just mark as started even if we don't start * it, a request that has been delayed should * not be passed by new incoming requests */ rq->cmd_flags |= REQ_STARTED; trace_block_rq_issue(q, rq); } if (!q->boundary_rq || q->boundary_rq == rq) { q->end_sector = rq_end_sector(rq); q->boundary_rq = NULL; } if (rq->cmd_flags & REQ_DONTPREP) break; if (q->dma_drain_size && blk_rq_bytes(rq)) { /* * make sure space for the drain appears we * know we can do this because max_hw_segments * has been adjusted to be one fewer than the * device can handle */ rq->nr_phys_segments++; } if (!q->prep_rq_fn) break; ret = q->prep_rq_fn(q, rq); if (ret == BLKPREP_OK) { break; } else if (ret == BLKPREP_DEFER) { /* * the request may have been (partially) prepped. * we need to keep this request in the front to * avoid resource deadlock. REQ_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && !(rq->cmd_flags & REQ_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again */ --rq->nr_phys_segments; } rq = NULL; break; } else if (ret == BLKPREP_KILL) { rq->cmd_flags |= REQ_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); __blk_end_request_all(rq, -EIO); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; } } return rq; } EXPORT_SYMBOL(blk_peek_request); void blk_dequeue_request(struct request *rq) { struct request_queue *q = rq->q; BUG_ON(list_empty(&rq->queuelist)); BUG_ON(ELV_ON_HASH(rq)); list_del_init(&rq->queuelist); /* * the time frame between a request being removed from the lists * and to it is freed is accounted as io that is in progress at * the driver side. */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; /* * Mark this device as supporting hardware queuing, if * we have more IOs in flight than 4. */ if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) set_bit(QUEUE_FLAG_CQ, &q->queue_flags); } } /** * blk_start_request - start request processing on the driver * @req: request to dequeue * * Description: * Dequeue @req and start timeout timer on it. This hands off the * request to the driver. * * Block internal functions which don't want to start timer should * call blk_dequeue_request(). * * Context: * queue_lock must be held. */ void blk_start_request(struct request *req) { blk_dequeue_request(req); /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. */ req->resid_len = blk_rq_bytes(req); if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); /** * blk_fetch_request - fetch a request from a request queue * @q: request queue to fetch a request from * * Description: * Return the request at the top of @q. The request is started on * return and LLD can start processing it immediately. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. * * Context: * queue_lock must be held. */ struct request *blk_fetch_request(struct request_queue *q) { struct request *rq; rq = blk_peek_request(q); if (rq) blk_start_request(rq); return rq; } EXPORT_SYMBOL(blk_fetch_request); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. * Actual device drivers should use blk_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes, bio_nbytes, next_idx = 0; struct bio *bio; if (!req->bio) return false; trace_block_rq_complete(req->q, req); /* * For fs requests, rq is just carrier of independent bio's * and each partial completion should be handled separately. * Reset per-request error on each partial completion. * * TODO: tj: This is too subtle. It would be better to let * low level drivers do what they see fit. */ if (blk_fs_request(req)) req->errors = 0; if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); } blk_account_io_completion(req, nr_bytes); total_bytes = bio_nbytes = 0; while ((bio = req->bio) != NULL) { int nbytes; if (nr_bytes >= bio->bi_size) { req->bio = bio->bi_next; nbytes = bio->bi_size; req_bio_endio(req, bio, nbytes, error); next_idx = 0; bio_nbytes = 0; } else { int idx = bio->bi_idx + next_idx; if (unlikely(idx >= bio->bi_vcnt)) { blk_dump_rq_flags(req, "__end_that"); printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", __func__, idx, bio->bi_vcnt); break; } nbytes = bio_iovec_idx(bio, idx)->bv_len; BIO_BUG_ON(nbytes > bio->bi_size); /* * not a complete bvec done */ if (unlikely(nbytes > nr_bytes)) { bio_nbytes += nr_bytes; total_bytes += nr_bytes; break; } /* * advance to the next vector */ next_idx++; bio_nbytes += nbytes; } total_bytes += nbytes; nr_bytes -= nbytes; bio = req->bio; if (bio) { /* * end more in this run, or just return 'not-done' */ if (unlikely(nr_bytes <= 0)) break; } } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } /* * if the request wasn't completed, update state */ if (bio_nbytes) { req_bio_endio(req, bio, bio_nbytes, error); bio->bi_idx += next_idx; bio_iovec(bio)->bv_offset += nr_bytes; bio_iovec(bio)->bv_len -= nr_bytes; } req->__data_len -= total_bytes; req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (blk_fs_request(req) || blk_discard_rq(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->cmd_flags & REQ_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; } /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { printk(KERN_ERR "blk: request botched\n"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ blk_recalc_rq_segments(req); return true; } EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_request(rq, error, nr_bytes)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && blk_update_request(rq->next_rq, error, bidi_bytes)) return true; add_disk_randomness(rq->rq_disk); return false; } /* * queue lock must be held */ static void blk_finish_request(struct request *req, int error) { if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); if (unlikely(laptop_mode) && blk_fs_request(req)) laptop_io_completion(); blk_delete_timer(req); blk_account_io_done(req); if (req->end_io) req->end_io(req, error); else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); __blk_put_request(req->q, req); } } /** * blk_end_bidi_request - Complete a bidi request * @rq: the request to complete * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. * Drivers that supports bidi can safely call this member for any * type of request, bidi or uni. In the later case @bidi_bytes is * just ignored. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ static bool blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; spin_lock_irqsave(q->queue_lock, flags); blk_finish_request(rq, error); spin_unlock_irqrestore(q->queue_lock, flags); return false; } /** * __blk_end_bidi_request - Complete a bidi request with queue lock held * @rq: the request to complete * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Identical to blk_end_bidi_request() except that queue lock is * assumed to be locked on entry and remains so on return. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ static bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; blk_finish_request(rq, error); return false; } /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: * Ends I/O on a number of bytes attached to @rq. * If @rq has leftover, sets it up for the next range of segments. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: %0 for success, < %0 for error * * Description: * Completely finish @rq. */ void blk_end_request_all(struct request *rq, int error) { bool pending; unsigned int bidi_bytes = 0; if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(blk_end_request_all); /** * blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for * @error: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool blk_end_request_cur(struct request *rq, int error) { return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } EXPORT_SYMBOL(blk_end_request_cur); /** * blk_end_request_err - Finish a request till the next failure boundary. * @rq: the request to finish till the next failure boundary for * @error: must be negative errno * * Description: * Complete @rq till the next failure boundary. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool blk_end_request_err(struct request *rq, int error) { WARN_ON(error >= 0); return blk_end_request(rq, error, blk_rq_err_bytes(rq)); } EXPORT_SYMBOL_GPL(blk_end_request_err); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: * Must be called with queue lock held unlike blk_end_request(). * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return __blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: %0 for success, < %0 for error * * Description: * Completely finish @rq. Must be called with queue lock held. */ void __blk_end_request_all(struct request *rq, int error) { bool pending; unsigned int bidi_bytes = 0; if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for * @error: %0 for success, < %0 for error * * Description: * Complete the current consecutively mapped chunk from @rq. Must * be called with queue lock held. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool __blk_end_request_cur(struct request *rq, int error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } EXPORT_SYMBOL(__blk_end_request_cur); /** * __blk_end_request_err - Finish a request till the next failure boundary. * @rq: the request to finish till the next failure boundary for * @error: must be negative errno * * Description: * Complete @rq till the next failure boundary. Must be called * with queue lock held. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool __blk_end_request_err(struct request *rq, int error) { WARN_ON(error >= 0); return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); } EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_RW; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); rq->buffer = bio_data(bio); } rq->__data_len = bio->bi_size; rq->bio = rq->biotail = bio; if (bio->bi_bdev) rq->rq_disk = bio->bi_bdev->bd_disk; } /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (q->lld_busy_fn) return q->lld_busy_fn(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; } /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = fs_bio_set; blk_rq_init(NULL, rq); __rq_for_each_bio(bio_src, rq_src) { bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs); if (!bio) goto free_and_out; __bio_clone(bio, bio_src); if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp_mask, bs)) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else rq->bio = rq->biotail = bio; } __blk_rq_prep_clone(rq, rq_src); return 0; free_and_out: if (bio) bio_free(bio, bs); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * sizeof(((struct request *)0)->cmd_flags)); kblockd_workqueue = create_workqueue("kblockd"); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; }