aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-10 16:06:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-10 16:06:15 -0400
commit7426d62871dafbeeed087d609c6469a515c88389 (patch)
tree7d935f360eeb5e78ba633238a29e9213c291aad7
parent4d7696f1b05f4aeb586c74868fe3da2731daca4b (diff)
parent7fff5e8f727285cf54e6aba10f31b196f207b98a (diff)
Merge tag 'dm-3.12-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device-mapper updates from Mike Snitzer: "Add the ability to collect I/O statistics on user-defined regions of a device-mapper device. This dm-stats code required the reintroduction of a div64_u64_rem() helper, but as a separate method that doesn't slow down div64_u64() -- especially on 32-bit systems. Allow the error target to replace request-based DM devices (e.g. multipath) in addition to bio-based DM devices. Various other small code fixes and improvements to thin-provisioning, DM cache and the DM ioctl interface" * tag 'dm-3.12-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm stripe: silence a couple sparse warnings dm: add statistics support dm thin: always return -ENOSPC if no_free_space is set dm ioctl: cleanup error handling in table_load dm ioctl: increase granularity of type_lock when loading table dm ioctl: prevent rename to empty name or uuid dm thin: set pool read-only if breaking_sharing fails block allocation dm thin: prefix pool error messages with pool device name dm: allow error target to replace bio-based and request-based targets math64: New separate div64_u64_rem helper dm space map: optimise sm_ll_dec and sm_ll_inc dm btree: prefetch child nodes when walking tree for a dm_btree_del dm btree: use pop_frame in dm_btree_del to cleanup code dm cache: eliminate holes in cache structure dm cache: fix stacking of geometry limits dm thin: fix stacking of geometry limits dm thin: add data block size limits to Documentation dm cache: add data block size limits to code and Documentation dm cache: document metadata device is exclussive to a cache dm: stop using WQ_NON_REENTRANT
-rw-r--r--Documentation/device-mapper/cache.txt6
-rw-r--r--Documentation/device-mapper/statistics.txt186
-rw-r--r--Documentation/device-mapper/thin-provisioning.txt15
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-cache-target.c59
-rw-r--r--drivers/md/dm-crypt.c10
-rw-r--r--drivers/md/dm-ioctl.c60
-rw-r--r--drivers/md/dm-kcopyd.c3
-rw-r--r--drivers/md/dm-raid1.c3
-rw-r--r--drivers/md/dm-stats.c969
-rw-r--r--drivers/md/dm-stats.h40
-rw-r--r--drivers/md/dm-stripe.c1
-rw-r--r--drivers/md/dm-table.c20
-rw-r--r--drivers/md/dm-target.c9
-rw-r--r--drivers/md/dm-thin.c122
-rw-r--r--drivers/md/dm.c70
-rw-r--r--drivers/md/dm.h27
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c5
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h5
-rw-r--r--drivers/md/persistent-data/dm-btree.c28
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c77
-rw-r--r--include/linux/device-mapper.h9
-rw-r--r--include/linux/math64.h13
-rw-r--r--include/uapi/linux/dm-ioctl.h4
-rw-r--r--lib/div64.c40
25 files changed, 1621 insertions, 162 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index e8cdf7241b66..33d45ee0b737 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -50,14 +50,16 @@ other parameters detailed later):
50 which are dirty, and extra hints for use by the policy object. 50 which are dirty, and extra hints for use by the policy object.
51 This information could be put on the cache device, but having it 51 This information could be put on the cache device, but having it
52 separate allows the volume manager to configure it differently, 52 separate allows the volume manager to configure it differently,
53 e.g. as a mirror for extra robustness. 53 e.g. as a mirror for extra robustness. This metadata device may only
54 be used by a single cache device.
54 55
55Fixed block size 56Fixed block size
56---------------- 57----------------
57 58
58The origin is divided up into blocks of a fixed size. This block size 59The origin is divided up into blocks of a fixed size. This block size
59is configurable when you first create the cache. Typically we've been 60is configurable when you first create the cache. Typically we've been
60using block sizes of 256k - 1024k. 61using block sizes of 256KB - 1024KB. The block size must be between 64
62(32KB) and 2097152 (1GB) and a multiple of 64 (32KB).
61 63
62Having a fixed block size simplifies the target a lot. But it is 64Having a fixed block size simplifies the target a lot. But it is
63something of a compromise. For instance, a small part of a block may be 65something of a compromise. For instance, a small part of a block may be
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
new file mode 100644
index 000000000000..2a1673adc200
--- /dev/null
+++ b/Documentation/device-mapper/statistics.txt
@@ -0,0 +1,186 @@
1DM statistics
2=============
3
4Device Mapper supports the collection of I/O statistics on user-defined
5regions of a DM device. If no regions are defined no statistics are
6collected so there isn't any performance impact. Only bio-based DM
7devices are currently supported.
8
9Each user-defined region specifies a starting sector, length and step.
10Individual statistics will be collected for each step-sized area within
11the range specified.
12
13The I/O statistics counters for each step-sized area of a region are
14in the same format as /sys/block/*/stat or /proc/diskstats (see:
15Documentation/iostats.txt). But two extra counters (12 and 13) are
16provided: total time spent reading and writing in milliseconds. All
17these counters may be accessed by sending the @stats_print message to
18the appropriate DM device via dmsetup.
19
20Each region has a corresponding unique identifier, which we call a
21region_id, that is assigned when the region is created. The region_id
22must be supplied when querying statistics about the region, deleting the
23region, etc. Unique region_ids enable multiple userspace programs to
24request and process statistics for the same DM device without stepping
25on each other's data.
26
27The creation of DM statistics will allocate memory via kmalloc or
28fallback to using vmalloc space. At most, 1/4 of the overall system
29memory may be allocated by DM statistics. The admin can see how much
30memory is used by reading
31/sys/module/dm_mod/parameters/stats_current_allocated_bytes
32
33Messages
34========
35
36 @stats_create <range> <step> [<program_id> [<aux_data>]]
37
38 Create a new region and return the region_id.
39
40 <range>
41 "-" - whole device
42 "<start_sector>+<length>" - a range of <length> 512-byte sectors
43 starting with <start_sector>.
44
45 <step>
46 "<area_size>" - the range is subdivided into areas each containing
47 <area_size> sectors.
48 "/<number_of_areas>" - the range is subdivided into the specified
49 number of areas.
50
51 <program_id>
52 An optional parameter. A name that uniquely identifies
53 the userspace owner of the range. This groups ranges together
54 so that userspace programs can identify the ranges they
55 created and ignore those created by others.
56 The kernel returns this string back in the output of
57 @stats_list message, but it doesn't use it for anything else.
58
59 <aux_data>
60 An optional parameter. A word that provides auxiliary data
61 that is useful to the client program that created the range.
62 The kernel returns this string back in the output of
63 @stats_list message, but it doesn't use this value for anything.
64
65 @stats_delete <region_id>
66
67 Delete the region with the specified id.
68
69 <region_id>
70 region_id returned from @stats_create
71
72 @stats_clear <region_id>
73
74 Clear all the counters except the in-flight i/o counters.
75
76 <region_id>
77 region_id returned from @stats_create
78
79 @stats_list [<program_id>]
80
81 List all regions registered with @stats_create.
82
83 <program_id>
84 An optional parameter.
85 If this parameter is specified, only matching regions
86 are returned.
87 If it is not specified, all regions are returned.
88
89 Output format:
90 <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
91
92 @stats_print <region_id> [<starting_line> <number_of_lines>]
93
94 Print counters for each step-sized area of a region.
95
96 <region_id>
97 region_id returned from @stats_create
98
99 <starting_line>
100 The index of the starting line in the output.
101 If omitted, all lines are returned.
102
103 <number_of_lines>
104 The number of lines to include in the output.
105 If omitted, all lines are returned.
106
107 Output format for each step-sized area of a region:
108
109 <start_sector>+<length> counters
110
111 The first 11 counters have the same meaning as
112 /sys/block/*/stat or /proc/diskstats.
113
114 Please refer to Documentation/iostats.txt for details.
115
116 1. the number of reads completed
117 2. the number of reads merged
118 3. the number of sectors read
119 4. the number of milliseconds spent reading
120 5. the number of writes completed
121 6. the number of writes merged
122 7. the number of sectors written
123 8. the number of milliseconds spent writing
124 9. the number of I/Os currently in progress
125 10. the number of milliseconds spent doing I/Os
126 11. the weighted number of milliseconds spent doing I/Os
127
128 Additional counters:
129 12. the total time spent reading in milliseconds
130 13. the total time spent writing in milliseconds
131
132 @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
133
134 Atomically print and then clear all the counters except the
135 in-flight i/o counters. Useful when the client consuming the
136 statistics does not want to lose any statistics (those updated
137 between printing and clearing).
138
139 <region_id>
140 region_id returned from @stats_create
141
142 <starting_line>
143 The index of the starting line in the output.
144 If omitted, all lines are printed and then cleared.
145
146 <number_of_lines>
147 The number of lines to process.
148 If omitted, all lines are printed and then cleared.
149
150 @stats_set_aux <region_id> <aux_data>
151
152 Store auxiliary data aux_data for the specified region.
153
154 <region_id>
155 region_id returned from @stats_create
156
157 <aux_data>
158 The string that identifies data which is useful to the client
159 program that created the range. The kernel returns this
160 string back in the output of @stats_list message, but it
161 doesn't use this value for anything.
162
163Examples
164========
165
166Subdivide the DM device 'vol' into 100 pieces and start collecting
167statistics on them:
168
169 dmsetup message vol 0 @stats_create - /100
170
171Set the auxillary data string to "foo bar baz" (the escape for each
172space must also be escaped, otherwise the shell will consume them):
173
174 dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
175
176List the statistics:
177
178 dmsetup message vol 0 @stats_list
179
180Print the statistics:
181
182 dmsetup message vol 0 @stats_print 0
183
184Delete the statistics:
185
186 dmsetup message vol 0 @stats_delete 0
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 30b8b83bd333..50c44cf79b0e 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -99,13 +99,14 @@ Using an existing pool device
99 $data_block_size $low_water_mark" 99 $data_block_size $low_water_mark"
100 100
101$data_block_size gives the smallest unit of disk space that can be 101$data_block_size gives the smallest unit of disk space that can be
102allocated at a time expressed in units of 512-byte sectors. People 102allocated at a time expressed in units of 512-byte sectors.
103primarily interested in thin provisioning may want to use a value such 103$data_block_size must be between 128 (64KB) and 2097152 (1GB) and a
104as 1024 (512KB). People doing lots of snapshotting may want a smaller value 104multiple of 128 (64KB). $data_block_size cannot be changed after the
105such as 128 (64KB). If you are not zeroing newly-allocated data, 105thin-pool is created. People primarily interested in thin provisioning
106a larger $data_block_size in the region of 256000 (128MB) is suggested. 106may want to use a value such as 1024 (512KB). People doing lots of
107$data_block_size must be the same for the lifetime of the 107snapshotting may want a smaller value such as 128 (64KB). If you are
108metadata device. 108not zeroing newly-allocated data, a larger $data_block_size in the
109region of 256000 (128MB) is suggested.
109 110
110$low_water_mark is expressed in blocks of size $data_block_size. If 111$low_water_mark is expressed in blocks of size $data_block_size. If
111free space on the data device drops below this level then a dm event 112free space on the data device drops below this level then a dm event
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5ef78efc27f2..2acc43fe0229 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
7dm-multipath-y += dm-path-selector.o dm-mpath.o 7dm-multipath-y += dm-path-selector.o dm-mpath.o
8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o 9 dm-snap-persistent.o
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 0df3ec085ebb..29569768ffbf 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -67,9 +67,11 @@ static void free_bitset(unsigned long *bits)
67#define MIGRATION_COUNT_WINDOW 10 67#define MIGRATION_COUNT_WINDOW 10
68 68
69/* 69/*
70 * The block size of the device holding cache data must be >= 32KB 70 * The block size of the device holding cache data must be
71 * between 32KB and 1GB.
71 */ 72 */
72#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 73#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
74#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
73 75
74/* 76/*
75 * FIXME: the cache is read/write for the time being. 77 * FIXME: the cache is read/write for the time being.
@@ -101,6 +103,8 @@ struct cache {
101 struct dm_target *ti; 103 struct dm_target *ti;
102 struct dm_target_callbacks callbacks; 104 struct dm_target_callbacks callbacks;
103 105
106 struct dm_cache_metadata *cmd;
107
104 /* 108 /*
105 * Metadata is written to this device. 109 * Metadata is written to this device.
106 */ 110 */
@@ -117,11 +121,6 @@ struct cache {
117 struct dm_dev *cache_dev; 121 struct dm_dev *cache_dev;
118 122
119 /* 123 /*
120 * Cache features such as write-through.
121 */
122 struct cache_features features;
123
124 /*
125 * Size of the origin device in _complete_ blocks and native sectors. 124 * Size of the origin device in _complete_ blocks and native sectors.
126 */ 125 */
127 dm_oblock_t origin_blocks; 126 dm_oblock_t origin_blocks;
@@ -138,8 +137,6 @@ struct cache {
138 uint32_t sectors_per_block; 137 uint32_t sectors_per_block;
139 int sectors_per_block_shift; 138 int sectors_per_block_shift;
140 139
141 struct dm_cache_metadata *cmd;
142
143 spinlock_t lock; 140 spinlock_t lock;
144 struct bio_list deferred_bios; 141 struct bio_list deferred_bios;
145 struct bio_list deferred_flush_bios; 142 struct bio_list deferred_flush_bios;
@@ -148,8 +145,8 @@ struct cache {
148 struct list_head completed_migrations; 145 struct list_head completed_migrations;
149 struct list_head need_commit_migrations; 146 struct list_head need_commit_migrations;
150 sector_t migration_threshold; 147 sector_t migration_threshold;
151 atomic_t nr_migrations;
152 wait_queue_head_t migration_wait; 148 wait_queue_head_t migration_wait;
149 atomic_t nr_migrations;
153 150
154 /* 151 /*
155 * cache_size entries, dirty if set 152 * cache_size entries, dirty if set
@@ -160,9 +157,16 @@ struct cache {
160 /* 157 /*
161 * origin_blocks entries, discarded if set. 158 * origin_blocks entries, discarded if set.
162 */ 159 */
163 uint32_t discard_block_size; /* a power of 2 times sectors per block */
164 dm_dblock_t discard_nr_blocks; 160 dm_dblock_t discard_nr_blocks;
165 unsigned long *discard_bitset; 161 unsigned long *discard_bitset;
162 uint32_t discard_block_size; /* a power of 2 times sectors per block */
163
164 /*
165 * Rather than reconstructing the table line for the status we just
166 * save it and regurgitate.
167 */
168 unsigned nr_ctr_args;
169 const char **ctr_args;
166 170
167 struct dm_kcopyd_client *copier; 171 struct dm_kcopyd_client *copier;
168 struct workqueue_struct *wq; 172 struct workqueue_struct *wq;
@@ -187,14 +191,12 @@ struct cache {
187 bool loaded_mappings:1; 191 bool loaded_mappings:1;
188 bool loaded_discards:1; 192 bool loaded_discards:1;
189 193
190 struct cache_stats stats;
191
192 /* 194 /*
193 * Rather than reconstructing the table line for the status we just 195 * Cache features such as write-through.
194 * save it and regurgitate.
195 */ 196 */
196 unsigned nr_ctr_args; 197 struct cache_features features;
197 const char **ctr_args; 198
199 struct cache_stats stats;
198}; 200};
199 201
200struct per_bio_data { 202struct per_bio_data {
@@ -1687,24 +1689,25 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1687static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1689static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1688 char **error) 1690 char **error)
1689{ 1691{
1690 unsigned long tmp; 1692 unsigned long block_size;
1691 1693
1692 if (!at_least_one_arg(as, error)) 1694 if (!at_least_one_arg(as, error))
1693 return -EINVAL; 1695 return -EINVAL;
1694 1696
1695 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || 1697 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1696 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1698 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1697 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1699 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1700 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1698 *error = "Invalid data block size"; 1701 *error = "Invalid data block size";
1699 return -EINVAL; 1702 return -EINVAL;
1700 } 1703 }
1701 1704
1702 if (tmp > ca->cache_sectors) { 1705 if (block_size > ca->cache_sectors) {
1703 *error = "Data block size is larger than the cache device"; 1706 *error = "Data block size is larger than the cache device";
1704 return -EINVAL; 1707 return -EINVAL;
1705 } 1708 }
1706 1709
1707 ca->block_size = tmp; 1710 ca->block_size = block_size;
1708 1711
1709 return 0; 1712 return 0;
1710} 1713}
@@ -2609,9 +2612,17 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2609static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2612static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2610{ 2613{
2611 struct cache *cache = ti->private; 2614 struct cache *cache = ti->private;
2615 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2612 2616
2613 blk_limits_io_min(limits, 0); 2617 /*
2614 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2618 * If the system-determined stacked limits are compatible with the
2619 * cache's blocksize (io_opt is a factor) do not override them.
2620 */
2621 if (io_opt_sectors < cache->sectors_per_block ||
2622 do_div(io_opt_sectors, cache->sectors_per_block)) {
2623 blk_limits_io_min(limits, 0);
2624 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2625 }
2615 set_discard_limits(cache, limits); 2626 set_discard_limits(cache, limits);
2616} 2627}
2617 2628
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41ae9e32..0fce0bc1a957 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1645,20 +1645,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1645 } 1645 }
1646 1646
1647 ret = -ENOMEM; 1647 ret = -ENOMEM;
1648 cc->io_queue = alloc_workqueue("kcryptd_io", 1648 cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1);
1649 WQ_NON_REENTRANT|
1650 WQ_MEM_RECLAIM,
1651 1);
1652 if (!cc->io_queue) { 1649 if (!cc->io_queue) {
1653 ti->error = "Couldn't create kcryptd io queue"; 1650 ti->error = "Couldn't create kcryptd io queue";
1654 goto bad; 1651 goto bad;
1655 } 1652 }
1656 1653
1657 cc->crypt_queue = alloc_workqueue("kcryptd", 1654 cc->crypt_queue = alloc_workqueue("kcryptd",
1658 WQ_NON_REENTRANT| 1655 WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
1659 WQ_CPU_INTENSIVE|
1660 WQ_MEM_RECLAIM,
1661 1);
1662 if (!cc->crypt_queue) { 1656 if (!cc->crypt_queue) {
1663 ti->error = "Couldn't create kcryptd queue"; 1657 ti->error = "Couldn't create kcryptd queue";
1664 goto bad; 1658 goto bad;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f1b758675ec7..afe08146f73e 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -877,7 +877,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
877 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; 877 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
878 878
879 if (new_data < param->data || 879 if (new_data < param->data ||
880 invalid_str(new_data, (void *) param + param_size) || 880 invalid_str(new_data, (void *) param + param_size) || !*new_data ||
881 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { 881 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
882 DMWARN("Invalid new mapped device name or uuid string supplied."); 882 DMWARN("Invalid new mapped device name or uuid string supplied.");
883 return -EINVAL; 883 return -EINVAL;
@@ -1262,44 +1262,37 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1262 1262
1263 r = dm_table_create(&t, get_mode(param), param->target_count, md); 1263 r = dm_table_create(&t, get_mode(param), param->target_count, md);
1264 if (r) 1264 if (r)
1265 goto out; 1265 goto err;
1266 1266
1267 /* Protect md->type and md->queue against concurrent table loads. */
1268 dm_lock_md_type(md);
1267 r = populate_table(t, param, param_size); 1269 r = populate_table(t, param, param_size);
1268 if (r) { 1270 if (r)
1269 dm_table_destroy(t); 1271 goto err_unlock_md_type;
1270 goto out;
1271 }
1272 1272
1273 immutable_target_type = dm_get_immutable_target_type(md); 1273 immutable_target_type = dm_get_immutable_target_type(md);
1274 if (immutable_target_type && 1274 if (immutable_target_type &&
1275 (immutable_target_type != dm_table_get_immutable_target_type(t))) { 1275 (immutable_target_type != dm_table_get_immutable_target_type(t))) {
1276 DMWARN("can't replace immutable target type %s", 1276 DMWARN("can't replace immutable target type %s",
1277 immutable_target_type->name); 1277 immutable_target_type->name);
1278 dm_table_destroy(t);
1279 r = -EINVAL; 1278 r = -EINVAL;
1280 goto out; 1279 goto err_unlock_md_type;
1281 } 1280 }
1282 1281
1283 /* Protect md->type and md->queue against concurrent table loads. */
1284 dm_lock_md_type(md);
1285 if (dm_get_md_type(md) == DM_TYPE_NONE) 1282 if (dm_get_md_type(md) == DM_TYPE_NONE)
1286 /* Initial table load: acquire type of table. */ 1283 /* Initial table load: acquire type of table. */
1287 dm_set_md_type(md, dm_table_get_type(t)); 1284 dm_set_md_type(md, dm_table_get_type(t));
1288 else if (dm_get_md_type(md) != dm_table_get_type(t)) { 1285 else if (dm_get_md_type(md) != dm_table_get_type(t)) {
1289 DMWARN("can't change device type after initial table load."); 1286 DMWARN("can't change device type after initial table load.");
1290 dm_table_destroy(t);
1291 dm_unlock_md_type(md);
1292 r = -EINVAL; 1287 r = -EINVAL;
1293 goto out; 1288 goto err_unlock_md_type;
1294 } 1289 }
1295 1290
1296 /* setup md->queue to reflect md's type (may block) */ 1291 /* setup md->queue to reflect md's type (may block) */
1297 r = dm_setup_md_queue(md); 1292 r = dm_setup_md_queue(md);
1298 if (r) { 1293 if (r) {
1299 DMWARN("unable to set up device queue for new table."); 1294 DMWARN("unable to set up device queue for new table.");
1300 dm_table_destroy(t); 1295 goto err_unlock_md_type;
1301 dm_unlock_md_type(md);
1302 goto out;
1303 } 1296 }
1304 dm_unlock_md_type(md); 1297 dm_unlock_md_type(md);
1305 1298
@@ -1309,9 +1302,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1309 if (!hc || hc->md != md) { 1302 if (!hc || hc->md != md) {
1310 DMWARN("device has been removed from the dev hash table."); 1303 DMWARN("device has been removed from the dev hash table.");
1311 up_write(&_hash_lock); 1304 up_write(&_hash_lock);
1312 dm_table_destroy(t);
1313 r = -ENXIO; 1305 r = -ENXIO;
1314 goto out; 1306 goto err_destroy_table;
1315 } 1307 }
1316 1308
1317 if (hc->new_map) 1309 if (hc->new_map)
@@ -1322,7 +1314,6 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1322 param->flags |= DM_INACTIVE_PRESENT_FLAG; 1314 param->flags |= DM_INACTIVE_PRESENT_FLAG;
1323 __dev_status(md, param); 1315 __dev_status(md, param);
1324 1316
1325out:
1326 if (old_map) { 1317 if (old_map) {
1327 dm_sync_table(md); 1318 dm_sync_table(md);
1328 dm_table_destroy(old_map); 1319 dm_table_destroy(old_map);
@@ -1330,6 +1321,15 @@ out:
1330 1321
1331 dm_put(md); 1322 dm_put(md);
1332 1323
1324 return 0;
1325
1326err_unlock_md_type:
1327 dm_unlock_md_type(md);
1328err_destroy_table:
1329 dm_table_destroy(t);
1330err:
1331 dm_put(md);
1332
1333 return r; 1333 return r;
1334} 1334}
1335 1335
@@ -1455,20 +1455,26 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1455 return 0; 1455 return 0;
1456} 1456}
1457 1457
1458static bool buffer_test_overflow(char *result, unsigned maxlen)
1459{
1460 return !maxlen || strlen(result) + 1 >= maxlen;
1461}
1462
1463/* 1458/*
1464 * Process device-mapper dependent messages. 1459 * Process device-mapper dependent messages. Messages prefixed with '@'
1460 * are processed by the DM core. All others are delivered to the target.
1465 * Returns a number <= 1 if message was processed by device mapper. 1461 * Returns a number <= 1 if message was processed by device mapper.
1466 * Returns 2 if message should be delivered to the target. 1462 * Returns 2 if message should be delivered to the target.
1467 */ 1463 */
1468static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, 1464static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
1469 char *result, unsigned maxlen) 1465 char *result, unsigned maxlen)
1470{ 1466{
1471 return 2; 1467 int r;
1468
1469 if (**argv != '@')
1470 return 2; /* no '@' prefix, deliver to target */
1471
1472 r = dm_stats_message(md, argc, argv, result, maxlen);
1473 if (r < 2)
1474 return r;
1475
1476 DMERR("Unsupported message sent to DM core: %s", argv[0]);
1477 return -EINVAL;
1472} 1478}
1473 1479
1474/* 1480/*
@@ -1542,7 +1548,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1542 1548
1543 if (r == 1) { 1549 if (r == 1) {
1544 param->flags |= DM_DATA_OUT_FLAG; 1550 param->flags |= DM_DATA_OUT_FLAG;
1545 if (buffer_test_overflow(result, maxlen)) 1551 if (dm_message_test_buffer_overflow(result, maxlen))
1546 param->flags |= DM_BUFFER_FULL_FLAG; 1552 param->flags |= DM_BUFFER_FULL_FLAG;
1547 else 1553 else
1548 param->data_size = param->data_start + strlen(result) + 1; 1554 param->data_size = param->data_start + strlen(result) + 1;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d581fe5d2faf..3a7cade5e27d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -833,8 +833,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
833 goto bad_slab; 833 goto bad_slab;
834 834
835 INIT_WORK(&kc->kcopyd_work, do_work); 835 INIT_WORK(&kc->kcopyd_work, do_work);
836 kc->kcopyd_wq = alloc_workqueue("kcopyd", 836 kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0);
837 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
838 if (!kc->kcopyd_wq) 837 if (!kc->kcopyd_wq)
839 goto bad_workqueue; 838 goto bad_workqueue;
840 839
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 699b5be68d31..9584443c5614 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1080,8 +1080,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1080 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); 1080 ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
1081 ti->discard_zeroes_data_unsupported = true; 1081 ti->discard_zeroes_data_unsupported = true;
1082 1082
1083 ms->kmirrord_wq = alloc_workqueue("kmirrord", 1083 ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
1084 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1085 if (!ms->kmirrord_wq) { 1084 if (!ms->kmirrord_wq) {
1086 DMERR("couldn't start kmirrord"); 1085 DMERR("couldn't start kmirrord");
1087 r = -ENOMEM; 1086 r = -ENOMEM;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
new file mode 100644
index 000000000000..8ae31e8d3d64
--- /dev/null
+++ b/drivers/md/dm-stats.c
@@ -0,0 +1,969 @@
1#include <linux/errno.h>
2#include <linux/numa.h>
3#include <linux/slab.h>
4#include <linux/rculist.h>
5#include <linux/threads.h>
6#include <linux/preempt.h>
7#include <linux/irqflags.h>
8#include <linux/vmalloc.h>
9#include <linux/mm.h>
10#include <linux/module.h>
11#include <linux/device-mapper.h>
12
13#include "dm.h"
14#include "dm-stats.h"
15
16#define DM_MSG_PREFIX "stats"
17
18static int dm_stat_need_rcu_barrier;
19
20/*
21 * Using 64-bit values to avoid overflow (which is a
22 * problem that block/genhd.c's IO accounting has).
23 */
24struct dm_stat_percpu {
25 unsigned long long sectors[2];
26 unsigned long long ios[2];
27 unsigned long long merges[2];
28 unsigned long long ticks[2];
29 unsigned long long io_ticks[2];
30 unsigned long long io_ticks_total;
31 unsigned long long time_in_queue;
32};
33
34struct dm_stat_shared {
35 atomic_t in_flight[2];
36 unsigned long stamp;
37 struct dm_stat_percpu tmp;
38};
39
40struct dm_stat {
41 struct list_head list_entry;
42 int id;
43 size_t n_entries;
44 sector_t start;
45 sector_t end;
46 sector_t step;
47 const char *program_id;
48 const char *aux_data;
49 struct rcu_head rcu_head;
50 size_t shared_alloc_size;
51 size_t percpu_alloc_size;
52 struct dm_stat_percpu *stat_percpu[NR_CPUS];
53 struct dm_stat_shared stat_shared[0];
54};
55
56struct dm_stats_last_position {
57 sector_t last_sector;
58 unsigned last_rw;
59};
60
61/*
62 * A typo on the command line could possibly make the kernel run out of memory
63 * and crash. To prevent the crash we account all used memory. We fail if we
64 * exhaust 1/4 of all memory or 1/2 of vmalloc space.
65 */
66#define DM_STATS_MEMORY_FACTOR 4
67#define DM_STATS_VMALLOC_FACTOR 2
68
69static DEFINE_SPINLOCK(shared_memory_lock);
70
71static unsigned long shared_memory_amount;
72
73static bool __check_shared_memory(size_t alloc_size)
74{
75 size_t a;
76
77 a = shared_memory_amount + alloc_size;
78 if (a < shared_memory_amount)
79 return false;
80 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
81 return false;
82#ifdef CONFIG_MMU
83 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
84 return false;
85#endif
86 return true;
87}
88
89static bool check_shared_memory(size_t alloc_size)
90{
91 bool ret;
92
93 spin_lock_irq(&shared_memory_lock);
94
95 ret = __check_shared_memory(alloc_size);
96
97 spin_unlock_irq(&shared_memory_lock);
98
99 return ret;
100}
101
102static bool claim_shared_memory(size_t alloc_size)
103{
104 spin_lock_irq(&shared_memory_lock);
105
106 if (!__check_shared_memory(alloc_size)) {
107 spin_unlock_irq(&shared_memory_lock);
108 return false;
109 }
110
111 shared_memory_amount += alloc_size;
112
113 spin_unlock_irq(&shared_memory_lock);
114
115 return true;
116}
117
118static void free_shared_memory(size_t alloc_size)
119{
120 unsigned long flags;
121
122 spin_lock_irqsave(&shared_memory_lock, flags);
123
124 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
125 spin_unlock_irqrestore(&shared_memory_lock, flags);
126 DMCRIT("Memory usage accounting bug.");
127 return;
128 }
129
130 shared_memory_amount -= alloc_size;
131
132 spin_unlock_irqrestore(&shared_memory_lock, flags);
133}
134
135static void *dm_kvzalloc(size_t alloc_size, int node)
136{
137 void *p;
138
139 if (!claim_shared_memory(alloc_size))
140 return NULL;
141
142 if (alloc_size <= KMALLOC_MAX_SIZE) {
143 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
144 if (p)
145 return p;
146 }
147 p = vzalloc_node(alloc_size, node);
148 if (p)
149 return p;
150
151 free_shared_memory(alloc_size);
152
153 return NULL;
154}
155
156static void dm_kvfree(void *ptr, size_t alloc_size)
157{
158 if (!ptr)
159 return;
160
161 free_shared_memory(alloc_size);
162
163 if (is_vmalloc_addr(ptr))
164 vfree(ptr);
165 else
166 kfree(ptr);
167}
168
169static void dm_stat_free(struct rcu_head *head)
170{
171 int cpu;
172 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
174 kfree(s->program_id);
175 kfree(s->aux_data);
176 for_each_possible_cpu(cpu)
177 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
178 dm_kvfree(s, s->shared_alloc_size);
179}
180
181static int dm_stat_in_flight(struct dm_stat_shared *shared)
182{
183 return atomic_read(&shared->in_flight[READ]) +
184 atomic_read(&shared->in_flight[WRITE]);
185}
186
187void dm_stats_init(struct dm_stats *stats)
188{
189 int cpu;
190 struct dm_stats_last_position *last;
191
192 mutex_init(&stats->mutex);
193 INIT_LIST_HEAD(&stats->list);
194 stats->last = alloc_percpu(struct dm_stats_last_position);
195 for_each_possible_cpu(cpu) {
196 last = per_cpu_ptr(stats->last, cpu);
197 last->last_sector = (sector_t)ULLONG_MAX;
198 last->last_rw = UINT_MAX;
199 }
200}
201
202void dm_stats_cleanup(struct dm_stats *stats)
203{
204 size_t ni;
205 struct dm_stat *s;
206 struct dm_stat_shared *shared;
207
208 while (!list_empty(&stats->list)) {
209 s = container_of(stats->list.next, struct dm_stat, list_entry);
210 list_del(&s->list_entry);
211 for (ni = 0; ni < s->n_entries; ni++) {
212 shared = &s->stat_shared[ni];
213 if (WARN_ON(dm_stat_in_flight(shared))) {
214 DMCRIT("leaked in-flight counter at index %lu "
215 "(start %llu, end %llu, step %llu): reads %d, writes %d",
216 (unsigned long)ni,
217 (unsigned long long)s->start,
218 (unsigned long long)s->end,
219 (unsigned long long)s->step,
220 atomic_read(&shared->in_flight[READ]),
221 atomic_read(&shared->in_flight[WRITE]));
222 }
223 }
224 dm_stat_free(&s->rcu_head);
225 }
226 free_percpu(stats->last);
227}
228
229static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
230 sector_t step, const char *program_id, const char *aux_data,
231 void (*suspend_callback)(struct mapped_device *),
232 void (*resume_callback)(struct mapped_device *),
233 struct mapped_device *md)
234{
235 struct list_head *l;
236 struct dm_stat *s, *tmp_s;
237 sector_t n_entries;
238 size_t ni;
239 size_t shared_alloc_size;
240 size_t percpu_alloc_size;
241 struct dm_stat_percpu *p;
242 int cpu;
243 int ret_id;
244 int r;
245
246 if (end < start || !step)
247 return -EINVAL;
248
249 n_entries = end - start;
250 if (dm_sector_div64(n_entries, step))
251 n_entries++;
252
253 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
254 return -EOVERFLOW;
255
256 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
257 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
258 return -EOVERFLOW;
259
260 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
261 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
262 return -EOVERFLOW;
263
264 if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
265 return -ENOMEM;
266
267 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
268 if (!s)
269 return -ENOMEM;
270
271 s->n_entries = n_entries;
272 s->start = start;
273 s->end = end;
274 s->step = step;
275 s->shared_alloc_size = shared_alloc_size;
276 s->percpu_alloc_size = percpu_alloc_size;
277
278 s->program_id = kstrdup(program_id, GFP_KERNEL);
279 if (!s->program_id) {
280 r = -ENOMEM;
281 goto out;
282 }
283 s->aux_data = kstrdup(aux_data, GFP_KERNEL);
284 if (!s->aux_data) {
285 r = -ENOMEM;
286 goto out;
287 }
288
289 for (ni = 0; ni < n_entries; ni++) {
290 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
291 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
292 }
293
294 for_each_possible_cpu(cpu) {
295 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
296 if (!p) {
297 r = -ENOMEM;
298 goto out;
299 }
300 s->stat_percpu[cpu] = p;
301 }
302
303 /*
304 * Suspend/resume to make sure there is no i/o in flight,
305 * so that newly created statistics will be exact.
306 *
307 * (note: we couldn't suspend earlier because we must not
308 * allocate memory while suspended)
309 */
310 suspend_callback(md);
311
312 mutex_lock(&stats->mutex);
313 s->id = 0;
314 list_for_each(l, &stats->list) {
315 tmp_s = container_of(l, struct dm_stat, list_entry);
316 if (WARN_ON(tmp_s->id < s->id)) {
317 r = -EINVAL;
318 goto out_unlock_resume;
319 }
320 if (tmp_s->id > s->id)
321 break;
322 if (unlikely(s->id == INT_MAX)) {
323 r = -ENFILE;
324 goto out_unlock_resume;
325 }
326 s->id++;
327 }
328 ret_id = s->id;
329 list_add_tail_rcu(&s->list_entry, l);
330 mutex_unlock(&stats->mutex);
331
332 resume_callback(md);
333
334 return ret_id;
335
336out_unlock_resume:
337 mutex_unlock(&stats->mutex);
338 resume_callback(md);
339out:
340 dm_stat_free(&s->rcu_head);
341 return r;
342}
343
344static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
345{
346 struct dm_stat *s;
347
348 list_for_each_entry(s, &stats->list, list_entry) {
349 if (s->id > id)
350 break;
351 if (s->id == id)
352 return s;
353 }
354
355 return NULL;
356}
357
358static int dm_stats_delete(struct dm_stats *stats, int id)
359{
360 struct dm_stat *s;
361 int cpu;
362
363 mutex_lock(&stats->mutex);
364
365 s = __dm_stats_find(stats, id);
366 if (!s) {
367 mutex_unlock(&stats->mutex);
368 return -ENOENT;
369 }
370
371 list_del_rcu(&s->list_entry);
372 mutex_unlock(&stats->mutex);
373
374 /*
375 * vfree can't be called from RCU callback
376 */
377 for_each_possible_cpu(cpu)
378 if (is_vmalloc_addr(s->stat_percpu))
379 goto do_sync_free;
380 if (is_vmalloc_addr(s)) {
381do_sync_free:
382 synchronize_rcu_expedited();
383 dm_stat_free(&s->rcu_head);
384 } else {
385 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
386 call_rcu(&s->rcu_head, dm_stat_free);
387 }
388 return 0;
389}
390
391static int dm_stats_list(struct dm_stats *stats, const char *program,
392 char *result, unsigned maxlen)
393{
394 struct dm_stat *s;
395 sector_t len;
396 unsigned sz = 0;
397
398 /*
399 * Output format:
400 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
401 */
402
403 mutex_lock(&stats->mutex);
404 list_for_each_entry(s, &stats->list, list_entry) {
405 if (!program || !strcmp(program, s->program_id)) {
406 len = s->end - s->start;
407 DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
408 (unsigned long long)s->start,
409 (unsigned long long)len,
410 (unsigned long long)s->step,
411 s->program_id,
412 s->aux_data);
413 }
414 }
415 mutex_unlock(&stats->mutex);
416
417 return 1;
418}
419
420static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
421{
422 /*
423 * This is racy, but so is part_round_stats_single.
424 */
425 unsigned long now = jiffies;
426 unsigned in_flight_read;
427 unsigned in_flight_write;
428 unsigned long difference = now - shared->stamp;
429
430 if (!difference)
431 return;
432 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
433 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
434 if (in_flight_read)
435 p->io_ticks[READ] += difference;
436 if (in_flight_write)
437 p->io_ticks[WRITE] += difference;
438 if (in_flight_read + in_flight_write) {
439 p->io_ticks_total += difference;
440 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
441 }
442 shared->stamp = now;
443}
444
445static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
446 unsigned long bi_rw, sector_t len, bool merged,
447 bool end, unsigned long duration)
448{
449 unsigned long idx = bi_rw & REQ_WRITE;
450 struct dm_stat_shared *shared = &s->stat_shared[entry];
451 struct dm_stat_percpu *p;
452
453 /*
454 * For strict correctness we should use local_irq_disable/enable
455 * instead of preempt_disable/enable.
456 *
457 * This is racy if the driver finishes bios from non-interrupt
458 * context as well as from interrupt context or from more different
459 * interrupts.
460 *
461 * However, the race only results in not counting some events,
462 * so it is acceptable.
463 *
464 * part_stat_lock()/part_stat_unlock() have this race too.
465 */
466 preempt_disable();
467 p = &s->stat_percpu[smp_processor_id()][entry];
468
469 if (!end) {
470 dm_stat_round(shared, p);
471 atomic_inc(&shared->in_flight[idx]);
472 } else {
473 dm_stat_round(shared, p);
474 atomic_dec(&shared->in_flight[idx]);
475 p->sectors[idx] += len;
476 p->ios[idx] += 1;
477 p->merges[idx] += merged;
478 p->ticks[idx] += duration;
479 }
480
481 preempt_enable();
482}
483
484static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
485 sector_t bi_sector, sector_t end_sector,
486 bool end, unsigned long duration,
487 struct dm_stats_aux *stats_aux)
488{
489 sector_t rel_sector, offset, todo, fragment_len;
490 size_t entry;
491
492 if (end_sector <= s->start || bi_sector >= s->end)
493 return;
494 if (unlikely(bi_sector < s->start)) {
495 rel_sector = 0;
496 todo = end_sector - s->start;
497 } else {
498 rel_sector = bi_sector - s->start;
499 todo = end_sector - bi_sector;
500 }
501 if (unlikely(end_sector > s->end))
502 todo -= (end_sector - s->end);
503
504 offset = dm_sector_div64(rel_sector, s->step);
505 entry = rel_sector;
506 do {
507 if (WARN_ON_ONCE(entry >= s->n_entries)) {
508 DMCRIT("Invalid area access in region id %d", s->id);
509 return;
510 }
511 fragment_len = todo;
512 if (fragment_len > s->step - offset)
513 fragment_len = s->step - offset;
514 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
515 stats_aux->merged, end, duration);
516 todo -= fragment_len;
517 entry++;
518 offset = 0;
519 } while (unlikely(todo != 0));
520}
521
522void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
523 sector_t bi_sector, unsigned bi_sectors, bool end,
524 unsigned long duration, struct dm_stats_aux *stats_aux)
525{
526 struct dm_stat *s;
527 sector_t end_sector;
528 struct dm_stats_last_position *last;
529
530 if (unlikely(!bi_sectors))
531 return;
532
533 end_sector = bi_sector + bi_sectors;
534
535 if (!end) {
536 /*
537 * A race condition can at worst result in the merged flag being
538 * misrepresented, so we don't have to disable preemption here.
539 */
540 last = __this_cpu_ptr(stats->last);
541 stats_aux->merged =
542 (bi_sector == (ACCESS_ONCE(last->last_sector) &&
543 ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
544 (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
545 ));
546 ACCESS_ONCE(last->last_sector) = end_sector;
547 ACCESS_ONCE(last->last_rw) = bi_rw;
548 }
549
550 rcu_read_lock();
551
552 list_for_each_entry_rcu(s, &stats->list, list_entry)
553 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
554
555 rcu_read_unlock();
556}
557
558static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
559 struct dm_stat *s, size_t x)
560{
561 int cpu;
562 struct dm_stat_percpu *p;
563
564 local_irq_disable();
565 p = &s->stat_percpu[smp_processor_id()][x];
566 dm_stat_round(shared, p);
567 local_irq_enable();
568
569 memset(&shared->tmp, 0, sizeof(shared->tmp));
570 for_each_possible_cpu(cpu) {
571 p = &s->stat_percpu[cpu][x];
572 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
573 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
574 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
575 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
576 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
577 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
578 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
579 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
580 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
581 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
582 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
583 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
584 }
585}
586
587static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
588 bool init_tmp_percpu_totals)
589{
590 size_t x;
591 struct dm_stat_shared *shared;
592 struct dm_stat_percpu *p;
593
594 for (x = idx_start; x < idx_end; x++) {
595 shared = &s->stat_shared[x];
596 if (init_tmp_percpu_totals)
597 __dm_stat_init_temporary_percpu_totals(shared, s, x);
598 local_irq_disable();
599 p = &s->stat_percpu[smp_processor_id()][x];
600 p->sectors[READ] -= shared->tmp.sectors[READ];
601 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
602 p->ios[READ] -= shared->tmp.ios[READ];
603 p->ios[WRITE] -= shared->tmp.ios[WRITE];
604 p->merges[READ] -= shared->tmp.merges[READ];
605 p->merges[WRITE] -= shared->tmp.merges[WRITE];
606 p->ticks[READ] -= shared->tmp.ticks[READ];
607 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
608 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
609 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
610 p->io_ticks_total -= shared->tmp.io_ticks_total;
611 p->time_in_queue -= shared->tmp.time_in_queue;
612 local_irq_enable();
613 }
614}
615
616static int dm_stats_clear(struct dm_stats *stats, int id)
617{
618 struct dm_stat *s;
619
620 mutex_lock(&stats->mutex);
621
622 s = __dm_stats_find(stats, id);
623 if (!s) {
624 mutex_unlock(&stats->mutex);
625 return -ENOENT;
626 }
627
628 __dm_stat_clear(s, 0, s->n_entries, true);
629
630 mutex_unlock(&stats->mutex);
631
632 return 1;
633}
634
635/*
636 * This is like jiffies_to_msec, but works for 64-bit values.
637 */
638static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
639{
640 unsigned long long result = 0;
641 unsigned mult;
642
643 if (j)
644 result = jiffies_to_msecs(j & 0x3fffff);
645 if (j >= 1 << 22) {
646 mult = jiffies_to_msecs(1 << 22);
647 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
648 }
649 if (j >= 1ULL << 44)
650 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
651
652 return result;
653}
654
655static int dm_stats_print(struct dm_stats *stats, int id,
656 size_t idx_start, size_t idx_len,
657 bool clear, char *result, unsigned maxlen)
658{
659 unsigned sz = 0;
660 struct dm_stat *s;
661 size_t x;
662 sector_t start, end, step;
663 size_t idx_end;
664 struct dm_stat_shared *shared;
665
666 /*
667 * Output format:
668 * <start_sector>+<length> counters
669 */
670
671 mutex_lock(&stats->mutex);
672
673 s = __dm_stats_find(stats, id);
674 if (!s) {
675 mutex_unlock(&stats->mutex);
676 return -ENOENT;
677 }
678
679 idx_end = idx_start + idx_len;
680 if (idx_end < idx_start ||
681 idx_end > s->n_entries)
682 idx_end = s->n_entries;
683
684 if (idx_start > idx_end)
685 idx_start = idx_end;
686
687 step = s->step;
688 start = s->start + (step * idx_start);
689
690 for (x = idx_start; x < idx_end; x++, start = end) {
691 shared = &s->stat_shared[x];
692 end = start + step;
693 if (unlikely(end > s->end))
694 end = s->end;
695
696 __dm_stat_init_temporary_percpu_totals(shared, s, x);
697
698 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
699 (unsigned long long)start,
700 (unsigned long long)step,
701 shared->tmp.ios[READ],
702 shared->tmp.merges[READ],
703 shared->tmp.sectors[READ],
704 dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
705 shared->tmp.ios[WRITE],
706 shared->tmp.merges[WRITE],
707 shared->tmp.sectors[WRITE],
708 dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
709 dm_stat_in_flight(shared),
710 dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
711 dm_jiffies_to_msec64(shared->tmp.time_in_queue),
712 dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
713 dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
714
715 if (unlikely(sz + 1 >= maxlen))
716 goto buffer_overflow;
717 }
718
719 if (clear)
720 __dm_stat_clear(s, idx_start, idx_end, false);
721
722buffer_overflow:
723 mutex_unlock(&stats->mutex);
724
725 return 1;
726}
727
728static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
729{
730 struct dm_stat *s;
731 const char *new_aux_data;
732
733 mutex_lock(&stats->mutex);
734
735 s = __dm_stats_find(stats, id);
736 if (!s) {
737 mutex_unlock(&stats->mutex);
738 return -ENOENT;
739 }
740
741 new_aux_data = kstrdup(aux_data, GFP_KERNEL);
742 if (!new_aux_data) {
743 mutex_unlock(&stats->mutex);
744 return -ENOMEM;
745 }
746
747 kfree(s->aux_data);
748 s->aux_data = new_aux_data;
749
750 mutex_unlock(&stats->mutex);
751
752 return 0;
753}
754
755static int message_stats_create(struct mapped_device *md,
756 unsigned argc, char **argv,
757 char *result, unsigned maxlen)
758{
759 int id;
760 char dummy;
761 unsigned long long start, end, len, step;
762 unsigned divisor;
763 const char *program_id, *aux_data;
764
765 /*
766 * Input format:
767 * <range> <step> [<program_id> [<aux_data>]]
768 */
769
770 if (argc < 3 || argc > 5)
771 return -EINVAL;
772
773 if (!strcmp(argv[1], "-")) {
774 start = 0;
775 len = dm_get_size(md);
776 if (!len)
777 len = 1;
778 } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
779 start != (sector_t)start || len != (sector_t)len)
780 return -EINVAL;
781
782 end = start + len;
783 if (start >= end)
784 return -EINVAL;
785
786 if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
787 step = end - start;
788 if (do_div(step, divisor))
789 step++;
790 if (!step)
791 step = 1;
792 } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
793 step != (sector_t)step || !step)
794 return -EINVAL;
795
796 program_id = "-";
797 aux_data = "-";
798
799 if (argc > 3)
800 program_id = argv[3];
801
802 if (argc > 4)
803 aux_data = argv[4];
804
805 /*
806 * If a buffer overflow happens after we created the region,
807 * it's too late (the userspace would retry with a larger
808 * buffer, but the region id that caused the overflow is already
809 * leaked). So we must detect buffer overflow in advance.
810 */
811 snprintf(result, maxlen, "%d", INT_MAX);
812 if (dm_message_test_buffer_overflow(result, maxlen))
813 return 1;
814
815 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
816 dm_internal_suspend, dm_internal_resume, md);
817 if (id < 0)
818 return id;
819
820 snprintf(result, maxlen, "%d", id);
821
822 return 1;
823}
824
825static int message_stats_delete(struct mapped_device *md,
826 unsigned argc, char **argv)
827{
828 int id;
829 char dummy;
830
831 if (argc != 2)
832 return -EINVAL;
833
834 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
835 return -EINVAL;
836
837 return dm_stats_delete(dm_get_stats(md), id);
838}
839
840static int message_stats_clear(struct mapped_device *md,
841 unsigned argc, char **argv)
842{
843 int id;
844 char dummy;
845
846 if (argc != 2)
847 return -EINVAL;
848
849 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
850 return -EINVAL;
851
852 return dm_stats_clear(dm_get_stats(md), id);
853}
854
855static int message_stats_list(struct mapped_device *md,
856 unsigned argc, char **argv,
857 char *result, unsigned maxlen)
858{
859 int r;
860 const char *program = NULL;
861
862 if (argc < 1 || argc > 2)
863 return -EINVAL;
864
865 if (argc > 1) {
866 program = kstrdup(argv[1], GFP_KERNEL);
867 if (!program)
868 return -ENOMEM;
869 }
870
871 r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
872
873 kfree(program);
874
875 return r;
876}
877
878static int message_stats_print(struct mapped_device *md,
879 unsigned argc, char **argv, bool clear,
880 char *result, unsigned maxlen)
881{
882 int id;
883 char dummy;
884 unsigned long idx_start = 0, idx_len = ULONG_MAX;
885
886 if (argc != 2 && argc != 4)
887 return -EINVAL;
888
889 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
890 return -EINVAL;
891
892 if (argc > 3) {
893 if (strcmp(argv[2], "-") &&
894 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
895 return -EINVAL;
896 if (strcmp(argv[3], "-") &&
897 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
898 return -EINVAL;
899 }
900
901 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
902 result, maxlen);
903}
904
905static int message_stats_set_aux(struct mapped_device *md,
906 unsigned argc, char **argv)
907{
908 int id;
909 char dummy;
910
911 if (argc != 3)
912 return -EINVAL;
913
914 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
915 return -EINVAL;
916
917 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
918}
919
920int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
921 char *result, unsigned maxlen)
922{
923 int r;
924
925 if (dm_request_based(md)) {
926 DMWARN("Statistics are only supported for bio-based devices");
927 return -EOPNOTSUPP;
928 }
929
930 /* All messages here must start with '@' */
931 if (!strcasecmp(argv[0], "@stats_create"))
932 r = message_stats_create(md, argc, argv, result, maxlen);
933 else if (!strcasecmp(argv[0], "@stats_delete"))
934 r = message_stats_delete(md, argc, argv);
935 else if (!strcasecmp(argv[0], "@stats_clear"))
936 r = message_stats_clear(md, argc, argv);
937 else if (!strcasecmp(argv[0], "@stats_list"))
938 r = message_stats_list(md, argc, argv, result, maxlen);
939 else if (!strcasecmp(argv[0], "@stats_print"))
940 r = message_stats_print(md, argc, argv, false, result, maxlen);
941 else if (!strcasecmp(argv[0], "@stats_print_clear"))
942 r = message_stats_print(md, argc, argv, true, result, maxlen);
943 else if (!strcasecmp(argv[0], "@stats_set_aux"))
944 r = message_stats_set_aux(md, argc, argv);
945 else
946 return 2; /* this wasn't a stats message */
947
948 if (r == -EINVAL)
949 DMWARN("Invalid parameters for message %s", argv[0]);
950
951 return r;
952}
953
954int __init dm_statistics_init(void)
955{
956 dm_stat_need_rcu_barrier = 0;
957 return 0;
958}
959
960void dm_statistics_exit(void)
961{
962 if (dm_stat_need_rcu_barrier)
963 rcu_barrier();
964 if (WARN_ON(shared_memory_amount))
965 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
966}
967
968module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
969MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
new file mode 100644
index 000000000000..e7c4984bf235
--- /dev/null
+++ b/drivers/md/dm-stats.h
@@ -0,0 +1,40 @@
1#ifndef DM_STATS_H
2#define DM_STATS_H
3
4#include <linux/types.h>
5#include <linux/mutex.h>
6#include <linux/list.h>
7
8int dm_statistics_init(void);
9void dm_statistics_exit(void);
10
11struct dm_stats {
12 struct mutex mutex;
13 struct list_head list; /* list of struct dm_stat */
14 struct dm_stats_last_position __percpu *last;
15 sector_t last_sector;
16 unsigned last_rw;
17};
18
19struct dm_stats_aux {
20 bool merged;
21};
22
23void dm_stats_init(struct dm_stats *st);
24void dm_stats_cleanup(struct dm_stats *st);
25
26struct mapped_device;
27
28int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
29 char *result, unsigned maxlen);
30
31void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
32 sector_t bi_sector, unsigned bi_sectors, bool end,
33 unsigned long duration, struct dm_stats_aux *aux);
34
35static inline bool dm_stats_used(struct dm_stats *st)
36{
37 return !list_empty(&st->list);
38}
39
40#endif
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d907ca6227ce..73c1712dad96 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -4,6 +4,7 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm.h"
7#include <linux/device-mapper.h> 8#include <linux/device-mapper.h>
8 9
9#include <linux/module.h> 10#include <linux/module.h>
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f221812b7dbc..8f8783533ac7 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -860,14 +860,17 @@ EXPORT_SYMBOL(dm_consume_args);
860static int dm_table_set_type(struct dm_table *t) 860static int dm_table_set_type(struct dm_table *t)
861{ 861{
862 unsigned i; 862 unsigned i;
863 unsigned bio_based = 0, request_based = 0; 863 unsigned bio_based = 0, request_based = 0, hybrid = 0;
864 struct dm_target *tgt; 864 struct dm_target *tgt;
865 struct dm_dev_internal *dd; 865 struct dm_dev_internal *dd;
866 struct list_head *devices; 866 struct list_head *devices;
867 unsigned live_md_type;
867 868
868 for (i = 0; i < t->num_targets; i++) { 869 for (i = 0; i < t->num_targets; i++) {
869 tgt = t->targets + i; 870 tgt = t->targets + i;
870 if (dm_target_request_based(tgt)) 871 if (dm_target_hybrid(tgt))
872 hybrid = 1;
873 else if (dm_target_request_based(tgt))
871 request_based = 1; 874 request_based = 1;
872 else 875 else
873 bio_based = 1; 876 bio_based = 1;
@@ -879,6 +882,19 @@ static int dm_table_set_type(struct dm_table *t)
879 } 882 }
880 } 883 }
881 884
885 if (hybrid && !bio_based && !request_based) {
886 /*
887 * The targets can work either way.
888 * Determine the type from the live device.
889 * Default to bio-based if device is new.
890 */
891 live_md_type = dm_get_md_type(t->md);
892 if (live_md_type == DM_TYPE_REQUEST_BASED)
893 request_based = 1;
894 else
895 bio_based = 1;
896 }
897
882 if (bio_based) { 898 if (bio_based) {
883 /* We must use this table as bio-based */ 899 /* We must use this table as bio-based */
884 t->type = DM_TYPE_BIO_BASED; 900 t->type = DM_TYPE_BIO_BASED;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 37ba5db71cd9..242e3cec397a 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -131,12 +131,19 @@ static int io_err_map(struct dm_target *tt, struct bio *bio)
131 return -EIO; 131 return -EIO;
132} 132}
133 133
134static int io_err_map_rq(struct dm_target *ti, struct request *clone,
135 union map_info *map_context)
136{
137 return -EIO;
138}
139
134static struct target_type error_target = { 140static struct target_type error_target = {
135 .name = "error", 141 .name = "error",
136 .version = {1, 1, 0}, 142 .version = {1, 2, 0},
137 .ctr = io_err_ctr, 143 .ctr = io_err_ctr,
138 .dtr = io_err_dtr, 144 .dtr = io_err_dtr,
139 .map = io_err_map, 145 .map = io_err_map,
146 .map_rq = io_err_map_rq,
140}; 147};
141 148
142int __init dm_target_init(void) 149int __init dm_target_init(void)
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 88f2f802d528..ed063427d676 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -887,7 +887,8 @@ static int commit(struct pool *pool)
887 887
888 r = dm_pool_commit_metadata(pool->pmd); 888 r = dm_pool_commit_metadata(pool->pmd);
889 if (r) 889 if (r)
890 DMERR_LIMIT("commit failed: error = %d", r); 890 DMERR_LIMIT("%s: commit failed: error = %d",
891 dm_device_name(pool->pool_md), r);
891 892
892 return r; 893 return r;
893} 894}
@@ -917,6 +918,13 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
917 unsigned long flags; 918 unsigned long flags;
918 struct pool *pool = tc->pool; 919 struct pool *pool = tc->pool;
919 920
921 /*
922 * Once no_free_space is set we must not allow allocation to succeed.
923 * Otherwise it is difficult to explain, debug, test and support.
924 */
925 if (pool->no_free_space)
926 return -ENOSPC;
927
920 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 928 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
921 if (r) 929 if (r)
922 return r; 930 return r;
@@ -931,31 +939,30 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
931 } 939 }
932 940
933 if (!free_blocks) { 941 if (!free_blocks) {
934 if (pool->no_free_space) 942 /*
935 return -ENOSPC; 943 * Try to commit to see if that will free up some
936 else { 944 * more space.
937 /* 945 */
938 * Try to commit to see if that will free up some 946 (void) commit_or_fallback(pool);
939 * more space.
940 */
941 (void) commit_or_fallback(pool);
942 947
943 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 948 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
944 if (r) 949 if (r)
945 return r; 950 return r;
946 951
947 /* 952 /*
948 * If we still have no space we set a flag to avoid 953 * If we still have no space we set a flag to avoid
949 * doing all this checking and return -ENOSPC. 954 * doing all this checking and return -ENOSPC. This
950 */ 955 * flag serves as a latch that disallows allocations from
951 if (!free_blocks) { 956 * this pool until the admin takes action (e.g. resize or
952 DMWARN("%s: no free space available.", 957 * table reload).
953 dm_device_name(pool->pool_md)); 958 */
954 spin_lock_irqsave(&pool->lock, flags); 959 if (!free_blocks) {
955 pool->no_free_space = 1; 960 DMWARN("%s: no free space available.",
956 spin_unlock_irqrestore(&pool->lock, flags); 961 dm_device_name(pool->pool_md));
957 return -ENOSPC; 962 spin_lock_irqsave(&pool->lock, flags);
958 } 963 pool->no_free_space = 1;
964 spin_unlock_irqrestore(&pool->lock, flags);
965 return -ENOSPC;
959 } 966 }
960 } 967 }
961 968
@@ -1085,6 +1092,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1085{ 1092{
1086 int r; 1093 int r;
1087 dm_block_t data_block; 1094 dm_block_t data_block;
1095 struct pool *pool = tc->pool;
1088 1096
1089 r = alloc_data_block(tc, &data_block); 1097 r = alloc_data_block(tc, &data_block);
1090 switch (r) { 1098 switch (r) {
@@ -1094,13 +1102,14 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1094 break; 1102 break;
1095 1103
1096 case -ENOSPC: 1104 case -ENOSPC:
1097 no_space(tc->pool, cell); 1105 no_space(pool, cell);
1098 break; 1106 break;
1099 1107
1100 default: 1108 default:
1101 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1109 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1102 __func__, r); 1110 __func__, r);
1103 cell_error(tc->pool, cell); 1111 set_pool_mode(pool, PM_READ_ONLY);
1112 cell_error(pool, cell);
1104 break; 1113 break;
1105 } 1114 }
1106} 1115}
@@ -1386,7 +1395,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1386 1395
1387 switch (mode) { 1396 switch (mode) {
1388 case PM_FAIL: 1397 case PM_FAIL:
1389 DMERR("switching pool to failure mode"); 1398 DMERR("%s: switching pool to failure mode",
1399 dm_device_name(pool->pool_md));
1390 pool->process_bio = process_bio_fail; 1400 pool->process_bio = process_bio_fail;
1391 pool->process_discard = process_bio_fail; 1401 pool->process_discard = process_bio_fail;
1392 pool->process_prepared_mapping = process_prepared_mapping_fail; 1402 pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -1394,10 +1404,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1394 break; 1404 break;
1395 1405
1396 case PM_READ_ONLY: 1406 case PM_READ_ONLY:
1397 DMERR("switching pool to read-only mode"); 1407 DMERR("%s: switching pool to read-only mode",
1408 dm_device_name(pool->pool_md));
1398 r = dm_pool_abort_metadata(pool->pmd); 1409 r = dm_pool_abort_metadata(pool->pmd);
1399 if (r) { 1410 if (r) {
1400 DMERR("aborting transaction failed"); 1411 DMERR("%s: aborting transaction failed",
1412 dm_device_name(pool->pool_md));
1401 set_pool_mode(pool, PM_FAIL); 1413 set_pool_mode(pool, PM_FAIL);
1402 } else { 1414 } else {
1403 dm_pool_metadata_read_only(pool->pmd); 1415 dm_pool_metadata_read_only(pool->pmd);
@@ -2156,19 +2168,22 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2156 2168
2157 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2169 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2158 if (r) { 2170 if (r) {
2159 DMERR("failed to retrieve data device size"); 2171 DMERR("%s: failed to retrieve data device size",
2172 dm_device_name(pool->pool_md));
2160 return r; 2173 return r;
2161 } 2174 }
2162 2175
2163 if (data_size < sb_data_size) { 2176 if (data_size < sb_data_size) {
2164 DMERR("pool target (%llu blocks) too small: expected %llu", 2177 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2178 dm_device_name(pool->pool_md),
2165 (unsigned long long)data_size, sb_data_size); 2179 (unsigned long long)data_size, sb_data_size);
2166 return -EINVAL; 2180 return -EINVAL;
2167 2181
2168 } else if (data_size > sb_data_size) { 2182 } else if (data_size > sb_data_size) {
2169 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2183 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2170 if (r) { 2184 if (r) {
2171 DMERR("failed to resize data device"); 2185 DMERR("%s: failed to resize data device",
2186 dm_device_name(pool->pool_md));
2172 set_pool_mode(pool, PM_READ_ONLY); 2187 set_pool_mode(pool, PM_READ_ONLY);
2173 return r; 2188 return r;
2174 } 2189 }
@@ -2192,19 +2207,22 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2192 2207
2193 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); 2208 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2194 if (r) { 2209 if (r) {
2195 DMERR("failed to retrieve data device size"); 2210 DMERR("%s: failed to retrieve metadata device size",
2211 dm_device_name(pool->pool_md));
2196 return r; 2212 return r;
2197 } 2213 }
2198 2214
2199 if (metadata_dev_size < sb_metadata_dev_size) { 2215 if (metadata_dev_size < sb_metadata_dev_size) {
2200 DMERR("metadata device (%llu blocks) too small: expected %llu", 2216 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2217 dm_device_name(pool->pool_md),
2201 metadata_dev_size, sb_metadata_dev_size); 2218 metadata_dev_size, sb_metadata_dev_size);
2202 return -EINVAL; 2219 return -EINVAL;
2203 2220
2204 } else if (metadata_dev_size > sb_metadata_dev_size) { 2221 } else if (metadata_dev_size > sb_metadata_dev_size) {
2205 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2222 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2206 if (r) { 2223 if (r) {
2207 DMERR("failed to resize metadata device"); 2224 DMERR("%s: failed to resize metadata device",
2225 dm_device_name(pool->pool_md));
2208 return r; 2226 return r;
2209 } 2227 }
2210 2228
@@ -2530,37 +2548,43 @@ static void pool_status(struct dm_target *ti, status_type_t type,
2530 2548
2531 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2549 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2532 if (r) { 2550 if (r) {
2533 DMERR("dm_pool_get_metadata_transaction_id returned %d", r); 2551 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2552 dm_device_name(pool->pool_md), r);
2534 goto err; 2553 goto err;
2535 } 2554 }
2536 2555
2537 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); 2556 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2538 if (r) { 2557 if (r) {
2539 DMERR("dm_pool_get_free_metadata_block_count returned %d", r); 2558 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2559 dm_device_name(pool->pool_md), r);
2540 goto err; 2560 goto err;
2541 } 2561 }
2542 2562
2543 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2563 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2544 if (r) { 2564 if (r) {
2545 DMERR("dm_pool_get_metadata_dev_size returned %d", r); 2565 DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2566 dm_device_name(pool->pool_md), r);
2546 goto err; 2567 goto err;
2547 } 2568 }
2548 2569
2549 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); 2570 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2550 if (r) { 2571 if (r) {
2551 DMERR("dm_pool_get_free_block_count returned %d", r); 2572 DMERR("%s: dm_pool_get_free_block_count returned %d",
2573 dm_device_name(pool->pool_md), r);
2552 goto err; 2574 goto err;
2553 } 2575 }
2554 2576
2555 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2577 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2556 if (r) { 2578 if (r) {
2557 DMERR("dm_pool_get_data_dev_size returned %d", r); 2579 DMERR("%s: dm_pool_get_data_dev_size returned %d",
2580 dm_device_name(pool->pool_md), r);
2558 goto err; 2581 goto err;
2559 } 2582 }
2560 2583
2561 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2584 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2562 if (r) { 2585 if (r) {
2563 DMERR("dm_pool_get_metadata_snap returned %d", r); 2586 DMERR("%s: dm_pool_get_metadata_snap returned %d",
2587 dm_device_name(pool->pool_md), r);
2564 goto err; 2588 goto err;
2565 } 2589 }
2566 2590
@@ -2648,9 +2672,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2648{ 2672{
2649 struct pool_c *pt = ti->private; 2673 struct pool_c *pt = ti->private;
2650 struct pool *pool = pt->pool; 2674 struct pool *pool = pt->pool;
2675 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2651 2676
2652 blk_limits_io_min(limits, 0); 2677 /*
2653 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2678 * If the system-determined stacked limits are compatible with the
2679 * pool's blocksize (io_opt is a factor) do not override them.
2680 */
2681 if (io_opt_sectors < pool->sectors_per_block ||
2682 do_div(io_opt_sectors, pool->sectors_per_block)) {
2683 blk_limits_io_min(limits, 0);
2684 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2685 }
2654 2686
2655 /* 2687 /*
2656 * pt->adjusted_pf is a staging area for the actual features to use. 2688 * pt->adjusted_pf is a staging area for the actual features to use.
@@ -2669,7 +2701,7 @@ static struct target_type pool_target = {
2669 .name = "thin-pool", 2701 .name = "thin-pool",
2670 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2702 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2671 DM_TARGET_IMMUTABLE, 2703 DM_TARGET_IMMUTABLE,
2672 .version = {1, 8, 0}, 2704 .version = {1, 9, 0},
2673 .module = THIS_MODULE, 2705 .module = THIS_MODULE,
2674 .ctr = pool_ctr, 2706 .ctr = pool_ctr,
2675 .dtr = pool_dtr, 2707 .dtr = pool_dtr,
@@ -2956,7 +2988,7 @@ static int thin_iterate_devices(struct dm_target *ti,
2956 2988
2957static struct target_type thin_target = { 2989static struct target_type thin_target = {
2958 .name = "thin", 2990 .name = "thin",
2959 .version = {1, 8, 0}, 2991 .version = {1, 9, 0},
2960 .module = THIS_MODULE, 2992 .module = THIS_MODULE,
2961 .ctr = thin_ctr, 2993 .ctr = thin_ctr,
2962 .dtr = thin_dtr, 2994 .dtr = thin_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e39d2b64bf8..6a5e9ed2fcc3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -60,6 +60,7 @@ struct dm_io {
60 struct bio *bio; 60 struct bio *bio;
61 unsigned long start_time; 61 unsigned long start_time;
62 spinlock_t endio_lock; 62 spinlock_t endio_lock;
63 struct dm_stats_aux stats_aux;
63}; 64};
64 65
65/* 66/*
@@ -198,6 +199,8 @@ struct mapped_device {
198 199
199 /* zero-length flush that will be cloned and submitted to targets */ 200 /* zero-length flush that will be cloned and submitted to targets */
200 struct bio flush_bio; 201 struct bio flush_bio;
202
203 struct dm_stats stats;
201}; 204};
202 205
203/* 206/*
@@ -269,6 +272,7 @@ static int (*_inits[])(void) __initdata = {
269 dm_io_init, 272 dm_io_init,
270 dm_kcopyd_init, 273 dm_kcopyd_init,
271 dm_interface_init, 274 dm_interface_init,
275 dm_statistics_init,
272}; 276};
273 277
274static void (*_exits[])(void) = { 278static void (*_exits[])(void) = {
@@ -279,6 +283,7 @@ static void (*_exits[])(void) = {
279 dm_io_exit, 283 dm_io_exit,
280 dm_kcopyd_exit, 284 dm_kcopyd_exit,
281 dm_interface_exit, 285 dm_interface_exit,
286 dm_statistics_exit,
282}; 287};
283 288
284static int __init dm_init(void) 289static int __init dm_init(void)
@@ -384,6 +389,16 @@ int dm_lock_for_deletion(struct mapped_device *md)
384 return r; 389 return r;
385} 390}
386 391
392sector_t dm_get_size(struct mapped_device *md)
393{
394 return get_capacity(md->disk);
395}
396
397struct dm_stats *dm_get_stats(struct mapped_device *md)
398{
399 return &md->stats;
400}
401
387static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 402static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
388{ 403{
389 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
@@ -466,8 +481,9 @@ static int md_in_flight(struct mapped_device *md)
466static void start_io_acct(struct dm_io *io) 481static void start_io_acct(struct dm_io *io)
467{ 482{
468 struct mapped_device *md = io->md; 483 struct mapped_device *md = io->md;
484 struct bio *bio = io->bio;
469 int cpu; 485 int cpu;
470 int rw = bio_data_dir(io->bio); 486 int rw = bio_data_dir(bio);
471 487
472 io->start_time = jiffies; 488 io->start_time = jiffies;
473 489
@@ -476,6 +492,10 @@ static void start_io_acct(struct dm_io *io)
476 part_stat_unlock(); 492 part_stat_unlock();
477 atomic_set(&dm_disk(md)->part0.in_flight[rw], 493 atomic_set(&dm_disk(md)->part0.in_flight[rw],
478 atomic_inc_return(&md->pending[rw])); 494 atomic_inc_return(&md->pending[rw]));
495
496 if (unlikely(dm_stats_used(&md->stats)))
497 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
498 bio_sectors(bio), false, 0, &io->stats_aux);
479} 499}
480 500
481static void end_io_acct(struct dm_io *io) 501static void end_io_acct(struct dm_io *io)
@@ -491,6 +511,10 @@ static void end_io_acct(struct dm_io *io)
491 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 511 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
492 part_stat_unlock(); 512 part_stat_unlock();
493 513
514 if (unlikely(dm_stats_used(&md->stats)))
515 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
516 bio_sectors(bio), true, duration, &io->stats_aux);
517
494 /* 518 /*
495 * After this is decremented the bio must not be touched if it is 519 * After this is decremented the bio must not be touched if it is
496 * a flush. 520 * a flush.
@@ -1519,7 +1543,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1519 return; 1543 return;
1520} 1544}
1521 1545
1522static int dm_request_based(struct mapped_device *md) 1546int dm_request_based(struct mapped_device *md)
1523{ 1547{
1524 return blk_queue_stackable(md->queue); 1548 return blk_queue_stackable(md->queue);
1525} 1549}
@@ -1946,8 +1970,7 @@ static struct mapped_device *alloc_dev(int minor)
1946 add_disk(md->disk); 1970 add_disk(md->disk);
1947 format_dev_t(md->name, MKDEV(_major, minor)); 1971 format_dev_t(md->name, MKDEV(_major, minor));
1948 1972
1949 md->wq = alloc_workqueue("kdmflush", 1973 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1950 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1951 if (!md->wq) 1974 if (!md->wq)
1952 goto bad_thread; 1975 goto bad_thread;
1953 1976
@@ -1959,6 +1982,8 @@ static struct mapped_device *alloc_dev(int minor)
1959 md->flush_bio.bi_bdev = md->bdev; 1982 md->flush_bio.bi_bdev = md->bdev;
1960 md->flush_bio.bi_rw = WRITE_FLUSH; 1983 md->flush_bio.bi_rw = WRITE_FLUSH;
1961 1984
1985 dm_stats_init(&md->stats);
1986
1962 /* Populate the mapping, nobody knows we exist yet */ 1987 /* Populate the mapping, nobody knows we exist yet */
1963 spin_lock(&_minor_lock); 1988 spin_lock(&_minor_lock);
1964 old_md = idr_replace(&_minor_idr, md, minor); 1989 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2010,6 +2035,7 @@ static void free_dev(struct mapped_device *md)
2010 2035
2011 put_disk(md->disk); 2036 put_disk(md->disk);
2012 blk_cleanup_queue(md->queue); 2037 blk_cleanup_queue(md->queue);
2038 dm_stats_cleanup(&md->stats);
2013 module_put(THIS_MODULE); 2039 module_put(THIS_MODULE);
2014 kfree(md); 2040 kfree(md);
2015} 2041}
@@ -2151,7 +2177,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2151 /* 2177 /*
2152 * Wipe any geometry if the size of the table changed. 2178 * Wipe any geometry if the size of the table changed.
2153 */ 2179 */
2154 if (size != get_capacity(md->disk)) 2180 if (size != dm_get_size(md))
2155 memset(&md->geometry, 0, sizeof(md->geometry)); 2181 memset(&md->geometry, 0, sizeof(md->geometry));
2156 2182
2157 __set_size(md, size); 2183 __set_size(md, size);
@@ -2236,11 +2262,13 @@ void dm_unlock_md_type(struct mapped_device *md)
2236 2262
2237void dm_set_md_type(struct mapped_device *md, unsigned type) 2263void dm_set_md_type(struct mapped_device *md, unsigned type)
2238{ 2264{
2265 BUG_ON(!mutex_is_locked(&md->type_lock));
2239 md->type = type; 2266 md->type = type;
2240} 2267}
2241 2268
2242unsigned dm_get_md_type(struct mapped_device *md) 2269unsigned dm_get_md_type(struct mapped_device *md)
2243{ 2270{
2271 BUG_ON(!mutex_is_locked(&md->type_lock));
2244 return md->type; 2272 return md->type;
2245} 2273}
2246 2274
@@ -2695,6 +2723,38 @@ out:
2695 return r; 2723 return r;
2696} 2724}
2697 2725
2726/*
2727 * Internal suspend/resume works like userspace-driven suspend. It waits
2728 * until all bios finish and prevents issuing new bios to the target drivers.
2729 * It may be used only from the kernel.
2730 *
2731 * Internal suspend holds md->suspend_lock, which prevents interaction with
2732 * userspace-driven suspend.
2733 */
2734
2735void dm_internal_suspend(struct mapped_device *md)
2736{
2737 mutex_lock(&md->suspend_lock);
2738 if (dm_suspended_md(md))
2739 return;
2740
2741 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2742 synchronize_srcu(&md->io_barrier);
2743 flush_workqueue(md->wq);
2744 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2745}
2746
2747void dm_internal_resume(struct mapped_device *md)
2748{
2749 if (dm_suspended_md(md))
2750 goto done;
2751
2752 dm_queue_flush(md);
2753
2754done:
2755 mutex_unlock(&md->suspend_lock);
2756}
2757
2698/*----------------------------------------------------------------- 2758/*-----------------------------------------------------------------
2699 * Event notification. 2759 * Event notification.
2700 *---------------------------------------------------------------*/ 2760 *---------------------------------------------------------------*/
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 45b97da1bd06..5e604cc7b4aa 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -16,6 +16,8 @@
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18 18
19#include "dm-stats.h"
20
19/* 21/*
20 * Suspend feature flags 22 * Suspend feature flags
21 */ 23 */
@@ -89,10 +91,21 @@ int dm_setup_md_queue(struct mapped_device *md);
89#define dm_target_is_valid(t) ((t)->table) 91#define dm_target_is_valid(t) ((t)->table)
90 92
91/* 93/*
94 * To check whether the target type is bio-based or not (request-based).
95 */
96#define dm_target_bio_based(t) ((t)->type->map != NULL)
97
98/*
92 * To check whether the target type is request-based or not (bio-based). 99 * To check whether the target type is request-based or not (bio-based).
93 */ 100 */
94#define dm_target_request_based(t) ((t)->type->map_rq != NULL) 101#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
95 102
103/*
104 * To check whether the target type is a hybrid (capable of being
105 * either request-based or bio-based).
106 */
107#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
108
96/*----------------------------------------------------------------- 109/*-----------------------------------------------------------------
97 * A registry of target types. 110 * A registry of target types.
98 *---------------------------------------------------------------*/ 111 *---------------------------------------------------------------*/
@@ -146,10 +159,16 @@ void dm_destroy(struct mapped_device *md);
146void dm_destroy_immediate(struct mapped_device *md); 159void dm_destroy_immediate(struct mapped_device *md);
147int dm_open_count(struct mapped_device *md); 160int dm_open_count(struct mapped_device *md);
148int dm_lock_for_deletion(struct mapped_device *md); 161int dm_lock_for_deletion(struct mapped_device *md);
162int dm_request_based(struct mapped_device *md);
163sector_t dm_get_size(struct mapped_device *md);
164struct dm_stats *dm_get_stats(struct mapped_device *md);
149 165
150int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 166int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
151 unsigned cookie); 167 unsigned cookie);
152 168
169void dm_internal_suspend(struct mapped_device *md);
170void dm_internal_resume(struct mapped_device *md);
171
153int dm_io_init(void); 172int dm_io_init(void);
154void dm_io_exit(void); 173void dm_io_exit(void);
155 174
@@ -162,4 +181,12 @@ void dm_kcopyd_exit(void);
162struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); 181struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
163void dm_free_md_mempools(struct dm_md_mempools *pools); 182void dm_free_md_mempools(struct dm_md_mempools *pools);
164 183
184/*
185 * Helpers that are used by DM core
186 */
187static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
188{
189 return !maxlen || strlen(result) + 1 >= maxlen;
190}
191
165#endif 192#endif
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 81b513890e2b..a7e8bf296388 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -615,6 +615,11 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
615} 615}
616EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); 616EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
617 617
618void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
619{
620 dm_bufio_prefetch(bm->bufio, b, 1);
621}
622
618void dm_bm_set_read_only(struct dm_block_manager *bm) 623void dm_bm_set_read_only(struct dm_block_manager *bm)
619{ 624{
620 bm->read_only = true; 625 bm->read_only = true;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index be5bff61be28..9a82083a66b6 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,6 +108,11 @@ int dm_bm_unlock(struct dm_block *b);
108int dm_bm_flush_and_unlock(struct dm_block_manager *bm, 108int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
109 struct dm_block *superblock); 109 struct dm_block *superblock);
110 110
111 /*
112 * Request data be prefetched into the cache.
113 */
114void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
115
111/* 116/*
112 * Switches the bm to a read only mode. Once read-only mode 117 * Switches the bm to a read only mode. Once read-only mode
113 * has been entered the following functions will return -EPERM. 118 * has been entered the following functions will return -EPERM.
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 35865425e4b4..468e371ee9b2 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -161,6 +161,7 @@ struct frame {
161}; 161};
162 162
163struct del_stack { 163struct del_stack {
164 struct dm_btree_info *info;
164 struct dm_transaction_manager *tm; 165 struct dm_transaction_manager *tm;
165 int top; 166 int top;
166 struct frame spine[MAX_SPINE_DEPTH]; 167 struct frame spine[MAX_SPINE_DEPTH];
@@ -183,6 +184,20 @@ static int unprocessed_frames(struct del_stack *s)
183 return s->top >= 0; 184 return s->top >= 0;
184} 185}
185 186
187static void prefetch_children(struct del_stack *s, struct frame *f)
188{
189 unsigned i;
190 struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
191
192 for (i = 0; i < f->nr_children; i++)
193 dm_bm_prefetch(bm, value64(f->n, i));
194}
195
196static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
197{
198 return f->level < (info->levels - 1);
199}
200
186static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 201static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
187{ 202{
188 int r; 203 int r;
@@ -205,6 +220,7 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
205 dm_tm_dec(s->tm, b); 220 dm_tm_dec(s->tm, b);
206 221
207 else { 222 else {
223 uint32_t flags;
208 struct frame *f = s->spine + ++s->top; 224 struct frame *f = s->spine + ++s->top;
209 225
210 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 226 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
@@ -217,6 +233,10 @@ static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
217 f->level = level; 233 f->level = level;
218 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 234 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
219 f->current_child = 0; 235 f->current_child = 0;
236
237 flags = le32_to_cpu(f->n->header.flags);
238 if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
239 prefetch_children(s, f);
220 } 240 }
221 241
222 return 0; 242 return 0;
@@ -230,11 +250,6 @@ static void pop_frame(struct del_stack *s)
230 dm_tm_unlock(s->tm, f->b); 250 dm_tm_unlock(s->tm, f->b);
231} 251}
232 252
233static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
234{
235 return f->level < (info->levels - 1);
236}
237
238int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 253int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
239{ 254{
240 int r; 255 int r;
@@ -243,6 +258,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
243 s = kmalloc(sizeof(*s), GFP_KERNEL); 258 s = kmalloc(sizeof(*s), GFP_KERNEL);
244 if (!s) 259 if (!s)
245 return -ENOMEM; 260 return -ENOMEM;
261 s->info = info;
246 s->tm = info->tm; 262 s->tm = info->tm;
247 s->top = -1; 263 s->top = -1;
248 264
@@ -287,7 +303,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
287 info->value_type.dec(info->value_type.context, 303 info->value_type.dec(info->value_type.context,
288 value_ptr(f->n, i)); 304 value_ptr(f->n, i));
289 } 305 }
290 f->current_child = f->nr_children; 306 pop_frame(s);
291 } 307 }
292 } 308 }
293 309
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 3e7a88d99eb0..6058569fe86c 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -292,16 +292,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
292 return dm_tm_unlock(ll->tm, blk); 292 return dm_tm_unlock(ll->tm, blk);
293} 293}
294 294
295int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) 295static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b,
296 uint32_t *result)
296{ 297{
297 __le32 le_rc; 298 __le32 le_rc;
298 int r = sm_ll_lookup_bitmap(ll, b, result); 299 int r;
299
300 if (r)
301 return r;
302
303 if (*result != 3)
304 return r;
305 300
306 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); 301 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
307 if (r < 0) 302 if (r < 0)
@@ -312,6 +307,19 @@ int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
312 return r; 307 return r;
313} 308}
314 309
310int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
311{
312 int r = sm_ll_lookup_bitmap(ll, b, result);
313
314 if (r)
315 return r;
316
317 if (*result != 3)
318 return r;
319
320 return sm_ll_lookup_big_ref_count(ll, b, result);
321}
322
315int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, 323int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
316 dm_block_t end, dm_block_t *result) 324 dm_block_t end, dm_block_t *result)
317{ 325{
@@ -372,11 +380,12 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
372 return -ENOSPC; 380 return -ENOSPC;
373} 381}
374 382
375int sm_ll_insert(struct ll_disk *ll, dm_block_t b, 383static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
376 uint32_t ref_count, enum allocation_event *ev) 384 uint32_t (*mutator)(void *context, uint32_t old),
385 void *context, enum allocation_event *ev)
377{ 386{
378 int r; 387 int r;
379 uint32_t bit, old; 388 uint32_t bit, old, ref_count;
380 struct dm_block *nb; 389 struct dm_block *nb;
381 dm_block_t index = b; 390 dm_block_t index = b;
382 struct disk_index_entry ie_disk; 391 struct disk_index_entry ie_disk;
@@ -399,6 +408,14 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
399 bm_le = dm_bitmap_data(nb); 408 bm_le = dm_bitmap_data(nb);
400 old = sm_lookup_bitmap(bm_le, bit); 409 old = sm_lookup_bitmap(bm_le, bit);
401 410
411 if (old > 2) {
412 r = sm_ll_lookup_big_ref_count(ll, b, &old);
413 if (r < 0)
414 return r;
415 }
416
417 ref_count = mutator(context, old);
418
402 if (ref_count <= 2) { 419 if (ref_count <= 2) {
403 sm_set_bitmap(bm_le, bit, ref_count); 420 sm_set_bitmap(bm_le, bit, ref_count);
404 421
@@ -448,31 +465,35 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
448 return ll->save_ie(ll, index, &ie_disk); 465 return ll->save_ie(ll, index, &ie_disk);
449} 466}
450 467
451int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 468static uint32_t set_ref_count(void *context, uint32_t old)
452{ 469{
453 int r; 470 return *((uint32_t *) context);
454 uint32_t rc; 471}
455
456 r = sm_ll_lookup(ll, b, &rc);
457 if (r)
458 return r;
459 472
460 return sm_ll_insert(ll, b, rc + 1, ev); 473int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
474 uint32_t ref_count, enum allocation_event *ev)
475{
476 return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
461} 477}
462 478
463int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) 479static uint32_t inc_ref_count(void *context, uint32_t old)
464{ 480{
465 int r; 481 return old + 1;
466 uint32_t rc; 482}
467 483
468 r = sm_ll_lookup(ll, b, &rc); 484int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
469 if (r) 485{
470 return r; 486 return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
487}
471 488
472 if (!rc) 489static uint32_t dec_ref_count(void *context, uint32_t old)
473 return -EINVAL; 490{
491 return old - 1;
492}
474 493
475 return sm_ll_insert(ll, b, rc - 1, ev); 494int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
495{
496 return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
476} 497}
477 498
478int sm_ll_commit(struct ll_disk *ll) 499int sm_ll_commit(struct ll_disk *ll)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e151d4c9298d..653073de09e3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -10,6 +10,7 @@
10 10
11#include <linux/bio.h> 11#include <linux/bio.h>
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13#include <linux/math64.h>
13#include <linux/ratelimit.h> 14#include <linux/ratelimit.h>
14 15
15struct dm_dev; 16struct dm_dev;
@@ -550,6 +551,14 @@ extern struct ratelimit_state dm_ratelimit_state;
550#define DM_MAPIO_REMAPPED 1 551#define DM_MAPIO_REMAPPED 1
551#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE 552#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
552 553
554#define dm_sector_div64(x, y)( \
555{ \
556 u64 _res; \
557 (x) = div64_u64_rem(x, y, &_res); \
558 _res; \
559} \
560)
561
553/* 562/*
554 * Ceiling(n / sz) 563 * Ceiling(n / sz)
555 */ 564 */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 2913b86eb12a..69ed5f5e9f6e 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -31,6 +31,15 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
31} 31}
32 32
33/** 33/**
34 * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
35 */
36static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
37{
38 *remainder = dividend % divisor;
39 return dividend / divisor;
40}
41
42/**
34 * div64_u64 - unsigned 64bit divide with 64bit divisor 43 * div64_u64 - unsigned 64bit divide with 64bit divisor
35 */ 44 */
36static inline u64 div64_u64(u64 dividend, u64 divisor) 45static inline u64 div64_u64(u64 dividend, u64 divisor)
@@ -63,6 +72,10 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
63extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); 72extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
64#endif 73#endif
65 74
75#ifndef div64_u64_rem
76extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
77#endif
78
66#ifndef div64_u64 79#ifndef div64_u64
67extern u64 div64_u64(u64 dividend, u64 divisor); 80extern u64 div64_u64(u64 dividend, u64 divisor);
68#endif 81#endif
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index afd0cbd52edb..f1e12bd40b3b 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
268 268
269#define DM_VERSION_MAJOR 4 269#define DM_VERSION_MAJOR 4
270#define DM_VERSION_MINOR 25 270#define DM_VERSION_MINOR 26
271#define DM_VERSION_PATCHLEVEL 0 271#define DM_VERSION_PATCHLEVEL 0
272#define DM_VERSION_EXTRA "-ioctl (2013-06-26)" 272#define DM_VERSION_EXTRA "-ioctl (2013-08-15)"
273 273
274/* Status bits */ 274/* Status bits */
275#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 275#define DM_READONLY_FLAG (1 << 0) /* In/Out */
diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef73..4382ad77777e 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,6 +79,46 @@ EXPORT_SYMBOL(div_s64_rem);
79#endif 79#endif
80 80
81/** 81/**
82 * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
83 * @dividend: 64bit dividend
84 * @divisor: 64bit divisor
85 * @remainder: 64bit remainder
86 *
87 * This implementation is a comparable to algorithm used by div64_u64.
88 * But this operation, which includes math for calculating the remainder,
89 * is kept distinct to avoid slowing down the div64_u64 operation on 32bit
90 * systems.
91 */
92#ifndef div64_u64_rem
93u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
94{
95 u32 high = divisor >> 32;
96 u64 quot;
97
98 if (high == 0) {
99 u32 rem32;
100 quot = div_u64_rem(dividend, divisor, &rem32);
101 *remainder = rem32;
102 } else {
103 int n = 1 + fls(high);
104 quot = div_u64(dividend >> n, divisor >> n);
105
106 if (quot != 0)
107 quot--;
108
109 *remainder = dividend - quot * divisor;
110 if (*remainder >= divisor) {
111 quot++;
112 *remainder -= divisor;
113 }
114 }
115
116 return quot;
117}
118EXPORT_SYMBOL(div64_u64_rem);
119#endif
120
121/**
82 * div64_u64 - unsigned 64bit divide with 64bit divisor 122 * div64_u64 - unsigned 64bit divide with 64bit divisor
83 * @dividend: 64bit dividend 123 * @dividend: 64bit dividend
84 * @divisor: 64bit divisor 124 * @divisor: 64bit divisor