aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/statistics.txt186
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-ioctl.c22
-rw-r--r--drivers/md/dm-stats.c969
-rw-r--r--drivers/md/dm-stats.h40
-rw-r--r--drivers/md/dm.c65
-rw-r--r--drivers/md/dm.h16
-rw-r--r--include/linux/device-mapper.h9
-rw-r--r--include/uapi/linux/dm-ioctl.h4
9 files changed, 1299 insertions, 14 deletions
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
new file mode 100644
index 000000000000..2a1673adc200
--- /dev/null
+++ b/Documentation/device-mapper/statistics.txt
@@ -0,0 +1,186 @@
1DM statistics
2=============
3
4Device Mapper supports the collection of I/O statistics on user-defined
5regions of a DM device. If no regions are defined no statistics are
6collected so there isn't any performance impact. Only bio-based DM
7devices are currently supported.
8
9Each user-defined region specifies a starting sector, length and step.
10Individual statistics will be collected for each step-sized area within
11the range specified.
12
13The I/O statistics counters for each step-sized area of a region are
14in the same format as /sys/block/*/stat or /proc/diskstats (see:
15Documentation/iostats.txt). But two extra counters (12 and 13) are
16provided: total time spent reading and writing in milliseconds. All
17these counters may be accessed by sending the @stats_print message to
18the appropriate DM device via dmsetup.
19
20Each region has a corresponding unique identifier, which we call a
21region_id, that is assigned when the region is created. The region_id
22must be supplied when querying statistics about the region, deleting the
23region, etc. Unique region_ids enable multiple userspace programs to
24request and process statistics for the same DM device without stepping
25on each other's data.
26
27The creation of DM statistics will allocate memory via kmalloc or
28fallback to using vmalloc space. At most, 1/4 of the overall system
29memory may be allocated by DM statistics. The admin can see how much
30memory is used by reading
31/sys/module/dm_mod/parameters/stats_current_allocated_bytes
32
33Messages
34========
35
36 @stats_create <range> <step> [<program_id> [<aux_data>]]
37
38 Create a new region and return the region_id.
39
40 <range>
41 "-" - whole device
42 "<start_sector>+<length>" - a range of <length> 512-byte sectors
43 starting with <start_sector>.
44
45 <step>
46 "<area_size>" - the range is subdivided into areas each containing
47 <area_size> sectors.
48 "/<number_of_areas>" - the range is subdivided into the specified
49 number of areas.
50
51 <program_id>
52 An optional parameter. A name that uniquely identifies
53 the userspace owner of the range. This groups ranges together
54 so that userspace programs can identify the ranges they
55 created and ignore those created by others.
56 The kernel returns this string back in the output of
57 @stats_list message, but it doesn't use it for anything else.
58
59 <aux_data>
60 An optional parameter. A word that provides auxiliary data
61 that is useful to the client program that created the range.
62 The kernel returns this string back in the output of
63 @stats_list message, but it doesn't use this value for anything.
64
65 @stats_delete <region_id>
66
67 Delete the region with the specified id.
68
69 <region_id>
70 region_id returned from @stats_create
71
72 @stats_clear <region_id>
73
74 Clear all the counters except the in-flight i/o counters.
75
76 <region_id>
77 region_id returned from @stats_create
78
79 @stats_list [<program_id>]
80
81 List all regions registered with @stats_create.
82
83 <program_id>
84 An optional parameter.
85 If this parameter is specified, only matching regions
86 are returned.
87 If it is not specified, all regions are returned.
88
89 Output format:
90 <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
91
92 @stats_print <region_id> [<starting_line> <number_of_lines>]
93
94 Print counters for each step-sized area of a region.
95
96 <region_id>
97 region_id returned from @stats_create
98
99 <starting_line>
100 The index of the starting line in the output.
101 If omitted, all lines are returned.
102
103 <number_of_lines>
104 The number of lines to include in the output.
105 If omitted, all lines are returned.
106
107 Output format for each step-sized area of a region:
108
109 <start_sector>+<length> counters
110
111 The first 11 counters have the same meaning as
112 /sys/block/*/stat or /proc/diskstats.
113
114 Please refer to Documentation/iostats.txt for details.
115
116 1. the number of reads completed
117 2. the number of reads merged
118 3. the number of sectors read
119 4. the number of milliseconds spent reading
120 5. the number of writes completed
121 6. the number of writes merged
122 7. the number of sectors written
123 8. the number of milliseconds spent writing
124 9. the number of I/Os currently in progress
125 10. the number of milliseconds spent doing I/Os
126 11. the weighted number of milliseconds spent doing I/Os
127
128 Additional counters:
129 12. the total time spent reading in milliseconds
130 13. the total time spent writing in milliseconds
131
132 @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
133
134 Atomically print and then clear all the counters except the
135 in-flight i/o counters. Useful when the client consuming the
136 statistics does not want to lose any statistics (those updated
137 between printing and clearing).
138
139 <region_id>
140 region_id returned from @stats_create
141
142 <starting_line>
143 The index of the starting line in the output.
144 If omitted, all lines are printed and then cleared.
145
146 <number_of_lines>
147 The number of lines to process.
148 If omitted, all lines are printed and then cleared.
149
150 @stats_set_aux <region_id> <aux_data>
151
152 Store auxiliary data aux_data for the specified region.
153
154 <region_id>
155 region_id returned from @stats_create
156
157 <aux_data>
158 The string that identifies data which is useful to the client
159 program that created the range. The kernel returns this
160 string back in the output of @stats_list message, but it
161 doesn't use this value for anything.
162
163Examples
164========
165
166Subdivide the DM device 'vol' into 100 pieces and start collecting
167statistics on them:
168
169 dmsetup message vol 0 @stats_create - /100
170
171Set the auxillary data string to "foo bar baz" (the escape for each
172space must also be escaped, otherwise the shell will consume them):
173
174 dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
175
176List the statistics:
177
178 dmsetup message vol 0 @stats_list
179
180Print the statistics:
181
182 dmsetup message vol 0 @stats_print 0
183
184Delete the statistics:
185
186 dmsetup message vol 0 @stats_delete 0
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5ef78efc27f2..2acc43fe0229 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o
7dm-multipath-y += dm-path-selector.o dm-mpath.o 7dm-multipath-y += dm-path-selector.o dm-mpath.o
8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ 8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o 9 dm-snap-persistent.o
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index e9c0de75010e..afe08146f73e 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1455,20 +1455,26 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1455 return 0; 1455 return 0;
1456} 1456}
1457 1457
1458static bool buffer_test_overflow(char *result, unsigned maxlen)
1459{
1460 return !maxlen || strlen(result) + 1 >= maxlen;
1461}
1462
1463/* 1458/*
1464 * Process device-mapper dependent messages. 1459 * Process device-mapper dependent messages. Messages prefixed with '@'
1460 * are processed by the DM core. All others are delivered to the target.
1465 * Returns a number <= 1 if message was processed by device mapper. 1461 * Returns a number <= 1 if message was processed by device mapper.
1466 * Returns 2 if message should be delivered to the target. 1462 * Returns 2 if message should be delivered to the target.
1467 */ 1463 */
1468static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, 1464static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
1469 char *result, unsigned maxlen) 1465 char *result, unsigned maxlen)
1470{ 1466{
1471 return 2; 1467 int r;
1468
1469 if (**argv != '@')
1470 return 2; /* no '@' prefix, deliver to target */
1471
1472 r = dm_stats_message(md, argc, argv, result, maxlen);
1473 if (r < 2)
1474 return r;
1475
1476 DMERR("Unsupported message sent to DM core: %s", argv[0]);
1477 return -EINVAL;
1472} 1478}
1473 1479
1474/* 1480/*
@@ -1542,7 +1548,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1542 1548
1543 if (r == 1) { 1549 if (r == 1) {
1544 param->flags |= DM_DATA_OUT_FLAG; 1550 param->flags |= DM_DATA_OUT_FLAG;
1545 if (buffer_test_overflow(result, maxlen)) 1551 if (dm_message_test_buffer_overflow(result, maxlen))
1546 param->flags |= DM_BUFFER_FULL_FLAG; 1552 param->flags |= DM_BUFFER_FULL_FLAG;
1547 else 1553 else
1548 param->data_size = param->data_start + strlen(result) + 1; 1554 param->data_size = param->data_start + strlen(result) + 1;
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
new file mode 100644
index 000000000000..8ae31e8d3d64
--- /dev/null
+++ b/drivers/md/dm-stats.c
@@ -0,0 +1,969 @@
1#include <linux/errno.h>
2#include <linux/numa.h>
3#include <linux/slab.h>
4#include <linux/rculist.h>
5#include <linux/threads.h>
6#include <linux/preempt.h>
7#include <linux/irqflags.h>
8#include <linux/vmalloc.h>
9#include <linux/mm.h>
10#include <linux/module.h>
11#include <linux/device-mapper.h>
12
13#include "dm.h"
14#include "dm-stats.h"
15
16#define DM_MSG_PREFIX "stats"
17
18static int dm_stat_need_rcu_barrier;
19
20/*
21 * Using 64-bit values to avoid overflow (which is a
22 * problem that block/genhd.c's IO accounting has).
23 */
24struct dm_stat_percpu {
25 unsigned long long sectors[2];
26 unsigned long long ios[2];
27 unsigned long long merges[2];
28 unsigned long long ticks[2];
29 unsigned long long io_ticks[2];
30 unsigned long long io_ticks_total;
31 unsigned long long time_in_queue;
32};
33
34struct dm_stat_shared {
35 atomic_t in_flight[2];
36 unsigned long stamp;
37 struct dm_stat_percpu tmp;
38};
39
40struct dm_stat {
41 struct list_head list_entry;
42 int id;
43 size_t n_entries;
44 sector_t start;
45 sector_t end;
46 sector_t step;
47 const char *program_id;
48 const char *aux_data;
49 struct rcu_head rcu_head;
50 size_t shared_alloc_size;
51 size_t percpu_alloc_size;
52 struct dm_stat_percpu *stat_percpu[NR_CPUS];
53 struct dm_stat_shared stat_shared[0];
54};
55
56struct dm_stats_last_position {
57 sector_t last_sector;
58 unsigned last_rw;
59};
60
61/*
62 * A typo on the command line could possibly make the kernel run out of memory
63 * and crash. To prevent the crash we account all used memory. We fail if we
64 * exhaust 1/4 of all memory or 1/2 of vmalloc space.
65 */
66#define DM_STATS_MEMORY_FACTOR 4
67#define DM_STATS_VMALLOC_FACTOR 2
68
69static DEFINE_SPINLOCK(shared_memory_lock);
70
71static unsigned long shared_memory_amount;
72
73static bool __check_shared_memory(size_t alloc_size)
74{
75 size_t a;
76
77 a = shared_memory_amount + alloc_size;
78 if (a < shared_memory_amount)
79 return false;
80 if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR)
81 return false;
82#ifdef CONFIG_MMU
83 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
84 return false;
85#endif
86 return true;
87}
88
89static bool check_shared_memory(size_t alloc_size)
90{
91 bool ret;
92
93 spin_lock_irq(&shared_memory_lock);
94
95 ret = __check_shared_memory(alloc_size);
96
97 spin_unlock_irq(&shared_memory_lock);
98
99 return ret;
100}
101
102static bool claim_shared_memory(size_t alloc_size)
103{
104 spin_lock_irq(&shared_memory_lock);
105
106 if (!__check_shared_memory(alloc_size)) {
107 spin_unlock_irq(&shared_memory_lock);
108 return false;
109 }
110
111 shared_memory_amount += alloc_size;
112
113 spin_unlock_irq(&shared_memory_lock);
114
115 return true;
116}
117
118static void free_shared_memory(size_t alloc_size)
119{
120 unsigned long flags;
121
122 spin_lock_irqsave(&shared_memory_lock, flags);
123
124 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
125 spin_unlock_irqrestore(&shared_memory_lock, flags);
126 DMCRIT("Memory usage accounting bug.");
127 return;
128 }
129
130 shared_memory_amount -= alloc_size;
131
132 spin_unlock_irqrestore(&shared_memory_lock, flags);
133}
134
135static void *dm_kvzalloc(size_t alloc_size, int node)
136{
137 void *p;
138
139 if (!claim_shared_memory(alloc_size))
140 return NULL;
141
142 if (alloc_size <= KMALLOC_MAX_SIZE) {
143 p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
144 if (p)
145 return p;
146 }
147 p = vzalloc_node(alloc_size, node);
148 if (p)
149 return p;
150
151 free_shared_memory(alloc_size);
152
153 return NULL;
154}
155
156static void dm_kvfree(void *ptr, size_t alloc_size)
157{
158 if (!ptr)
159 return;
160
161 free_shared_memory(alloc_size);
162
163 if (is_vmalloc_addr(ptr))
164 vfree(ptr);
165 else
166 kfree(ptr);
167}
168
169static void dm_stat_free(struct rcu_head *head)
170{
171 int cpu;
172 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
174 kfree(s->program_id);
175 kfree(s->aux_data);
176 for_each_possible_cpu(cpu)
177 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
178 dm_kvfree(s, s->shared_alloc_size);
179}
180
181static int dm_stat_in_flight(struct dm_stat_shared *shared)
182{
183 return atomic_read(&shared->in_flight[READ]) +
184 atomic_read(&shared->in_flight[WRITE]);
185}
186
187void dm_stats_init(struct dm_stats *stats)
188{
189 int cpu;
190 struct dm_stats_last_position *last;
191
192 mutex_init(&stats->mutex);
193 INIT_LIST_HEAD(&stats->list);
194 stats->last = alloc_percpu(struct dm_stats_last_position);
195 for_each_possible_cpu(cpu) {
196 last = per_cpu_ptr(stats->last, cpu);
197 last->last_sector = (sector_t)ULLONG_MAX;
198 last->last_rw = UINT_MAX;
199 }
200}
201
202void dm_stats_cleanup(struct dm_stats *stats)
203{
204 size_t ni;
205 struct dm_stat *s;
206 struct dm_stat_shared *shared;
207
208 while (!list_empty(&stats->list)) {
209 s = container_of(stats->list.next, struct dm_stat, list_entry);
210 list_del(&s->list_entry);
211 for (ni = 0; ni < s->n_entries; ni++) {
212 shared = &s->stat_shared[ni];
213 if (WARN_ON(dm_stat_in_flight(shared))) {
214 DMCRIT("leaked in-flight counter at index %lu "
215 "(start %llu, end %llu, step %llu): reads %d, writes %d",
216 (unsigned long)ni,
217 (unsigned long long)s->start,
218 (unsigned long long)s->end,
219 (unsigned long long)s->step,
220 atomic_read(&shared->in_flight[READ]),
221 atomic_read(&shared->in_flight[WRITE]));
222 }
223 }
224 dm_stat_free(&s->rcu_head);
225 }
226 free_percpu(stats->last);
227}
228
229static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
230 sector_t step, const char *program_id, const char *aux_data,
231 void (*suspend_callback)(struct mapped_device *),
232 void (*resume_callback)(struct mapped_device *),
233 struct mapped_device *md)
234{
235 struct list_head *l;
236 struct dm_stat *s, *tmp_s;
237 sector_t n_entries;
238 size_t ni;
239 size_t shared_alloc_size;
240 size_t percpu_alloc_size;
241 struct dm_stat_percpu *p;
242 int cpu;
243 int ret_id;
244 int r;
245
246 if (end < start || !step)
247 return -EINVAL;
248
249 n_entries = end - start;
250 if (dm_sector_div64(n_entries, step))
251 n_entries++;
252
253 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
254 return -EOVERFLOW;
255
256 shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
257 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
258 return -EOVERFLOW;
259
260 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
261 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
262 return -EOVERFLOW;
263
264 if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
265 return -ENOMEM;
266
267 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
268 if (!s)
269 return -ENOMEM;
270
271 s->n_entries = n_entries;
272 s->start = start;
273 s->end = end;
274 s->step = step;
275 s->shared_alloc_size = shared_alloc_size;
276 s->percpu_alloc_size = percpu_alloc_size;
277
278 s->program_id = kstrdup(program_id, GFP_KERNEL);
279 if (!s->program_id) {
280 r = -ENOMEM;
281 goto out;
282 }
283 s->aux_data = kstrdup(aux_data, GFP_KERNEL);
284 if (!s->aux_data) {
285 r = -ENOMEM;
286 goto out;
287 }
288
289 for (ni = 0; ni < n_entries; ni++) {
290 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
291 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
292 }
293
294 for_each_possible_cpu(cpu) {
295 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
296 if (!p) {
297 r = -ENOMEM;
298 goto out;
299 }
300 s->stat_percpu[cpu] = p;
301 }
302
303 /*
304 * Suspend/resume to make sure there is no i/o in flight,
305 * so that newly created statistics will be exact.
306 *
307 * (note: we couldn't suspend earlier because we must not
308 * allocate memory while suspended)
309 */
310 suspend_callback(md);
311
312 mutex_lock(&stats->mutex);
313 s->id = 0;
314 list_for_each(l, &stats->list) {
315 tmp_s = container_of(l, struct dm_stat, list_entry);
316 if (WARN_ON(tmp_s->id < s->id)) {
317 r = -EINVAL;
318 goto out_unlock_resume;
319 }
320 if (tmp_s->id > s->id)
321 break;
322 if (unlikely(s->id == INT_MAX)) {
323 r = -ENFILE;
324 goto out_unlock_resume;
325 }
326 s->id++;
327 }
328 ret_id = s->id;
329 list_add_tail_rcu(&s->list_entry, l);
330 mutex_unlock(&stats->mutex);
331
332 resume_callback(md);
333
334 return ret_id;
335
336out_unlock_resume:
337 mutex_unlock(&stats->mutex);
338 resume_callback(md);
339out:
340 dm_stat_free(&s->rcu_head);
341 return r;
342}
343
344static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
345{
346 struct dm_stat *s;
347
348 list_for_each_entry(s, &stats->list, list_entry) {
349 if (s->id > id)
350 break;
351 if (s->id == id)
352 return s;
353 }
354
355 return NULL;
356}
357
358static int dm_stats_delete(struct dm_stats *stats, int id)
359{
360 struct dm_stat *s;
361 int cpu;
362
363 mutex_lock(&stats->mutex);
364
365 s = __dm_stats_find(stats, id);
366 if (!s) {
367 mutex_unlock(&stats->mutex);
368 return -ENOENT;
369 }
370
371 list_del_rcu(&s->list_entry);
372 mutex_unlock(&stats->mutex);
373
374 /*
375 * vfree can't be called from RCU callback
376 */
377 for_each_possible_cpu(cpu)
378 if (is_vmalloc_addr(s->stat_percpu))
379 goto do_sync_free;
380 if (is_vmalloc_addr(s)) {
381do_sync_free:
382 synchronize_rcu_expedited();
383 dm_stat_free(&s->rcu_head);
384 } else {
385 ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
386 call_rcu(&s->rcu_head, dm_stat_free);
387 }
388 return 0;
389}
390
391static int dm_stats_list(struct dm_stats *stats, const char *program,
392 char *result, unsigned maxlen)
393{
394 struct dm_stat *s;
395 sector_t len;
396 unsigned sz = 0;
397
398 /*
399 * Output format:
400 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
401 */
402
403 mutex_lock(&stats->mutex);
404 list_for_each_entry(s, &stats->list, list_entry) {
405 if (!program || !strcmp(program, s->program_id)) {
406 len = s->end - s->start;
407 DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id,
408 (unsigned long long)s->start,
409 (unsigned long long)len,
410 (unsigned long long)s->step,
411 s->program_id,
412 s->aux_data);
413 }
414 }
415 mutex_unlock(&stats->mutex);
416
417 return 1;
418}
419
420static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
421{
422 /*
423 * This is racy, but so is part_round_stats_single.
424 */
425 unsigned long now = jiffies;
426 unsigned in_flight_read;
427 unsigned in_flight_write;
428 unsigned long difference = now - shared->stamp;
429
430 if (!difference)
431 return;
432 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
433 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
434 if (in_flight_read)
435 p->io_ticks[READ] += difference;
436 if (in_flight_write)
437 p->io_ticks[WRITE] += difference;
438 if (in_flight_read + in_flight_write) {
439 p->io_ticks_total += difference;
440 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
441 }
442 shared->stamp = now;
443}
444
445static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
446 unsigned long bi_rw, sector_t len, bool merged,
447 bool end, unsigned long duration)
448{
449 unsigned long idx = bi_rw & REQ_WRITE;
450 struct dm_stat_shared *shared = &s->stat_shared[entry];
451 struct dm_stat_percpu *p;
452
453 /*
454 * For strict correctness we should use local_irq_disable/enable
455 * instead of preempt_disable/enable.
456 *
457 * This is racy if the driver finishes bios from non-interrupt
458 * context as well as from interrupt context or from more different
459 * interrupts.
460 *
461 * However, the race only results in not counting some events,
462 * so it is acceptable.
463 *
464 * part_stat_lock()/part_stat_unlock() have this race too.
465 */
466 preempt_disable();
467 p = &s->stat_percpu[smp_processor_id()][entry];
468
469 if (!end) {
470 dm_stat_round(shared, p);
471 atomic_inc(&shared->in_flight[idx]);
472 } else {
473 dm_stat_round(shared, p);
474 atomic_dec(&shared->in_flight[idx]);
475 p->sectors[idx] += len;
476 p->ios[idx] += 1;
477 p->merges[idx] += merged;
478 p->ticks[idx] += duration;
479 }
480
481 preempt_enable();
482}
483
484static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
485 sector_t bi_sector, sector_t end_sector,
486 bool end, unsigned long duration,
487 struct dm_stats_aux *stats_aux)
488{
489 sector_t rel_sector, offset, todo, fragment_len;
490 size_t entry;
491
492 if (end_sector <= s->start || bi_sector >= s->end)
493 return;
494 if (unlikely(bi_sector < s->start)) {
495 rel_sector = 0;
496 todo = end_sector - s->start;
497 } else {
498 rel_sector = bi_sector - s->start;
499 todo = end_sector - bi_sector;
500 }
501 if (unlikely(end_sector > s->end))
502 todo -= (end_sector - s->end);
503
504 offset = dm_sector_div64(rel_sector, s->step);
505 entry = rel_sector;
506 do {
507 if (WARN_ON_ONCE(entry >= s->n_entries)) {
508 DMCRIT("Invalid area access in region id %d", s->id);
509 return;
510 }
511 fragment_len = todo;
512 if (fragment_len > s->step - offset)
513 fragment_len = s->step - offset;
514 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
515 stats_aux->merged, end, duration);
516 todo -= fragment_len;
517 entry++;
518 offset = 0;
519 } while (unlikely(todo != 0));
520}
521
522void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
523 sector_t bi_sector, unsigned bi_sectors, bool end,
524 unsigned long duration, struct dm_stats_aux *stats_aux)
525{
526 struct dm_stat *s;
527 sector_t end_sector;
528 struct dm_stats_last_position *last;
529
530 if (unlikely(!bi_sectors))
531 return;
532
533 end_sector = bi_sector + bi_sectors;
534
535 if (!end) {
536 /*
537 * A race condition can at worst result in the merged flag being
538 * misrepresented, so we don't have to disable preemption here.
539 */
540 last = __this_cpu_ptr(stats->last);
541 stats_aux->merged =
542 (bi_sector == (ACCESS_ONCE(last->last_sector) &&
543 ((bi_rw & (REQ_WRITE | REQ_DISCARD)) ==
544 (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD)))
545 ));
546 ACCESS_ONCE(last->last_sector) = end_sector;
547 ACCESS_ONCE(last->last_rw) = bi_rw;
548 }
549
550 rcu_read_lock();
551
552 list_for_each_entry_rcu(s, &stats->list, list_entry)
553 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
554
555 rcu_read_unlock();
556}
557
558static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
559 struct dm_stat *s, size_t x)
560{
561 int cpu;
562 struct dm_stat_percpu *p;
563
564 local_irq_disable();
565 p = &s->stat_percpu[smp_processor_id()][x];
566 dm_stat_round(shared, p);
567 local_irq_enable();
568
569 memset(&shared->tmp, 0, sizeof(shared->tmp));
570 for_each_possible_cpu(cpu) {
571 p = &s->stat_percpu[cpu][x];
572 shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
573 shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
574 shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
575 shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
576 shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
577 shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
578 shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
579 shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
580 shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
581 shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
582 shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
583 shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
584 }
585}
586
587static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
588 bool init_tmp_percpu_totals)
589{
590 size_t x;
591 struct dm_stat_shared *shared;
592 struct dm_stat_percpu *p;
593
594 for (x = idx_start; x < idx_end; x++) {
595 shared = &s->stat_shared[x];
596 if (init_tmp_percpu_totals)
597 __dm_stat_init_temporary_percpu_totals(shared, s, x);
598 local_irq_disable();
599 p = &s->stat_percpu[smp_processor_id()][x];
600 p->sectors[READ] -= shared->tmp.sectors[READ];
601 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
602 p->ios[READ] -= shared->tmp.ios[READ];
603 p->ios[WRITE] -= shared->tmp.ios[WRITE];
604 p->merges[READ] -= shared->tmp.merges[READ];
605 p->merges[WRITE] -= shared->tmp.merges[WRITE];
606 p->ticks[READ] -= shared->tmp.ticks[READ];
607 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
608 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
609 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
610 p->io_ticks_total -= shared->tmp.io_ticks_total;
611 p->time_in_queue -= shared->tmp.time_in_queue;
612 local_irq_enable();
613 }
614}
615
616static int dm_stats_clear(struct dm_stats *stats, int id)
617{
618 struct dm_stat *s;
619
620 mutex_lock(&stats->mutex);
621
622 s = __dm_stats_find(stats, id);
623 if (!s) {
624 mutex_unlock(&stats->mutex);
625 return -ENOENT;
626 }
627
628 __dm_stat_clear(s, 0, s->n_entries, true);
629
630 mutex_unlock(&stats->mutex);
631
632 return 1;
633}
634
635/*
636 * This is like jiffies_to_msec, but works for 64-bit values.
637 */
638static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
639{
640 unsigned long long result = 0;
641 unsigned mult;
642
643 if (j)
644 result = jiffies_to_msecs(j & 0x3fffff);
645 if (j >= 1 << 22) {
646 mult = jiffies_to_msecs(1 << 22);
647 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
648 }
649 if (j >= 1ULL << 44)
650 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
651
652 return result;
653}
654
655static int dm_stats_print(struct dm_stats *stats, int id,
656 size_t idx_start, size_t idx_len,
657 bool clear, char *result, unsigned maxlen)
658{
659 unsigned sz = 0;
660 struct dm_stat *s;
661 size_t x;
662 sector_t start, end, step;
663 size_t idx_end;
664 struct dm_stat_shared *shared;
665
666 /*
667 * Output format:
668 * <start_sector>+<length> counters
669 */
670
671 mutex_lock(&stats->mutex);
672
673 s = __dm_stats_find(stats, id);
674 if (!s) {
675 mutex_unlock(&stats->mutex);
676 return -ENOENT;
677 }
678
679 idx_end = idx_start + idx_len;
680 if (idx_end < idx_start ||
681 idx_end > s->n_entries)
682 idx_end = s->n_entries;
683
684 if (idx_start > idx_end)
685 idx_start = idx_end;
686
687 step = s->step;
688 start = s->start + (step * idx_start);
689
690 for (x = idx_start; x < idx_end; x++, start = end) {
691 shared = &s->stat_shared[x];
692 end = start + step;
693 if (unlikely(end > s->end))
694 end = s->end;
695
696 __dm_stat_init_temporary_percpu_totals(shared, s, x);
697
698 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
699 (unsigned long long)start,
700 (unsigned long long)step,
701 shared->tmp.ios[READ],
702 shared->tmp.merges[READ],
703 shared->tmp.sectors[READ],
704 dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
705 shared->tmp.ios[WRITE],
706 shared->tmp.merges[WRITE],
707 shared->tmp.sectors[WRITE],
708 dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
709 dm_stat_in_flight(shared),
710 dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
711 dm_jiffies_to_msec64(shared->tmp.time_in_queue),
712 dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
713 dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
714
715 if (unlikely(sz + 1 >= maxlen))
716 goto buffer_overflow;
717 }
718
719 if (clear)
720 __dm_stat_clear(s, idx_start, idx_end, false);
721
722buffer_overflow:
723 mutex_unlock(&stats->mutex);
724
725 return 1;
726}
727
728static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
729{
730 struct dm_stat *s;
731 const char *new_aux_data;
732
733 mutex_lock(&stats->mutex);
734
735 s = __dm_stats_find(stats, id);
736 if (!s) {
737 mutex_unlock(&stats->mutex);
738 return -ENOENT;
739 }
740
741 new_aux_data = kstrdup(aux_data, GFP_KERNEL);
742 if (!new_aux_data) {
743 mutex_unlock(&stats->mutex);
744 return -ENOMEM;
745 }
746
747 kfree(s->aux_data);
748 s->aux_data = new_aux_data;
749
750 mutex_unlock(&stats->mutex);
751
752 return 0;
753}
754
755static int message_stats_create(struct mapped_device *md,
756 unsigned argc, char **argv,
757 char *result, unsigned maxlen)
758{
759 int id;
760 char dummy;
761 unsigned long long start, end, len, step;
762 unsigned divisor;
763 const char *program_id, *aux_data;
764
765 /*
766 * Input format:
767 * <range> <step> [<program_id> [<aux_data>]]
768 */
769
770 if (argc < 3 || argc > 5)
771 return -EINVAL;
772
773 if (!strcmp(argv[1], "-")) {
774 start = 0;
775 len = dm_get_size(md);
776 if (!len)
777 len = 1;
778 } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
779 start != (sector_t)start || len != (sector_t)len)
780 return -EINVAL;
781
782 end = start + len;
783 if (start >= end)
784 return -EINVAL;
785
786 if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
787 step = end - start;
788 if (do_div(step, divisor))
789 step++;
790 if (!step)
791 step = 1;
792 } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
793 step != (sector_t)step || !step)
794 return -EINVAL;
795
796 program_id = "-";
797 aux_data = "-";
798
799 if (argc > 3)
800 program_id = argv[3];
801
802 if (argc > 4)
803 aux_data = argv[4];
804
805 /*
806 * If a buffer overflow happens after we created the region,
807 * it's too late (the userspace would retry with a larger
808 * buffer, but the region id that caused the overflow is already
809 * leaked). So we must detect buffer overflow in advance.
810 */
811 snprintf(result, maxlen, "%d", INT_MAX);
812 if (dm_message_test_buffer_overflow(result, maxlen))
813 return 1;
814
815 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
816 dm_internal_suspend, dm_internal_resume, md);
817 if (id < 0)
818 return id;
819
820 snprintf(result, maxlen, "%d", id);
821
822 return 1;
823}
824
825static int message_stats_delete(struct mapped_device *md,
826 unsigned argc, char **argv)
827{
828 int id;
829 char dummy;
830
831 if (argc != 2)
832 return -EINVAL;
833
834 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
835 return -EINVAL;
836
837 return dm_stats_delete(dm_get_stats(md), id);
838}
839
840static int message_stats_clear(struct mapped_device *md,
841 unsigned argc, char **argv)
842{
843 int id;
844 char dummy;
845
846 if (argc != 2)
847 return -EINVAL;
848
849 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
850 return -EINVAL;
851
852 return dm_stats_clear(dm_get_stats(md), id);
853}
854
855static int message_stats_list(struct mapped_device *md,
856 unsigned argc, char **argv,
857 char *result, unsigned maxlen)
858{
859 int r;
860 const char *program = NULL;
861
862 if (argc < 1 || argc > 2)
863 return -EINVAL;
864
865 if (argc > 1) {
866 program = kstrdup(argv[1], GFP_KERNEL);
867 if (!program)
868 return -ENOMEM;
869 }
870
871 r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
872
873 kfree(program);
874
875 return r;
876}
877
878static int message_stats_print(struct mapped_device *md,
879 unsigned argc, char **argv, bool clear,
880 char *result, unsigned maxlen)
881{
882 int id;
883 char dummy;
884 unsigned long idx_start = 0, idx_len = ULONG_MAX;
885
886 if (argc != 2 && argc != 4)
887 return -EINVAL;
888
889 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
890 return -EINVAL;
891
892 if (argc > 3) {
893 if (strcmp(argv[2], "-") &&
894 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
895 return -EINVAL;
896 if (strcmp(argv[3], "-") &&
897 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
898 return -EINVAL;
899 }
900
901 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
902 result, maxlen);
903}
904
905static int message_stats_set_aux(struct mapped_device *md,
906 unsigned argc, char **argv)
907{
908 int id;
909 char dummy;
910
911 if (argc != 3)
912 return -EINVAL;
913
914 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
915 return -EINVAL;
916
917 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
918}
919
920int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
921 char *result, unsigned maxlen)
922{
923 int r;
924
925 if (dm_request_based(md)) {
926 DMWARN("Statistics are only supported for bio-based devices");
927 return -EOPNOTSUPP;
928 }
929
930 /* All messages here must start with '@' */
931 if (!strcasecmp(argv[0], "@stats_create"))
932 r = message_stats_create(md, argc, argv, result, maxlen);
933 else if (!strcasecmp(argv[0], "@stats_delete"))
934 r = message_stats_delete(md, argc, argv);
935 else if (!strcasecmp(argv[0], "@stats_clear"))
936 r = message_stats_clear(md, argc, argv);
937 else if (!strcasecmp(argv[0], "@stats_list"))
938 r = message_stats_list(md, argc, argv, result, maxlen);
939 else if (!strcasecmp(argv[0], "@stats_print"))
940 r = message_stats_print(md, argc, argv, false, result, maxlen);
941 else if (!strcasecmp(argv[0], "@stats_print_clear"))
942 r = message_stats_print(md, argc, argv, true, result, maxlen);
943 else if (!strcasecmp(argv[0], "@stats_set_aux"))
944 r = message_stats_set_aux(md, argc, argv);
945 else
946 return 2; /* this wasn't a stats message */
947
948 if (r == -EINVAL)
949 DMWARN("Invalid parameters for message %s", argv[0]);
950
951 return r;
952}
953
954int __init dm_statistics_init(void)
955{
956 dm_stat_need_rcu_barrier = 0;
957 return 0;
958}
959
960void dm_statistics_exit(void)
961{
962 if (dm_stat_need_rcu_barrier)
963 rcu_barrier();
964 if (WARN_ON(shared_memory_amount))
965 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
966}
967
968module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
969MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
new file mode 100644
index 000000000000..e7c4984bf235
--- /dev/null
+++ b/drivers/md/dm-stats.h
@@ -0,0 +1,40 @@
1#ifndef DM_STATS_H
2#define DM_STATS_H
3
4#include <linux/types.h>
5#include <linux/mutex.h>
6#include <linux/list.h>
7
8int dm_statistics_init(void);
9void dm_statistics_exit(void);
10
11struct dm_stats {
12 struct mutex mutex;
13 struct list_head list; /* list of struct dm_stat */
14 struct dm_stats_last_position __percpu *last;
15 sector_t last_sector;
16 unsigned last_rw;
17};
18
19struct dm_stats_aux {
20 bool merged;
21};
22
23void dm_stats_init(struct dm_stats *st);
24void dm_stats_cleanup(struct dm_stats *st);
25
26struct mapped_device;
27
28int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
29 char *result, unsigned maxlen);
30
31void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
32 sector_t bi_sector, unsigned bi_sectors, bool end,
33 unsigned long duration, struct dm_stats_aux *aux);
34
35static inline bool dm_stats_used(struct dm_stats *st)
36{
37 return !list_empty(&st->list);
38}
39
40#endif
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7faeaa3d4835..6a5e9ed2fcc3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -60,6 +60,7 @@ struct dm_io {
60 struct bio *bio; 60 struct bio *bio;
61 unsigned long start_time; 61 unsigned long start_time;
62 spinlock_t endio_lock; 62 spinlock_t endio_lock;
63 struct dm_stats_aux stats_aux;
63}; 64};
64 65
65/* 66/*
@@ -198,6 +199,8 @@ struct mapped_device {
198 199
199 /* zero-length flush that will be cloned and submitted to targets */ 200 /* zero-length flush that will be cloned and submitted to targets */
200 struct bio flush_bio; 201 struct bio flush_bio;
202
203 struct dm_stats stats;
201}; 204};
202 205
203/* 206/*
@@ -269,6 +272,7 @@ static int (*_inits[])(void) __initdata = {
269 dm_io_init, 272 dm_io_init,
270 dm_kcopyd_init, 273 dm_kcopyd_init,
271 dm_interface_init, 274 dm_interface_init,
275 dm_statistics_init,
272}; 276};
273 277
274static void (*_exits[])(void) = { 278static void (*_exits[])(void) = {
@@ -279,6 +283,7 @@ static void (*_exits[])(void) = {
279 dm_io_exit, 283 dm_io_exit,
280 dm_kcopyd_exit, 284 dm_kcopyd_exit,
281 dm_interface_exit, 285 dm_interface_exit,
286 dm_statistics_exit,
282}; 287};
283 288
284static int __init dm_init(void) 289static int __init dm_init(void)
@@ -384,6 +389,16 @@ int dm_lock_for_deletion(struct mapped_device *md)
384 return r; 389 return r;
385} 390}
386 391
392sector_t dm_get_size(struct mapped_device *md)
393{
394 return get_capacity(md->disk);
395}
396
397struct dm_stats *dm_get_stats(struct mapped_device *md)
398{
399 return &md->stats;
400}
401
387static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 402static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
388{ 403{
389 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
@@ -466,8 +481,9 @@ static int md_in_flight(struct mapped_device *md)
466static void start_io_acct(struct dm_io *io) 481static void start_io_acct(struct dm_io *io)
467{ 482{
468 struct mapped_device *md = io->md; 483 struct mapped_device *md = io->md;
484 struct bio *bio = io->bio;
469 int cpu; 485 int cpu;
470 int rw = bio_data_dir(io->bio); 486 int rw = bio_data_dir(bio);
471 487
472 io->start_time = jiffies; 488 io->start_time = jiffies;
473 489
@@ -476,6 +492,10 @@ static void start_io_acct(struct dm_io *io)
476 part_stat_unlock(); 492 part_stat_unlock();
477 atomic_set(&dm_disk(md)->part0.in_flight[rw], 493 atomic_set(&dm_disk(md)->part0.in_flight[rw],
478 atomic_inc_return(&md->pending[rw])); 494 atomic_inc_return(&md->pending[rw]));
495
496 if (unlikely(dm_stats_used(&md->stats)))
497 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
498 bio_sectors(bio), false, 0, &io->stats_aux);
479} 499}
480 500
481static void end_io_acct(struct dm_io *io) 501static void end_io_acct(struct dm_io *io)
@@ -491,6 +511,10 @@ static void end_io_acct(struct dm_io *io)
491 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 511 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
492 part_stat_unlock(); 512 part_stat_unlock();
493 513
514 if (unlikely(dm_stats_used(&md->stats)))
515 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
516 bio_sectors(bio), true, duration, &io->stats_aux);
517
494 /* 518 /*
495 * After this is decremented the bio must not be touched if it is 519 * After this is decremented the bio must not be touched if it is
496 * a flush. 520 * a flush.
@@ -1519,7 +1543,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio)
1519 return; 1543 return;
1520} 1544}
1521 1545
1522static int dm_request_based(struct mapped_device *md) 1546int dm_request_based(struct mapped_device *md)
1523{ 1547{
1524 return blk_queue_stackable(md->queue); 1548 return blk_queue_stackable(md->queue);
1525} 1549}
@@ -1958,6 +1982,8 @@ static struct mapped_device *alloc_dev(int minor)
1958 md->flush_bio.bi_bdev = md->bdev; 1982 md->flush_bio.bi_bdev = md->bdev;
1959 md->flush_bio.bi_rw = WRITE_FLUSH; 1983 md->flush_bio.bi_rw = WRITE_FLUSH;
1960 1984
1985 dm_stats_init(&md->stats);
1986
1961 /* Populate the mapping, nobody knows we exist yet */ 1987 /* Populate the mapping, nobody knows we exist yet */
1962 spin_lock(&_minor_lock); 1988 spin_lock(&_minor_lock);
1963 old_md = idr_replace(&_minor_idr, md, minor); 1989 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2009,6 +2035,7 @@ static void free_dev(struct mapped_device *md)
2009 2035
2010 put_disk(md->disk); 2036 put_disk(md->disk);
2011 blk_cleanup_queue(md->queue); 2037 blk_cleanup_queue(md->queue);
2038 dm_stats_cleanup(&md->stats);
2012 module_put(THIS_MODULE); 2039 module_put(THIS_MODULE);
2013 kfree(md); 2040 kfree(md);
2014} 2041}
@@ -2150,7 +2177,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2150 /* 2177 /*
2151 * Wipe any geometry if the size of the table changed. 2178 * Wipe any geometry if the size of the table changed.
2152 */ 2179 */
2153 if (size != get_capacity(md->disk)) 2180 if (size != dm_get_size(md))
2154 memset(&md->geometry, 0, sizeof(md->geometry)); 2181 memset(&md->geometry, 0, sizeof(md->geometry));
2155 2182
2156 __set_size(md, size); 2183 __set_size(md, size);
@@ -2696,6 +2723,38 @@ out:
2696 return r; 2723 return r;
2697} 2724}
2698 2725
2726/*
2727 * Internal suspend/resume works like userspace-driven suspend. It waits
2728 * until all bios finish and prevents issuing new bios to the target drivers.
2729 * It may be used only from the kernel.
2730 *
2731 * Internal suspend holds md->suspend_lock, which prevents interaction with
2732 * userspace-driven suspend.
2733 */
2734
2735void dm_internal_suspend(struct mapped_device *md)
2736{
2737 mutex_lock(&md->suspend_lock);
2738 if (dm_suspended_md(md))
2739 return;
2740
2741 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2742 synchronize_srcu(&md->io_barrier);
2743 flush_workqueue(md->wq);
2744 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2745}
2746
2747void dm_internal_resume(struct mapped_device *md)
2748{
2749 if (dm_suspended_md(md))
2750 goto done;
2751
2752 dm_queue_flush(md);
2753
2754done:
2755 mutex_unlock(&md->suspend_lock);
2756}
2757
2699/*----------------------------------------------------------------- 2758/*-----------------------------------------------------------------
2700 * Event notification. 2759 * Event notification.
2701 *---------------------------------------------------------------*/ 2760 *---------------------------------------------------------------*/
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 8b4c075d9a2f..5e604cc7b4aa 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -16,6 +16,8 @@
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18 18
19#include "dm-stats.h"
20
19/* 21/*
20 * Suspend feature flags 22 * Suspend feature flags
21 */ 23 */
@@ -157,10 +159,16 @@ void dm_destroy(struct mapped_device *md);
157void dm_destroy_immediate(struct mapped_device *md); 159void dm_destroy_immediate(struct mapped_device *md);
158int dm_open_count(struct mapped_device *md); 160int dm_open_count(struct mapped_device *md);
159int dm_lock_for_deletion(struct mapped_device *md); 161int dm_lock_for_deletion(struct mapped_device *md);
162int dm_request_based(struct mapped_device *md);
163sector_t dm_get_size(struct mapped_device *md);
164struct dm_stats *dm_get_stats(struct mapped_device *md);
160 165
161int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 166int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
162 unsigned cookie); 167 unsigned cookie);
163 168
169void dm_internal_suspend(struct mapped_device *md);
170void dm_internal_resume(struct mapped_device *md);
171
164int dm_io_init(void); 172int dm_io_init(void);
165void dm_io_exit(void); 173void dm_io_exit(void);
166 174
@@ -173,4 +181,12 @@ void dm_kcopyd_exit(void);
173struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); 181struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
174void dm_free_md_mempools(struct dm_md_mempools *pools); 182void dm_free_md_mempools(struct dm_md_mempools *pools);
175 183
184/*
185 * Helpers that are used by DM core
186 */
187static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
188{
189 return !maxlen || strlen(result) + 1 >= maxlen;
190}
191
176#endif 192#endif
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e151d4c9298d..653073de09e3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -10,6 +10,7 @@
10 10
11#include <linux/bio.h> 11#include <linux/bio.h>
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13#include <linux/math64.h>
13#include <linux/ratelimit.h> 14#include <linux/ratelimit.h>
14 15
15struct dm_dev; 16struct dm_dev;
@@ -550,6 +551,14 @@ extern struct ratelimit_state dm_ratelimit_state;
550#define DM_MAPIO_REMAPPED 1 551#define DM_MAPIO_REMAPPED 1
551#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE 552#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
552 553
554#define dm_sector_div64(x, y)( \
555{ \
556 u64 _res; \
557 (x) = div64_u64_rem(x, y, &_res); \
558 _res; \
559} \
560)
561
553/* 562/*
554 * Ceiling(n / sz) 563 * Ceiling(n / sz)
555 */ 564 */
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index afd0cbd52edb..f1e12bd40b3b 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 267#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
268 268
269#define DM_VERSION_MAJOR 4 269#define DM_VERSION_MAJOR 4
270#define DM_VERSION_MINOR 25 270#define DM_VERSION_MINOR 26
271#define DM_VERSION_PATCHLEVEL 0 271#define DM_VERSION_PATCHLEVEL 0
272#define DM_VERSION_EXTRA "-ioctl (2013-06-26)" 272#define DM_VERSION_EXTRA "-ioctl (2013-08-15)"
273 273
274/* Status bits */ 274/* Status bits */
275#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 275#define DM_READONLY_FLAG (1 << 0) /* In/Out */