aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-13 22:08:14 -0500
commit0910c0bdf7c291a41bc21e40a97389c9d4c1960d (patch)
tree177c4cb22ece78b18f64f548ae82b9a15edbb99c /lib
parent2821fe6b00a1e902fd399bb4b7e40bc3041f4d44 (diff)
parente37459b8e2c7db6735e39e019e448b76e5e77647 (diff)
Merge branch 'for-3.13/core' of git://git.kernel.dk/linux-block
Pull block IO core updates from Jens Axboe: "This is the pull request for the core changes in the block layer for 3.13. It contains: - The new blk-mq request interface. This is a new and more scalable queueing model that marries the best part of the request based interface we currently have (which is fully featured, but scales poorly) and the bio based "interface" which the new drivers for high IOPS devices end up using because it's much faster than the request based one. The bio interface has no block layer support, since it taps into the stack much earlier. This means that drivers end up having to implement a lot of functionality on their own, like tagging, timeout handling, requeue, etc. The blk-mq interface provides all these. Some drivers even provide a switch to select bio or rq and has code to handle both, since things like merging only works in the rq model and hence is faster for some workloads. This is a huge mess. Conversion of these drivers nets us a substantial code reduction. Initial results on converting SCSI to this model even shows an 8x improvement on single queue devices. So while the model was intended to work on the newer multiqueue devices, it has substantial improvements for "classic" hardware as well. This code has gone through extensive testing and development, it's now ready to go. A pull request is coming to convert virtio-blk to this model will be will be coming as well, with more drivers scheduled for 3.14 conversion. - Two blktrace fixes from Jan and Chen Gang. - A plug merge fix from Alireza Haghdoost. - Conversion of __get_cpu_var() from Christoph Lameter. - Fix for sector_div() with 64-bit divider from Geert Uytterhoeven. - A fix for a race between request completion and the timeout handling from Jeff Moyer. This is what caused the merge conflict with blk-mq/core, in case you are looking at that. - A dm stacking fix from Mike Snitzer. - A code consolidation fix and duplicated code removal from Kent Overstreet. - A handful of block bug fixes from Mikulas Patocka, fixing a loop crash and memory corruption on blk cg. - Elevator switch bug fix from Tomoki Sekiyama. A heads-up that I had to rebase this branch. Initially the immutable bio_vecs had been queued up for inclusion, but a week later, it became clear that it wasn't fully cooked yet. So the decision was made to pull this out and postpone it until 3.14. It was a straight forward rebase, just pruning out the immutable series and the later fixes of problems with it. The rest of the patches applied directly and no further changes were made" * 'for-3.13/core' of git://git.kernel.dk/linux-block: (31 commits) block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO block: Do not call sector_div() with a 64-bit divisor kernel: trace: blktrace: remove redundent memcpy() in compat_blk_trace_setup() block: Consolidate duplicated bio_trim() implementations block: Use rw_copy_check_uvector() block: Enable sysfs nomerge control for I/O requests in the plug list block: properly stack underlying max_segment_size to DM device elevator: acquire q->sysfs_lock in elevator_change() elevator: Fix a race in elevator switching and md device initialization block: Replace __get_cpu_var uses bdi: test bdi_init failure block: fix a probe argument to blk_register_region loop: fix crash if blk_alloc_queue fails blk-core: Fix memory corruption if blkcg_init_queue fails block: fix race between request completion and timeout handling blktrace: Send BLK_TN_PROCESS events to all running traces blk-mq: don't disallow request merges for req->special being set blk-mq: mq plug list breakage blk-mq: fix for flush deadlock ...
Diffstat (limited to 'lib')
-rw-r--r--lib/percpu_counter.c15
-rw-r--r--lib/percpu_ida.c89
2 files changed, 81 insertions, 23 deletions
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 93c5d5ecff4e..7473ee3b4ee7 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -60,14 +60,15 @@ static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
60void percpu_counter_set(struct percpu_counter *fbc, s64 amount) 60void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
61{ 61{
62 int cpu; 62 int cpu;
63 unsigned long flags;
63 64
64 raw_spin_lock(&fbc->lock); 65 raw_spin_lock_irqsave(&fbc->lock, flags);
65 for_each_possible_cpu(cpu) { 66 for_each_possible_cpu(cpu) {
66 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 67 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
67 *pcount = 0; 68 *pcount = 0;
68 } 69 }
69 fbc->count = amount; 70 fbc->count = amount;
70 raw_spin_unlock(&fbc->lock); 71 raw_spin_unlock_irqrestore(&fbc->lock, flags);
71} 72}
72EXPORT_SYMBOL(percpu_counter_set); 73EXPORT_SYMBOL(percpu_counter_set);
73 74
@@ -78,9 +79,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
78 preempt_disable(); 79 preempt_disable();
79 count = __this_cpu_read(*fbc->counters) + amount; 80 count = __this_cpu_read(*fbc->counters) + amount;
80 if (count >= batch || count <= -batch) { 81 if (count >= batch || count <= -batch) {
81 raw_spin_lock(&fbc->lock); 82 unsigned long flags;
83 raw_spin_lock_irqsave(&fbc->lock, flags);
82 fbc->count += count; 84 fbc->count += count;
83 raw_spin_unlock(&fbc->lock); 85 raw_spin_unlock_irqrestore(&fbc->lock, flags);
84 __this_cpu_write(*fbc->counters, 0); 86 __this_cpu_write(*fbc->counters, 0);
85 } else { 87 } else {
86 __this_cpu_write(*fbc->counters, count); 88 __this_cpu_write(*fbc->counters, count);
@@ -97,14 +99,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
97{ 99{
98 s64 ret; 100 s64 ret;
99 int cpu; 101 int cpu;
102 unsigned long flags;
100 103
101 raw_spin_lock(&fbc->lock); 104 raw_spin_lock_irqsave(&fbc->lock, flags);
102 ret = fbc->count; 105 ret = fbc->count;
103 for_each_online_cpu(cpu) { 106 for_each_online_cpu(cpu) {
104 s32 *pcount = per_cpu_ptr(fbc->counters, cpu); 107 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
105 ret += *pcount; 108 ret += *pcount;
106 } 109 }
107 raw_spin_unlock(&fbc->lock); 110 raw_spin_unlock_irqrestore(&fbc->lock, flags);
108 return ret; 111 return ret;
109} 112}
110EXPORT_SYMBOL(__percpu_counter_sum); 113EXPORT_SYMBOL(__percpu_counter_sum);
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index bab1ba2a4c71..b0698ea972c6 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -30,15 +30,6 @@
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/percpu_ida.h> 31#include <linux/percpu_ida.h>
32 32
33/*
34 * Number of tags we move between the percpu freelist and the global freelist at
35 * a time
36 */
37#define IDA_PCPU_BATCH_MOVE 32U
38
39/* Max size of percpu freelist, */
40#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
41
42struct percpu_ida_cpu { 33struct percpu_ida_cpu {
43 /* 34 /*
44 * Even though this is percpu, we need a lock for tag stealing by remote 35 * Even though this is percpu, we need a lock for tag stealing by remote
@@ -78,7 +69,7 @@ static inline void steal_tags(struct percpu_ida *pool,
78 struct percpu_ida_cpu *remote; 69 struct percpu_ida_cpu *remote;
79 70
80 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); 71 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
81 cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2; 72 cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
82 cpus_have_tags--) { 73 cpus_have_tags--) {
83 cpu = cpumask_next(cpu, &pool->cpus_have_tags); 74 cpu = cpumask_next(cpu, &pool->cpus_have_tags);
84 75
@@ -123,7 +114,7 @@ static inline void alloc_global_tags(struct percpu_ida *pool,
123{ 114{
124 move_tags(tags->freelist, &tags->nr_free, 115 move_tags(tags->freelist, &tags->nr_free,
125 pool->freelist, &pool->nr_free, 116 pool->freelist, &pool->nr_free,
126 min(pool->nr_free, IDA_PCPU_BATCH_MOVE)); 117 min(pool->nr_free, pool->percpu_batch_size));
127} 118}
128 119
129static inline unsigned alloc_local_tag(struct percpu_ida *pool, 120static inline unsigned alloc_local_tag(struct percpu_ida *pool,
@@ -245,17 +236,17 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
245 wake_up(&pool->wait); 236 wake_up(&pool->wait);
246 } 237 }
247 238
248 if (nr_free == IDA_PCPU_SIZE) { 239 if (nr_free == pool->percpu_max_size) {
249 spin_lock(&pool->lock); 240 spin_lock(&pool->lock);
250 241
251 /* 242 /*
252 * Global lock held and irqs disabled, don't need percpu 243 * Global lock held and irqs disabled, don't need percpu
253 * lock 244 * lock
254 */ 245 */
255 if (tags->nr_free == IDA_PCPU_SIZE) { 246 if (tags->nr_free == pool->percpu_max_size) {
256 move_tags(pool->freelist, &pool->nr_free, 247 move_tags(pool->freelist, &pool->nr_free,
257 tags->freelist, &tags->nr_free, 248 tags->freelist, &tags->nr_free,
258 IDA_PCPU_BATCH_MOVE); 249 pool->percpu_batch_size);
259 250
260 wake_up(&pool->wait); 251 wake_up(&pool->wait);
261 } 252 }
@@ -292,7 +283,8 @@ EXPORT_SYMBOL_GPL(percpu_ida_destroy);
292 * Allocation is percpu, but sharding is limited by nr_tags - for best 283 * Allocation is percpu, but sharding is limited by nr_tags - for best
293 * performance, the workload should not span more cpus than nr_tags / 128. 284 * performance, the workload should not span more cpus than nr_tags / 128.
294 */ 285 */
295int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags) 286int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
287 unsigned long max_size, unsigned long batch_size)
296{ 288{
297 unsigned i, cpu, order; 289 unsigned i, cpu, order;
298 290
@@ -301,6 +293,8 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
301 init_waitqueue_head(&pool->wait); 293 init_waitqueue_head(&pool->wait);
302 spin_lock_init(&pool->lock); 294 spin_lock_init(&pool->lock);
303 pool->nr_tags = nr_tags; 295 pool->nr_tags = nr_tags;
296 pool->percpu_max_size = max_size;
297 pool->percpu_batch_size = batch_size;
304 298
305 /* Guard against overflow */ 299 /* Guard against overflow */
306 if (nr_tags > (unsigned) INT_MAX + 1) { 300 if (nr_tags > (unsigned) INT_MAX + 1) {
@@ -319,7 +313,7 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
319 pool->nr_free = nr_tags; 313 pool->nr_free = nr_tags;
320 314
321 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) + 315 pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
322 IDA_PCPU_SIZE * sizeof(unsigned), 316 pool->percpu_max_size * sizeof(unsigned),
323 sizeof(unsigned)); 317 sizeof(unsigned));
324 if (!pool->tag_cpu) 318 if (!pool->tag_cpu)
325 goto err; 319 goto err;
@@ -332,4 +326,65 @@ err:
332 percpu_ida_destroy(pool); 326 percpu_ida_destroy(pool);
333 return -ENOMEM; 327 return -ENOMEM;
334} 328}
335EXPORT_SYMBOL_GPL(percpu_ida_init); 329EXPORT_SYMBOL_GPL(__percpu_ida_init);
330
331/**
332 * percpu_ida_for_each_free - iterate free ids of a pool
333 * @pool: pool to iterate
334 * @fn: interate callback function
335 * @data: parameter for @fn
336 *
337 * Note, this doesn't guarantee to iterate all free ids restrictly. Some free
338 * ids might be missed, some might be iterated duplicated, and some might
339 * be iterated and not free soon.
340 */
341int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
342 void *data)
343{
344 unsigned long flags;
345 struct percpu_ida_cpu *remote;
346 unsigned cpu, i, err = 0;
347
348 local_irq_save(flags);
349 for_each_possible_cpu(cpu) {
350 remote = per_cpu_ptr(pool->tag_cpu, cpu);
351 spin_lock(&remote->lock);
352 for (i = 0; i < remote->nr_free; i++) {
353 err = fn(remote->freelist[i], data);
354 if (err)
355 break;
356 }
357 spin_unlock(&remote->lock);
358 if (err)
359 goto out;
360 }
361
362 spin_lock(&pool->lock);
363 for (i = 0; i < pool->nr_free; i++) {
364 err = fn(pool->freelist[i], data);
365 if (err)
366 break;
367 }
368 spin_unlock(&pool->lock);
369out:
370 local_irq_restore(flags);
371 return err;
372}
373EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
374
375/**
376 * percpu_ida_free_tags - return free tags number of a specific cpu or global pool
377 * @pool: pool related
378 * @cpu: specific cpu or global pool if @cpu == nr_cpu_ids
379 *
380 * Note: this just returns a snapshot of free tags number.
381 */
382unsigned percpu_ida_free_tags(struct percpu_ida *pool, int cpu)
383{
384 struct percpu_ida_cpu *remote;
385 if (cpu == nr_cpu_ids)
386 return pool->nr_free;
387 remote = per_cpu_ptr(pool->tag_cpu, cpu);
388 return remote->nr_free;
389}
390EXPORT_SYMBOL_GPL(percpu_ida_free_tags);