aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-03-23 19:24:24 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-23 19:24:24 -0500
commitcec6062037783a762aa5606b06b8bc5c14d9657f (patch)
treec862659eb55e44314f6d524f5479204994b91901
parent88f07ffb63add018bfafd480ec6a294088277f06 (diff)
parent2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 (diff)
Merge branch 'blktrace' of git://brick.kernel.dk/data/git/linux-2.6-block
* 'blktrace' of git://brick.kernel.dk/data/git/linux-2.6-block: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23 [PATCH] relay: consolidate sendfile() and read() code [PATCH] relay: add sendfile() support [PATCH] relay: migrate from relayfs to a generic relay API
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile2
-rw-r--r--block/blktrace.c538
-rw-r--r--block/elevator.c4
-rw-r--r--block/ioctl.c6
-rw-r--r--block/ll_rw_blk.c44
-rw-r--r--drivers/block/cciss.c2
-rw-r--r--drivers/md/dm.c13
-rw-r--r--fs/Kconfig12
-rw-r--r--fs/Makefile1
-rw-r--r--fs/bio.c4
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/relayfs/Makefile4
-rw-r--r--fs/relayfs/buffers.c190
-rw-r--r--fs/relayfs/buffers.h12
-rw-r--r--fs/relayfs/inode.c581
-rw-r--r--fs/relayfs/relay.c482
-rw-r--r--fs/relayfs/relay.h8
-rw-r--r--include/linux/blkdev.h3
-rw-r--r--include/linux/blktrace_api.h277
-rw-r--r--include/linux/compat_ioctl.h4
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/relay.h281
-rw-r--r--include/linux/sched.h1
-rw-r--r--init/Kconfig11
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/relay.c1012
-rw-r--r--mm/highmem.c3
29 files changed, 2221 insertions, 1293 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e17..96783645092d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
11 your machine, or if you want to have a raid or loopback device 11 your machine, or if you want to have a raid or loopback device
12 bigger than 2TB. Otherwise say N. 12 bigger than 2TB. Otherwise say N.
13 13
14config BLK_DEV_IO_TRACE
15 bool "Support for tracing block io actions"
16 select RELAY
17 select DEBUG_FS
18 help
19 Say Y here, if you want to be able to trace the block layer actions
20 on a given queue. Tracing allows you to see any traffic happening
21 on a block device queue. For more information (and the user space
22 support tools needed), fetch the blktrace app from:
23
24 git://brick.kernel.dk/data/git/blktrace.git
25
14source block/Kconfig.iosched 26source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44e..c05de0e0037f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o 8obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 9obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 10obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
11
12obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 000000000000..36f3a172275f
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
1/*
2 * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 *
17 */
18#include <linux/config.h>
19#include <linux/kernel.h>
20#include <linux/blkdev.h>
21#include <linux/blktrace_api.h>
22#include <linux/percpu.h>
23#include <linux/init.h>
24#include <linux/mutex.h>
25#include <linux/debugfs.h>
26#include <asm/uaccess.h>
27
28static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
29static unsigned int blktrace_seq __read_mostly = 1;
30
31/*
32 * Send out a notify for this process, if we haven't done so since a trace
33 * started
34 */
35static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
36{
37 struct blk_io_trace *t;
38
39 t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
40 if (t) {
41 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
42 t->device = bt->dev;
43 t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
44 t->pid = tsk->pid;
45 t->cpu = smp_processor_id();
46 t->pdu_len = sizeof(tsk->comm);
47 memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
48 tsk->btrace_seq = blktrace_seq;
49 }
50}
51
52static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
53 pid_t pid)
54{
55 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
56 return 1;
57 if (sector < bt->start_lba || sector > bt->end_lba)
58 return 1;
59 if (bt->pid && pid != bt->pid)
60 return 1;
61
62 return 0;
63}
64
65/*
66 * Data direction bit lookup
67 */
68static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
69
70/*
71 * Bio action bits of interest
72 */
73static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
74
75/*
76 * More could be added as needed, taking care to increment the decrementer
77 * to get correct indexing
78 */
79#define trace_barrier_bit(rw) \
80 (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
81#define trace_sync_bit(rw) \
82 (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
83
84/*
85 * The worker for the various blk_add_trace*() types. Fills out a
86 * blk_io_trace structure and places it in a per-cpu subbuffer.
87 */
88void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
89 int rw, u32 what, int error, int pdu_len, void *pdu_data)
90{
91 struct task_struct *tsk = current;
92 struct blk_io_trace *t;
93 unsigned long flags;
94 unsigned long *sequence;
95 pid_t pid;
96 int cpu;
97
98 if (unlikely(bt->trace_state != Blktrace_running))
99 return;
100
101 what |= ddir_act[rw & WRITE];
102 what |= bio_act[trace_barrier_bit(rw)];
103 what |= bio_act[trace_sync_bit(rw)];
104
105 pid = tsk->pid;
106 if (unlikely(act_log_check(bt, what, sector, pid)))
107 return;
108
109 /*
110 * A word about the locking here - we disable interrupts to reserve
111 * some space in the relay per-cpu buffer, to prevent an irq
112 * from coming in and stepping on our toes. Once reserved, it's
113 * enough to get preemption disabled to prevent read of this data
114 * before we are through filling it. get_cpu()/put_cpu() does this
115 * for us
116 */
117 local_irq_save(flags);
118
119 if (unlikely(tsk->btrace_seq != blktrace_seq))
120 trace_note_tsk(bt, tsk);
121
122 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
123 if (t) {
124 cpu = smp_processor_id();
125 sequence = per_cpu_ptr(bt->sequence, cpu);
126
127 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
128 t->sequence = ++(*sequence);
129 t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
130 t->sector = sector;
131 t->bytes = bytes;
132 t->action = what;
133 t->pid = pid;
134 t->device = bt->dev;
135 t->cpu = cpu;
136 t->error = error;
137 t->pdu_len = pdu_len;
138
139 if (pdu_len)
140 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
141 }
142
143 local_irq_restore(flags);
144}
145
146EXPORT_SYMBOL_GPL(__blk_add_trace);
147
148static struct dentry *blk_tree_root;
149static struct mutex blk_tree_mutex;
150static unsigned int root_users;
151
152static inline void blk_remove_root(void)
153{
154 if (blk_tree_root) {
155 debugfs_remove(blk_tree_root);
156 blk_tree_root = NULL;
157 }
158}
159
160static void blk_remove_tree(struct dentry *dir)
161{
162 mutex_lock(&blk_tree_mutex);
163 debugfs_remove(dir);
164 if (--root_users == 0)
165 blk_remove_root();
166 mutex_unlock(&blk_tree_mutex);
167}
168
169static struct dentry *blk_create_tree(const char *blk_name)
170{
171 struct dentry *dir = NULL;
172
173 mutex_lock(&blk_tree_mutex);
174
175 if (!blk_tree_root) {
176 blk_tree_root = debugfs_create_dir("block", NULL);
177 if (!blk_tree_root)
178 goto err;
179 }
180
181 dir = debugfs_create_dir(blk_name, blk_tree_root);
182 if (dir)
183 root_users++;
184 else
185 blk_remove_root();
186
187err:
188 mutex_unlock(&blk_tree_mutex);
189 return dir;
190}
191
192static void blk_trace_cleanup(struct blk_trace *bt)
193{
194 relay_close(bt->rchan);
195 debugfs_remove(bt->dropped_file);
196 blk_remove_tree(bt->dir);
197 free_percpu(bt->sequence);
198 kfree(bt);
199}
200
201static int blk_trace_remove(request_queue_t *q)
202{
203 struct blk_trace *bt;
204
205 bt = xchg(&q->blk_trace, NULL);
206 if (!bt)
207 return -EINVAL;
208
209 if (bt->trace_state == Blktrace_setup ||
210 bt->trace_state == Blktrace_stopped)
211 blk_trace_cleanup(bt);
212
213 return 0;
214}
215
216static int blk_dropped_open(struct inode *inode, struct file *filp)
217{
218 filp->private_data = inode->u.generic_ip;
219
220 return 0;
221}
222
223static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
224 size_t count, loff_t *ppos)
225{
226 struct blk_trace *bt = filp->private_data;
227 char buf[16];
228
229 snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
230
231 return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
232}
233
234static struct file_operations blk_dropped_fops = {
235 .owner = THIS_MODULE,
236 .open = blk_dropped_open,
237 .read = blk_dropped_read,
238};
239
240/*
241 * Keep track of how many times we encountered a full subbuffer, to aid
242 * the user space app in telling how many lost events there were.
243 */
244static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
245 void *prev_subbuf, size_t prev_padding)
246{
247 struct blk_trace *bt;
248
249 if (!relay_buf_full(buf))
250 return 1;
251
252 bt = buf->chan->private_data;
253 atomic_inc(&bt->dropped);
254 return 0;
255}
256
257static int blk_remove_buf_file_callback(struct dentry *dentry)
258{
259 debugfs_remove(dentry);
260 return 0;
261}
262
263static struct dentry *blk_create_buf_file_callback(const char *filename,
264 struct dentry *parent,
265 int mode,
266 struct rchan_buf *buf,
267 int *is_global)
268{
269 return debugfs_create_file(filename, mode, parent, buf,
270 &relay_file_operations);
271}
272
273static struct rchan_callbacks blk_relay_callbacks = {
274 .subbuf_start = blk_subbuf_start_callback,
275 .create_buf_file = blk_create_buf_file_callback,
276 .remove_buf_file = blk_remove_buf_file_callback,
277};
278
279/*
280 * Setup everything required to start tracing
281 */
282static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
283 char __user *arg)
284{
285 struct blk_user_trace_setup buts;
286 struct blk_trace *old_bt, *bt = NULL;
287 struct dentry *dir = NULL;
288 char b[BDEVNAME_SIZE];
289 int ret, i;
290
291 if (copy_from_user(&buts, arg, sizeof(buts)))
292 return -EFAULT;
293
294 if (!buts.buf_size || !buts.buf_nr)
295 return -EINVAL;
296
297 strcpy(buts.name, bdevname(bdev, b));
298
299 /*
300 * some device names have larger paths - convert the slashes
301 * to underscores for this to work as expected
302 */
303 for (i = 0; i < strlen(buts.name); i++)
304 if (buts.name[i] == '/')
305 buts.name[i] = '_';
306
307 if (copy_to_user(arg, &buts, sizeof(buts)))
308 return -EFAULT;
309
310 ret = -ENOMEM;
311 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
312 if (!bt)
313 goto err;
314
315 bt->sequence = alloc_percpu(unsigned long);
316 if (!bt->sequence)
317 goto err;
318
319 ret = -ENOENT;
320 dir = blk_create_tree(buts.name);
321 if (!dir)
322 goto err;
323
324 bt->dir = dir;
325 bt->dev = bdev->bd_dev;
326 atomic_set(&bt->dropped, 0);
327
328 ret = -EIO;
329 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
330 if (!bt->dropped_file)
331 goto err;
332
333 bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
334 if (!bt->rchan)
335 goto err;
336 bt->rchan->private_data = bt;
337
338 bt->act_mask = buts.act_mask;
339 if (!bt->act_mask)
340 bt->act_mask = (u16) -1;
341
342 bt->start_lba = buts.start_lba;
343 bt->end_lba = buts.end_lba;
344 if (!bt->end_lba)
345 bt->end_lba = -1ULL;
346
347 bt->pid = buts.pid;
348 bt->trace_state = Blktrace_setup;
349
350 ret = -EBUSY;
351 old_bt = xchg(&q->blk_trace, bt);
352 if (old_bt) {
353 (void) xchg(&q->blk_trace, old_bt);
354 goto err;
355 }
356
357 return 0;
358err:
359 if (dir)
360 blk_remove_tree(dir);
361 if (bt) {
362 if (bt->dropped_file)
363 debugfs_remove(bt->dropped_file);
364 if (bt->sequence)
365 free_percpu(bt->sequence);
366 if (bt->rchan)
367 relay_close(bt->rchan);
368 kfree(bt);
369 }
370 return ret;
371}
372
373static int blk_trace_startstop(request_queue_t *q, int start)
374{
375 struct blk_trace *bt;
376 int ret;
377
378 if ((bt = q->blk_trace) == NULL)
379 return -EINVAL;
380
381 /*
382 * For starting a trace, we can transition from a setup or stopped
383 * trace. For stopping a trace, the state must be running
384 */
385 ret = -EINVAL;
386 if (start) {
387 if (bt->trace_state == Blktrace_setup ||
388 bt->trace_state == Blktrace_stopped) {
389 blktrace_seq++;
390 smp_mb();
391 bt->trace_state = Blktrace_running;
392 ret = 0;
393 }
394 } else {
395 if (bt->trace_state == Blktrace_running) {
396 bt->trace_state = Blktrace_stopped;
397 relay_flush(bt->rchan);
398 ret = 0;
399 }
400 }
401
402 return ret;
403}
404
405/**
406 * blk_trace_ioctl: - handle the ioctls associated with tracing
407 * @bdev: the block device
408 * @cmd: the ioctl cmd
409 * @arg: the argument data, if any
410 *
411 **/
412int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
413{
414 request_queue_t *q;
415 int ret, start = 0;
416
417 q = bdev_get_queue(bdev);
418 if (!q)
419 return -ENXIO;
420
421 mutex_lock(&bdev->bd_mutex);
422
423 switch (cmd) {
424 case BLKTRACESETUP:
425 ret = blk_trace_setup(q, bdev, arg);
426 break;
427 case BLKTRACESTART:
428 start = 1;
429 case BLKTRACESTOP:
430 ret = blk_trace_startstop(q, start);
431 break;
432 case BLKTRACETEARDOWN:
433 ret = blk_trace_remove(q);
434 break;
435 default:
436 ret = -ENOTTY;
437 break;
438 }
439
440 mutex_unlock(&bdev->bd_mutex);
441 return ret;
442}
443
444/**
445 * blk_trace_shutdown: - stop and cleanup trace structures
446 * @q: the request queue associated with the device
447 *
448 **/
449void blk_trace_shutdown(request_queue_t *q)
450{
451 blk_trace_startstop(q, 0);
452 blk_trace_remove(q);
453}
454
455/*
456 * Average offset over two calls to sched_clock() with a gettimeofday()
457 * in the middle
458 */
459static void blk_check_time(unsigned long long *t)
460{
461 unsigned long long a, b;
462 struct timeval tv;
463
464 a = sched_clock();
465 do_gettimeofday(&tv);
466 b = sched_clock();
467
468 *t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
469 *t -= (a + b) / 2;
470}
471
472static void blk_trace_check_cpu_time(void *data)
473{
474 unsigned long long *t;
475 int cpu = get_cpu();
476
477 t = &per_cpu(blk_trace_cpu_offset, cpu);
478
479 /*
480 * Just call it twice, hopefully the second call will be cache hot
481 * and a little more precise
482 */
483 blk_check_time(t);
484 blk_check_time(t);
485
486 put_cpu();
487}
488
489/*
490 * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
491 * timings
492 */
493static void blk_trace_calibrate_offsets(void)
494{
495 unsigned long flags;
496
497 smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
498 local_irq_save(flags);
499 blk_trace_check_cpu_time(NULL);
500 local_irq_restore(flags);
501}
502
503static void blk_trace_set_ht_offsets(void)
504{
505#if defined(CONFIG_SCHED_SMT)
506 int cpu, i;
507
508 /*
509 * now make sure HT siblings have the same time offset
510 */
511 preempt_disable();
512 for_each_online_cpu(cpu) {
513 unsigned long long *cpu_off, *sibling_off;
514
515 for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
516 if (i == cpu)
517 continue;
518
519 cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
520 sibling_off = &per_cpu(blk_trace_cpu_offset, i);
521 *sibling_off = *cpu_off;
522 }
523 }
524 preempt_enable();
525#endif
526}
527
528static __init int blk_trace_init(void)
529{
530 mutex_init(&blk_tree_mutex);
531 blk_trace_calibrate_offsets();
532 blk_trace_set_ht_offsets();
533
534 return 0;
535}
536
537module_init(blk_trace_init);
538
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a0..5e558c4689a4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blktrace_api.h>
36 37
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
333 struct list_head *pos; 334 struct list_head *pos;
334 unsigned ordseq; 335 unsigned ordseq;
335 336
337 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
338
336 rq->q = q; 339 rq->q = q;
337 340
338 switch (where) { 341 switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
499 * not be passed by new incoming requests 502 * not be passed by new incoming requests
500 */ 503 */
501 rq->flags |= REQ_STARTED; 504 rq->flags |= REQ_STARTED;
505 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
502 } 506 }
503 507
504 if (!q->boundary_rq || q->boundary_rq == rq) { 508 if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc6512..9cfa2e1ecb24 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
5#include <linux/backing-dev.h> 5#include <linux/backing-dev.h>
6#include <linux/buffer_head.h> 6#include <linux/buffer_head.h>
7#include <linux/smp_lock.h> 7#include <linux/smp_lock.h>
8#include <linux/blktrace_api.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
9 10
10static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) 11static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
189 return put_ulong(arg, bdev->bd_inode->i_size >> 9); 190 return put_ulong(arg, bdev->bd_inode->i_size >> 9);
190 case BLKGETSIZE64: 191 case BLKGETSIZE64:
191 return put_u64(arg, bdev->bd_inode->i_size); 192 return put_u64(arg, bdev->bd_inode->i_size);
193 case BLKTRACESTART:
194 case BLKTRACESTOP:
195 case BLKTRACESETUP:
196 case BLKTRACETEARDOWN:
197 return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
192 } 198 }
193 return -ENOIOCTLCMD; 199 return -ENOIOCTLCMD;
194} 200}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa9..062067fa7ead 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/interrupt.h> 29#include <linux/interrupt.h>
30#include <linux/cpu.h> 30#include <linux/cpu.h>
31#include <linux/blktrace_api.h>
31 32
32/* 33/*
33 * for max sense size 34 * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
1556 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) 1557 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1557 return; 1558 return;
1558 1559
1559 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1560 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1560 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1561 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1562 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1563 }
1561} 1564}
1562 1565
1563EXPORT_SYMBOL(blk_plug_device); 1566EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1621 /* 1624 /*
1622 * devices don't necessarily have an ->unplug_fn defined 1625 * devices don't necessarily have an ->unplug_fn defined
1623 */ 1626 */
1624 if (q->unplug_fn) 1627 if (q->unplug_fn) {
1628 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1629 q->rq.count[READ] + q->rq.count[WRITE]);
1630
1625 q->unplug_fn(q); 1631 q->unplug_fn(q);
1632 }
1626} 1633}
1627 1634
1628static void blk_unplug_work(void *data) 1635static void blk_unplug_work(void *data)
1629{ 1636{
1630 request_queue_t *q = data; 1637 request_queue_t *q = data;
1631 1638
1639 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1640 q->rq.count[READ] + q->rq.count[WRITE]);
1641
1632 q->unplug_fn(q); 1642 q->unplug_fn(q);
1633} 1643}
1634 1644
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
1636{ 1646{
1637 request_queue_t *q = (request_queue_t *)data; 1647 request_queue_t *q = (request_queue_t *)data;
1638 1648
1649 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1650 q->rq.count[READ] + q->rq.count[WRITE]);
1651
1639 kblockd_schedule_work(&q->unplug_work); 1652 kblockd_schedule_work(&q->unplug_work);
1640} 1653}
1641 1654
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
1753 if (q->queue_tags) 1766 if (q->queue_tags)
1754 __blk_queue_free_tags(q); 1767 __blk_queue_free_tags(q);
1755 1768
1769 if (q->blk_trace)
1770 blk_trace_shutdown(q);
1771
1756 kmem_cache_free(requestq_cachep, q); 1772 kmem_cache_free(requestq_cachep, q);
1757} 1773}
1758 1774
@@ -2129,6 +2145,8 @@ rq_starved:
2129 2145
2130 rq_init(q, rq); 2146 rq_init(q, rq);
2131 rq->rl = rl; 2147 rq->rl = rl;
2148
2149 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2132out: 2150out:
2133 return rq; 2151 return rq;
2134} 2152}
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
2157 if (!rq) { 2175 if (!rq) {
2158 struct io_context *ioc; 2176 struct io_context *ioc;
2159 2177
2178 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2179
2160 __generic_unplug_device(q); 2180 __generic_unplug_device(q);
2161 spin_unlock_irq(q->queue_lock); 2181 spin_unlock_irq(q->queue_lock);
2162 io_schedule(); 2182 io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
2210 */ 2230 */
2211void blk_requeue_request(request_queue_t *q, struct request *rq) 2231void blk_requeue_request(request_queue_t *q, struct request *rq)
2212{ 2232{
2233 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2234
2213 if (blk_rq_tagged(rq)) 2235 if (blk_rq_tagged(rq))
2214 blk_queue_end_tag(q, rq); 2236 blk_queue_end_tag(q, rq);
2215 2237
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2844 if (!q->back_merge_fn(q, req, bio)) 2866 if (!q->back_merge_fn(q, req, bio))
2845 break; 2867 break;
2846 2868
2869 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
2870
2847 req->biotail->bi_next = bio; 2871 req->biotail->bi_next = bio;
2848 req->biotail = bio; 2872 req->biotail = bio;
2849 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2873 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2859 if (!q->front_merge_fn(q, req, bio)) 2883 if (!q->front_merge_fn(q, req, bio))
2860 break; 2884 break;
2861 2885
2886 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
2887
2862 bio->bi_next = req->bio; 2888 bio->bi_next = req->bio;
2863 req->bio = bio; 2889 req->bio = bio;
2864 2890
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
2976 request_queue_t *q; 3002 request_queue_t *q;
2977 sector_t maxsector; 3003 sector_t maxsector;
2978 int ret, nr_sectors = bio_sectors(bio); 3004 int ret, nr_sectors = bio_sectors(bio);
3005 dev_t old_dev;
2979 3006
2980 might_sleep(); 3007 might_sleep();
2981 /* Test device or partition size, when known. */ 3008 /* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
3002 * NOTE: we don't repeat the blk_size check for each new device. 3029 * NOTE: we don't repeat the blk_size check for each new device.
3003 * Stacking drivers are expected to know what they are doing. 3030 * Stacking drivers are expected to know what they are doing.
3004 */ 3031 */
3032 maxsector = -1;
3033 old_dev = 0;
3005 do { 3034 do {
3006 char b[BDEVNAME_SIZE]; 3035 char b[BDEVNAME_SIZE];
3007 3036
@@ -3034,6 +3063,15 @@ end_io:
3034 */ 3063 */
3035 blk_partition_remap(bio); 3064 blk_partition_remap(bio);
3036 3065
3066 if (maxsector != -1)
3067 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3068 maxsector);
3069
3070 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3071
3072 maxsector = bio->bi_sector;
3073 old_dev = bio->bi_bdev->bd_dev;
3074
3037 ret = q->make_request_fn(q, bio); 3075 ret = q->make_request_fn(q, bio);
3038 } while (ret); 3076 } while (ret);
3039} 3077}
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
3153 int total_bytes, bio_nbytes, error, next_idx = 0; 3191 int total_bytes, bio_nbytes, error, next_idx = 0;
3154 struct bio *bio; 3192 struct bio *bio;
3155 3193
3194 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3195
3156 /* 3196 /*
3157 * extend uptodate bool to allow < 0 value to be direct io error 3197 * extend uptodate bool to allow < 0 value to be direct io error
3158 */ 3198 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80e..1f2890989b56 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
38#include <linux/hdreg.h> 38#include <linux/hdreg.h>
39#include <linux/spinlock.h> 39#include <linux/spinlock.h>
40#include <linux/compat.h> 40#include <linux/compat.h>
41#include <linux/blktrace_api.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/io.h> 43#include <asm/io.h>
43 44
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
2331 2332
2332 cmd->rq->completion_data = cmd; 2333 cmd->rq->completion_data = cmd;
2333 cmd->rq->errors = status; 2334 cmd->rq->errors = status;
2335 blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
2334 blk_complete_request(cmd->rq); 2336 blk_complete_request(cmd->rq);
2335} 2337}
2336 2338
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c7..8c82373f7ff3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
17#include <linux/mempool.h> 17#include <linux/mempool.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/blktrace_api.h>
20 21
21static const char *_name = DM_NAME; 22static const char *_name = DM_NAME;
22 23
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
334 /* nudge anyone waiting on suspend queue */ 335 /* nudge anyone waiting on suspend queue */
335 wake_up(&io->md->wait); 336 wake_up(&io->md->wait);
336 337
338 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
339
337 bio_endio(io->bio, io->bio->bi_size, io->error); 340 bio_endio(io->bio, io->bio->bi_size, io->error);
338 free_io(io->md, io); 341 free_io(io->md, io);
339 } 342 }
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
392 struct target_io *tio) 395 struct target_io *tio)
393{ 396{
394 int r; 397 int r;
398 sector_t sector;
395 399
396 /* 400 /*
397 * Sanity checks. 401 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
407 * this io. 411 * this io.
408 */ 412 */
409 atomic_inc(&tio->io->io_count); 413 atomic_inc(&tio->io->io_count);
414 sector = clone->bi_sector;
410 r = ti->type->map(ti, clone, &tio->info); 415 r = ti->type->map(ti, clone, &tio->info);
411 if (r > 0) 416 if (r > 0) {
412 /* the bio has been remapped so dispatch it */ 417 /* the bio has been remapped so dispatch it */
418
419 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
420 tio->io->bio->bi_bdev->bd_dev, sector,
421 clone->bi_sector);
422
413 generic_make_request(clone); 423 generic_make_request(clone);
424 }
414 425
415 else if (r < 0) { 426 else if (r < 0) {
416 /* error the io and bail out */ 427 /* error the io and bail out */
diff --git a/fs/Kconfig b/fs/Kconfig
index e9749b0eecd8..c8d0a209120c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -859,18 +859,6 @@ config RAMFS
859 To compile this as a module, choose M here: the module will be called 859 To compile this as a module, choose M here: the module will be called
860 ramfs. 860 ramfs.
861 861
862config RELAYFS_FS
863 tristate "Relayfs file system support"
864 ---help---
865 Relayfs is a high-speed data relay filesystem designed to provide
866 an efficient mechanism for tools and facilities to relay large
867 amounts of data from kernel space to user space.
868
869 To compile this code as a module, choose M here: the module will be
870 called relayfs.
871
872 If unsure, say N.
873
874config CONFIGFS_FS 862config CONFIGFS_FS
875 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" 863 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
876 depends on EXPERIMENTAL 864 depends on EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1db711319c80..080b3867be4d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/
91obj-$(CONFIG_ADFS_FS) += adfs/ 91obj-$(CONFIG_ADFS_FS) += adfs/
92obj-$(CONFIG_FUSE_FS) += fuse/ 92obj-$(CONFIG_FUSE_FS) += fuse/
93obj-$(CONFIG_UDF_FS) += udf/ 93obj-$(CONFIG_UDF_FS) += udf/
94obj-$(CONFIG_RELAYFS_FS) += relayfs/
95obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ 94obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
96obj-$(CONFIG_JFS_FS) += jfs/ 95obj-$(CONFIG_JFS_FS) += jfs/
97obj-$(CONFIG_XFS_FS) += xfs/ 96obj-$(CONFIG_XFS_FS) += xfs/
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c96..0a8c59cb68f5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/blktrace_api.h>
28#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
29 30
30#define BIO_POOL_SIZE 256 31#define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1095 if (!bp) 1096 if (!bp)
1096 return bp; 1097 return bp;
1097 1098
1099 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
1100 bi->bi_sector + first_sectors);
1101
1098 BUG_ON(bi->bi_vcnt != 1); 1102 BUG_ON(bi->bi_vcnt != 1);
1099 BUG_ON(bi->bi_idx != 0); 1103 BUG_ON(bi->bi_idx != 0);
1100 atomic_set(&bp->cnt, 3); 1104 atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875d..7c031f00fd79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
72#include <linux/i2c-dev.h> 72#include <linux/i2c-dev.h>
73#include <linux/wireless.h> 73#include <linux/wireless.h>
74#include <linux/atalk.h> 74#include <linux/atalk.h>
75#include <linux/blktrace_api.h>
75 76
76#include <net/sock.h> /* siocdevprivate_ioctl */ 77#include <net/sock.h> /* siocdevprivate_ioctl */
77#include <net/bluetooth/bluetooth.h> 78#include <net/bluetooth/bluetooth.h>
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
deleted file mode 100644
index e76e182cdb38..000000000000
--- a/fs/relayfs/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
1obj-$(CONFIG_RELAYFS_FS) += relayfs.o
2
3relayfs-y := relay.o inode.o buffers.o
4
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
deleted file mode 100644
index 10187812771e..000000000000
--- a/fs/relayfs/buffers.c
+++ /dev/null
@@ -1,190 +0,0 @@
1/*
2 * RelayFS buffer management code.
3 *
4 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
5 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
6 *
7 * This file is released under the GPL.
8 */
9
10#include <linux/module.h>
11#include <linux/vmalloc.h>
12#include <linux/mm.h>
13#include <linux/relayfs_fs.h>
14#include "relay.h"
15#include "buffers.h"
16
17/*
18 * close() vm_op implementation for relayfs file mapping.
19 */
20static void relay_file_mmap_close(struct vm_area_struct *vma)
21{
22 struct rchan_buf *buf = vma->vm_private_data;
23 buf->chan->cb->buf_unmapped(buf, vma->vm_file);
24}
25
26/*
27 * nopage() vm_op implementation for relayfs file mapping.
28 */
29static struct page *relay_buf_nopage(struct vm_area_struct *vma,
30 unsigned long address,
31 int *type)
32{
33 struct page *page;
34 struct rchan_buf *buf = vma->vm_private_data;
35 unsigned long offset = address - vma->vm_start;
36
37 if (address > vma->vm_end)
38 return NOPAGE_SIGBUS; /* Disallow mremap */
39 if (!buf)
40 return NOPAGE_OOM;
41
42 page = vmalloc_to_page(buf->start + offset);
43 if (!page)
44 return NOPAGE_OOM;
45 get_page(page);
46
47 if (type)
48 *type = VM_FAULT_MINOR;
49
50 return page;
51}
52
53/*
54 * vm_ops for relay file mappings.
55 */
56static struct vm_operations_struct relay_file_mmap_ops = {
57 .nopage = relay_buf_nopage,
58 .close = relay_file_mmap_close,
59};
60
61/**
62 * relay_mmap_buf: - mmap channel buffer to process address space
63 * @buf: relay channel buffer
64 * @vma: vm_area_struct describing memory to be mapped
65 *
66 * Returns 0 if ok, negative on error
67 *
68 * Caller should already have grabbed mmap_sem.
69 */
70int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
71{
72 unsigned long length = vma->vm_end - vma->vm_start;
73 struct file *filp = vma->vm_file;
74
75 if (!buf)
76 return -EBADF;
77
78 if (length != (unsigned long)buf->chan->alloc_size)
79 return -EINVAL;
80
81 vma->vm_ops = &relay_file_mmap_ops;
82 vma->vm_private_data = buf;
83 buf->chan->cb->buf_mapped(buf, filp);
84
85 return 0;
86}
87
88/**
89 * relay_alloc_buf - allocate a channel buffer
90 * @buf: the buffer struct
91 * @size: total size of the buffer
92 *
93 * Returns a pointer to the resulting buffer, NULL if unsuccessful
94 */
95static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
96{
97 void *mem;
98 unsigned int i, j, n_pages;
99
100 size = PAGE_ALIGN(size);
101 n_pages = size >> PAGE_SHIFT;
102
103 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
104 if (!buf->page_array)
105 return NULL;
106
107 for (i = 0; i < n_pages; i++) {
108 buf->page_array[i] = alloc_page(GFP_KERNEL);
109 if (unlikely(!buf->page_array[i]))
110 goto depopulate;
111 }
112 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
113 if (!mem)
114 goto depopulate;
115
116 memset(mem, 0, size);
117 buf->page_count = n_pages;
118 return mem;
119
120depopulate:
121 for (j = 0; j < i; j++)
122 __free_page(buf->page_array[j]);
123 kfree(buf->page_array);
124 return NULL;
125}
126
127/**
128 * relay_create_buf - allocate and initialize a channel buffer
129 * @alloc_size: size of the buffer to allocate
130 * @n_subbufs: number of sub-buffers in the channel
131 *
132 * Returns channel buffer if successful, NULL otherwise
133 */
134struct rchan_buf *relay_create_buf(struct rchan *chan)
135{
136 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
137 if (!buf)
138 return NULL;
139
140 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
141 if (!buf->padding)
142 goto free_buf;
143
144 buf->start = relay_alloc_buf(buf, chan->alloc_size);
145 if (!buf->start)
146 goto free_buf;
147
148 buf->chan = chan;
149 kref_get(&buf->chan->kref);
150 return buf;
151
152free_buf:
153 kfree(buf->padding);
154 kfree(buf);
155 return NULL;
156}
157
158/**
159 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
160 * @buf: the buffer struct
161 */
162void relay_destroy_buf(struct rchan_buf *buf)
163{
164 struct rchan *chan = buf->chan;
165 unsigned int i;
166
167 if (likely(buf->start)) {
168 vunmap(buf->start);
169 for (i = 0; i < buf->page_count; i++)
170 __free_page(buf->page_array[i]);
171 kfree(buf->page_array);
172 }
173 kfree(buf->padding);
174 kfree(buf);
175 kref_put(&chan->kref, relay_destroy_channel);
176}
177
178/**
179 * relay_remove_buf - remove a channel buffer
180 *
181 * Removes the file from the relayfs fileystem, which also frees the
182 * rchan_buf_struct and the channel buffer. Should only be called from
183 * kref_put().
184 */
185void relay_remove_buf(struct kref *kref)
186{
187 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
188 buf->chan->cb->remove_buf_file(buf->dentry);
189 relay_destroy_buf(buf);
190}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
deleted file mode 100644
index 37a12493f641..000000000000
--- a/fs/relayfs/buffers.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _BUFFERS_H
2#define _BUFFERS_H
3
4/* This inspired by rtai/shmem */
5#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
6
7extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
8extern struct rchan_buf *relay_create_buf(struct rchan *chan);
9extern void relay_destroy_buf(struct rchan_buf *buf);
10extern void relay_remove_buf(struct kref *kref);
11
12#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
deleted file mode 100644
index 383523011aad..000000000000
--- a/fs/relayfs/inode.c
+++ /dev/null
@@ -1,581 +0,0 @@
1/*
2 * VFS-related code for RelayFS, a high-speed data relay filesystem.
3 *
4 * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
5 * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
6 *
7 * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
8 *
9 * This file is released under the GPL.
10 */
11
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/mount.h>
15#include <linux/pagemap.h>
16#include <linux/init.h>
17#include <linux/string.h>
18#include <linux/backing-dev.h>
19#include <linux/namei.h>
20#include <linux/poll.h>
21#include <linux/relayfs_fs.h>
22#include "relay.h"
23#include "buffers.h"
24
25#define RELAYFS_MAGIC 0xF0B4A981
26
27static struct vfsmount * relayfs_mount;
28static int relayfs_mount_count;
29
30static struct backing_dev_info relayfs_backing_dev_info = {
31 .ra_pages = 0, /* No readahead */
32 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
33};
34
35static struct inode *relayfs_get_inode(struct super_block *sb,
36 int mode,
37 struct file_operations *fops,
38 void *data)
39{
40 struct inode *inode;
41
42 inode = new_inode(sb);
43 if (!inode)
44 return NULL;
45
46 inode->i_mode = mode;
47 inode->i_uid = 0;
48 inode->i_gid = 0;
49 inode->i_blksize = PAGE_CACHE_SIZE;
50 inode->i_blocks = 0;
51 inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
52 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53 switch (mode & S_IFMT) {
54 case S_IFREG:
55 inode->i_fop = fops;
56 if (data)
57 inode->u.generic_ip = data;
58 break;
59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations;
62
63 /* directory inodes start off with i_nlink == 2 (for "." entry) */
64 inode->i_nlink++;
65 break;
66 default:
67 break;
68 }
69
70 return inode;
71}
72
73/**
74 * relayfs_create_entry - create a relayfs directory or file
75 * @name: the name of the file to create
76 * @parent: parent directory
77 * @mode: mode
78 * @fops: file operations to use for the file
79 * @data: user-associated data for this file
80 *
81 * Returns the new dentry, NULL on failure
82 *
83 * Creates a file or directory with the specifed permissions.
84 */
85static struct dentry *relayfs_create_entry(const char *name,
86 struct dentry *parent,
87 int mode,
88 struct file_operations *fops,
89 void *data)
90{
91 struct dentry *d;
92 struct inode *inode;
93 int error = 0;
94
95 BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
96
97 error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
98 if (error) {
99 printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
100 return NULL;
101 }
102
103 if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
104 parent = relayfs_mount->mnt_sb->s_root;
105
106 if (!parent) {
107 simple_release_fs(&relayfs_mount, &relayfs_mount_count);
108 return NULL;
109 }
110
111 parent = dget(parent);
112 mutex_lock(&parent->d_inode->i_mutex);
113 d = lookup_one_len(name, parent, strlen(name));
114 if (IS_ERR(d)) {
115 d = NULL;
116 goto release_mount;
117 }
118
119 if (d->d_inode) {
120 d = NULL;
121 goto release_mount;
122 }
123
124 inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
125 if (!inode) {
126 d = NULL;
127 goto release_mount;
128 }
129
130 d_instantiate(d, inode);
131 dget(d); /* Extra count - pin the dentry in core */
132
133 if (S_ISDIR(mode))
134 parent->d_inode->i_nlink++;
135
136 goto exit;
137
138release_mount:
139 simple_release_fs(&relayfs_mount, &relayfs_mount_count);
140
141exit:
142 mutex_unlock(&parent->d_inode->i_mutex);
143 dput(parent);
144 return d;
145}
146
147/**
148 * relayfs_create_file - create a file in the relay filesystem
149 * @name: the name of the file to create
150 * @parent: parent directory
151 * @mode: mode, if not specied the default perms are used
152 * @fops: file operations to use for the file
153 * @data: user-associated data for this file
154 *
155 * Returns file dentry if successful, NULL otherwise.
156 *
157 * The file will be created user r on behalf of current user.
158 */
159struct dentry *relayfs_create_file(const char *name,
160 struct dentry *parent,
161 int mode,
162 struct file_operations *fops,
163 void *data)
164{
165 BUG_ON(!fops);
166
167 if (!mode)
168 mode = S_IRUSR;
169 mode = (mode & S_IALLUGO) | S_IFREG;
170
171 return relayfs_create_entry(name, parent, mode, fops, data);
172}
173
174/**
175 * relayfs_create_dir - create a directory in the relay filesystem
176 * @name: the name of the directory to create
177 * @parent: parent directory, NULL if parent should be fs root
178 *
179 * Returns directory dentry if successful, NULL otherwise.
180 *
181 * The directory will be created world rwx on behalf of current user.
182 */
183struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
184{
185 int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
186 return relayfs_create_entry(name, parent, mode, NULL, NULL);
187}
188
189/**
190 * relayfs_remove - remove a file or directory in the relay filesystem
191 * @dentry: file or directory dentry
192 *
193 * Returns 0 if successful, negative otherwise.
194 */
195int relayfs_remove(struct dentry *dentry)
196{
197 struct dentry *parent;
198 int error = 0;
199
200 if (!dentry)
201 return -EINVAL;
202 parent = dentry->d_parent;
203 if (!parent)
204 return -EINVAL;
205
206 parent = dget(parent);
207 mutex_lock(&parent->d_inode->i_mutex);
208 if (dentry->d_inode) {
209 if (S_ISDIR(dentry->d_inode->i_mode))
210 error = simple_rmdir(parent->d_inode, dentry);
211 else
212 error = simple_unlink(parent->d_inode, dentry);
213 if (!error)
214 d_delete(dentry);
215 }
216 if (!error)
217 dput(dentry);
218 mutex_unlock(&parent->d_inode->i_mutex);
219 dput(parent);
220
221 if (!error)
222 simple_release_fs(&relayfs_mount, &relayfs_mount_count);
223
224 return error;
225}
226
227/**
228 * relayfs_remove_file - remove a file from relay filesystem
229 * @dentry: directory dentry
230 *
231 * Returns 0 if successful, negative otherwise.
232 */
233int relayfs_remove_file(struct dentry *dentry)
234{
235 return relayfs_remove(dentry);
236}
237
238/**
239 * relayfs_remove_dir - remove a directory in the relay filesystem
240 * @dentry: directory dentry
241 *
242 * Returns 0 if successful, negative otherwise.
243 */
244int relayfs_remove_dir(struct dentry *dentry)
245{
246 return relayfs_remove(dentry);
247}
248
249/**
250 * relay_file_open - open file op for relay files
251 * @inode: the inode
252 * @filp: the file
253 *
254 * Increments the channel buffer refcount.
255 */
256static int relay_file_open(struct inode *inode, struct file *filp)
257{
258 struct rchan_buf *buf = inode->u.generic_ip;
259 kref_get(&buf->kref);
260 filp->private_data = buf;
261
262 return 0;
263}
264
265/**
266 * relay_file_mmap - mmap file op for relay files
267 * @filp: the file
268 * @vma: the vma describing what to map
269 *
270 * Calls upon relay_mmap_buf to map the file into user space.
271 */
272static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
273{
274 struct rchan_buf *buf = filp->private_data;
275 return relay_mmap_buf(buf, vma);
276}
277
278/**
279 * relay_file_poll - poll file op for relay files
280 * @filp: the file
281 * @wait: poll table
282 *
283 * Poll implemention.
284 */
285static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
286{
287 unsigned int mask = 0;
288 struct rchan_buf *buf = filp->private_data;
289
290 if (buf->finalized)
291 return POLLERR;
292
293 if (filp->f_mode & FMODE_READ) {
294 poll_wait(filp, &buf->read_wait, wait);
295 if (!relay_buf_empty(buf))
296 mask |= POLLIN | POLLRDNORM;
297 }
298
299 return mask;
300}
301
302/**
303 * relay_file_release - release file op for relay files
304 * @inode: the inode
305 * @filp: the file
306 *
307 * Decrements the channel refcount, as the filesystem is
308 * no longer using it.
309 */
310static int relay_file_release(struct inode *inode, struct file *filp)
311{
312 struct rchan_buf *buf = filp->private_data;
313 kref_put(&buf->kref, relay_remove_buf);
314
315 return 0;
316}
317
318/**
319 * relay_file_read_consume - update the consumed count for the buffer
320 */
321static void relay_file_read_consume(struct rchan_buf *buf,
322 size_t read_pos,
323 size_t bytes_consumed)
324{
325 size_t subbuf_size = buf->chan->subbuf_size;
326 size_t n_subbufs = buf->chan->n_subbufs;
327 size_t read_subbuf;
328
329 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
330 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
331 buf->bytes_consumed = 0;
332 }
333
334 buf->bytes_consumed += bytes_consumed;
335 read_subbuf = read_pos / buf->chan->subbuf_size;
336 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
337 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
338 (buf->offset == subbuf_size))
339 return;
340 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
341 buf->bytes_consumed = 0;
342 }
343}
344
345/**
346 * relay_file_read_avail - boolean, are there unconsumed bytes available?
347 */
348static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
349{
350 size_t bytes_produced, bytes_consumed, write_offset;
351 size_t subbuf_size = buf->chan->subbuf_size;
352 size_t n_subbufs = buf->chan->n_subbufs;
353 size_t produced = buf->subbufs_produced % n_subbufs;
354 size_t consumed = buf->subbufs_consumed % n_subbufs;
355
356 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
357
358 if (consumed > produced) {
359 if ((produced > n_subbufs) &&
360 (produced + n_subbufs - consumed <= n_subbufs))
361 produced += n_subbufs;
362 } else if (consumed == produced) {
363 if (buf->offset > subbuf_size) {
364 produced += n_subbufs;
365 if (buf->subbufs_produced == buf->subbufs_consumed)
366 consumed += n_subbufs;
367 }
368 }
369
370 if (buf->offset > subbuf_size)
371 bytes_produced = (produced - 1) * subbuf_size + write_offset;
372 else
373 bytes_produced = produced * subbuf_size + write_offset;
374 bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
375
376 if (bytes_produced == bytes_consumed)
377 return 0;
378
379 relay_file_read_consume(buf, read_pos, 0);
380
381 return 1;
382}
383
384/**
385 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
386 */
387static size_t relay_file_read_subbuf_avail(size_t read_pos,
388 struct rchan_buf *buf)
389{
390 size_t padding, avail = 0;
391 size_t read_subbuf, read_offset, write_subbuf, write_offset;
392 size_t subbuf_size = buf->chan->subbuf_size;
393
394 write_subbuf = (buf->data - buf->start) / subbuf_size;
395 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
396 read_subbuf = read_pos / subbuf_size;
397 read_offset = read_pos % subbuf_size;
398 padding = buf->padding[read_subbuf];
399
400 if (read_subbuf == write_subbuf) {
401 if (read_offset + padding < write_offset)
402 avail = write_offset - (read_offset + padding);
403 } else
404 avail = (subbuf_size - padding) - read_offset;
405
406 return avail;
407}
408
409/**
410 * relay_file_read_start_pos - find the first available byte to read
411 *
412 * If the read_pos is in the middle of padding, return the
413 * position of the first actually available byte, otherwise
414 * return the original value.
415 */
416static size_t relay_file_read_start_pos(size_t read_pos,
417 struct rchan_buf *buf)
418{
419 size_t read_subbuf, padding, padding_start, padding_end;
420 size_t subbuf_size = buf->chan->subbuf_size;
421 size_t n_subbufs = buf->chan->n_subbufs;
422
423 read_subbuf = read_pos / subbuf_size;
424 padding = buf->padding[read_subbuf];
425 padding_start = (read_subbuf + 1) * subbuf_size - padding;
426 padding_end = (read_subbuf + 1) * subbuf_size;
427 if (read_pos >= padding_start && read_pos < padding_end) {
428 read_subbuf = (read_subbuf + 1) % n_subbufs;
429 read_pos = read_subbuf * subbuf_size;
430 }
431
432 return read_pos;
433}
434
435/**
436 * relay_file_read_end_pos - return the new read position
437 */
438static size_t relay_file_read_end_pos(struct rchan_buf *buf,
439 size_t read_pos,
440 size_t count)
441{
442 size_t read_subbuf, padding, end_pos;
443 size_t subbuf_size = buf->chan->subbuf_size;
444 size_t n_subbufs = buf->chan->n_subbufs;
445
446 read_subbuf = read_pos / subbuf_size;
447 padding = buf->padding[read_subbuf];
448 if (read_pos % subbuf_size + count + padding == subbuf_size)
449 end_pos = (read_subbuf + 1) * subbuf_size;
450 else
451 end_pos = read_pos + count;
452 if (end_pos >= subbuf_size * n_subbufs)
453 end_pos = 0;
454
455 return end_pos;
456}
457
458/**
459 * relay_file_read - read file op for relay files
460 * @filp: the file
461 * @buffer: the userspace buffer
462 * @count: number of bytes to read
463 * @ppos: position to read from
464 *
465 * Reads count bytes or the number of bytes available in the
466 * current sub-buffer being read, whichever is smaller.
467 */
468static ssize_t relay_file_read(struct file *filp,
469 char __user *buffer,
470 size_t count,
471 loff_t *ppos)
472{
473 struct rchan_buf *buf = filp->private_data;
474 struct inode *inode = filp->f_dentry->d_inode;
475 size_t read_start, avail;
476 ssize_t ret = 0;
477 void *from;
478
479 mutex_lock(&inode->i_mutex);
480 if(!relay_file_read_avail(buf, *ppos))
481 goto out;
482
483 read_start = relay_file_read_start_pos(*ppos, buf);
484 avail = relay_file_read_subbuf_avail(read_start, buf);
485 if (!avail)
486 goto out;
487
488 from = buf->start + read_start;
489 ret = count = min(count, avail);
490 if (copy_to_user(buffer, from, count)) {
491 ret = -EFAULT;
492 goto out;
493 }
494 relay_file_read_consume(buf, read_start, count);
495 *ppos = relay_file_read_end_pos(buf, read_start, count);
496out:
497 mutex_unlock(&inode->i_mutex);
498 return ret;
499}
500
501struct file_operations relay_file_operations = {
502 .open = relay_file_open,
503 .poll = relay_file_poll,
504 .mmap = relay_file_mmap,
505 .read = relay_file_read,
506 .llseek = no_llseek,
507 .release = relay_file_release,
508};
509
510static struct super_operations relayfs_ops = {
511 .statfs = simple_statfs,
512 .drop_inode = generic_delete_inode,
513};
514
515static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
516{
517 struct inode *inode;
518 struct dentry *root;
519 int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
520
521 sb->s_blocksize = PAGE_CACHE_SIZE;
522 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
523 sb->s_magic = RELAYFS_MAGIC;
524 sb->s_op = &relayfs_ops;
525 inode = relayfs_get_inode(sb, mode, NULL, NULL);
526
527 if (!inode)
528 return -ENOMEM;
529
530 root = d_alloc_root(inode);
531 if (!root) {
532 iput(inode);
533 return -ENOMEM;
534 }
535 sb->s_root = root;
536
537 return 0;
538}
539
540static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
541 int flags, const char *dev_name,
542 void *data)
543{
544 return get_sb_single(fs_type, flags, data, relayfs_fill_super);
545}
546
547static struct file_system_type relayfs_fs_type = {
548 .owner = THIS_MODULE,
549 .name = "relayfs",
550 .get_sb = relayfs_get_sb,
551 .kill_sb = kill_litter_super,
552};
553
554static int __init init_relayfs_fs(void)
555{
556 return register_filesystem(&relayfs_fs_type);
557}
558
559static void __exit exit_relayfs_fs(void)
560{
561
562
563
564
565
566 unregister_filesystem(&relayfs_fs_type);
567}
568
569module_init(init_relayfs_fs)
570module_exit(exit_relayfs_fs)
571
572EXPORT_SYMBOL_GPL(relay_file_operations);
573EXPORT_SYMBOL_GPL(relayfs_create_dir);
574EXPORT_SYMBOL_GPL(relayfs_remove_dir);
575EXPORT_SYMBOL_GPL(relayfs_create_file);
576EXPORT_SYMBOL_GPL(relayfs_remove_file);
577
578MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
579MODULE_DESCRIPTION("Relay Filesystem");
580MODULE_LICENSE("GPL");
581
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
deleted file mode 100644
index abf3ceaace49..000000000000
--- a/fs/relayfs/relay.c
+++ /dev/null
@@ -1,482 +0,0 @@
1/*
2 * Public API and common code for RelayFS.
3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 *
9 * This file is released under the GPL.
10 */
11
12#include <linux/errno.h>
13#include <linux/stddef.h>
14#include <linux/slab.h>
15#include <linux/module.h>
16#include <linux/string.h>
17#include <linux/relayfs_fs.h>
18#include "relay.h"
19#include "buffers.h"
20
21/**
22 * relay_buf_empty - boolean, is the channel buffer empty?
23 * @buf: channel buffer
24 *
25 * Returns 1 if the buffer is empty, 0 otherwise.
26 */
27int relay_buf_empty(struct rchan_buf *buf)
28{
29 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
30}
31
32/**
33 * relay_buf_full - boolean, is the channel buffer full?
34 * @buf: channel buffer
35 *
36 * Returns 1 if the buffer is full, 0 otherwise.
37 */
38int relay_buf_full(struct rchan_buf *buf)
39{
40 size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
41 return (ready >= buf->chan->n_subbufs) ? 1 : 0;
42}
43
44/*
45 * High-level relayfs kernel API and associated functions.
46 */
47
48/*
49 * rchan_callback implementations defining default channel behavior. Used
50 * in place of corresponding NULL values in client callback struct.
51 */
52
53/*
54 * subbuf_start() default callback. Does nothing.
55 */
56static int subbuf_start_default_callback (struct rchan_buf *buf,
57 void *subbuf,
58 void *prev_subbuf,
59 size_t prev_padding)
60{
61 if (relay_buf_full(buf))
62 return 0;
63
64 return 1;
65}
66
67/*
68 * buf_mapped() default callback. Does nothing.
69 */
70static void buf_mapped_default_callback(struct rchan_buf *buf,
71 struct file *filp)
72{
73}
74
75/*
76 * buf_unmapped() default callback. Does nothing.
77 */
78static void buf_unmapped_default_callback(struct rchan_buf *buf,
79 struct file *filp)
80{
81}
82
83/*
84 * create_buf_file_create() default callback. Creates file to represent buf.
85 */
86static struct dentry *create_buf_file_default_callback(const char *filename,
87 struct dentry *parent,
88 int mode,
89 struct rchan_buf *buf,
90 int *is_global)
91{
92 return relayfs_create_file(filename, parent, mode,
93 &relay_file_operations, buf);
94}
95
96/*
97 * remove_buf_file() default callback. Removes file representing relay buffer.
98 */
99static int remove_buf_file_default_callback(struct dentry *dentry)
100{
101 return relayfs_remove(dentry);
102}
103
104/* relay channel default callbacks */
105static struct rchan_callbacks default_channel_callbacks = {
106 .subbuf_start = subbuf_start_default_callback,
107 .buf_mapped = buf_mapped_default_callback,
108 .buf_unmapped = buf_unmapped_default_callback,
109 .create_buf_file = create_buf_file_default_callback,
110 .remove_buf_file = remove_buf_file_default_callback,
111};
112
113/**
114 * wakeup_readers - wake up readers waiting on a channel
115 * @private: the channel buffer
116 *
117 * This is the work function used to defer reader waking. The
118 * reason waking is deferred is that calling directly from write
119 * causes problems if you're writing from say the scheduler.
120 */
121static void wakeup_readers(void *private)
122{
123 struct rchan_buf *buf = private;
124 wake_up_interruptible(&buf->read_wait);
125}
126
127/**
128 * __relay_reset - reset a channel buffer
129 * @buf: the channel buffer
130 * @init: 1 if this is a first-time initialization
131 *
132 * See relay_reset for description of effect.
133 */
134static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
135{
136 size_t i;
137
138 if (init) {
139 init_waitqueue_head(&buf->read_wait);
140 kref_init(&buf->kref);
141 INIT_WORK(&buf->wake_readers, NULL, NULL);
142 } else {
143 cancel_delayed_work(&buf->wake_readers);
144 flush_scheduled_work();
145 }
146
147 buf->subbufs_produced = 0;
148 buf->subbufs_consumed = 0;
149 buf->bytes_consumed = 0;
150 buf->finalized = 0;
151 buf->data = buf->start;
152 buf->offset = 0;
153
154 for (i = 0; i < buf->chan->n_subbufs; i++)
155 buf->padding[i] = 0;
156
157 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
158}
159
160/**
161 * relay_reset - reset the channel
162 * @chan: the channel
163 *
164 * This has the effect of erasing all data from all channel buffers
165 * and restarting the channel in its initial state. The buffers
166 * are not freed, so any mappings are still in effect.
167 *
168 * NOTE: Care should be taken that the channel isn't actually
169 * being used by anything when this call is made.
170 */
171void relay_reset(struct rchan *chan)
172{
173 unsigned int i;
174 struct rchan_buf *prev = NULL;
175
176 if (!chan)
177 return;
178
179 for (i = 0; i < NR_CPUS; i++) {
180 if (!chan->buf[i] || chan->buf[i] == prev)
181 break;
182 __relay_reset(chan->buf[i], 0);
183 prev = chan->buf[i];
184 }
185}
186
187/**
188 * relay_open_buf - create a new channel buffer in relayfs
189 *
190 * Internal - used by relay_open().
191 */
192static struct rchan_buf *relay_open_buf(struct rchan *chan,
193 const char *filename,
194 struct dentry *parent,
195 int *is_global)
196{
197 struct rchan_buf *buf;
198 struct dentry *dentry;
199
200 if (*is_global)
201 return chan->buf[0];
202
203 buf = relay_create_buf(chan);
204 if (!buf)
205 return NULL;
206
207 /* Create file in fs */
208 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
209 buf, is_global);
210 if (!dentry) {
211 relay_destroy_buf(buf);
212 return NULL;
213 }
214
215 buf->dentry = dentry;
216 __relay_reset(buf, 1);
217
218 return buf;
219}
220
221/**
222 * relay_close_buf - close a channel buffer
223 * @buf: channel buffer
224 *
225 * Marks the buffer finalized and restores the default callbacks.
226 * The channel buffer and channel buffer data structure are then freed
227 * automatically when the last reference is given up.
228 */
229static inline void relay_close_buf(struct rchan_buf *buf)
230{
231 buf->finalized = 1;
232 buf->chan->cb = &default_channel_callbacks;
233 cancel_delayed_work(&buf->wake_readers);
234 flush_scheduled_work();
235 kref_put(&buf->kref, relay_remove_buf);
236}
237
238static inline void setup_callbacks(struct rchan *chan,
239 struct rchan_callbacks *cb)
240{
241 if (!cb) {
242 chan->cb = &default_channel_callbacks;
243 return;
244 }
245
246 if (!cb->subbuf_start)
247 cb->subbuf_start = subbuf_start_default_callback;
248 if (!cb->buf_mapped)
249 cb->buf_mapped = buf_mapped_default_callback;
250 if (!cb->buf_unmapped)
251 cb->buf_unmapped = buf_unmapped_default_callback;
252 if (!cb->create_buf_file)
253 cb->create_buf_file = create_buf_file_default_callback;
254 if (!cb->remove_buf_file)
255 cb->remove_buf_file = remove_buf_file_default_callback;
256 chan->cb = cb;
257}
258
259/**
260 * relay_open - create a new relayfs channel
261 * @base_filename: base name of files to create
262 * @parent: dentry of parent directory, NULL for root directory
263 * @subbuf_size: size of sub-buffers
264 * @n_subbufs: number of sub-buffers
265 * @cb: client callback functions
266 *
267 * Returns channel pointer if successful, NULL otherwise.
268 *
269 * Creates a channel buffer for each cpu using the sizes and
270 * attributes specified. The created channel buffer files
271 * will be named base_filename0...base_filenameN-1. File
272 * permissions will be S_IRUSR.
273 */
274struct rchan *relay_open(const char *base_filename,
275 struct dentry *parent,
276 size_t subbuf_size,
277 size_t n_subbufs,
278 struct rchan_callbacks *cb)
279{
280 unsigned int i;
281 struct rchan *chan;
282 char *tmpname;
283 int is_global = 0;
284
285 if (!base_filename)
286 return NULL;
287
288 if (!(subbuf_size && n_subbufs))
289 return NULL;
290
291 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
292 if (!chan)
293 return NULL;
294
295 chan->version = RELAYFS_CHANNEL_VERSION;
296 chan->n_subbufs = n_subbufs;
297 chan->subbuf_size = subbuf_size;
298 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
299 setup_callbacks(chan, cb);
300 kref_init(&chan->kref);
301
302 tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
303 if (!tmpname)
304 goto free_chan;
305
306 for_each_online_cpu(i) {
307 sprintf(tmpname, "%s%d", base_filename, i);
308 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
309 &is_global);
310 chan->buf[i]->cpu = i;
311 if (!chan->buf[i])
312 goto free_bufs;
313 }
314
315 kfree(tmpname);
316 return chan;
317
318free_bufs:
319 for (i = 0; i < NR_CPUS; i++) {
320 if (!chan->buf[i])
321 break;
322 relay_close_buf(chan->buf[i]);
323 if (is_global)
324 break;
325 }
326 kfree(tmpname);
327
328free_chan:
329 kref_put(&chan->kref, relay_destroy_channel);
330 return NULL;
331}
332
333/**
334 * relay_switch_subbuf - switch to a new sub-buffer
335 * @buf: channel buffer
336 * @length: size of current event
337 *
338 * Returns either the length passed in or 0 if full.
339
340 * Performs sub-buffer-switch tasks such as invoking callbacks,
341 * updating padding counts, waking up readers, etc.
342 */
343size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
344{
345 void *old, *new;
346 size_t old_subbuf, new_subbuf;
347
348 if (unlikely(length > buf->chan->subbuf_size))
349 goto toobig;
350
351 if (buf->offset != buf->chan->subbuf_size + 1) {
352 buf->prev_padding = buf->chan->subbuf_size - buf->offset;
353 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
354 buf->padding[old_subbuf] = buf->prev_padding;
355 buf->subbufs_produced++;
356 if (waitqueue_active(&buf->read_wait)) {
357 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
358 schedule_delayed_work(&buf->wake_readers, 1);
359 }
360 }
361
362 old = buf->data;
363 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
364 new = buf->start + new_subbuf * buf->chan->subbuf_size;
365 buf->offset = 0;
366 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
367 buf->offset = buf->chan->subbuf_size + 1;
368 return 0;
369 }
370 buf->data = new;
371 buf->padding[new_subbuf] = 0;
372
373 if (unlikely(length + buf->offset > buf->chan->subbuf_size))
374 goto toobig;
375
376 return length;
377
378toobig:
379 buf->chan->last_toobig = length;
380 return 0;
381}
382
383/**
384 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
385 * @chan: the channel
386 * @cpu: the cpu associated with the channel buffer to update
387 * @subbufs_consumed: number of sub-buffers to add to current buf's count
388 *
389 * Adds to the channel buffer's consumed sub-buffer count.
390 * subbufs_consumed should be the number of sub-buffers newly consumed,
391 * not the total consumed.
392 *
393 * NOTE: kernel clients don't need to call this function if the channel
394 * mode is 'overwrite'.
395 */
396void relay_subbufs_consumed(struct rchan *chan,
397 unsigned int cpu,
398 size_t subbufs_consumed)
399{
400 struct rchan_buf *buf;
401
402 if (!chan)
403 return;
404
405 if (cpu >= NR_CPUS || !chan->buf[cpu])
406 return;
407
408 buf = chan->buf[cpu];
409 buf->subbufs_consumed += subbufs_consumed;
410 if (buf->subbufs_consumed > buf->subbufs_produced)
411 buf->subbufs_consumed = buf->subbufs_produced;
412}
413
414/**
415 * relay_destroy_channel - free the channel struct
416 *
417 * Should only be called from kref_put().
418 */
419void relay_destroy_channel(struct kref *kref)
420{
421 struct rchan *chan = container_of(kref, struct rchan, kref);
422 kfree(chan);
423}
424
425/**
426 * relay_close - close the channel
427 * @chan: the channel
428 *
429 * Closes all channel buffers and frees the channel.
430 */
431void relay_close(struct rchan *chan)
432{
433 unsigned int i;
434 struct rchan_buf *prev = NULL;
435
436 if (!chan)
437 return;
438
439 for (i = 0; i < NR_CPUS; i++) {
440 if (!chan->buf[i] || chan->buf[i] == prev)
441 break;
442 relay_close_buf(chan->buf[i]);
443 prev = chan->buf[i];
444 }
445
446 if (chan->last_toobig)
447 printk(KERN_WARNING "relayfs: one or more items not logged "
448 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
449 chan->last_toobig, chan->subbuf_size);
450
451 kref_put(&chan->kref, relay_destroy_channel);
452}
453
454/**
455 * relay_flush - close the channel
456 * @chan: the channel
457 *
458 * Flushes all channel buffers i.e. forces buffer switch.
459 */
460void relay_flush(struct rchan *chan)
461{
462 unsigned int i;
463 struct rchan_buf *prev = NULL;
464
465 if (!chan)
466 return;
467
468 for (i = 0; i < NR_CPUS; i++) {
469 if (!chan->buf[i] || chan->buf[i] == prev)
470 break;
471 relay_switch_subbuf(chan->buf[i], 0);
472 prev = chan->buf[i];
473 }
474}
475
476EXPORT_SYMBOL_GPL(relay_open);
477EXPORT_SYMBOL_GPL(relay_close);
478EXPORT_SYMBOL_GPL(relay_flush);
479EXPORT_SYMBOL_GPL(relay_reset);
480EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
481EXPORT_SYMBOL_GPL(relay_switch_subbuf);
482EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
deleted file mode 100644
index 0993d3e5753b..000000000000
--- a/fs/relayfs/relay.h
+++ /dev/null
@@ -1,8 +0,0 @@
1#ifndef _RELAY_H
2#define _RELAY_H
3
4extern int relayfs_remove(struct dentry *dentry);
5extern int relay_buf_empty(struct rchan_buf *buf);
6extern void relay_destroy_channel(struct kref *kref);
7
8#endif /* _RELAY_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f3..c179966f1a2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
22struct elevator_queue; 22struct elevator_queue;
23typedef struct elevator_queue elevator_t; 23typedef struct elevator_queue elevator_t;
24struct request_pm_state; 24struct request_pm_state;
25struct blk_trace;
25 26
26#define BLKDEV_MIN_RQ 4 27#define BLKDEV_MIN_RQ 4
27#define BLKDEV_MAX_RQ 128 /* Default maximum */ 28#define BLKDEV_MAX_RQ 128 /* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
416 unsigned int sg_reserved_size; 417 unsigned int sg_reserved_size;
417 int node; 418 int node;
418 419
420 struct blk_trace *blk_trace;
421
419 /* 422 /*
420 * reserved for flush operations 423 * reserved for flush operations
421 */ 424 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 000000000000..b34d3e73d5ea
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
1#ifndef BLKTRACE_H
2#define BLKTRACE_H
3
4#include <linux/config.h>
5#include <linux/blkdev.h>
6#include <linux/relay.h>
7
8/*
9 * Trace categories
10 */
11enum blktrace_cat {
12 BLK_TC_READ = 1 << 0, /* reads */
13 BLK_TC_WRITE = 1 << 1, /* writes */
14 BLK_TC_BARRIER = 1 << 2, /* barrier */
15 BLK_TC_SYNC = 1 << 3, /* barrier */
16 BLK_TC_QUEUE = 1 << 4, /* queueing/merging */
17 BLK_TC_REQUEUE = 1 << 5, /* requeueing */
18 BLK_TC_ISSUE = 1 << 6, /* issue */
19 BLK_TC_COMPLETE = 1 << 7, /* completions */
20 BLK_TC_FS = 1 << 8, /* fs requests */
21 BLK_TC_PC = 1 << 9, /* pc requests */
22 BLK_TC_NOTIFY = 1 << 10, /* special message */
23
24 BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
25};
26
27#define BLK_TC_SHIFT (16)
28#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT)
29
30/*
31 * Basic trace actions
32 */
33enum blktrace_act {
34 __BLK_TA_QUEUE = 1, /* queued */
35 __BLK_TA_BACKMERGE, /* back merged to existing rq */
36 __BLK_TA_FRONTMERGE, /* front merge to existing rq */
37 __BLK_TA_GETRQ, /* allocated new request */
38 __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */
39 __BLK_TA_REQUEUE, /* request requeued */
40 __BLK_TA_ISSUE, /* sent to driver */
41 __BLK_TA_COMPLETE, /* completed by driver */
42 __BLK_TA_PLUG, /* queue was plugged */
43 __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */
44 __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
45 __BLK_TA_INSERT, /* insert request */
46 __BLK_TA_SPLIT, /* bio was split */
47 __BLK_TA_BOUNCE, /* bio was bounced */
48 __BLK_TA_REMAP, /* bio was remapped */
49};
50
51/*
52 * Trace actions in full. Additionally, read or write is masked
53 */
54#define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
55#define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
56#define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
57#define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
58#define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
59#define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
60#define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
61#define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
62#define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
63#define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
64#define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
65#define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
66#define BLK_TA_SPLIT (__BLK_TA_SPLIT)
67#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
68#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
69
70#define BLK_IO_TRACE_MAGIC 0x65617400
71#define BLK_IO_TRACE_VERSION 0x07
72
73/*
74 * The trace itself
75 */
76struct blk_io_trace {
77 u32 magic; /* MAGIC << 8 | version */
78 u32 sequence; /* event number */
79 u64 time; /* in microseconds */
80 u64 sector; /* disk offset */
81 u32 bytes; /* transfer length */
82 u32 action; /* what happened */
83 u32 pid; /* who did it */
84 u32 device; /* device number */
85 u32 cpu; /* on what cpu did it happen */
86 u16 error; /* completion error */
87 u16 pdu_len; /* length of data after this trace */
88};
89
90/*
91 * The remap event
92 */
93struct blk_io_trace_remap {
94 u32 device;
95 u32 __pad;
96 u64 sector;
97};
98
99enum {
100 Blktrace_setup = 1,
101 Blktrace_running,
102 Blktrace_stopped,
103};
104
105struct blk_trace {
106 int trace_state;
107 struct rchan *rchan;
108 unsigned long *sequence;
109 u16 act_mask;
110 u64 start_lba;
111 u64 end_lba;
112 u32 pid;
113 u32 dev;
114 struct dentry *dir;
115 struct dentry *dropped_file;
116 atomic_t dropped;
117};
118
119/*
120 * User setup structure passed with BLKTRACESTART
121 */
122struct blk_user_trace_setup {
123 char name[BDEVNAME_SIZE]; /* output */
124 u16 act_mask; /* input */
125 u32 buf_size; /* input */
126 u32 buf_nr; /* input */
127 u64 start_lba;
128 u64 end_lba;
129 u32 pid;
130};
131
132#if defined(CONFIG_BLK_DEV_IO_TRACE)
133extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
134extern void blk_trace_shutdown(request_queue_t *);
135extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
136
137/**
138 * blk_add_trace_rq - Add a trace for a request oriented action
139 * @q: queue the io is for
140 * @rq: the source request
141 * @what: the action
142 *
143 * Description:
144 * Records an action against a request. Will log the bio offset + size.
145 *
146 **/
147static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
148 u32 what)
149{
150 struct blk_trace *bt = q->blk_trace;
151 int rw = rq->flags & 0x07;
152
153 if (likely(!bt))
154 return;
155
156 if (blk_pc_request(rq)) {
157 what |= BLK_TC_ACT(BLK_TC_PC);
158 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
159 } else {
160 what |= BLK_TC_ACT(BLK_TC_FS);
161 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
162 }
163}
164
165/**
166 * blk_add_trace_bio - Add a trace for a bio oriented action
167 * @q: queue the io is for
168 * @bio: the source bio
169 * @what: the action
170 *
171 * Description:
172 * Records an action against a bio. Will log the bio offset + size.
173 *
174 **/
175static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
176 u32 what)
177{
178 struct blk_trace *bt = q->blk_trace;
179
180 if (likely(!bt))
181 return;
182
183 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
184}
185
186/**
187 * blk_add_trace_generic - Add a trace for a generic action
188 * @q: queue the io is for
189 * @bio: the source bio
190 * @rw: the data direction
191 * @what: the action
192 *
193 * Description:
194 * Records a simple trace
195 *
196 **/
197static inline void blk_add_trace_generic(struct request_queue *q,
198 struct bio *bio, int rw, u32 what)
199{
200 struct blk_trace *bt = q->blk_trace;
201
202 if (likely(!bt))
203 return;
204
205 if (bio)
206 blk_add_trace_bio(q, bio, what);
207 else
208 __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
209}
210
211/**
212 * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
213 * @q: queue the io is for
214 * @what: the action
215 * @bio: the source bio
216 * @pdu: the integer payload
217 *
218 * Description:
219 * Adds a trace with some integer payload. This might be an unplug
220 * option given as the action, with the depth at unplug time given
221 * as the payload
222 *
223 **/
224static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
225 struct bio *bio, unsigned int pdu)
226{
227 struct blk_trace *bt = q->blk_trace;
228 u64 rpdu = cpu_to_be64(pdu);
229
230 if (likely(!bt))
231 return;
232
233 if (bio)
234 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
235 else
236 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
237}
238
239/**
240 * blk_add_trace_remap - Add a trace for a remap operation
241 * @q: queue the io is for
242 * @bio: the source bio
243 * @dev: target device
244 * @from: source sector
245 * @to: target sector
246 *
247 * Description:
248 * Device mapper or raid target sometimes need to split a bio because
249 * it spans a stripe (or similar). Add a trace for that action.
250 *
251 **/
252static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
253 dev_t dev, sector_t from, sector_t to)
254{
255 struct blk_trace *bt = q->blk_trace;
256 struct blk_io_trace_remap r;
257
258 if (likely(!bt))
259 return;
260
261 r.device = cpu_to_be32(dev);
262 r.sector = cpu_to_be64(to);
263
264 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
265}
266
267#else /* !CONFIG_BLK_DEV_IO_TRACE */
268#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
269#define blk_trace_shutdown(q) do { } while (0)
270#define blk_add_trace_rq(q, rq, what) do { } while (0)
271#define blk_add_trace_bio(q, rq, what) do { } while (0)
272#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
273#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
274#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
275#endif /* CONFIG_BLK_DEV_IO_TRACE */
276
277#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df3..efb518f16bb3 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
97COMPATIBLE_IOCTL(BLKFLSBUF) 97COMPATIBLE_IOCTL(BLKFLSBUF)
98COMPATIBLE_IOCTL(BLKSECTSET) 98COMPATIBLE_IOCTL(BLKSECTSET)
99COMPATIBLE_IOCTL(BLKSSZGET) 99COMPATIBLE_IOCTL(BLKSSZGET)
100COMPATIBLE_IOCTL(BLKTRACESTART)
101COMPATIBLE_IOCTL(BLKTRACESTOP)
102COMPATIBLE_IOCTL(BLKTRACESETUP)
103COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
100ULONG_IOCTL(BLKRASET) 104ULONG_IOCTL(BLKRASET)
101ULONG_IOCTL(BLKFRASET) 105ULONG_IOCTL(BLKFRASET)
102/* RAID */ 106/* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d0..9b34a1b03455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
197#define BLKBSZGET _IOR(0x12,112,size_t) 197#define BLKBSZGET _IOR(0x12,112,size_t)
198#define BLKBSZSET _IOW(0x12,113,size_t) 198#define BLKBSZSET _IOW(0x12,113,size_t)
199#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ 199#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
200#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
201#define BLKTRACESTART _IO(0x12,116)
202#define BLKTRACESTOP _IO(0x12,117)
203#define BLKTRACETEARDOWN _IO(0x12,118)
200 204
201#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 205#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
202#define FIBMAP _IO(0x00,1) /* bmap access */ 206#define FIBMAP _IO(0x00,1) /* bmap access */
diff --git a/include/linux/relay.h b/include/linux/relay.h
new file mode 100644
index 000000000000..4bcc1531d6a9
--- /dev/null
+++ b/include/linux/relay.h
@@ -0,0 +1,281 @@
1/*
2 * linux/include/linux/relay.h
3 *
4 * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
5 * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
6 *
7 * CONFIG_RELAY definitions and declarations
8 */
9
10#ifndef _LINUX_RELAY_H
11#define _LINUX_RELAY_H
12
13#include <linux/config.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/wait.h>
17#include <linux/list.h>
18#include <linux/fs.h>
19#include <linux/poll.h>
20#include <linux/kref.h>
21
22/* Needs a _much_ better name... */
23#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
24
25/*
26 * Tracks changes to rchan/rchan_buf structs
27 */
28#define RELAYFS_CHANNEL_VERSION 6
29
30/*
31 * Per-cpu relay channel buffer
32 */
33struct rchan_buf
34{
35 void *start; /* start of channel buffer */
36 void *data; /* start of current sub-buffer */
37 size_t offset; /* current offset into sub-buffer */
38 size_t subbufs_produced; /* count of sub-buffers produced */
39 size_t subbufs_consumed; /* count of sub-buffers consumed */
40 struct rchan *chan; /* associated channel */
41 wait_queue_head_t read_wait; /* reader wait queue */
42 struct work_struct wake_readers; /* reader wake-up work struct */
43 struct dentry *dentry; /* channel file dentry */
44 struct kref kref; /* channel buffer refcount */
45 struct page **page_array; /* array of current buffer pages */
46 unsigned int page_count; /* number of current buffer pages */
47 unsigned int finalized; /* buffer has been finalized */
48 size_t *padding; /* padding counts per sub-buffer */
49 size_t prev_padding; /* temporary variable */
50 size_t bytes_consumed; /* bytes consumed in cur read subbuf */
51 unsigned int cpu; /* this buf's cpu */
52} ____cacheline_aligned;
53
54/*
55 * Relay channel data structure
56 */
57struct rchan
58{
59 u32 version; /* the version of this struct */
60 size_t subbuf_size; /* sub-buffer size */
61 size_t n_subbufs; /* number of sub-buffers per buffer */
62 size_t alloc_size; /* total buffer size allocated */
63 struct rchan_callbacks *cb; /* client callbacks */
64 struct kref kref; /* channel refcount */
65 void *private_data; /* for user-defined data */
66 size_t last_toobig; /* tried to log event > subbuf size */
67 struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
68};
69
70/*
71 * Relay channel client callbacks
72 */
73struct rchan_callbacks
74{
75 /*
76 * subbuf_start - called on buffer-switch to a new sub-buffer
77 * @buf: the channel buffer containing the new sub-buffer
78 * @subbuf: the start of the new sub-buffer
79 * @prev_subbuf: the start of the previous sub-buffer
80 * @prev_padding: unused space at the end of previous sub-buffer
81 *
82 * The client should return 1 to continue logging, 0 to stop
83 * logging.
84 *
85 * NOTE: subbuf_start will also be invoked when the buffer is
86 * created, so that the first sub-buffer can be initialized
87 * if necessary. In this case, prev_subbuf will be NULL.
88 *
89 * NOTE: the client can reserve bytes at the beginning of the new
90 * sub-buffer by calling subbuf_start_reserve() in this callback.
91 */
92 int (*subbuf_start) (struct rchan_buf *buf,
93 void *subbuf,
94 void *prev_subbuf,
95 size_t prev_padding);
96
97 /*
98 * buf_mapped - relay buffer mmap notification
99 * @buf: the channel buffer
100 * @filp: relay file pointer
101 *
102 * Called when a relay file is successfully mmapped
103 */
104 void (*buf_mapped)(struct rchan_buf *buf,
105 struct file *filp);
106
107 /*
108 * buf_unmapped - relay buffer unmap notification
109 * @buf: the channel buffer
110 * @filp: relay file pointer
111 *
112 * Called when a relay file is successfully unmapped
113 */
114 void (*buf_unmapped)(struct rchan_buf *buf,
115 struct file *filp);
116 /*
117 * create_buf_file - create file to represent a relay channel buffer
118 * @filename: the name of the file to create
119 * @parent: the parent of the file to create
120 * @mode: the mode of the file to create
121 * @buf: the channel buffer
122 * @is_global: outparam - set non-zero if the buffer should be global
123 *
124 * Called during relay_open(), once for each per-cpu buffer,
125 * to allow the client to create a file to be used to
126 * represent the corresponding channel buffer. If the file is
127 * created outside of relay, the parent must also exist in
128 * that filesystem.
129 *
130 * The callback should return the dentry of the file created
131 * to represent the relay buffer.
132 *
133 * Setting the is_global outparam to a non-zero value will
134 * cause relay_open() to create a single global buffer rather
135 * than the default set of per-cpu buffers.
136 *
137 * See Documentation/filesystems/relayfs.txt for more info.
138 */
139 struct dentry *(*create_buf_file)(const char *filename,
140 struct dentry *parent,
141 int mode,
142 struct rchan_buf *buf,
143 int *is_global);
144
145 /*
146 * remove_buf_file - remove file representing a relay channel buffer
147 * @dentry: the dentry of the file to remove
148 *
149 * Called during relay_close(), once for each per-cpu buffer,
150 * to allow the client to remove a file used to represent a
151 * channel buffer.
152 *
153 * The callback should return 0 if successful, negative if not.
154 */
155 int (*remove_buf_file)(struct dentry *dentry);
156};
157
158/*
159 * CONFIG_RELAY kernel API, kernel/relay.c
160 */
161
162struct rchan *relay_open(const char *base_filename,
163 struct dentry *parent,
164 size_t subbuf_size,
165 size_t n_subbufs,
166 struct rchan_callbacks *cb);
167extern void relay_close(struct rchan *chan);
168extern void relay_flush(struct rchan *chan);
169extern void relay_subbufs_consumed(struct rchan *chan,
170 unsigned int cpu,
171 size_t consumed);
172extern void relay_reset(struct rchan *chan);
173extern int relay_buf_full(struct rchan_buf *buf);
174
175extern size_t relay_switch_subbuf(struct rchan_buf *buf,
176 size_t length);
177
178/**
179 * relay_write - write data into the channel
180 * @chan: relay channel
181 * @data: data to be written
182 * @length: number of bytes to write
183 *
184 * Writes data into the current cpu's channel buffer.
185 *
186 * Protects the buffer by disabling interrupts. Use this
187 * if you might be logging from interrupt context. Try
188 * __relay_write() if you know you won't be logging from
189 * interrupt context.
190 */
191static inline void relay_write(struct rchan *chan,
192 const void *data,
193 size_t length)
194{
195 unsigned long flags;
196 struct rchan_buf *buf;
197
198 local_irq_save(flags);
199 buf = chan->buf[smp_processor_id()];
200 if (unlikely(buf->offset + length > chan->subbuf_size))
201 length = relay_switch_subbuf(buf, length);
202 memcpy(buf->data + buf->offset, data, length);
203 buf->offset += length;
204 local_irq_restore(flags);
205}
206
207/**
208 * __relay_write - write data into the channel
209 * @chan: relay channel
210 * @data: data to be written
211 * @length: number of bytes to write
212 *
213 * Writes data into the current cpu's channel buffer.
214 *
215 * Protects the buffer by disabling preemption. Use
216 * relay_write() if you might be logging from interrupt
217 * context.
218 */
219static inline void __relay_write(struct rchan *chan,
220 const void *data,
221 size_t length)
222{
223 struct rchan_buf *buf;
224
225 buf = chan->buf[get_cpu()];
226 if (unlikely(buf->offset + length > buf->chan->subbuf_size))
227 length = relay_switch_subbuf(buf, length);
228 memcpy(buf->data + buf->offset, data, length);
229 buf->offset += length;
230 put_cpu();
231}
232
233/**
234 * relay_reserve - reserve slot in channel buffer
235 * @chan: relay channel
236 * @length: number of bytes to reserve
237 *
238 * Returns pointer to reserved slot, NULL if full.
239 *
240 * Reserves a slot in the current cpu's channel buffer.
241 * Does not protect the buffer at all - caller must provide
242 * appropriate synchronization.
243 */
244static inline void *relay_reserve(struct rchan *chan, size_t length)
245{
246 void *reserved;
247 struct rchan_buf *buf = chan->buf[smp_processor_id()];
248
249 if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
250 length = relay_switch_subbuf(buf, length);
251 if (!length)
252 return NULL;
253 }
254 reserved = buf->data + buf->offset;
255 buf->offset += length;
256
257 return reserved;
258}
259
260/**
261 * subbuf_start_reserve - reserve bytes at the start of a sub-buffer
262 * @buf: relay channel buffer
263 * @length: number of bytes to reserve
264 *
265 * Helper function used to reserve bytes at the beginning of
266 * a sub-buffer in the subbuf_start() callback.
267 */
268static inline void subbuf_start_reserve(struct rchan_buf *buf,
269 size_t length)
270{
271 BUG_ON(length >= buf->chan->subbuf_size - 1);
272 buf->offset = length;
273}
274
275/*
276 * exported relay file operations, kernel/relay.c
277 */
278extern struct file_operations relay_file_operations;
279
280#endif /* _LINUX_RELAY_H */
281
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f0..e60a91d5b369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
706 prio_array_t *array; 706 prio_array_t *array;
707 707
708 unsigned short ioprio; 708 unsigned short ioprio;
709 unsigned int btrace_seq;
709 710
710 unsigned long sleep_avg; 711 unsigned long sleep_avg;
711 unsigned long long timestamp, last_ran; 712 unsigned long long timestamp, last_ran;
diff --git a/init/Kconfig b/init/Kconfig
index 38416a199def..1d19fd25204b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,17 @@ config CPUSETS
214 214
215 Say N if unsure. 215 Say N if unsure.
216 216
217config RELAY
218 bool "Kernel->user space relay support (formerly relayfs)"
219 help
220 This option enables support for relay interface support in
221 certain file systems (such as debugfs).
222 It is designed to provide an efficient mechanism for tools and
223 facilities to relay large amounts of data from kernel space to
224 user space.
225
226 If unsure, say N.
227
217source "usr/Kconfig" 228source "usr/Kconfig"
218 229
219config UID16 230config UID16
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..aebd7a78984e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 34obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
35obj-$(CONFIG_SECCOMP) += seccomp.o 35obj-$(CONFIG_SECCOMP) += seccomp.o
36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 36obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
37obj-$(CONFIG_RELAY) += relay.o
37 38
38ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 39ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
39# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 40# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a49..c21bae8c93b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
181 /* One for us, one for whoever does the "release_task()" (usually parent) */ 181 /* One for us, one for whoever does the "release_task()" (usually parent) */
182 atomic_set(&tsk->usage,2); 182 atomic_set(&tsk->usage,2);
183 atomic_set(&tsk->fs_excl, 0); 183 atomic_set(&tsk->fs_excl, 0);
184 tsk->btrace_seq = 0;
184 return tsk; 185 return tsk;
185} 186}
186 187
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 000000000000..33345e73485c
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1012 @@
1/*
2 * Public API and common code for kernel->userspace relay file support.
3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 *
9 * Moved to kernel/relay.c by Paul Mundt, 2006.
10 *
11 * This file is released under the GPL.
12 */
13#include <linux/errno.h>
14#include <linux/stddef.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/relay.h>
19#include <linux/vmalloc.h>
20#include <linux/mm.h>
21
22/*
23 * close() vm_op implementation for relay file mapping.
24 */
25static void relay_file_mmap_close(struct vm_area_struct *vma)
26{
27 struct rchan_buf *buf = vma->vm_private_data;
28 buf->chan->cb->buf_unmapped(buf, vma->vm_file);
29}
30
31/*
32 * nopage() vm_op implementation for relay file mapping.
33 */
34static struct page *relay_buf_nopage(struct vm_area_struct *vma,
35 unsigned long address,
36 int *type)
37{
38 struct page *page;
39 struct rchan_buf *buf = vma->vm_private_data;
40 unsigned long offset = address - vma->vm_start;
41
42 if (address > vma->vm_end)
43 return NOPAGE_SIGBUS; /* Disallow mremap */
44 if (!buf)
45 return NOPAGE_OOM;
46
47 page = vmalloc_to_page(buf->start + offset);
48 if (!page)
49 return NOPAGE_OOM;
50 get_page(page);
51
52 if (type)
53 *type = VM_FAULT_MINOR;
54
55 return page;
56}
57
58/*
59 * vm_ops for relay file mappings.
60 */
61static struct vm_operations_struct relay_file_mmap_ops = {
62 .nopage = relay_buf_nopage,
63 .close = relay_file_mmap_close,
64};
65
66/**
67 * relay_mmap_buf: - mmap channel buffer to process address space
68 * @buf: relay channel buffer
69 * @vma: vm_area_struct describing memory to be mapped
70 *
71 * Returns 0 if ok, negative on error
72 *
73 * Caller should already have grabbed mmap_sem.
74 */
75int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
76{
77 unsigned long length = vma->vm_end - vma->vm_start;
78 struct file *filp = vma->vm_file;
79
80 if (!buf)
81 return -EBADF;
82
83 if (length != (unsigned long)buf->chan->alloc_size)
84 return -EINVAL;
85
86 vma->vm_ops = &relay_file_mmap_ops;
87 vma->vm_private_data = buf;
88 buf->chan->cb->buf_mapped(buf, filp);
89
90 return 0;
91}
92
93/**
94 * relay_alloc_buf - allocate a channel buffer
95 * @buf: the buffer struct
96 * @size: total size of the buffer
97 *
98 * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
99 * passed in size will get page aligned, if it isn't already.
100 */
101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
102{
103 void *mem;
104 unsigned int i, j, n_pages;
105
106 *size = PAGE_ALIGN(*size);
107 n_pages = *size >> PAGE_SHIFT;
108
109 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
110 if (!buf->page_array)
111 return NULL;
112
113 for (i = 0; i < n_pages; i++) {
114 buf->page_array[i] = alloc_page(GFP_KERNEL);
115 if (unlikely(!buf->page_array[i]))
116 goto depopulate;
117 }
118 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
119 if (!mem)
120 goto depopulate;
121
122 memset(mem, 0, *size);
123 buf->page_count = n_pages;
124 return mem;
125
126depopulate:
127 for (j = 0; j < i; j++)
128 __free_page(buf->page_array[j]);
129 kfree(buf->page_array);
130 return NULL;
131}
132
133/**
134 * relay_create_buf - allocate and initialize a channel buffer
135 * @alloc_size: size of the buffer to allocate
136 * @n_subbufs: number of sub-buffers in the channel
137 *
138 * Returns channel buffer if successful, NULL otherwise
139 */
140struct rchan_buf *relay_create_buf(struct rchan *chan)
141{
142 struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
143 if (!buf)
144 return NULL;
145
146 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
147 if (!buf->padding)
148 goto free_buf;
149
150 buf->start = relay_alloc_buf(buf, &chan->alloc_size);
151 if (!buf->start)
152 goto free_buf;
153
154 buf->chan = chan;
155 kref_get(&buf->chan->kref);
156 return buf;
157
158free_buf:
159 kfree(buf->padding);
160 kfree(buf);
161 return NULL;
162}
163
164/**
165 * relay_destroy_channel - free the channel struct
166 *
167 * Should only be called from kref_put().
168 */
169void relay_destroy_channel(struct kref *kref)
170{
171 struct rchan *chan = container_of(kref, struct rchan, kref);
172 kfree(chan);
173}
174
175/**
176 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
177 * @buf: the buffer struct
178 */
179void relay_destroy_buf(struct rchan_buf *buf)
180{
181 struct rchan *chan = buf->chan;
182 unsigned int i;
183
184 if (likely(buf->start)) {
185 vunmap(buf->start);
186 for (i = 0; i < buf->page_count; i++)
187 __free_page(buf->page_array[i]);
188 kfree(buf->page_array);
189 }
190 kfree(buf->padding);
191 kfree(buf);
192 kref_put(&chan->kref, relay_destroy_channel);
193}
194
195/**
196 * relay_remove_buf - remove a channel buffer
197 *
198 * Removes the file from the fileystem, which also frees the
199 * rchan_buf_struct and the channel buffer. Should only be called from
200 * kref_put().
201 */
202void relay_remove_buf(struct kref *kref)
203{
204 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
205 buf->chan->cb->remove_buf_file(buf->dentry);
206 relay_destroy_buf(buf);
207}
208
209/**
210 * relay_buf_empty - boolean, is the channel buffer empty?
211 * @buf: channel buffer
212 *
213 * Returns 1 if the buffer is empty, 0 otherwise.
214 */
215int relay_buf_empty(struct rchan_buf *buf)
216{
217 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
218}
219EXPORT_SYMBOL_GPL(relay_buf_empty);
220
221/**
222 * relay_buf_full - boolean, is the channel buffer full?
223 * @buf: channel buffer
224 *
225 * Returns 1 if the buffer is full, 0 otherwise.
226 */
227int relay_buf_full(struct rchan_buf *buf)
228{
229 size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
230 return (ready >= buf->chan->n_subbufs) ? 1 : 0;
231}
232EXPORT_SYMBOL_GPL(relay_buf_full);
233
234/*
235 * High-level relay kernel API and associated functions.
236 */
237
238/*
239 * rchan_callback implementations defining default channel behavior. Used
240 * in place of corresponding NULL values in client callback struct.
241 */
242
243/*
244 * subbuf_start() default callback. Does nothing.
245 */
246static int subbuf_start_default_callback (struct rchan_buf *buf,
247 void *subbuf,
248 void *prev_subbuf,
249 size_t prev_padding)
250{
251 if (relay_buf_full(buf))
252 return 0;
253
254 return 1;
255}
256
257/*
258 * buf_mapped() default callback. Does nothing.
259 */
260static void buf_mapped_default_callback(struct rchan_buf *buf,
261 struct file *filp)
262{
263}
264
265/*
266 * buf_unmapped() default callback. Does nothing.
267 */
268static void buf_unmapped_default_callback(struct rchan_buf *buf,
269 struct file *filp)
270{
271}
272
273/*
274 * create_buf_file_create() default callback. Does nothing.
275 */
276static struct dentry *create_buf_file_default_callback(const char *filename,
277 struct dentry *parent,
278 int mode,
279 struct rchan_buf *buf,
280 int *is_global)
281{
282 return NULL;
283}
284
285/*
286 * remove_buf_file() default callback. Does nothing.
287 */
288static int remove_buf_file_default_callback(struct dentry *dentry)
289{
290 return -EINVAL;
291}
292
293/* relay channel default callbacks */
294static struct rchan_callbacks default_channel_callbacks = {
295 .subbuf_start = subbuf_start_default_callback,
296 .buf_mapped = buf_mapped_default_callback,
297 .buf_unmapped = buf_unmapped_default_callback,
298 .create_buf_file = create_buf_file_default_callback,
299 .remove_buf_file = remove_buf_file_default_callback,
300};
301
302/**
303 * wakeup_readers - wake up readers waiting on a channel
304 * @private: the channel buffer
305 *
306 * This is the work function used to defer reader waking. The
307 * reason waking is deferred is that calling directly from write
308 * causes problems if you're writing from say the scheduler.
309 */
310static void wakeup_readers(void *private)
311{
312 struct rchan_buf *buf = private;
313 wake_up_interruptible(&buf->read_wait);
314}
315
316/**
317 * __relay_reset - reset a channel buffer
318 * @buf: the channel buffer
319 * @init: 1 if this is a first-time initialization
320 *
321 * See relay_reset for description of effect.
322 */
323static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
324{
325 size_t i;
326
327 if (init) {
328 init_waitqueue_head(&buf->read_wait);
329 kref_init(&buf->kref);
330 INIT_WORK(&buf->wake_readers, NULL, NULL);
331 } else {
332 cancel_delayed_work(&buf->wake_readers);
333 flush_scheduled_work();
334 }
335
336 buf->subbufs_produced = 0;
337 buf->subbufs_consumed = 0;
338 buf->bytes_consumed = 0;
339 buf->finalized = 0;
340 buf->data = buf->start;
341 buf->offset = 0;
342
343 for (i = 0; i < buf->chan->n_subbufs; i++)
344 buf->padding[i] = 0;
345
346 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
347}
348
349/**
350 * relay_reset - reset the channel
351 * @chan: the channel
352 *
353 * This has the effect of erasing all data from all channel buffers
354 * and restarting the channel in its initial state. The buffers
355 * are not freed, so any mappings are still in effect.
356 *
357 * NOTE: Care should be taken that the channel isn't actually
358 * being used by anything when this call is made.
359 */
360void relay_reset(struct rchan *chan)
361{
362 unsigned int i;
363 struct rchan_buf *prev = NULL;
364
365 if (!chan)
366 return;
367
368 for (i = 0; i < NR_CPUS; i++) {
369 if (!chan->buf[i] || chan->buf[i] == prev)
370 break;
371 __relay_reset(chan->buf[i], 0);
372 prev = chan->buf[i];
373 }
374}
375EXPORT_SYMBOL_GPL(relay_reset);
376
377/**
378 * relay_open_buf - create a new relay channel buffer
379 *
380 * Internal - used by relay_open().
381 */
382static struct rchan_buf *relay_open_buf(struct rchan *chan,
383 const char *filename,
384 struct dentry *parent,
385 int *is_global)
386{
387 struct rchan_buf *buf;
388 struct dentry *dentry;
389
390 if (*is_global)
391 return chan->buf[0];
392
393 buf = relay_create_buf(chan);
394 if (!buf)
395 return NULL;
396
397 /* Create file in fs */
398 dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
399 buf, is_global);
400 if (!dentry) {
401 relay_destroy_buf(buf);
402 return NULL;
403 }
404
405 buf->dentry = dentry;
406 __relay_reset(buf, 1);
407
408 return buf;
409}
410
411/**
412 * relay_close_buf - close a channel buffer
413 * @buf: channel buffer
414 *
415 * Marks the buffer finalized and restores the default callbacks.
416 * The channel buffer and channel buffer data structure are then freed
417 * automatically when the last reference is given up.
418 */
419static inline void relay_close_buf(struct rchan_buf *buf)
420{
421 buf->finalized = 1;
422 cancel_delayed_work(&buf->wake_readers);
423 flush_scheduled_work();
424 kref_put(&buf->kref, relay_remove_buf);
425}
426
427static inline void setup_callbacks(struct rchan *chan,
428 struct rchan_callbacks *cb)
429{
430 if (!cb) {
431 chan->cb = &default_channel_callbacks;
432 return;
433 }
434
435 if (!cb->subbuf_start)
436 cb->subbuf_start = subbuf_start_default_callback;
437 if (!cb->buf_mapped)
438 cb->buf_mapped = buf_mapped_default_callback;
439 if (!cb->buf_unmapped)
440 cb->buf_unmapped = buf_unmapped_default_callback;
441 if (!cb->create_buf_file)
442 cb->create_buf_file = create_buf_file_default_callback;
443 if (!cb->remove_buf_file)
444 cb->remove_buf_file = remove_buf_file_default_callback;
445 chan->cb = cb;
446}
447
448/**
449 * relay_open - create a new relay channel
450 * @base_filename: base name of files to create
451 * @parent: dentry of parent directory, NULL for root directory
452 * @subbuf_size: size of sub-buffers
453 * @n_subbufs: number of sub-buffers
454 * @cb: client callback functions
455 *
456 * Returns channel pointer if successful, NULL otherwise.
457 *
458 * Creates a channel buffer for each cpu using the sizes and
459 * attributes specified. The created channel buffer files
460 * will be named base_filename0...base_filenameN-1. File
461 * permissions will be S_IRUSR.
462 */
463struct rchan *relay_open(const char *base_filename,
464 struct dentry *parent,
465 size_t subbuf_size,
466 size_t n_subbufs,
467 struct rchan_callbacks *cb)
468{
469 unsigned int i;
470 struct rchan *chan;
471 char *tmpname;
472 int is_global = 0;
473
474 if (!base_filename)
475 return NULL;
476
477 if (!(subbuf_size && n_subbufs))
478 return NULL;
479
480 chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
481 if (!chan)
482 return NULL;
483
484 chan->version = RELAYFS_CHANNEL_VERSION;
485 chan->n_subbufs = n_subbufs;
486 chan->subbuf_size = subbuf_size;
487 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
488 setup_callbacks(chan, cb);
489 kref_init(&chan->kref);
490
491 tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
492 if (!tmpname)
493 goto free_chan;
494
495 for_each_online_cpu(i) {
496 sprintf(tmpname, "%s%d", base_filename, i);
497 chan->buf[i] = relay_open_buf(chan, tmpname, parent,
498 &is_global);
499 if (!chan->buf[i])
500 goto free_bufs;
501
502 chan->buf[i]->cpu = i;
503 }
504
505 kfree(tmpname);
506 return chan;
507
508free_bufs:
509 for (i = 0; i < NR_CPUS; i++) {
510 if (!chan->buf[i])
511 break;
512 relay_close_buf(chan->buf[i]);
513 if (is_global)
514 break;
515 }
516 kfree(tmpname);
517
518free_chan:
519 kref_put(&chan->kref, relay_destroy_channel);
520 return NULL;
521}
522EXPORT_SYMBOL_GPL(relay_open);
523
524/**
525 * relay_switch_subbuf - switch to a new sub-buffer
526 * @buf: channel buffer
527 * @length: size of current event
528 *
529 * Returns either the length passed in or 0 if full.
530 *
531 * Performs sub-buffer-switch tasks such as invoking callbacks,
532 * updating padding counts, waking up readers, etc.
533 */
534size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
535{
536 void *old, *new;
537 size_t old_subbuf, new_subbuf;
538
539 if (unlikely(length > buf->chan->subbuf_size))
540 goto toobig;
541
542 if (buf->offset != buf->chan->subbuf_size + 1) {
543 buf->prev_padding = buf->chan->subbuf_size - buf->offset;
544 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
545 buf->padding[old_subbuf] = buf->prev_padding;
546 buf->subbufs_produced++;
547 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
548 buf->padding[old_subbuf];
549 smp_mb();
550 if (waitqueue_active(&buf->read_wait)) {
551 PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
552 schedule_delayed_work(&buf->wake_readers, 1);
553 }
554 }
555
556 old = buf->data;
557 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
558 new = buf->start + new_subbuf * buf->chan->subbuf_size;
559 buf->offset = 0;
560 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
561 buf->offset = buf->chan->subbuf_size + 1;
562 return 0;
563 }
564 buf->data = new;
565 buf->padding[new_subbuf] = 0;
566
567 if (unlikely(length + buf->offset > buf->chan->subbuf_size))
568 goto toobig;
569
570 return length;
571
572toobig:
573 buf->chan->last_toobig = length;
574 return 0;
575}
576EXPORT_SYMBOL_GPL(relay_switch_subbuf);
577
578/**
579 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
580 * @chan: the channel
581 * @cpu: the cpu associated with the channel buffer to update
582 * @subbufs_consumed: number of sub-buffers to add to current buf's count
583 *
584 * Adds to the channel buffer's consumed sub-buffer count.
585 * subbufs_consumed should be the number of sub-buffers newly consumed,
586 * not the total consumed.
587 *
588 * NOTE: kernel clients don't need to call this function if the channel
589 * mode is 'overwrite'.
590 */
591void relay_subbufs_consumed(struct rchan *chan,
592 unsigned int cpu,
593 size_t subbufs_consumed)
594{
595 struct rchan_buf *buf;
596
597 if (!chan)
598 return;
599
600 if (cpu >= NR_CPUS || !chan->buf[cpu])
601 return;
602
603 buf = chan->buf[cpu];
604 buf->subbufs_consumed += subbufs_consumed;
605 if (buf->subbufs_consumed > buf->subbufs_produced)
606 buf->subbufs_consumed = buf->subbufs_produced;
607}
608EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
609
610/**
611 * relay_close - close the channel
612 * @chan: the channel
613 *
614 * Closes all channel buffers and frees the channel.
615 */
616void relay_close(struct rchan *chan)
617{
618 unsigned int i;
619 struct rchan_buf *prev = NULL;
620
621 if (!chan)
622 return;
623
624 for (i = 0; i < NR_CPUS; i++) {
625 if (!chan->buf[i] || chan->buf[i] == prev)
626 break;
627 relay_close_buf(chan->buf[i]);
628 prev = chan->buf[i];
629 }
630
631 if (chan->last_toobig)
632 printk(KERN_WARNING "relay: one or more items not logged "
633 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
634 chan->last_toobig, chan->subbuf_size);
635
636 kref_put(&chan->kref, relay_destroy_channel);
637}
638EXPORT_SYMBOL_GPL(relay_close);
639
640/**
641 * relay_flush - close the channel
642 * @chan: the channel
643 *
644 * Flushes all channel buffers i.e. forces buffer switch.
645 */
646void relay_flush(struct rchan *chan)
647{
648 unsigned int i;
649 struct rchan_buf *prev = NULL;
650
651 if (!chan)
652 return;
653
654 for (i = 0; i < NR_CPUS; i++) {
655 if (!chan->buf[i] || chan->buf[i] == prev)
656 break;
657 relay_switch_subbuf(chan->buf[i], 0);
658 prev = chan->buf[i];
659 }
660}
661EXPORT_SYMBOL_GPL(relay_flush);
662
663/**
664 * relay_file_open - open file op for relay files
665 * @inode: the inode
666 * @filp: the file
667 *
668 * Increments the channel buffer refcount.
669 */
670static int relay_file_open(struct inode *inode, struct file *filp)
671{
672 struct rchan_buf *buf = inode->u.generic_ip;
673 kref_get(&buf->kref);
674 filp->private_data = buf;
675
676 return 0;
677}
678
679/**
680 * relay_file_mmap - mmap file op for relay files
681 * @filp: the file
682 * @vma: the vma describing what to map
683 *
684 * Calls upon relay_mmap_buf to map the file into user space.
685 */
686static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
687{
688 struct rchan_buf *buf = filp->private_data;
689 return relay_mmap_buf(buf, vma);
690}
691
692/**
693 * relay_file_poll - poll file op for relay files
694 * @filp: the file
695 * @wait: poll table
696 *
697 * Poll implemention.
698 */
699static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
700{
701 unsigned int mask = 0;
702 struct rchan_buf *buf = filp->private_data;
703
704 if (buf->finalized)
705 return POLLERR;
706
707 if (filp->f_mode & FMODE_READ) {
708 poll_wait(filp, &buf->read_wait, wait);
709 if (!relay_buf_empty(buf))
710 mask |= POLLIN | POLLRDNORM;
711 }
712
713 return mask;
714}
715
716/**
717 * relay_file_release - release file op for relay files
718 * @inode: the inode
719 * @filp: the file
720 *
721 * Decrements the channel refcount, as the filesystem is
722 * no longer using it.
723 */
724static int relay_file_release(struct inode *inode, struct file *filp)
725{
726 struct rchan_buf *buf = filp->private_data;
727 kref_put(&buf->kref, relay_remove_buf);
728
729 return 0;
730}
731
732/**
733 * relay_file_read_consume - update the consumed count for the buffer
734 */
735static void relay_file_read_consume(struct rchan_buf *buf,
736 size_t read_pos,
737 size_t bytes_consumed)
738{
739 size_t subbuf_size = buf->chan->subbuf_size;
740 size_t n_subbufs = buf->chan->n_subbufs;
741 size_t read_subbuf;
742
743 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
744 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
745 buf->bytes_consumed = 0;
746 }
747
748 buf->bytes_consumed += bytes_consumed;
749 read_subbuf = read_pos / buf->chan->subbuf_size;
750 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
751 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
752 (buf->offset == subbuf_size))
753 return;
754 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
755 buf->bytes_consumed = 0;
756 }
757}
758
759/**
760 * relay_file_read_avail - boolean, are there unconsumed bytes available?
761 */
762static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
763{
764 size_t subbuf_size = buf->chan->subbuf_size;
765 size_t n_subbufs = buf->chan->n_subbufs;
766 size_t produced = buf->subbufs_produced;
767 size_t consumed = buf->subbufs_consumed;
768
769 relay_file_read_consume(buf, read_pos, 0);
770
771 if (unlikely(buf->offset > subbuf_size)) {
772 if (produced == consumed)
773 return 0;
774 return 1;
775 }
776
777 if (unlikely(produced - consumed >= n_subbufs)) {
778 consumed = (produced / n_subbufs) * n_subbufs;
779 buf->subbufs_consumed = consumed;
780 }
781
782 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
783 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
784
785 if (consumed > produced)
786 produced += n_subbufs * subbuf_size;
787
788 if (consumed == produced)
789 return 0;
790
791 return 1;
792}
793
794/**
795 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
796 */
797static size_t relay_file_read_subbuf_avail(size_t read_pos,
798 struct rchan_buf *buf)
799{
800 size_t padding, avail = 0;
801 size_t read_subbuf, read_offset, write_subbuf, write_offset;
802 size_t subbuf_size = buf->chan->subbuf_size;
803
804 write_subbuf = (buf->data - buf->start) / subbuf_size;
805 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
806 read_subbuf = read_pos / subbuf_size;
807 read_offset = read_pos % subbuf_size;
808 padding = buf->padding[read_subbuf];
809
810 if (read_subbuf == write_subbuf) {
811 if (read_offset + padding < write_offset)
812 avail = write_offset - (read_offset + padding);
813 } else
814 avail = (subbuf_size - padding) - read_offset;
815
816 return avail;
817}
818
819/**
820 * relay_file_read_start_pos - find the first available byte to read
821 *
822 * If the read_pos is in the middle of padding, return the
823 * position of the first actually available byte, otherwise
824 * return the original value.
825 */
826static size_t relay_file_read_start_pos(size_t read_pos,
827 struct rchan_buf *buf)
828{
829 size_t read_subbuf, padding, padding_start, padding_end;
830 size_t subbuf_size = buf->chan->subbuf_size;
831 size_t n_subbufs = buf->chan->n_subbufs;
832
833 read_subbuf = read_pos / subbuf_size;
834 padding = buf->padding[read_subbuf];
835 padding_start = (read_subbuf + 1) * subbuf_size - padding;
836 padding_end = (read_subbuf + 1) * subbuf_size;
837 if (read_pos >= padding_start && read_pos < padding_end) {
838 read_subbuf = (read_subbuf + 1) % n_subbufs;
839 read_pos = read_subbuf * subbuf_size;
840 }
841
842 return read_pos;
843}
844
845/**
846 * relay_file_read_end_pos - return the new read position
847 */
848static size_t relay_file_read_end_pos(struct rchan_buf *buf,
849 size_t read_pos,
850 size_t count)
851{
852 size_t read_subbuf, padding, end_pos;
853 size_t subbuf_size = buf->chan->subbuf_size;
854 size_t n_subbufs = buf->chan->n_subbufs;
855
856 read_subbuf = read_pos / subbuf_size;
857 padding = buf->padding[read_subbuf];
858 if (read_pos % subbuf_size + count + padding == subbuf_size)
859 end_pos = (read_subbuf + 1) * subbuf_size;
860 else
861 end_pos = read_pos + count;
862 if (end_pos >= subbuf_size * n_subbufs)
863 end_pos = 0;
864
865 return end_pos;
866}
867
868/**
869 * subbuf_read_actor - read up to one subbuf's worth of data
870 */
871static int subbuf_read_actor(size_t read_start,
872 struct rchan_buf *buf,
873 size_t avail,
874 read_descriptor_t *desc,
875 read_actor_t actor)
876{
877 void *from;
878 int ret = 0;
879
880 from = buf->start + read_start;
881 ret = avail;
882 if (copy_to_user(desc->arg.data, from, avail)) {
883 desc->error = -EFAULT;
884 ret = 0;
885 }
886 desc->arg.data += ret;
887 desc->written += ret;
888 desc->count -= ret;
889
890 return ret;
891}
892
893/**
894 * subbuf_send_actor - send up to one subbuf's worth of data
895 */
896static int subbuf_send_actor(size_t read_start,
897 struct rchan_buf *buf,
898 size_t avail,
899 read_descriptor_t *desc,
900 read_actor_t actor)
901{
902 unsigned long pidx, poff;
903 unsigned int subbuf_pages;
904 int ret = 0;
905
906 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
907 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
908 poff = read_start & ~PAGE_MASK;
909 while (avail) {
910 struct page *p = buf->page_array[pidx];
911 unsigned int len;
912
913 len = PAGE_SIZE - poff;
914 if (len > avail)
915 len = avail;
916
917 len = actor(desc, p, poff, len);
918 if (desc->error)
919 break;
920
921 avail -= len;
922 ret += len;
923 poff = 0;
924 pidx = (pidx + 1) % subbuf_pages;
925 }
926
927 return ret;
928}
929
930typedef int (*subbuf_actor_t) (size_t read_start,
931 struct rchan_buf *buf,
932 size_t avail,
933 read_descriptor_t *desc,
934 read_actor_t actor);
935
936/**
937 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
938 */
939static inline ssize_t relay_file_read_subbufs(struct file *filp,
940 loff_t *ppos,
941 size_t count,
942 subbuf_actor_t subbuf_actor,
943 read_actor_t actor,
944 void *target)
945{
946 struct rchan_buf *buf = filp->private_data;
947 size_t read_start, avail;
948 read_descriptor_t desc;
949 int ret;
950
951 if (!count)
952 return 0;
953
954 desc.written = 0;
955 desc.count = count;
956 desc.arg.data = target;
957 desc.error = 0;
958
959 mutex_lock(&filp->f_dentry->d_inode->i_mutex);
960 do {
961 if (!relay_file_read_avail(buf, *ppos))
962 break;
963
964 read_start = relay_file_read_start_pos(*ppos, buf);
965 avail = relay_file_read_subbuf_avail(read_start, buf);
966 if (!avail)
967 break;
968
969 avail = min(desc.count, avail);
970 ret = subbuf_actor(read_start, buf, avail, &desc, actor);
971 if (desc.error < 0)
972 break;
973
974 if (ret) {
975 relay_file_read_consume(buf, read_start, ret);
976 *ppos = relay_file_read_end_pos(buf, read_start, ret);
977 }
978 } while (desc.count && ret);
979 mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
980
981 return desc.written;
982}
983
984static ssize_t relay_file_read(struct file *filp,
985 char __user *buffer,
986 size_t count,
987 loff_t *ppos)
988{
989 return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
990 NULL, buffer);
991}
992
993static ssize_t relay_file_sendfile(struct file *filp,
994 loff_t *ppos,
995 size_t count,
996 read_actor_t actor,
997 void *target)
998{
999 return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
1000 actor, target);
1001}
1002
1003struct file_operations relay_file_operations = {
1004 .open = relay_file_open,
1005 .poll = relay_file_poll,
1006 .mmap = relay_file_mmap,
1007 .read = relay_file_read,
1008 .llseek = no_llseek,
1009 .release = relay_file_release,
1010 .sendfile = relay_file_sendfile,
1011};
1012EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..d0ea1eec6a9a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
29#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
30 31
31static mempool_t *page_pool, *isa_page_pool; 32static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
483 pool = isa_page_pool; 484 pool = isa_page_pool;
484 } 485 }
485 486
487 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
488
486 /* 489 /*
487 * slow path 490 * slow path
488 */ 491 */