aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnaldo Carvalho de Melo <acme@redhat.com>2009-01-23 09:06:27 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-26 08:40:53 -0500
commitc71a896154119f4ca9e89d6078f5f63ad60ef199 (patch)
tree0018ea80af12d6385fd0aa5295dddefdb284eebe
parent9011262a37cb438f0fa9394b5e83840db8f9680a (diff)
blktrace: add ftrace plugin
Impact: New way of using the blktrace infrastructure This drops the requirement of userspace utilities to use the blktrace facility. Configuration is done thru sysfs, adding a "trace" directory to the partition directory where blktrace can be enabled for the associated request_queue. The same filters present in the IOCTL interface are present as sysfs device attributes. The /sys/block/sdX/sdXN/trace/enable file allows tracing without any filters. The other files in this directory: pid, act_mask, start_lba and end_lba can be used with the same meaning as with the IOCTL interface. Using the sysfs interface will only setup the request_queue->blk_trace fields, tracing will only take place when the "blk" tracer is selected via the ftrace interface, as in the following example: To see the trace, one can use the /d/tracing/trace file or the /d/tracign/trace_pipe file, with semantics defined in the ftrace documentation in Documentation/ftrace.txt. [root@f10-1 ~]# cat /t/trace kjournald-305 [000] 3046.491224: 8,1 A WBS 6367 + 8 <- (8,1) 6304 kjournald-305 [000] 3046.491227: 8,1 Q R 6367 + 8 [kjournald] kjournald-305 [000] 3046.491236: 8,1 G RB 6367 + 8 [kjournald] kjournald-305 [000] 3046.491239: 8,1 P NS [kjournald] kjournald-305 [000] 3046.491242: 8,1 I RBS 6367 + 8 [kjournald] kjournald-305 [000] 3046.491251: 8,1 D WB 6367 + 8 [kjournald] kjournald-305 [000] 3046.491610: 8,1 U WS [kjournald] 1 <idle>-0 [000] 3046.511914: 8,1 C RS 6367 + 8 [6367] [root@f10-1 ~]# The default line context (prefix) format is the one described in the ftrace documentation, with the blktrace specific bits using its existing format, described in blkparse(8). If one wants to have the classic blktrace formatting, this is possible by using: [root@f10-1 ~]# echo blk_classic > /t/trace_options [root@f10-1 ~]# cat /t/trace 8,1 0 3046.491224 305 A WBS 6367 + 8 <- (8,1) 6304 8,1 0 3046.491227 305 Q R 6367 + 8 [kjournald] 8,1 0 3046.491236 305 G RB 6367 + 8 [kjournald] 8,1 0 3046.491239 305 P NS [kjournald] 8,1 0 3046.491242 305 I RBS 6367 + 8 [kjournald] 8,1 0 3046.491251 305 D WB 6367 + 8 [kjournald] 8,1 0 3046.491610 305 U WS [kjournald] 1 8,1 0 3046.511914 0 C RS 6367 + 8 [6367] [root@f10-1 ~]# Using the ftrace standard format allows more flexibility, such as the ability of asking for backtraces via trace_options: [root@f10-1 ~]# echo noblk_classic > /t/trace_options [root@f10-1 ~]# echo stacktrace > /t/trace_options [root@f10-1 ~]# cat /t/trace kjournald-305 [000] 3318.826779: 8,1 A WBS 6375 + 8 <- (8,1) 6312 kjournald-305 [000] 3318.826782: <= submit_bio <= submit_bh <= sync_dirty_buffer <= journal_commit_transaction <= kjournald <= kthread <= child_rip kjournald-305 [000] 3318.826836: 8,1 Q R 6375 + 8 [kjournald] kjournald-305 [000] 3318.826837: <= generic_make_request <= submit_bio <= submit_bh <= sync_dirty_buffer <= journal_commit_transaction <= kjournald <= kthread Please read the ftrace documentation to use aditional, standardized tracing filters such as /d/tracing/trace_cpumask, etc. See also /d/tracing/trace_mark to add comments in the trace stream, that is equivalent to the /d/block/sdaN/msg interface. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--block/blktrace.c651
-rw-r--r--fs/partitions/check.c7
-rw-r--r--kernel/trace/trace.h1
3 files changed, 654 insertions, 5 deletions
diff --git a/block/blktrace.c b/block/blktrace.c
index b0a2cae886db..630f167f8240 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -25,9 +25,27 @@
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h> 26#include <trace/block.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <../kernel/trace/trace_output.h>
28 29
29static unsigned int blktrace_seq __read_mostly = 1; 30static unsigned int blktrace_seq __read_mostly = 1;
30 31
32static struct trace_array *blk_tr;
33static int __read_mostly blk_tracer_enabled;
34
35/* Select an alternative, minimalistic output than the original one */
36#define TRACE_BLK_OPT_CLASSIC 0x1
37
38static struct tracer_opt blk_tracer_opts[] = {
39 /* Default disable the minimalistic output */
40 { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) },
41 { }
42};
43
44static struct tracer_flags blk_tracer_flags = {
45 .val = 0,
46 .opts = blk_tracer_opts,
47};
48
31/* Global reference count of probes */ 49/* Global reference count of probes */
32static DEFINE_MUTEX(blk_probe_mutex); 50static DEFINE_MUTEX(blk_probe_mutex);
33static atomic_t blk_probes_ref = ATOMIC_INIT(0); 51static atomic_t blk_probes_ref = ATOMIC_INIT(0);
@@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
43{ 61{
44 struct blk_io_trace *t; 62 struct blk_io_trace *t;
45 63
64 if (!bt->rchan)
65 return;
66
46 t = relay_reserve(bt->rchan, sizeof(*t) + len); 67 t = relay_reserve(bt->rchan, sizeof(*t) + len);
47 if (t) { 68 if (t) {
48 const int cpu = smp_processor_id(); 69 const int cpu = smp_processor_id();
@@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
90 unsigned long flags; 111 unsigned long flags;
91 char *buf; 112 char *buf;
92 113
114 if (blk_tr) {
115 va_start(args, fmt);
116 ftrace_vprintk(fmt, args);
117 va_end(args);
118 return;
119 }
120
121 if (!bt->msg_data)
122 return;
123
93 local_irq_save(flags); 124 local_irq_save(flags);
94 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 125 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
95 va_start(args, fmt); 126 va_start(args, fmt);
@@ -131,13 +162,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
131 int rw, u32 what, int error, int pdu_len, void *pdu_data) 162 int rw, u32 what, int error, int pdu_len, void *pdu_data)
132{ 163{
133 struct task_struct *tsk = current; 164 struct task_struct *tsk = current;
165 struct ring_buffer_event *event = NULL;
134 struct blk_io_trace *t; 166 struct blk_io_trace *t;
135 unsigned long flags; 167 unsigned long flags;
136 unsigned long *sequence; 168 unsigned long *sequence;
137 pid_t pid; 169 pid_t pid;
138 int cpu; 170 int cpu, pc = 0;
139 171
140 if (unlikely(bt->trace_state != Blktrace_running)) 172 if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
141 return; 173 return;
142 174
143 what |= ddir_act[rw & WRITE]; 175 what |= ddir_act[rw & WRITE];
@@ -150,6 +182,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
150 pid = tsk->pid; 182 pid = tsk->pid;
151 if (unlikely(act_log_check(bt, what, sector, pid))) 183 if (unlikely(act_log_check(bt, what, sector, pid)))
152 return; 184 return;
185 cpu = raw_smp_processor_id();
186
187 if (blk_tr) {
188 struct trace_entry *ent;
189 tracing_record_cmdline(current);
190
191 event = ring_buffer_lock_reserve(blk_tr->buffer,
192 sizeof(*t) + pdu_len, &flags);
193 if (!event)
194 return;
195
196 ent = ring_buffer_event_data(event);
197 t = (struct blk_io_trace *)ent;
198 pc = preempt_count();
199 tracing_generic_entry_update(ent, 0, pc);
200 ent->type = TRACE_BLK;
201 goto record_it;
202 }
153 203
154 /* 204 /*
155 * A word about the locking here - we disable interrupts to reserve 205 * A word about the locking here - we disable interrupts to reserve
@@ -163,23 +213,33 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
163 213
164 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); 214 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
165 if (t) { 215 if (t) {
166 cpu = smp_processor_id();
167 sequence = per_cpu_ptr(bt->sequence, cpu); 216 sequence = per_cpu_ptr(bt->sequence, cpu);
168 217
169 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; 218 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
170 t->sequence = ++(*sequence); 219 t->sequence = ++(*sequence);
171 t->time = ktime_to_ns(ktime_get()); 220 t->time = ktime_to_ns(ktime_get());
221 t->cpu = cpu;
222 t->pid = pid;
223record_it:
172 t->sector = sector; 224 t->sector = sector;
173 t->bytes = bytes; 225 t->bytes = bytes;
174 t->action = what; 226 t->action = what;
175 t->pid = pid;
176 t->device = bt->dev; 227 t->device = bt->dev;
177 t->cpu = cpu;
178 t->error = error; 228 t->error = error;
179 t->pdu_len = pdu_len; 229 t->pdu_len = pdu_len;
180 230
181 if (pdu_len) 231 if (pdu_len)
182 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 232 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
233
234 if (blk_tr) {
235 ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
236 if (pid != 0 &&
237 (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 &&
238 (trace_flags & TRACE_ITER_STACKTRACE) != 0)
239 __trace_stack(blk_tr, NULL, flags, 5, pc);
240 trace_wake_up();
241 return;
242 }
183 } 243 }
184 244
185 local_irq_restore(flags); 245 local_irq_restore(flags);
@@ -888,3 +948,584 @@ static void blk_unregister_tracepoints(void)
888 948
889 tracepoint_synchronize_unregister(); 949 tracepoint_synchronize_unregister();
890} 950}
951
952/*
953 * struct blk_io_tracer formatting routines
954 */
955
956static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
957{
958 int i = 0;
959
960 if (t->action & BLK_TC_DISCARD) rwbs[i++] = 'D';
961 else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W';
962 else if (t->bytes) rwbs[i++] = 'R';
963 else rwbs[i++] = 'N';
964
965 if (t->action & BLK_TC_AHEAD) rwbs[i++] = 'A';
966 if (t->action & BLK_TC_BARRIER) rwbs[i++] = 'B';
967 if (t->action & BLK_TC_SYNC) rwbs[i++] = 'S';
968 if (t->action & BLK_TC_META) rwbs[i++] = 'M';
969
970 rwbs[i] = '\0';
971}
972
973static inline
974const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
975{
976 return (const struct blk_io_trace *)ent;
977}
978
979static inline const void *pdu_start(const struct trace_entry *ent)
980{
981 return te_blk_io_trace(ent) + 1;
982}
983
984static inline u32 t_sec(const struct trace_entry *ent)
985{
986 return te_blk_io_trace(ent)->bytes >> 9;
987}
988
989static inline unsigned long long t_sector(const struct trace_entry *ent)
990{
991 return te_blk_io_trace(ent)->sector;
992}
993
994static inline __u16 t_error(const struct trace_entry *ent)
995{
996 return te_blk_io_trace(ent)->sector;
997}
998
999static __u64 get_pdu_int(const struct trace_entry *ent)
1000{
1001 const __u64 *val = pdu_start(ent);
1002 return be64_to_cpu(*val);
1003}
1004
1005static void get_pdu_remap(const struct trace_entry *ent,
1006 struct blk_io_trace_remap *r)
1007{
1008 const struct blk_io_trace_remap *__r = pdu_start(ent);
1009 __u64 sector = __r->sector;
1010
1011 r->device = be32_to_cpu(__r->device);
1012 r->device_from = be32_to_cpu(__r->device_from);
1013 r->sector = be64_to_cpu(sector);
1014}
1015
1016static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
1017{
1018 char rwbs[6];
1019 unsigned long long ts = ns2usecs(iter->ts);
1020 unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
1021 unsigned secs = (unsigned long)ts;
1022 const struct trace_entry *ent = iter->ent;
1023 const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
1024
1025 fill_rwbs(rwbs, t);
1026
1027 return trace_seq_printf(&iter->seq,
1028 "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
1029 MAJOR(t->device), MINOR(t->device), iter->cpu,
1030 secs, usec_rem, ent->pid, act, rwbs);
1031}
1032
1033static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
1034 const char *act)
1035{
1036 char rwbs[6];
1037 fill_rwbs(rwbs, t);
1038 return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
1039 MAJOR(t->device), MINOR(t->device), act, rwbs);
1040}
1041
1042static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1043{
1044 const char *cmd = trace_find_cmdline(ent->pid);
1045
1046 if (t_sec(ent))
1047 return trace_seq_printf(s, "%llu + %u [%s]\n",
1048 t_sector(ent), t_sec(ent), cmd);
1049 return trace_seq_printf(s, "[%s]\n", cmd);
1050}
1051
1052static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent)
1053{
1054 if (t_sec(ent))
1055 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
1056 t_sec(ent), t_error(ent));
1057 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
1058}
1059
1060static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1061{
1062 struct blk_io_trace_remap r = { .device = 0, };
1063
1064 get_pdu_remap(ent, &r);
1065 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1066 t_sector(ent),
1067 t_sec(ent), MAJOR(r.device), MINOR(r.device),
1068 (unsigned long long)r.sector);
1069}
1070
1071static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1072{
1073 return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
1074}
1075
1076static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1077{
1078 return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
1079 get_pdu_int(ent));
1080}
1081
1082static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1083{
1084 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1085 get_pdu_int(ent), trace_find_cmdline(ent->pid));
1086}
1087
1088/*
1089 * struct tracer operations
1090 */
1091
1092static void blk_tracer_print_header(struct seq_file *m)
1093{
1094 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1095 return;
1096 seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
1097 "# | | | | | |\n");
1098}
1099
1100static void blk_tracer_start(struct trace_array *tr)
1101{
1102 int cpu;
1103
1104 tr->time_start = ftrace_now(tr->cpu);
1105
1106 for_each_online_cpu(cpu)
1107 tracing_reset(tr, cpu);
1108
1109 mutex_lock(&blk_probe_mutex);
1110 if (atomic_add_return(1, &blk_probes_ref) == 1)
1111 if (blk_register_tracepoints())
1112 atomic_dec(&blk_probes_ref);
1113 mutex_unlock(&blk_probe_mutex);
1114}
1115
1116static int blk_tracer_init(struct trace_array *tr)
1117{
1118 blk_tr = tr;
1119 blk_tracer_start(tr);
1120 mutex_lock(&blk_probe_mutex);
1121 blk_tracer_enabled++;
1122 mutex_unlock(&blk_probe_mutex);
1123 return 0;
1124}
1125
1126static void blk_tracer_stop(struct trace_array *tr)
1127{
1128 mutex_lock(&blk_probe_mutex);
1129 if (atomic_dec_and_test(&blk_probes_ref))
1130 blk_unregister_tracepoints();
1131 mutex_unlock(&blk_probe_mutex);
1132}
1133
1134static void blk_tracer_reset(struct trace_array *tr)
1135{
1136 if (!atomic_read(&blk_probes_ref))
1137 return;
1138
1139 mutex_lock(&blk_probe_mutex);
1140 blk_tracer_enabled--;
1141 WARN_ON(blk_tracer_enabled < 0);
1142 mutex_unlock(&blk_probe_mutex);
1143
1144 blk_tracer_stop(tr);
1145}
1146
1147static struct {
1148 const char *act[2];
1149 int (*print)(struct trace_seq *s, const struct trace_entry *ent);
1150} what2act[] __read_mostly = {
1151 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1152 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
1153 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
1154 [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
1155 [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
1156 [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
1157 [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
1158 [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
1159 [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
1160 [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
1161 [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1162 [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
1163 [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
1164 [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
1165 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
1166};
1167
1168static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
1169 int flags)
1170{
1171 const struct blk_io_trace *t = (struct blk_io_trace *)ent;
1172 const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1173 int ret;
1174
1175 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
1176 ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1177 else {
1178 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1179 ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
1180 if (ret)
1181 ret = what2act[what].print(s, ent);
1182 }
1183
1184 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1185}
1186
1187static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1188{
1189 const struct blk_io_trace *t;
1190 u16 what;
1191 int ret;
1192
1193 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1194 return TRACE_TYPE_UNHANDLED;
1195
1196 t = (const struct blk_io_trace *)iter->ent;
1197 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1198
1199 if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
1200 ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
1201 else {
1202 const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1203 ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
1204 if (ret)
1205 ret = what2act[what].print(&iter->seq, iter->ent);
1206 }
1207
1208 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1209}
1210
1211static struct tracer blk_tracer __read_mostly = {
1212 .name = "blk",
1213 .init = blk_tracer_init,
1214 .reset = blk_tracer_reset,
1215 .start = blk_tracer_start,
1216 .stop = blk_tracer_stop,
1217 .print_header = blk_tracer_print_header,
1218 .print_line = blk_tracer_print_line,
1219 .flags = &blk_tracer_flags,
1220};
1221
1222static struct trace_event trace_blk_event = {
1223 .type = TRACE_BLK,
1224 .trace = blk_trace_event_print,
1225 .latency_trace = blk_trace_event_print,
1226 .raw = trace_nop_print,
1227 .hex = trace_nop_print,
1228 .binary = trace_nop_print,
1229};
1230
1231static int __init init_blk_tracer(void)
1232{
1233 if (!register_ftrace_event(&trace_blk_event)) {
1234 pr_warning("Warning: could not register block events\n");
1235 return 1;
1236 }
1237
1238 if (register_tracer(&blk_tracer) != 0) {
1239 pr_warning("Warning: could not register the block tracer\n");
1240 unregister_ftrace_event(&trace_blk_event);
1241 return 1;
1242 }
1243
1244 return 0;
1245}
1246
1247device_initcall(init_blk_tracer);
1248
1249static int blk_trace_remove_queue(struct request_queue *q)
1250{
1251 struct blk_trace *bt;
1252
1253 bt = xchg(&q->blk_trace, NULL);
1254 if (bt == NULL)
1255 return -EINVAL;
1256
1257 kfree(bt);
1258 return 0;
1259}
1260
1261/*
1262 * Setup everything required to start tracing
1263 */
1264static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1265{
1266 struct blk_trace *old_bt, *bt = NULL;
1267 int ret;
1268
1269 ret = -ENOMEM;
1270 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1271 if (!bt)
1272 goto err;
1273
1274 bt->dev = dev;
1275 bt->act_mask = (u16)-1;
1276 bt->end_lba = -1ULL;
1277 bt->trace_state = Blktrace_running;
1278
1279 old_bt = xchg(&q->blk_trace, bt);
1280 if (old_bt != NULL) {
1281 (void)xchg(&q->blk_trace, old_bt);
1282 kfree(bt);
1283 ret = -EBUSY;
1284 }
1285 return 0;
1286err:
1287 return ret;
1288}
1289
1290/*
1291 * sysfs interface to enable and configure tracing
1292 */
1293
1294static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
1295 struct device_attribute *attr,
1296 char *buf)
1297{
1298 struct hd_struct *p = dev_to_part(dev);
1299 struct block_device *bdev;
1300 ssize_t ret = -ENXIO;
1301
1302 lock_kernel();
1303 bdev = bdget(part_devt(p));
1304 if (bdev != NULL) {
1305 struct request_queue *q = bdev_get_queue(bdev);
1306
1307 if (q != NULL) {
1308 mutex_lock(&bdev->bd_mutex);
1309 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1310 mutex_unlock(&bdev->bd_mutex);
1311 }
1312
1313 bdput(bdev);
1314 }
1315
1316 unlock_kernel();
1317 return ret;
1318}
1319
1320static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
1321 struct device_attribute *attr,
1322 const char *buf, size_t count)
1323{
1324 struct block_device *bdev;
1325 struct request_queue *q;
1326 struct hd_struct *p;
1327 int value;
1328 ssize_t ret = -ENXIO;
1329
1330 if (count == 0 || sscanf(buf, "%d", &value) != 1)
1331 goto out;
1332
1333 lock_kernel();
1334 p = dev_to_part(dev);
1335 bdev = bdget(part_devt(p));
1336 if (bdev == NULL)
1337 goto out_unlock_kernel;
1338
1339 q = bdev_get_queue(bdev);
1340 if (q == NULL)
1341 goto out_bdput;
1342
1343 mutex_lock(&bdev->bd_mutex);
1344 if (value)
1345 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1346 else
1347 ret = blk_trace_remove_queue(q);
1348 mutex_unlock(&bdev->bd_mutex);
1349
1350 if (ret == 0)
1351 ret = count;
1352out_bdput:
1353 bdput(bdev);
1354out_unlock_kernel:
1355 unlock_kernel();
1356out:
1357 return ret;
1358}
1359
1360static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1361 struct device_attribute *attr,
1362 char *buf);
1363static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1364 struct device_attribute *attr,
1365 const char *buf, size_t count);
1366#define BLK_TRACE_DEVICE_ATTR(_name) \
1367 DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1368 sysfs_blk_trace_attr_show, \
1369 sysfs_blk_trace_attr_store)
1370
1371static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
1372 sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
1373static BLK_TRACE_DEVICE_ATTR(act_mask);
1374static BLK_TRACE_DEVICE_ATTR(pid);
1375static BLK_TRACE_DEVICE_ATTR(start_lba);
1376static BLK_TRACE_DEVICE_ATTR(end_lba);
1377
1378static struct attribute *blk_trace_attrs[] = {
1379 &dev_attr_enable.attr,
1380 &dev_attr_act_mask.attr,
1381 &dev_attr_pid.attr,
1382 &dev_attr_start_lba.attr,
1383 &dev_attr_end_lba.attr,
1384 NULL
1385};
1386
1387struct attribute_group blk_trace_attr_group = {
1388 .name = "trace",
1389 .attrs = blk_trace_attrs,
1390};
1391
1392static int blk_str2act_mask(const char *str)
1393{
1394 int mask = 0;
1395 char *copy = kstrdup(str, GFP_KERNEL), *s;
1396
1397 if (copy == NULL)
1398 return -ENOMEM;
1399
1400 s = strstrip(copy);
1401
1402 while (1) {
1403 char *sep = strchr(s, ',');
1404
1405 if (sep != NULL)
1406 *sep = '\0';
1407
1408 if (strcasecmp(s, "barrier") == 0)
1409 mask |= BLK_TC_BARRIER;
1410 else if (strcasecmp(s, "complete") == 0)
1411 mask |= BLK_TC_COMPLETE;
1412 else if (strcasecmp(s, "fs") == 0)
1413 mask |= BLK_TC_FS;
1414 else if (strcasecmp(s, "issue") == 0)
1415 mask |= BLK_TC_ISSUE;
1416 else if (strcasecmp(s, "pc") == 0)
1417 mask |= BLK_TC_PC;
1418 else if (strcasecmp(s, "queue") == 0)
1419 mask |= BLK_TC_QUEUE;
1420 else if (strcasecmp(s, "read") == 0)
1421 mask |= BLK_TC_READ;
1422 else if (strcasecmp(s, "requeue") == 0)
1423 mask |= BLK_TC_REQUEUE;
1424 else if (strcasecmp(s, "sync") == 0)
1425 mask |= BLK_TC_SYNC;
1426 else if (strcasecmp(s, "write") == 0)
1427 mask |= BLK_TC_WRITE;
1428
1429 if (sep == NULL)
1430 break;
1431
1432 s = sep + 1;
1433 }
1434 kfree(copy);
1435
1436 return mask;
1437}
1438
1439static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1440 struct device_attribute *attr,
1441 char *buf)
1442{
1443 struct hd_struct *p = dev_to_part(dev);
1444 struct request_queue *q;
1445 struct block_device *bdev;
1446 ssize_t ret = -ENXIO;
1447
1448 lock_kernel();
1449 bdev = bdget(part_devt(p));
1450 if (bdev == NULL)
1451 goto out_unlock_kernel;
1452
1453 q = bdev_get_queue(bdev);
1454 if (q == NULL)
1455 goto out_bdput;
1456 mutex_lock(&bdev->bd_mutex);
1457 if (q->blk_trace == NULL)
1458 ret = sprintf(buf, "disabled\n");
1459 else if (attr == &dev_attr_act_mask)
1460 ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
1461 else if (attr == &dev_attr_pid)
1462 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1463 else if (attr == &dev_attr_start_lba)
1464 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1465 else if (attr == &dev_attr_end_lba)
1466 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1467 mutex_unlock(&bdev->bd_mutex);
1468out_bdput:
1469 bdput(bdev);
1470out_unlock_kernel:
1471 unlock_kernel();
1472 return ret;
1473}
1474
1475static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1476 struct device_attribute *attr,
1477 const char *buf, size_t count)
1478{
1479 struct block_device *bdev;
1480 struct request_queue *q;
1481 struct hd_struct *p;
1482 u64 value;
1483 ssize_t ret = -ENXIO;
1484
1485 if (count == 0)
1486 goto out;
1487
1488 if (attr == &dev_attr_act_mask) {
1489 if (sscanf(buf, "%llx", &value) != 1) {
1490 /* Assume it is a list of trace category names */
1491 value = blk_str2act_mask(buf);
1492 if (value < 0)
1493 goto out;
1494 }
1495 } else if (sscanf(buf, "%llu", &value) != 1)
1496 goto out;
1497
1498 lock_kernel();
1499 p = dev_to_part(dev);
1500 bdev = bdget(part_devt(p));
1501 if (bdev == NULL)
1502 goto out_unlock_kernel;
1503
1504 q = bdev_get_queue(bdev);
1505 if (q == NULL)
1506 goto out_bdput;
1507
1508 mutex_lock(&bdev->bd_mutex);
1509 ret = 0;
1510 if (q->blk_trace == NULL)
1511 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1512
1513 if (ret == 0) {
1514 if (attr == &dev_attr_act_mask)
1515 q->blk_trace->act_mask = value;
1516 else if (attr == &dev_attr_pid)
1517 q->blk_trace->pid = value;
1518 else if (attr == &dev_attr_start_lba)
1519 q->blk_trace->start_lba = value;
1520 else if (attr == &dev_attr_end_lba)
1521 q->blk_trace->end_lba = value;
1522 ret = count;
1523 }
1524 mutex_unlock(&bdev->bd_mutex);
1525out_bdput:
1526 bdput(bdev);
1527out_unlock_kernel:
1528 unlock_kernel();
1529out:
1530 return ret;
1531}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..01714efdc65a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -268,6 +268,10 @@ ssize_t part_fail_store(struct device *dev,
268} 268}
269#endif 269#endif
270 270
271#ifdef CONFIG_BLK_DEV_IO_TRACE
272extern struct attribute_group blk_trace_attr_group;
273#endif
274
271static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); 275static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
272static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 276static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
273static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 277static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
@@ -294,6 +298,9 @@ static struct attribute_group part_attr_group = {
294 298
295static struct attribute_group *part_attr_groups[] = { 299static struct attribute_group *part_attr_groups[] = {
296 &part_attr_group, 300 &part_attr_group,
301#ifdef CONFIG_BLK_DEV_IO_TRACE
302 &blk_trace_attr_group,
303#endif
297 NULL 304 NULL
298}; 305};
299 306
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b96037d970df..e603a291134b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -32,6 +32,7 @@ enum trace_type {
32 TRACE_KMEM_ALLOC, 32 TRACE_KMEM_ALLOC,
33 TRACE_KMEM_FREE, 33 TRACE_KMEM_FREE,
34 TRACE_POWER, 34 TRACE_POWER,
35 TRACE_BLK,
35 36
36 __TRACE_LAST_TYPE, 37 __TRACE_LAST_TYPE,
37}; 38};