diff options
author | Jeff Garzik <jeff@garzik.org> | 2006-08-29 17:20:55 -0400 |
---|---|---|
committer | Jeff Garzik <jeff@garzik.org> | 2006-08-29 17:20:55 -0400 |
commit | a422142cfdf90d889d8d3e2affb8311a381530b7 (patch) | |
tree | bde7e2c7a3ee8bca649aecd877a9ee1593f4223e /Documentation | |
parent | 6fc47e31c0e802d205d67e644f654532e5d365d5 (diff) | |
parent | 60d4684068ff1eec78f55b5888d0bd2d4cca1520 (diff) |
Merge branch 'master' into upstream
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/connector/ucon.c | 206 | ||||
-rw-r--r-- | Documentation/cpusets.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/00-INDEX | 4 | ||||
-rw-r--r-- | Documentation/filesystems/relay.txt | 479 | ||||
-rw-r--r-- | Documentation/filesystems/relayfs.txt | 442 | ||||
-rw-r--r-- | Documentation/input/joystick.txt | 1 | ||||
-rw-r--r-- | Documentation/scsi/ChangeLog.megaraid | 123 | ||||
-rw-r--r-- | Documentation/sysctl/fs.txt | 20 | ||||
-rw-r--r-- | Documentation/sysctl/kernel.txt | 20 |
9 files changed, 836 insertions, 465 deletions
diff --git a/Documentation/connector/ucon.c b/Documentation/connector/ucon.c new file mode 100644 index 000000000000..d738cde2a8d5 --- /dev/null +++ b/Documentation/connector/ucon.c | |||
@@ -0,0 +1,206 @@ | |||
1 | /* | ||
2 | * ucon.c | ||
3 | * | ||
4 | * Copyright (c) 2004+ Evgeniy Polyakov <johnpol@2ka.mipt.ru> | ||
5 | * | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
20 | */ | ||
21 | |||
22 | #include <asm/types.h> | ||
23 | |||
24 | #include <sys/types.h> | ||
25 | #include <sys/socket.h> | ||
26 | #include <sys/poll.h> | ||
27 | |||
28 | #include <linux/netlink.h> | ||
29 | #include <linux/rtnetlink.h> | ||
30 | |||
31 | #include <arpa/inet.h> | ||
32 | |||
33 | #include <stdio.h> | ||
34 | #include <stdlib.h> | ||
35 | #include <unistd.h> | ||
36 | #include <string.h> | ||
37 | #include <errno.h> | ||
38 | #include <time.h> | ||
39 | |||
40 | #include <linux/connector.h> | ||
41 | |||
42 | #define DEBUG | ||
43 | #define NETLINK_CONNECTOR 11 | ||
44 | |||
45 | #ifdef DEBUG | ||
46 | #define ulog(f, a...) fprintf(stdout, f, ##a) | ||
47 | #else | ||
48 | #define ulog(f, a...) do {} while (0) | ||
49 | #endif | ||
50 | |||
51 | static int need_exit; | ||
52 | static __u32 seq; | ||
53 | |||
54 | static int netlink_send(int s, struct cn_msg *msg) | ||
55 | { | ||
56 | struct nlmsghdr *nlh; | ||
57 | unsigned int size; | ||
58 | int err; | ||
59 | char buf[128]; | ||
60 | struct cn_msg *m; | ||
61 | |||
62 | size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len); | ||
63 | |||
64 | nlh = (struct nlmsghdr *)buf; | ||
65 | nlh->nlmsg_seq = seq++; | ||
66 | nlh->nlmsg_pid = getpid(); | ||
67 | nlh->nlmsg_type = NLMSG_DONE; | ||
68 | nlh->nlmsg_len = NLMSG_LENGTH(size - sizeof(*nlh)); | ||
69 | nlh->nlmsg_flags = 0; | ||
70 | |||
71 | m = NLMSG_DATA(nlh); | ||
72 | #if 0 | ||
73 | ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n", | ||
74 | __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack); | ||
75 | #endif | ||
76 | memcpy(m, msg, sizeof(*m) + msg->len); | ||
77 | |||
78 | err = send(s, nlh, size, 0); | ||
79 | if (err == -1) | ||
80 | ulog("Failed to send: %s [%d].\n", | ||
81 | strerror(errno), errno); | ||
82 | |||
83 | return err; | ||
84 | } | ||
85 | |||
86 | int main(int argc, char *argv[]) | ||
87 | { | ||
88 | int s; | ||
89 | char buf[1024]; | ||
90 | int len; | ||
91 | struct nlmsghdr *reply; | ||
92 | struct sockaddr_nl l_local; | ||
93 | struct cn_msg *data; | ||
94 | FILE *out; | ||
95 | time_t tm; | ||
96 | struct pollfd pfd; | ||
97 | |||
98 | if (argc < 2) | ||
99 | out = stdout; | ||
100 | else { | ||
101 | out = fopen(argv[1], "a+"); | ||
102 | if (!out) { | ||
103 | ulog("Unable to open %s for writing: %s\n", | ||
104 | argv[1], strerror(errno)); | ||
105 | out = stdout; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | memset(buf, 0, sizeof(buf)); | ||
110 | |||
111 | s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); | ||
112 | if (s == -1) { | ||
113 | perror("socket"); | ||
114 | return -1; | ||
115 | } | ||
116 | |||
117 | l_local.nl_family = AF_NETLINK; | ||
118 | l_local.nl_groups = 0x123; /* bitmask of requested groups */ | ||
119 | l_local.nl_pid = 0; | ||
120 | |||
121 | if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { | ||
122 | perror("bind"); | ||
123 | close(s); | ||
124 | return -1; | ||
125 | } | ||
126 | |||
127 | #if 0 | ||
128 | { | ||
129 | int on = 0x57; /* Additional group number */ | ||
130 | setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); | ||
131 | } | ||
132 | #endif | ||
133 | if (0) { | ||
134 | int i, j; | ||
135 | |||
136 | memset(buf, 0, sizeof(buf)); | ||
137 | |||
138 | data = (struct cn_msg *)buf; | ||
139 | |||
140 | data->id.idx = 0x123; | ||
141 | data->id.val = 0x456; | ||
142 | data->seq = seq++; | ||
143 | data->ack = 0; | ||
144 | data->len = 0; | ||
145 | |||
146 | for (j=0; j<10; ++j) { | ||
147 | for (i=0; i<1000; ++i) { | ||
148 | len = netlink_send(s, data); | ||
149 | } | ||
150 | |||
151 | ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val); | ||
152 | } | ||
153 | |||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | |||
158 | pfd.fd = s; | ||
159 | |||
160 | while (!need_exit) { | ||
161 | pfd.events = POLLIN; | ||
162 | pfd.revents = 0; | ||
163 | switch (poll(&pfd, 1, -1)) { | ||
164 | case 0: | ||
165 | need_exit = 1; | ||
166 | break; | ||
167 | case -1: | ||
168 | if (errno != EINTR) { | ||
169 | need_exit = 1; | ||
170 | break; | ||
171 | } | ||
172 | continue; | ||
173 | } | ||
174 | if (need_exit) | ||
175 | break; | ||
176 | |||
177 | memset(buf, 0, sizeof(buf)); | ||
178 | len = recv(s, buf, sizeof(buf), 0); | ||
179 | if (len == -1) { | ||
180 | perror("recv buf"); | ||
181 | close(s); | ||
182 | return -1; | ||
183 | } | ||
184 | reply = (struct nlmsghdr *)buf; | ||
185 | |||
186 | switch (reply->nlmsg_type) { | ||
187 | case NLMSG_ERROR: | ||
188 | fprintf(out, "Error message received.\n"); | ||
189 | fflush(out); | ||
190 | break; | ||
191 | case NLMSG_DONE: | ||
192 | data = (struct cn_msg *)NLMSG_DATA(reply); | ||
193 | |||
194 | time(&tm); | ||
195 | fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n", | ||
196 | ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack); | ||
197 | fflush(out); | ||
198 | break; | ||
199 | default: | ||
200 | break; | ||
201 | } | ||
202 | } | ||
203 | |||
204 | close(s); | ||
205 | return 0; | ||
206 | } | ||
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 159e2a0c3e80..76b44290c154 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -217,6 +217,12 @@ exclusive cpuset. Also, the use of a Linux virtual file system (vfs) | |||
217 | to represent the cpuset hierarchy provides for a familiar permission | 217 | to represent the cpuset hierarchy provides for a familiar permission |
218 | and name space for cpusets, with a minimum of additional kernel code. | 218 | and name space for cpusets, with a minimum of additional kernel code. |
219 | 219 | ||
220 | The cpus file in the root (top_cpuset) cpuset is read-only. | ||
221 | It automatically tracks the value of cpu_online_map, using a CPU | ||
222 | hotplug notifier. If and when memory nodes can be hotplugged, | ||
223 | we expect to make the mems file in the root cpuset read-only | ||
224 | as well, and have it track the value of node_online_map. | ||
225 | |||
220 | 226 | ||
221 | 1.4 What are exclusive cpusets ? | 227 | 1.4 What are exclusive cpusets ? |
222 | -------------------------------- | 228 | -------------------------------- |
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 66fdc0744fe0..16dec61d7671 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -62,8 +62,8 @@ ramfs-rootfs-initramfs.txt | |||
62 | - info on the 'in memory' filesystems ramfs, rootfs and initramfs. | 62 | - info on the 'in memory' filesystems ramfs, rootfs and initramfs. |
63 | reiser4.txt | 63 | reiser4.txt |
64 | - info on the Reiser4 filesystem based on dancing tree algorithms. | 64 | - info on the Reiser4 filesystem based on dancing tree algorithms. |
65 | relayfs.txt | 65 | relay.txt |
66 | - info on relayfs, for efficient streaming from kernel to user space. | 66 | - info on relay, for efficient streaming from kernel to user space. |
67 | romfs.txt | 67 | romfs.txt |
68 | - description of the ROMFS filesystem. | 68 | - description of the ROMFS filesystem. |
69 | smbfs.txt | 69 | smbfs.txt |
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt new file mode 100644 index 000000000000..d6788dae0349 --- /dev/null +++ b/Documentation/filesystems/relay.txt | |||
@@ -0,0 +1,479 @@ | |||
1 | relay interface (formerly relayfs) | ||
2 | ================================== | ||
3 | |||
4 | The relay interface provides a means for kernel applications to | ||
5 | efficiently log and transfer large quantities of data from the kernel | ||
6 | to userspace via user-defined 'relay channels'. | ||
7 | |||
8 | A 'relay channel' is a kernel->user data relay mechanism implemented | ||
9 | as a set of per-cpu kernel buffers ('channel buffers'), each | ||
10 | represented as a regular file ('relay file') in user space. Kernel | ||
11 | clients write into the channel buffers using efficient write | ||
12 | functions; these automatically log into the current cpu's channel | ||
13 | buffer. User space applications mmap() or read() from the relay files | ||
14 | and retrieve the data as it becomes available. The relay files | ||
15 | themselves are files created in a host filesystem, e.g. debugfs, and | ||
16 | are associated with the channel buffers using the API described below. | ||
17 | |||
18 | The format of the data logged into the channel buffers is completely | ||
19 | up to the kernel client; the relay interface does however provide | ||
20 | hooks which allow kernel clients to impose some structure on the | ||
21 | buffer data. The relay interface doesn't implement any form of data | ||
22 | filtering - this also is left to the kernel client. The purpose is to | ||
23 | keep things as simple as possible. | ||
24 | |||
25 | This document provides an overview of the relay interface API. The | ||
26 | details of the function parameters are documented along with the | ||
27 | functions in the relay interface code - please see that for details. | ||
28 | |||
29 | Semantics | ||
30 | ========= | ||
31 | |||
32 | Each relay channel has one buffer per CPU, each buffer has one or more | ||
33 | sub-buffers. Messages are written to the first sub-buffer until it is | ||
34 | too full to contain a new message, in which case it it is written to | ||
35 | the next (if available). Messages are never split across sub-buffers. | ||
36 | At this point, userspace can be notified so it empties the first | ||
37 | sub-buffer, while the kernel continues writing to the next. | ||
38 | |||
39 | When notified that a sub-buffer is full, the kernel knows how many | ||
40 | bytes of it are padding i.e. unused space occurring because a complete | ||
41 | message couldn't fit into a sub-buffer. Userspace can use this | ||
42 | knowledge to copy only valid data. | ||
43 | |||
44 | After copying it, userspace can notify the kernel that a sub-buffer | ||
45 | has been consumed. | ||
46 | |||
47 | A relay channel can operate in a mode where it will overwrite data not | ||
48 | yet collected by userspace, and not wait for it to be consumed. | ||
49 | |||
50 | The relay channel itself does not provide for communication of such | ||
51 | data between userspace and kernel, allowing the kernel side to remain | ||
52 | simple and not impose a single interface on userspace. It does | ||
53 | provide a set of examples and a separate helper though, described | ||
54 | below. | ||
55 | |||
56 | The read() interface both removes padding and internally consumes the | ||
57 | read sub-buffers; thus in cases where read(2) is being used to drain | ||
58 | the channel buffers, special-purpose communication between kernel and | ||
59 | user isn't necessary for basic operation. | ||
60 | |||
61 | One of the major goals of the relay interface is to provide a low | ||
62 | overhead mechanism for conveying kernel data to userspace. While the | ||
63 | read() interface is easy to use, it's not as efficient as the mmap() | ||
64 | approach; the example code attempts to make the tradeoff between the | ||
65 | two approaches as small as possible. | ||
66 | |||
67 | klog and relay-apps example code | ||
68 | ================================ | ||
69 | |||
70 | The relay interface itself is ready to use, but to make things easier, | ||
71 | a couple simple utility functions and a set of examples are provided. | ||
72 | |||
73 | The relay-apps example tarball, available on the relay sourceforge | ||
74 | site, contains a set of self-contained examples, each consisting of a | ||
75 | pair of .c files containing boilerplate code for each of the user and | ||
76 | kernel sides of a relay application. When combined these two sets of | ||
77 | boilerplate code provide glue to easily stream data to disk, without | ||
78 | having to bother with mundane housekeeping chores. | ||
79 | |||
80 | The 'klog debugging functions' patch (klog.patch in the relay-apps | ||
81 | tarball) provides a couple of high-level logging functions to the | ||
82 | kernel which allow writing formatted text or raw data to a channel, | ||
83 | regardless of whether a channel to write into exists or not, or even | ||
84 | whether the relay interface is compiled into the kernel or not. These | ||
85 | functions allow you to put unconditional 'trace' statements anywhere | ||
86 | in the kernel or kernel modules; only when there is a 'klog handler' | ||
87 | registered will data actually be logged (see the klog and kleak | ||
88 | examples for details). | ||
89 | |||
90 | It is of course possible to use the relay interface from scratch, | ||
91 | i.e. without using any of the relay-apps example code or klog, but | ||
92 | you'll have to implement communication between userspace and kernel, | ||
93 | allowing both to convey the state of buffers (full, empty, amount of | ||
94 | padding). The read() interface both removes padding and internally | ||
95 | consumes the read sub-buffers; thus in cases where read(2) is being | ||
96 | used to drain the channel buffers, special-purpose communication | ||
97 | between kernel and user isn't necessary for basic operation. Things | ||
98 | such as buffer-full conditions would still need to be communicated via | ||
99 | some channel though. | ||
100 | |||
101 | klog and the relay-apps examples can be found in the relay-apps | ||
102 | tarball on http://relayfs.sourceforge.net | ||
103 | |||
104 | The relay interface user space API | ||
105 | ================================== | ||
106 | |||
107 | The relay interface implements basic file operations for user space | ||
108 | access to relay channel buffer data. Here are the file operations | ||
109 | that are available and some comments regarding their behavior: | ||
110 | |||
111 | open() enables user to open an _existing_ channel buffer. | ||
112 | |||
113 | mmap() results in channel buffer being mapped into the caller's | ||
114 | memory space. Note that you can't do a partial mmap - you | ||
115 | must map the entire file, which is NRBUF * SUBBUFSIZE. | ||
116 | |||
117 | read() read the contents of a channel buffer. The bytes read are | ||
118 | 'consumed' by the reader, i.e. they won't be available | ||
119 | again to subsequent reads. If the channel is being used | ||
120 | in no-overwrite mode (the default), it can be read at any | ||
121 | time even if there's an active kernel writer. If the | ||
122 | channel is being used in overwrite mode and there are | ||
123 | active channel writers, results may be unpredictable - | ||
124 | users should make sure that all logging to the channel has | ||
125 | ended before using read() with overwrite mode. Sub-buffer | ||
126 | padding is automatically removed and will not be seen by | ||
127 | the reader. | ||
128 | |||
129 | sendfile() transfer data from a channel buffer to an output file | ||
130 | descriptor. Sub-buffer padding is automatically removed | ||
131 | and will not be seen by the reader. | ||
132 | |||
133 | poll() POLLIN/POLLRDNORM/POLLERR supported. User applications are | ||
134 | notified when sub-buffer boundaries are crossed. | ||
135 | |||
136 | close() decrements the channel buffer's refcount. When the refcount | ||
137 | reaches 0, i.e. when no process or kernel client has the | ||
138 | buffer open, the channel buffer is freed. | ||
139 | |||
140 | In order for a user application to make use of relay files, the | ||
141 | host filesystem must be mounted. For example, | ||
142 | |||
143 | mount -t debugfs debugfs /debug | ||
144 | |||
145 | NOTE: the host filesystem doesn't need to be mounted for kernel | ||
146 | clients to create or use channels - it only needs to be | ||
147 | mounted when user space applications need access to the buffer | ||
148 | data. | ||
149 | |||
150 | |||
151 | The relay interface kernel API | ||
152 | ============================== | ||
153 | |||
154 | Here's a summary of the API the relay interface provides to in-kernel clients: | ||
155 | |||
156 | TBD(curr. line MT:/API/) | ||
157 | channel management functions: | ||
158 | |||
159 | relay_open(base_filename, parent, subbuf_size, n_subbufs, | ||
160 | callbacks) | ||
161 | relay_close(chan) | ||
162 | relay_flush(chan) | ||
163 | relay_reset(chan) | ||
164 | |||
165 | channel management typically called on instigation of userspace: | ||
166 | |||
167 | relay_subbufs_consumed(chan, cpu, subbufs_consumed) | ||
168 | |||
169 | write functions: | ||
170 | |||
171 | relay_write(chan, data, length) | ||
172 | __relay_write(chan, data, length) | ||
173 | relay_reserve(chan, length) | ||
174 | |||
175 | callbacks: | ||
176 | |||
177 | subbuf_start(buf, subbuf, prev_subbuf, prev_padding) | ||
178 | buf_mapped(buf, filp) | ||
179 | buf_unmapped(buf, filp) | ||
180 | create_buf_file(filename, parent, mode, buf, is_global) | ||
181 | remove_buf_file(dentry) | ||
182 | |||
183 | helper functions: | ||
184 | |||
185 | relay_buf_full(buf) | ||
186 | subbuf_start_reserve(buf, length) | ||
187 | |||
188 | |||
189 | Creating a channel | ||
190 | ------------------ | ||
191 | |||
192 | relay_open() is used to create a channel, along with its per-cpu | ||
193 | channel buffers. Each channel buffer will have an associated file | ||
194 | created for it in the host filesystem, which can be and mmapped or | ||
195 | read from in user space. The files are named basename0...basenameN-1 | ||
196 | where N is the number of online cpus, and by default will be created | ||
197 | in the root of the filesystem (if the parent param is NULL). If you | ||
198 | want a directory structure to contain your relay files, you should | ||
199 | create it using the host filesystem's directory creation function, | ||
200 | e.g. debugfs_create_dir(), and pass the parent directory to | ||
201 | relay_open(). Users are responsible for cleaning up any directory | ||
202 | structure they create, when the channel is closed - again the host | ||
203 | filesystem's directory removal functions should be used for that, | ||
204 | e.g. debugfs_remove(). | ||
205 | |||
206 | In order for a channel to be created and the host filesystem's files | ||
207 | associated with its channel buffers, the user must provide definitions | ||
208 | for two callback functions, create_buf_file() and remove_buf_file(). | ||
209 | create_buf_file() is called once for each per-cpu buffer from | ||
210 | relay_open() and allows the user to create the file which will be used | ||
211 | to represent the corresponding channel buffer. The callback should | ||
212 | return the dentry of the file created to represent the channel buffer. | ||
213 | remove_buf_file() must also be defined; it's responsible for deleting | ||
214 | the file(s) created in create_buf_file() and is called during | ||
215 | relay_close(). | ||
216 | |||
217 | Here are some typical definitions for these callbacks, in this case | ||
218 | using debugfs: | ||
219 | |||
220 | /* | ||
221 | * create_buf_file() callback. Creates relay file in debugfs. | ||
222 | */ | ||
223 | static struct dentry *create_buf_file_handler(const char *filename, | ||
224 | struct dentry *parent, | ||
225 | int mode, | ||
226 | struct rchan_buf *buf, | ||
227 | int *is_global) | ||
228 | { | ||
229 | return debugfs_create_file(filename, mode, parent, buf, | ||
230 | &relay_file_operations); | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * remove_buf_file() callback. Removes relay file from debugfs. | ||
235 | */ | ||
236 | static int remove_buf_file_handler(struct dentry *dentry) | ||
237 | { | ||
238 | debugfs_remove(dentry); | ||
239 | |||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * relay interface callbacks | ||
245 | */ | ||
246 | static struct rchan_callbacks relay_callbacks = | ||
247 | { | ||
248 | .create_buf_file = create_buf_file_handler, | ||
249 | .remove_buf_file = remove_buf_file_handler, | ||
250 | }; | ||
251 | |||
252 | And an example relay_open() invocation using them: | ||
253 | |||
254 | chan = relay_open("cpu", NULL, SUBBUF_SIZE, N_SUBBUFS, &relay_callbacks); | ||
255 | |||
256 | If the create_buf_file() callback fails, or isn't defined, channel | ||
257 | creation and thus relay_open() will fail. | ||
258 | |||
259 | The total size of each per-cpu buffer is calculated by multiplying the | ||
260 | number of sub-buffers by the sub-buffer size passed into relay_open(). | ||
261 | The idea behind sub-buffers is that they're basically an extension of | ||
262 | double-buffering to N buffers, and they also allow applications to | ||
263 | easily implement random-access-on-buffer-boundary schemes, which can | ||
264 | be important for some high-volume applications. The number and size | ||
265 | of sub-buffers is completely dependent on the application and even for | ||
266 | the same application, different conditions will warrant different | ||
267 | values for these parameters at different times. Typically, the right | ||
268 | values to use are best decided after some experimentation; in general, | ||
269 | though, it's safe to assume that having only 1 sub-buffer is a bad | ||
270 | idea - you're guaranteed to either overwrite data or lose events | ||
271 | depending on the channel mode being used. | ||
272 | |||
273 | The create_buf_file() implementation can also be defined in such a way | ||
274 | as to allow the creation of a single 'global' buffer instead of the | ||
275 | default per-cpu set. This can be useful for applications interested | ||
276 | mainly in seeing the relative ordering of system-wide events without | ||
277 | the need to bother with saving explicit timestamps for the purpose of | ||
278 | merging/sorting per-cpu files in a postprocessing step. | ||
279 | |||
280 | To have relay_open() create a global buffer, the create_buf_file() | ||
281 | implementation should set the value of the is_global outparam to a | ||
282 | non-zero value in addition to creating the file that will be used to | ||
283 | represent the single buffer. In the case of a global buffer, | ||
284 | create_buf_file() and remove_buf_file() will be called only once. The | ||
285 | normal channel-writing functions, e.g. relay_write(), can still be | ||
286 | used - writes from any cpu will transparently end up in the global | ||
287 | buffer - but since it is a global buffer, callers should make sure | ||
288 | they use the proper locking for such a buffer, either by wrapping | ||
289 | writes in a spinlock, or by copying a write function from relay.h and | ||
290 | creating a local version that internally does the proper locking. | ||
291 | |||
292 | Channel 'modes' | ||
293 | --------------- | ||
294 | |||
295 | relay channels can be used in either of two modes - 'overwrite' or | ||
296 | 'no-overwrite'. The mode is entirely determined by the implementation | ||
297 | of the subbuf_start() callback, as described below. The default if no | ||
298 | subbuf_start() callback is defined is 'no-overwrite' mode. If the | ||
299 | default mode suits your needs, and you plan to use the read() | ||
300 | interface to retrieve channel data, you can ignore the details of this | ||
301 | section, as it pertains mainly to mmap() implementations. | ||
302 | |||
303 | In 'overwrite' mode, also known as 'flight recorder' mode, writes | ||
304 | continuously cycle around the buffer and will never fail, but will | ||
305 | unconditionally overwrite old data regardless of whether it's actually | ||
306 | been consumed. In no-overwrite mode, writes will fail, i.e. data will | ||
307 | be lost, if the number of unconsumed sub-buffers equals the total | ||
308 | number of sub-buffers in the channel. It should be clear that if | ||
309 | there is no consumer or if the consumer can't consume sub-buffers fast | ||
310 | enough, data will be lost in either case; the only difference is | ||
311 | whether data is lost from the beginning or the end of a buffer. | ||
312 | |||
313 | As explained above, a relay channel is made of up one or more | ||
314 | per-cpu channel buffers, each implemented as a circular buffer | ||
315 | subdivided into one or more sub-buffers. Messages are written into | ||
316 | the current sub-buffer of the channel's current per-cpu buffer via the | ||
317 | write functions described below. Whenever a message can't fit into | ||
318 | the current sub-buffer, because there's no room left for it, the | ||
319 | client is notified via the subbuf_start() callback that a switch to a | ||
320 | new sub-buffer is about to occur. The client uses this callback to 1) | ||
321 | initialize the next sub-buffer if appropriate 2) finalize the previous | ||
322 | sub-buffer if appropriate and 3) return a boolean value indicating | ||
323 | whether or not to actually move on to the next sub-buffer. | ||
324 | |||
325 | To implement 'no-overwrite' mode, the userspace client would provide | ||
326 | an implementation of the subbuf_start() callback something like the | ||
327 | following: | ||
328 | |||
329 | static int subbuf_start(struct rchan_buf *buf, | ||
330 | void *subbuf, | ||
331 | void *prev_subbuf, | ||
332 | unsigned int prev_padding) | ||
333 | { | ||
334 | if (prev_subbuf) | ||
335 | *((unsigned *)prev_subbuf) = prev_padding; | ||
336 | |||
337 | if (relay_buf_full(buf)) | ||
338 | return 0; | ||
339 | |||
340 | subbuf_start_reserve(buf, sizeof(unsigned int)); | ||
341 | |||
342 | return 1; | ||
343 | } | ||
344 | |||
345 | If the current buffer is full, i.e. all sub-buffers remain unconsumed, | ||
346 | the callback returns 0 to indicate that the buffer switch should not | ||
347 | occur yet, i.e. until the consumer has had a chance to read the | ||
348 | current set of ready sub-buffers. For the relay_buf_full() function | ||
349 | to make sense, the consumer is reponsible for notifying the relay | ||
350 | interface when sub-buffers have been consumed via | ||
351 | relay_subbufs_consumed(). Any subsequent attempts to write into the | ||
352 | buffer will again invoke the subbuf_start() callback with the same | ||
353 | parameters; only when the consumer has consumed one or more of the | ||
354 | ready sub-buffers will relay_buf_full() return 0, in which case the | ||
355 | buffer switch can continue. | ||
356 | |||
357 | The implementation of the subbuf_start() callback for 'overwrite' mode | ||
358 | would be very similar: | ||
359 | |||
360 | static int subbuf_start(struct rchan_buf *buf, | ||
361 | void *subbuf, | ||
362 | void *prev_subbuf, | ||
363 | unsigned int prev_padding) | ||
364 | { | ||
365 | if (prev_subbuf) | ||
366 | *((unsigned *)prev_subbuf) = prev_padding; | ||
367 | |||
368 | subbuf_start_reserve(buf, sizeof(unsigned int)); | ||
369 | |||
370 | return 1; | ||
371 | } | ||
372 | |||
373 | In this case, the relay_buf_full() check is meaningless and the | ||
374 | callback always returns 1, causing the buffer switch to occur | ||
375 | unconditionally. It's also meaningless for the client to use the | ||
376 | relay_subbufs_consumed() function in this mode, as it's never | ||
377 | consulted. | ||
378 | |||
379 | The default subbuf_start() implementation, used if the client doesn't | ||
380 | define any callbacks, or doesn't define the subbuf_start() callback, | ||
381 | implements the simplest possible 'no-overwrite' mode, i.e. it does | ||
382 | nothing but return 0. | ||
383 | |||
384 | Header information can be reserved at the beginning of each sub-buffer | ||
385 | by calling the subbuf_start_reserve() helper function from within the | ||
386 | subbuf_start() callback. This reserved area can be used to store | ||
387 | whatever information the client wants. In the example above, room is | ||
388 | reserved in each sub-buffer to store the padding count for that | ||
389 | sub-buffer. This is filled in for the previous sub-buffer in the | ||
390 | subbuf_start() implementation; the padding value for the previous | ||
391 | sub-buffer is passed into the subbuf_start() callback along with a | ||
392 | pointer to the previous sub-buffer, since the padding value isn't | ||
393 | known until a sub-buffer is filled. The subbuf_start() callback is | ||
394 | also called for the first sub-buffer when the channel is opened, to | ||
395 | give the client a chance to reserve space in it. In this case the | ||
396 | previous sub-buffer pointer passed into the callback will be NULL, so | ||
397 | the client should check the value of the prev_subbuf pointer before | ||
398 | writing into the previous sub-buffer. | ||
399 | |||
400 | Writing to a channel | ||
401 | -------------------- | ||
402 | |||
403 | Kernel clients write data into the current cpu's channel buffer using | ||
404 | relay_write() or __relay_write(). relay_write() is the main logging | ||
405 | function - it uses local_irqsave() to protect the buffer and should be | ||
406 | used if you might be logging from interrupt context. If you know | ||
407 | you'll never be logging from interrupt context, you can use | ||
408 | __relay_write(), which only disables preemption. These functions | ||
409 | don't return a value, so you can't determine whether or not they | ||
410 | failed - the assumption is that you wouldn't want to check a return | ||
411 | value in the fast logging path anyway, and that they'll always succeed | ||
412 | unless the buffer is full and no-overwrite mode is being used, in | ||
413 | which case you can detect a failed write in the subbuf_start() | ||
414 | callback by calling the relay_buf_full() helper function. | ||
415 | |||
416 | relay_reserve() is used to reserve a slot in a channel buffer which | ||
417 | can be written to later. This would typically be used in applications | ||
418 | that need to write directly into a channel buffer without having to | ||
419 | stage data in a temporary buffer beforehand. Because the actual write | ||
420 | may not happen immediately after the slot is reserved, applications | ||
421 | using relay_reserve() can keep a count of the number of bytes actually | ||
422 | written, either in space reserved in the sub-buffers themselves or as | ||
423 | a separate array. See the 'reserve' example in the relay-apps tarball | ||
424 | at http://relayfs.sourceforge.net for an example of how this can be | ||
425 | done. Because the write is under control of the client and is | ||
426 | separated from the reserve, relay_reserve() doesn't protect the buffer | ||
427 | at all - it's up to the client to provide the appropriate | ||
428 | synchronization when using relay_reserve(). | ||
429 | |||
430 | Closing a channel | ||
431 | ----------------- | ||
432 | |||
433 | The client calls relay_close() when it's finished using the channel. | ||
434 | The channel and its associated buffers are destroyed when there are no | ||
435 | longer any references to any of the channel buffers. relay_flush() | ||
436 | forces a sub-buffer switch on all the channel buffers, and can be used | ||
437 | to finalize and process the last sub-buffers before the channel is | ||
438 | closed. | ||
439 | |||
440 | Misc | ||
441 | ---- | ||
442 | |||
443 | Some applications may want to keep a channel around and re-use it | ||
444 | rather than open and close a new channel for each use. relay_reset() | ||
445 | can be used for this purpose - it resets a channel to its initial | ||
446 | state without reallocating channel buffer memory or destroying | ||
447 | existing mappings. It should however only be called when it's safe to | ||
448 | do so, i.e. when the channel isn't currently being written to. | ||
449 | |||
450 | Finally, there are a couple of utility callbacks that can be used for | ||
451 | different purposes. buf_mapped() is called whenever a channel buffer | ||
452 | is mmapped from user space and buf_unmapped() is called when it's | ||
453 | unmapped. The client can use this notification to trigger actions | ||
454 | within the kernel application, such as enabling/disabling logging to | ||
455 | the channel. | ||
456 | |||
457 | |||
458 | Resources | ||
459 | ========= | ||
460 | |||
461 | For news, example code, mailing list, etc. see the relay interface homepage: | ||
462 | |||
463 | http://relayfs.sourceforge.net | ||
464 | |||
465 | |||
466 | Credits | ||
467 | ======= | ||
468 | |||
469 | The ideas and specs for the relay interface came about as a result of | ||
470 | discussions on tracing involving the following: | ||
471 | |||
472 | Michel Dagenais <michel.dagenais@polymtl.ca> | ||
473 | Richard Moore <richardj_moore@uk.ibm.com> | ||
474 | Bob Wisniewski <bob@watson.ibm.com> | ||
475 | Karim Yaghmour <karim@opersys.com> | ||
476 | Tom Zanussi <zanussi@us.ibm.com> | ||
477 | |||
478 | Also thanks to Hubertus Franke for a lot of useful suggestions and bug | ||
479 | reports. | ||
diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt deleted file mode 100644 index 5832377b7340..000000000000 --- a/Documentation/filesystems/relayfs.txt +++ /dev/null | |||
@@ -1,442 +0,0 @@ | |||
1 | |||
2 | relayfs - a high-speed data relay filesystem | ||
3 | ============================================ | ||
4 | |||
5 | relayfs is a filesystem designed to provide an efficient mechanism for | ||
6 | tools and facilities to relay large and potentially sustained streams | ||
7 | of data from kernel space to user space. | ||
8 | |||
9 | The main abstraction of relayfs is the 'channel'. A channel consists | ||
10 | of a set of per-cpu kernel buffers each represented by a file in the | ||
11 | relayfs filesystem. Kernel clients write into a channel using | ||
12 | efficient write functions which automatically log to the current cpu's | ||
13 | channel buffer. User space applications mmap() the per-cpu files and | ||
14 | retrieve the data as it becomes available. | ||
15 | |||
16 | The format of the data logged into the channel buffers is completely | ||
17 | up to the relayfs client; relayfs does however provide hooks which | ||
18 | allow clients to impose some structure on the buffer data. Nor does | ||
19 | relayfs implement any form of data filtering - this also is left to | ||
20 | the client. The purpose is to keep relayfs as simple as possible. | ||
21 | |||
22 | This document provides an overview of the relayfs API. The details of | ||
23 | the function parameters are documented along with the functions in the | ||
24 | filesystem code - please see that for details. | ||
25 | |||
26 | Semantics | ||
27 | ========= | ||
28 | |||
29 | Each relayfs channel has one buffer per CPU, each buffer has one or | ||
30 | more sub-buffers. Messages are written to the first sub-buffer until | ||
31 | it is too full to contain a new message, in which case it it is | ||
32 | written to the next (if available). Messages are never split across | ||
33 | sub-buffers. At this point, userspace can be notified so it empties | ||
34 | the first sub-buffer, while the kernel continues writing to the next. | ||
35 | |||
36 | When notified that a sub-buffer is full, the kernel knows how many | ||
37 | bytes of it are padding i.e. unused. Userspace can use this knowledge | ||
38 | to copy only valid data. | ||
39 | |||
40 | After copying it, userspace can notify the kernel that a sub-buffer | ||
41 | has been consumed. | ||
42 | |||
43 | relayfs can operate in a mode where it will overwrite data not yet | ||
44 | collected by userspace, and not wait for it to consume it. | ||
45 | |||
46 | relayfs itself does not provide for communication of such data between | ||
47 | userspace and kernel, allowing the kernel side to remain simple and | ||
48 | not impose a single interface on userspace. It does provide a set of | ||
49 | examples and a separate helper though, described below. | ||
50 | |||
51 | klog and relay-apps example code | ||
52 | ================================ | ||
53 | |||
54 | relayfs itself is ready to use, but to make things easier, a couple | ||
55 | simple utility functions and a set of examples are provided. | ||
56 | |||
57 | The relay-apps example tarball, available on the relayfs sourceforge | ||
58 | site, contains a set of self-contained examples, each consisting of a | ||
59 | pair of .c files containing boilerplate code for each of the user and | ||
60 | kernel sides of a relayfs application; combined these two sets of | ||
61 | boilerplate code provide glue to easily stream data to disk, without | ||
62 | having to bother with mundane housekeeping chores. | ||
63 | |||
64 | The 'klog debugging functions' patch (klog.patch in the relay-apps | ||
65 | tarball) provides a couple of high-level logging functions to the | ||
66 | kernel which allow writing formatted text or raw data to a channel, | ||
67 | regardless of whether a channel to write into exists or not, or | ||
68 | whether relayfs is compiled into the kernel or is configured as a | ||
69 | module. These functions allow you to put unconditional 'trace' | ||
70 | statements anywhere in the kernel or kernel modules; only when there | ||
71 | is a 'klog handler' registered will data actually be logged (see the | ||
72 | klog and kleak examples for details). | ||
73 | |||
74 | It is of course possible to use relayfs from scratch i.e. without | ||
75 | using any of the relay-apps example code or klog, but you'll have to | ||
76 | implement communication between userspace and kernel, allowing both to | ||
77 | convey the state of buffers (full, empty, amount of padding). | ||
78 | |||
79 | klog and the relay-apps examples can be found in the relay-apps | ||
80 | tarball on http://relayfs.sourceforge.net | ||
81 | |||
82 | |||
83 | The relayfs user space API | ||
84 | ========================== | ||
85 | |||
86 | relayfs implements basic file operations for user space access to | ||
87 | relayfs channel buffer data. Here are the file operations that are | ||
88 | available and some comments regarding their behavior: | ||
89 | |||
90 | open() enables user to open an _existing_ buffer. | ||
91 | |||
92 | mmap() results in channel buffer being mapped into the caller's | ||
93 | memory space. Note that you can't do a partial mmap - you must | ||
94 | map the entire file, which is NRBUF * SUBBUFSIZE. | ||
95 | |||
96 | read() read the contents of a channel buffer. The bytes read are | ||
97 | 'consumed' by the reader i.e. they won't be available again | ||
98 | to subsequent reads. If the channel is being used in | ||
99 | no-overwrite mode (the default), it can be read at any time | ||
100 | even if there's an active kernel writer. If the channel is | ||
101 | being used in overwrite mode and there are active channel | ||
102 | writers, results may be unpredictable - users should make | ||
103 | sure that all logging to the channel has ended before using | ||
104 | read() with overwrite mode. | ||
105 | |||
106 | poll() POLLIN/POLLRDNORM/POLLERR supported. User applications are | ||
107 | notified when sub-buffer boundaries are crossed. | ||
108 | |||
109 | close() decrements the channel buffer's refcount. When the refcount | ||
110 | reaches 0 i.e. when no process or kernel client has the buffer | ||
111 | open, the channel buffer is freed. | ||
112 | |||
113 | |||
114 | In order for a user application to make use of relayfs files, the | ||
115 | relayfs filesystem must be mounted. For example, | ||
116 | |||
117 | mount -t relayfs relayfs /mnt/relay | ||
118 | |||
119 | NOTE: relayfs doesn't need to be mounted for kernel clients to create | ||
120 | or use channels - it only needs to be mounted when user space | ||
121 | applications need access to the buffer data. | ||
122 | |||
123 | |||
124 | The relayfs kernel API | ||
125 | ====================== | ||
126 | |||
127 | Here's a summary of the API relayfs provides to in-kernel clients: | ||
128 | |||
129 | |||
130 | channel management functions: | ||
131 | |||
132 | relay_open(base_filename, parent, subbuf_size, n_subbufs, | ||
133 | callbacks) | ||
134 | relay_close(chan) | ||
135 | relay_flush(chan) | ||
136 | relay_reset(chan) | ||
137 | relayfs_create_dir(name, parent) | ||
138 | relayfs_remove_dir(dentry) | ||
139 | relayfs_create_file(name, parent, mode, fops, data) | ||
140 | relayfs_remove_file(dentry) | ||
141 | |||
142 | channel management typically called on instigation of userspace: | ||
143 | |||
144 | relay_subbufs_consumed(chan, cpu, subbufs_consumed) | ||
145 | |||
146 | write functions: | ||
147 | |||
148 | relay_write(chan, data, length) | ||
149 | __relay_write(chan, data, length) | ||
150 | relay_reserve(chan, length) | ||
151 | |||
152 | callbacks: | ||
153 | |||
154 | subbuf_start(buf, subbuf, prev_subbuf, prev_padding) | ||
155 | buf_mapped(buf, filp) | ||
156 | buf_unmapped(buf, filp) | ||
157 | create_buf_file(filename, parent, mode, buf, is_global) | ||
158 | remove_buf_file(dentry) | ||
159 | |||
160 | helper functions: | ||
161 | |||
162 | relay_buf_full(buf) | ||
163 | subbuf_start_reserve(buf, length) | ||
164 | |||
165 | |||
166 | Creating a channel | ||
167 | ------------------ | ||
168 | |||
169 | relay_open() is used to create a channel, along with its per-cpu | ||
170 | channel buffers. Each channel buffer will have an associated file | ||
171 | created for it in the relayfs filesystem, which can be opened and | ||
172 | mmapped from user space if desired. The files are named | ||
173 | basename0...basenameN-1 where N is the number of online cpus, and by | ||
174 | default will be created in the root of the filesystem. If you want a | ||
175 | directory structure to contain your relayfs files, you can create it | ||
176 | with relayfs_create_dir() and pass the parent directory to | ||
177 | relay_open(). Clients are responsible for cleaning up any directory | ||
178 | structure they create when the channel is closed - use | ||
179 | relayfs_remove_dir() for that. | ||
180 | |||
181 | The total size of each per-cpu buffer is calculated by multiplying the | ||
182 | number of sub-buffers by the sub-buffer size passed into relay_open(). | ||
183 | The idea behind sub-buffers is that they're basically an extension of | ||
184 | double-buffering to N buffers, and they also allow applications to | ||
185 | easily implement random-access-on-buffer-boundary schemes, which can | ||
186 | be important for some high-volume applications. The number and size | ||
187 | of sub-buffers is completely dependent on the application and even for | ||
188 | the same application, different conditions will warrant different | ||
189 | values for these parameters at different times. Typically, the right | ||
190 | values to use are best decided after some experimentation; in general, | ||
191 | though, it's safe to assume that having only 1 sub-buffer is a bad | ||
192 | idea - you're guaranteed to either overwrite data or lose events | ||
193 | depending on the channel mode being used. | ||
194 | |||
195 | Channel 'modes' | ||
196 | --------------- | ||
197 | |||
198 | relayfs channels can be used in either of two modes - 'overwrite' or | ||
199 | 'no-overwrite'. The mode is entirely determined by the implementation | ||
200 | of the subbuf_start() callback, as described below. In 'overwrite' | ||
201 | mode, also known as 'flight recorder' mode, writes continuously cycle | ||
202 | around the buffer and will never fail, but will unconditionally | ||
203 | overwrite old data regardless of whether it's actually been consumed. | ||
204 | In no-overwrite mode, writes will fail i.e. data will be lost, if the | ||
205 | number of unconsumed sub-buffers equals the total number of | ||
206 | sub-buffers in the channel. It should be clear that if there is no | ||
207 | consumer or if the consumer can't consume sub-buffers fast enought, | ||
208 | data will be lost in either case; the only difference is whether data | ||
209 | is lost from the beginning or the end of a buffer. | ||
210 | |||
211 | As explained above, a relayfs channel is made of up one or more | ||
212 | per-cpu channel buffers, each implemented as a circular buffer | ||
213 | subdivided into one or more sub-buffers. Messages are written into | ||
214 | the current sub-buffer of the channel's current per-cpu buffer via the | ||
215 | write functions described below. Whenever a message can't fit into | ||
216 | the current sub-buffer, because there's no room left for it, the | ||
217 | client is notified via the subbuf_start() callback that a switch to a | ||
218 | new sub-buffer is about to occur. The client uses this callback to 1) | ||
219 | initialize the next sub-buffer if appropriate 2) finalize the previous | ||
220 | sub-buffer if appropriate and 3) return a boolean value indicating | ||
221 | whether or not to actually go ahead with the sub-buffer switch. | ||
222 | |||
223 | To implement 'no-overwrite' mode, the userspace client would provide | ||
224 | an implementation of the subbuf_start() callback something like the | ||
225 | following: | ||
226 | |||
227 | static int subbuf_start(struct rchan_buf *buf, | ||
228 | void *subbuf, | ||
229 | void *prev_subbuf, | ||
230 | unsigned int prev_padding) | ||
231 | { | ||
232 | if (prev_subbuf) | ||
233 | *((unsigned *)prev_subbuf) = prev_padding; | ||
234 | |||
235 | if (relay_buf_full(buf)) | ||
236 | return 0; | ||
237 | |||
238 | subbuf_start_reserve(buf, sizeof(unsigned int)); | ||
239 | |||
240 | return 1; | ||
241 | } | ||
242 | |||
243 | If the current buffer is full i.e. all sub-buffers remain unconsumed, | ||
244 | the callback returns 0 to indicate that the buffer switch should not | ||
245 | occur yet i.e. until the consumer has had a chance to read the current | ||
246 | set of ready sub-buffers. For the relay_buf_full() function to make | ||
247 | sense, the consumer is reponsible for notifying relayfs when | ||
248 | sub-buffers have been consumed via relay_subbufs_consumed(). Any | ||
249 | subsequent attempts to write into the buffer will again invoke the | ||
250 | subbuf_start() callback with the same parameters; only when the | ||
251 | consumer has consumed one or more of the ready sub-buffers will | ||
252 | relay_buf_full() return 0, in which case the buffer switch can | ||
253 | continue. | ||
254 | |||
255 | The implementation of the subbuf_start() callback for 'overwrite' mode | ||
256 | would be very similar: | ||
257 | |||
258 | static int subbuf_start(struct rchan_buf *buf, | ||
259 | void *subbuf, | ||
260 | void *prev_subbuf, | ||
261 | unsigned int prev_padding) | ||
262 | { | ||
263 | if (prev_subbuf) | ||
264 | *((unsigned *)prev_subbuf) = prev_padding; | ||
265 | |||
266 | subbuf_start_reserve(buf, sizeof(unsigned int)); | ||
267 | |||
268 | return 1; | ||
269 | } | ||
270 | |||
271 | In this case, the relay_buf_full() check is meaningless and the | ||
272 | callback always returns 1, causing the buffer switch to occur | ||
273 | unconditionally. It's also meaningless for the client to use the | ||
274 | relay_subbufs_consumed() function in this mode, as it's never | ||
275 | consulted. | ||
276 | |||
277 | The default subbuf_start() implementation, used if the client doesn't | ||
278 | define any callbacks, or doesn't define the subbuf_start() callback, | ||
279 | implements the simplest possible 'no-overwrite' mode i.e. it does | ||
280 | nothing but return 0. | ||
281 | |||
282 | Header information can be reserved at the beginning of each sub-buffer | ||
283 | by calling the subbuf_start_reserve() helper function from within the | ||
284 | subbuf_start() callback. This reserved area can be used to store | ||
285 | whatever information the client wants. In the example above, room is | ||
286 | reserved in each sub-buffer to store the padding count for that | ||
287 | sub-buffer. This is filled in for the previous sub-buffer in the | ||
288 | subbuf_start() implementation; the padding value for the previous | ||
289 | sub-buffer is passed into the subbuf_start() callback along with a | ||
290 | pointer to the previous sub-buffer, since the padding value isn't | ||
291 | known until a sub-buffer is filled. The subbuf_start() callback is | ||
292 | also called for the first sub-buffer when the channel is opened, to | ||
293 | give the client a chance to reserve space in it. In this case the | ||
294 | previous sub-buffer pointer passed into the callback will be NULL, so | ||
295 | the client should check the value of the prev_subbuf pointer before | ||
296 | writing into the previous sub-buffer. | ||
297 | |||
298 | Writing to a channel | ||
299 | -------------------- | ||
300 | |||
301 | kernel clients write data into the current cpu's channel buffer using | ||
302 | relay_write() or __relay_write(). relay_write() is the main logging | ||
303 | function - it uses local_irqsave() to protect the buffer and should be | ||
304 | used if you might be logging from interrupt context. If you know | ||
305 | you'll never be logging from interrupt context, you can use | ||
306 | __relay_write(), which only disables preemption. These functions | ||
307 | don't return a value, so you can't determine whether or not they | ||
308 | failed - the assumption is that you wouldn't want to check a return | ||
309 | value in the fast logging path anyway, and that they'll always succeed | ||
310 | unless the buffer is full and no-overwrite mode is being used, in | ||
311 | which case you can detect a failed write in the subbuf_start() | ||
312 | callback by calling the relay_buf_full() helper function. | ||
313 | |||
314 | relay_reserve() is used to reserve a slot in a channel buffer which | ||
315 | can be written to later. This would typically be used in applications | ||
316 | that need to write directly into a channel buffer without having to | ||
317 | stage data in a temporary buffer beforehand. Because the actual write | ||
318 | may not happen immediately after the slot is reserved, applications | ||
319 | using relay_reserve() can keep a count of the number of bytes actually | ||
320 | written, either in space reserved in the sub-buffers themselves or as | ||
321 | a separate array. See the 'reserve' example in the relay-apps tarball | ||
322 | at http://relayfs.sourceforge.net for an example of how this can be | ||
323 | done. Because the write is under control of the client and is | ||
324 | separated from the reserve, relay_reserve() doesn't protect the buffer | ||
325 | at all - it's up to the client to provide the appropriate | ||
326 | synchronization when using relay_reserve(). | ||
327 | |||
328 | Closing a channel | ||
329 | ----------------- | ||
330 | |||
331 | The client calls relay_close() when it's finished using the channel. | ||
332 | The channel and its associated buffers are destroyed when there are no | ||
333 | longer any references to any of the channel buffers. relay_flush() | ||
334 | forces a sub-buffer switch on all the channel buffers, and can be used | ||
335 | to finalize and process the last sub-buffers before the channel is | ||
336 | closed. | ||
337 | |||
338 | Creating non-relay files | ||
339 | ------------------------ | ||
340 | |||
341 | relay_open() automatically creates files in the relayfs filesystem to | ||
342 | represent the per-cpu kernel buffers; it's often useful for | ||
343 | applications to be able to create their own files alongside the relay | ||
344 | files in the relayfs filesystem as well e.g. 'control' files much like | ||
345 | those created in /proc or debugfs for similar purposes, used to | ||
346 | communicate control information between the kernel and user sides of a | ||
347 | relayfs application. For this purpose the relayfs_create_file() and | ||
348 | relayfs_remove_file() API functions exist. For relayfs_create_file(), | ||
349 | the caller passes in a set of user-defined file operations to be used | ||
350 | for the file and an optional void * to a user-specified data item, | ||
351 | which will be accessible via inode->u.generic_ip (see the relay-apps | ||
352 | tarball for examples). The file_operations are a required parameter | ||
353 | to relayfs_create_file() and thus the semantics of these files are | ||
354 | completely defined by the caller. | ||
355 | |||
356 | See the relay-apps tarball at http://relayfs.sourceforge.net for | ||
357 | examples of how these non-relay files are meant to be used. | ||
358 | |||
359 | Creating relay files in other filesystems | ||
360 | ----------------------------------------- | ||
361 | |||
362 | By default of course, relay_open() creates relay files in the relayfs | ||
363 | filesystem. Because relay_file_operations is exported, however, it's | ||
364 | also possible to create and use relay files in other pseudo-filesytems | ||
365 | such as debugfs. | ||
366 | |||
367 | For this purpose, two callback functions are provided, | ||
368 | create_buf_file() and remove_buf_file(). create_buf_file() is called | ||
369 | once for each per-cpu buffer from relay_open() to allow the client to | ||
370 | create a file to be used to represent the corresponding buffer; if | ||
371 | this callback is not defined, the default implementation will create | ||
372 | and return a file in the relayfs filesystem to represent the buffer. | ||
373 | The callback should return the dentry of the file created to represent | ||
374 | the relay buffer. Note that the parent directory passed to | ||
375 | relay_open() (and passed along to the callback), if specified, must | ||
376 | exist in the same filesystem the new relay file is created in. If | ||
377 | create_buf_file() is defined, remove_buf_file() must also be defined; | ||
378 | it's responsible for deleting the file(s) created in create_buf_file() | ||
379 | and is called during relay_close(). | ||
380 | |||
381 | The create_buf_file() implementation can also be defined in such a way | ||
382 | as to allow the creation of a single 'global' buffer instead of the | ||
383 | default per-cpu set. This can be useful for applications interested | ||
384 | mainly in seeing the relative ordering of system-wide events without | ||
385 | the need to bother with saving explicit timestamps for the purpose of | ||
386 | merging/sorting per-cpu files in a postprocessing step. | ||
387 | |||
388 | To have relay_open() create a global buffer, the create_buf_file() | ||
389 | implementation should set the value of the is_global outparam to a | ||
390 | non-zero value in addition to creating the file that will be used to | ||
391 | represent the single buffer. In the case of a global buffer, | ||
392 | create_buf_file() and remove_buf_file() will be called only once. The | ||
393 | normal channel-writing functions e.g. relay_write() can still be used | ||
394 | - writes from any cpu will transparently end up in the global buffer - | ||
395 | but since it is a global buffer, callers should make sure they use the | ||
396 | proper locking for such a buffer, either by wrapping writes in a | ||
397 | spinlock, or by copying a write function from relayfs_fs.h and | ||
398 | creating a local version that internally does the proper locking. | ||
399 | |||
400 | See the 'exported-relayfile' examples in the relay-apps tarball for | ||
401 | examples of creating and using relay files in debugfs. | ||
402 | |||
403 | Misc | ||
404 | ---- | ||
405 | |||
406 | Some applications may want to keep a channel around and re-use it | ||
407 | rather than open and close a new channel for each use. relay_reset() | ||
408 | can be used for this purpose - it resets a channel to its initial | ||
409 | state without reallocating channel buffer memory or destroying | ||
410 | existing mappings. It should however only be called when it's safe to | ||
411 | do so i.e. when the channel isn't currently being written to. | ||
412 | |||
413 | Finally, there are a couple of utility callbacks that can be used for | ||
414 | different purposes. buf_mapped() is called whenever a channel buffer | ||
415 | is mmapped from user space and buf_unmapped() is called when it's | ||
416 | unmapped. The client can use this notification to trigger actions | ||
417 | within the kernel application, such as enabling/disabling logging to | ||
418 | the channel. | ||
419 | |||
420 | |||
421 | Resources | ||
422 | ========= | ||
423 | |||
424 | For news, example code, mailing list, etc. see the relayfs homepage: | ||
425 | |||
426 | http://relayfs.sourceforge.net | ||
427 | |||
428 | |||
429 | Credits | ||
430 | ======= | ||
431 | |||
432 | The ideas and specs for relayfs came about as a result of discussions | ||
433 | on tracing involving the following: | ||
434 | |||
435 | Michel Dagenais <michel.dagenais@polymtl.ca> | ||
436 | Richard Moore <richardj_moore@uk.ibm.com> | ||
437 | Bob Wisniewski <bob@watson.ibm.com> | ||
438 | Karim Yaghmour <karim@opersys.com> | ||
439 | Tom Zanussi <zanussi@us.ibm.com> | ||
440 | |||
441 | Also thanks to Hubertus Franke for a lot of useful suggestions and bug | ||
442 | reports. | ||
diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt index d53b857a3710..841c353297e6 100644 --- a/Documentation/input/joystick.txt +++ b/Documentation/input/joystick.txt | |||
@@ -39,7 +39,6 @@ them. Bug reports and success stories are also welcome. | |||
39 | 39 | ||
40 | The input project website is at: | 40 | The input project website is at: |
41 | 41 | ||
42 | http://www.suse.cz/development/input/ | ||
43 | http://atrey.karlin.mff.cuni.cz/~vojtech/input/ | 42 | http://atrey.karlin.mff.cuni.cz/~vojtech/input/ |
44 | 43 | ||
45 | There is also a mailing list for the driver at: | 44 | There is also a mailing list for the driver at: |
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid index c173806c91fa..a056bbe67c7e 100644 --- a/Documentation/scsi/ChangeLog.megaraid +++ b/Documentation/scsi/ChangeLog.megaraid | |||
@@ -1,3 +1,126 @@ | |||
1 | Release Date : Fri May 19 09:31:45 EST 2006 - Seokmann Ju <sju@lsil.com> | ||
2 | Current Version : 2.20.4.9 (scsi module), 2.20.2.6 (cmm module) | ||
3 | Older Version : 2.20.4.8 (scsi module), 2.20.2.6 (cmm module) | ||
4 | |||
5 | 1. Fixed a bug in megaraid_init_mbox(). | ||
6 | Customer reported "garbage in file on x86_64 platform". | ||
7 | Root Cause: the driver registered controllers as 64-bit DMA capable | ||
8 | for those which are not support it. | ||
9 | Fix: Made change in the function inserting identification machanism | ||
10 | identifying 64-bit DMA capable controllers. | ||
11 | |||
12 | > -----Original Message----- | ||
13 | > From: Vasily Averin [mailto:vvs@sw.ru] | ||
14 | > Sent: Thursday, May 04, 2006 2:49 PM | ||
15 | > To: linux-scsi@vger.kernel.org; Kolli, Neela; Mukker, Atul; | ||
16 | > Ju, Seokmann; Bagalkote, Sreenivas; | ||
17 | > James.Bottomley@SteelEye.com; devel@openvz.org | ||
18 | > Subject: megaraid_mbox: garbage in file | ||
19 | > | ||
20 | > Hello all, | ||
21 | > | ||
22 | > I've investigated customers claim on the unstable work of | ||
23 | > their node and found a | ||
24 | > strange effect: reading from some files leads to the | ||
25 | > "attempt to access beyond end of device" messages. | ||
26 | > | ||
27 | > I've checked filesystem, memory on the node, motherboard BIOS | ||
28 | > version, but it | ||
29 | > does not help and issue still has been reproduced by simple | ||
30 | > file reading. | ||
31 | > | ||
32 | > Reproducer is simple: | ||
33 | > | ||
34 | > echo 0xffffffff >/proc/sys/dev/scsi/logging_level ; | ||
35 | > cat /vz/private/101/root/etc/ld.so.cache >/tmp/ttt ; | ||
36 | > echo 0 >/proc/sys/dev/scsi/logging | ||
37 | > | ||
38 | > It leads to the following messages in dmesg | ||
39 | > | ||
40 | > sd_init_command: disk=sda, block=871769260, count=26 | ||
41 | > sda : block=871769260 | ||
42 | > sda : reading 26/26 512 byte blocks. | ||
43 | > scsi_add_timer: scmd: f79ed980, time: 7500, (c02b1420) | ||
44 | > sd 0:1:0:0: send 0xf79ed980 sd 0:1:0:0: | ||
45 | > command: Read (10): 28 00 33 f6 24 ac 00 00 1a 00 | ||
46 | > buffer = 0xf7cfb540, bufflen = 13312, done = 0xc0366b40, | ||
47 | > queuecommand 0xc0344010 | ||
48 | > leaving scsi_dispatch_cmnd() | ||
49 | > scsi_delete_timer: scmd: f79ed980, rtn: 1 | ||
50 | > sd 0:1:0:0: done 0xf79ed980 SUCCESS 0 sd 0:1:0:0: | ||
51 | > command: Read (10): 28 00 33 f6 24 ac 00 00 1a 00 | ||
52 | > scsi host busy 1 failed 0 | ||
53 | > sd 0:1:0:0: Notifying upper driver of completion (result 0) | ||
54 | > sd_rw_intr: sda: res=0x0 | ||
55 | > 26 sectors total, 13312 bytes done. | ||
56 | > use_sg is 4 | ||
57 | > attempt to access beyond end of device | ||
58 | > sda6: rw=0, want=1044134458, limit=951401367 | ||
59 | > Buffer I/O error on device sda6, logical block 522067228 | ||
60 | > attempt to access beyond end of device | ||
61 | |||
62 | 2. When INQUIRY with EVPD bit set issued to the MegaRAID controller, | ||
63 | system memory gets corrupted. | ||
64 | Root Cause: MegaRAID F/W handle the INQUIRY with EVPD bit set | ||
65 | incorrectly. | ||
66 | Fix: MegaRAID F/W has fixed the problem and being process of release, | ||
67 | soon. Meanwhile, driver will filter out the request. | ||
68 | |||
69 | 3. One of member in the data structure of the driver leads unaligne | ||
70 | issue on 64-bit platform. | ||
71 | Customer reporeted "kernel unaligned access addrss" issue when | ||
72 | application communicates with MegaRAID HBA driver. | ||
73 | Root Cause: in uioc_t structure, one of member had misaligned and it | ||
74 | led system to display the error message. | ||
75 | Fix: A patch submitted to community from following folk. | ||
76 | |||
77 | > -----Original Message----- | ||
78 | > From: linux-scsi-owner@vger.kernel.org | ||
79 | > [mailto:linux-scsi-owner@vger.kernel.org] On Behalf Of Sakurai Hiroomi | ||
80 | > Sent: Wednesday, July 12, 2006 4:20 AM | ||
81 | > To: linux-scsi@vger.kernel.org; linux-kernel@vger.kernel.org | ||
82 | > Subject: Re: Help: strange messages from kernel on IA64 platform | ||
83 | > | ||
84 | > Hi, | ||
85 | > | ||
86 | > I saw same message. | ||
87 | > | ||
88 | > When GAM(Global Array Manager) is started, The following | ||
89 | > message output. | ||
90 | > kernel: kernel unaligned access to 0xe0000001fe1080d4, | ||
91 | > ip=0xa000000200053371 | ||
92 | > | ||
93 | > The uioc structure used by ioctl is defined by packed, | ||
94 | > the allignment of each member are disturbed. | ||
95 | > In a 64 bit structure, the allignment of member doesn't fit 64 bit | ||
96 | > boundary. this causes this messages. | ||
97 | > In a 32 bit structure, we don't see the message because the allinment | ||
98 | > of member fit 32 bit boundary even if packed is specified. | ||
99 | > | ||
100 | > patch | ||
101 | > I Add 32 bit dummy member to fit 64 bit boundary. I tested. | ||
102 | > We confirmed this patch fix the problem by IA64 server. | ||
103 | > | ||
104 | > ************************************************************** | ||
105 | > **************** | ||
106 | > --- linux-2.6.9/drivers/scsi/megaraid/megaraid_ioctl.h.orig | ||
107 | > 2006-04-03 17:13:03.000000000 +0900 | ||
108 | > +++ linux-2.6.9/drivers/scsi/megaraid/megaraid_ioctl.h | ||
109 | > 2006-04-03 17:14:09.000000000 +0900 | ||
110 | > @@ -132,6 +132,10 @@ | ||
111 | > /* Driver Data: */ | ||
112 | > void __user * user_data; | ||
113 | > uint32_t user_data_len; | ||
114 | > + | ||
115 | > + /* 64bit alignment */ | ||
116 | > + uint32_t pad_0xBC; | ||
117 | > + | ||
118 | > mraid_passthru_t __user *user_pthru; | ||
119 | > | ||
120 | > mraid_passthru_t *pthru32; | ||
121 | > ************************************************************** | ||
122 | > **************** | ||
123 | |||
1 | Release Date : Mon Apr 11 12:27:22 EST 2006 - Seokmann Ju <sju@lsil.com> | 124 | Release Date : Mon Apr 11 12:27:22 EST 2006 - Seokmann Ju <sju@lsil.com> |
2 | Current Version : 2.20.4.8 (scsi module), 2.20.2.6 (cmm module) | 125 | Current Version : 2.20.4.8 (scsi module), 2.20.2.6 (cmm module) |
3 | Older Version : 2.20.4.7 (scsi module), 2.20.2.6 (cmm module) | 126 | Older Version : 2.20.4.7 (scsi module), 2.20.2.6 (cmm module) |
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 0b62c62142cf..5c3a51905969 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt | |||
@@ -25,6 +25,7 @@ Currently, these files are in /proc/sys/fs: | |||
25 | - inode-state | 25 | - inode-state |
26 | - overflowuid | 26 | - overflowuid |
27 | - overflowgid | 27 | - overflowgid |
28 | - suid_dumpable | ||
28 | - super-max | 29 | - super-max |
29 | - super-nr | 30 | - super-nr |
30 | 31 | ||
@@ -131,6 +132,25 @@ The default is 65534. | |||
131 | 132 | ||
132 | ============================================================== | 133 | ============================================================== |
133 | 134 | ||
135 | suid_dumpable: | ||
136 | |||
137 | This value can be used to query and set the core dump mode for setuid | ||
138 | or otherwise protected/tainted binaries. The modes are | ||
139 | |||
140 | 0 - (default) - traditional behaviour. Any process which has changed | ||
141 | privilege levels or is execute only will not be dumped | ||
142 | 1 - (debug) - all processes dump core when possible. The core dump is | ||
143 | owned by the current user and no security is applied. This is | ||
144 | intended for system debugging situations only. Ptrace is unchecked. | ||
145 | 2 - (suidsafe) - any binary which normally would not be dumped is dumped | ||
146 | readable by root only. This allows the end user to remove | ||
147 | such a dump but not access it directly. For security reasons | ||
148 | core dumps in this mode will not overwrite one another or | ||
149 | other files. This mode is appropriate when adminstrators are | ||
150 | attempting to debug problems in a normal environment. | ||
151 | |||
152 | ============================================================== | ||
153 | |||
134 | super-max & super-nr: | 154 | super-max & super-nr: |
135 | 155 | ||
136 | These numbers control the maximum number of superblocks, and | 156 | These numbers control the maximum number of superblocks, and |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 7345c338080a..89bf8c20a586 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -50,7 +50,6 @@ show up in /proc/sys/kernel: | |||
50 | - shmmax [ sysv ipc ] | 50 | - shmmax [ sysv ipc ] |
51 | - shmmni | 51 | - shmmni |
52 | - stop-a [ SPARC only ] | 52 | - stop-a [ SPARC only ] |
53 | - suid_dumpable | ||
54 | - sysrq ==> Documentation/sysrq.txt | 53 | - sysrq ==> Documentation/sysrq.txt |
55 | - tainted | 54 | - tainted |
56 | - threads-max | 55 | - threads-max |
@@ -310,25 +309,6 @@ kernel. This value defaults to SHMMAX. | |||
310 | 309 | ||
311 | ============================================================== | 310 | ============================================================== |
312 | 311 | ||
313 | suid_dumpable: | ||
314 | |||
315 | This value can be used to query and set the core dump mode for setuid | ||
316 | or otherwise protected/tainted binaries. The modes are | ||
317 | |||
318 | 0 - (default) - traditional behaviour. Any process which has changed | ||
319 | privilege levels or is execute only will not be dumped | ||
320 | 1 - (debug) - all processes dump core when possible. The core dump is | ||
321 | owned by the current user and no security is applied. This is | ||
322 | intended for system debugging situations only. Ptrace is unchecked. | ||
323 | 2 - (suidsafe) - any binary which normally would not be dumped is dumped | ||
324 | readable by root only. This allows the end user to remove | ||
325 | such a dump but not access it directly. For security reasons | ||
326 | core dumps in this mode will not overwrite one another or | ||
327 | other files. This mode is appropriate when adminstrators are | ||
328 | attempting to debug problems in a normal environment. | ||
329 | |||
330 | ============================================================== | ||
331 | |||
332 | tainted: | 312 | tainted: |
333 | 313 | ||
334 | Non-zero if the kernel has been tainted. Numeric values, which | 314 | Non-zero if the kernel has been tainted. Numeric values, which |