aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/DocBook
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /Documentation/DocBook
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'Documentation/DocBook')
-rw-r--r--Documentation/DocBook/Makefile195
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl341
-rw-r--r--Documentation/DocBook/gadget.tmpl752
-rw-r--r--Documentation/DocBook/journal-api.tmpl333
-rw-r--r--Documentation/DocBook/kernel-api.tmpl342
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl1349
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl2088
-rw-r--r--Documentation/DocBook/libata.tmpl282
-rw-r--r--Documentation/DocBook/librs.tmpl289
-rw-r--r--Documentation/DocBook/lsm.tmpl265
-rw-r--r--Documentation/DocBook/man/Makefile3
-rw-r--r--Documentation/DocBook/mcabook.tmpl107
-rw-r--r--Documentation/DocBook/mtdnand.tmpl1320
-rw-r--r--Documentation/DocBook/procfs-guide.tmpl591
-rw-r--r--Documentation/DocBook/procfs_example.c224
-rw-r--r--Documentation/DocBook/scsidrivers.tmpl193
-rw-r--r--Documentation/DocBook/sis900.tmpl585
-rw-r--r--Documentation/DocBook/tulip-user.tmpl327
-rw-r--r--Documentation/DocBook/usb.tmpl979
-rw-r--r--Documentation/DocBook/via-audio.tmpl597
-rw-r--r--Documentation/DocBook/videobook.tmpl1663
-rw-r--r--Documentation/DocBook/wanbook.tmpl99
-rw-r--r--Documentation/DocBook/writing_usb_driver.tmpl419
-rw-r--r--Documentation/DocBook/z8530book.tmpl385
24 files changed, 13728 insertions, 0 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
new file mode 100644
index 000000000000..a221039ee4c9
--- /dev/null
+++ b/Documentation/DocBook/Makefile
@@ -0,0 +1,195 @@
1###
2# This makefile is used to generate the kernel documentation,
3# primarily based on in-line comments in various source files.
4# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
5# to ducument the SRC - and how to read it.
6# To add a new book the only step required is to add the book to the
7# list of DOCBOOKS.
8
9DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
10 kernel-hacking.xml kernel-locking.xml via-audio.xml \
11 deviceiobook.xml procfs-guide.xml tulip-user.xml \
12 writing_usb_driver.xml scsidrivers.xml sis900.xml \
13 kernel-api.xml journal-api.xml lsm.xml usb.xml \
14 gadget.xml libata.xml mtdnand.xml librs.xml
15
16###
17# The build process is as follows (targets):
18# (xmldocs)
19# file.tmpl --> file.xml +--> file.ps (psdocs)
20# +--> file.pdf (pdfdocs)
21# +--> DIR=file (htmldocs)
22# +--> man/ (mandocs)
23
24###
25# The targets that may be used.
26.PHONY: xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs
27
28BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
29xmldocs: $(BOOKS)
30sgmldocs: xmldocs
31
32PS := $(patsubst %.xml, %.ps, $(BOOKS))
33psdocs: $(PS)
34
35PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
36pdfdocs: $(PDF)
37
38HTML := $(patsubst %.xml, %.html, $(BOOKS))
39htmldocs: $(HTML)
40
41MAN := $(patsubst %.xml, %.9, $(BOOKS))
42mandocs: $(MAN)
43
44installmandocs: mandocs
45 $(MAKEMAN) install Documentation/DocBook/man
46
47###
48#External programs used
49KERNELDOC = scripts/kernel-doc
50DOCPROC = scripts/basic/docproc
51SPLITMAN = $(PERL) $(srctree)/scripts/split-man
52MAKEMAN = $(PERL) $(srctree)/scripts/makeman
53
54###
55# DOCPROC is used for two purposes:
56# 1) To generate a dependency list for a .tmpl file
57# 2) To preprocess a .tmpl file and call kernel-doc with
58# appropriate parameters.
59# The following rules are used to generate the .xml documentation
60# required to generate the final targets. (ps, pdf, html).
61quiet_cmd_docproc = DOCPROC $@
62 cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
63define rule_docproc
64 set -e; \
65 $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \
66 $(cmd_$(1)); \
67 ( \
68 echo 'cmd_$@ := $(cmd_$(1))'; \
69 echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \
70 ) > $(dir $@).$(notdir $@).cmd
71endef
72
73%.xml: %.tmpl FORCE
74 $(call if_changed_rule,docproc)
75
76###
77#Read in all saved dependency files
78cmd_files := $(wildcard $(foreach f,$(BOOKS),$(dir $(f)).$(notdir $(f)).cmd))
79
80ifneq ($(cmd_files),)
81 include $(cmd_files)
82endif
83
84###
85# Changes in kernel-doc force a rebuild of all documentation
86$(BOOKS): $(KERNELDOC)
87
88###
89# procfs guide uses a .c file as example code.
90# This requires an explicit dependency
91C-procfs-example = procfs_example.xml
92C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example))
93$(obj)/procfs-guide.xml: $(C-procfs-example2)
94
95###
96# Rules to generate postscript, PDF and HTML
97# db2html creates a directory. Generate a html file used for timestamp
98
99quiet_cmd_db2ps = DB2PS $@
100 cmd_db2ps = db2ps -o $(dir $@) $<
101%.ps : %.xml
102 @(which db2ps > /dev/null 2>&1) || \
103 (echo "*** You need to install DocBook stylesheets ***"; \
104 exit 1)
105 $(call cmd,db2ps)
106
107quiet_cmd_db2pdf = DB2PDF $@
108 cmd_db2pdf = db2pdf -o $(dir $@) $<
109%.pdf : %.xml
110 @(which db2pdf > /dev/null 2>&1) || \
111 (echo "*** You need to install DocBook stylesheets ***"; \
112 exit 1)
113 $(call cmd,db2pdf)
114
115quiet_cmd_db2html = DB2HTML $@
116 cmd_db2html = db2html -o $(patsubst %.html,%,$@) $< && \
117 echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/book1.html"> \
118 Goto $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
119
120%.html: %.xml
121 @(which db2html > /dev/null 2>&1) || \
122 (echo "*** You need to install DocBook stylesheets ***"; \
123 exit 1)
124 @rm -rf $@ $(patsubst %.html,%,$@)
125 $(call cmd,db2html)
126 @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
127 cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
128
129###
130# Rule to generate man files - output is placed in the man subdirectory
131
132%.9: %.xml
133ifneq ($(KBUILD_SRC),)
134 $(Q)mkdir -p $(objtree)/Documentation/DocBook/man
135endif
136 $(SPLITMAN) $< $(objtree)/Documentation/DocBook/man "$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)"
137 $(MAKEMAN) convert $(objtree)/Documentation/DocBook/man $<
138
139###
140# Rules to generate postscripts and PNG imgages from .fig format files
141quiet_cmd_fig2eps = FIG2EPS $@
142 cmd_fig2eps = fig2dev -Leps $< $@
143
144%.eps: %.fig
145 @(which fig2dev > /dev/null 2>&1) || \
146 (echo "*** You need to install transfig ***"; \
147 exit 1)
148 $(call cmd,fig2eps)
149
150quiet_cmd_fig2png = FIG2PNG $@
151 cmd_fig2png = fig2dev -Lpng $< $@
152
153%.png: %.fig
154 @(which fig2dev > /dev/null 2>&1) || \
155 (echo "*** You need to install transfig ***"; \
156 exit 1)
157 $(call cmd,fig2png)
158
159###
160# Rule to convert a .c file to inline XML documentation
161%.xml: %.c
162 @echo ' GEN $@'
163 @( \
164 echo "<programlisting>"; \
165 expand --tabs=8 < $< | \
166 sed -e "s/&/\\&amp;/g" \
167 -e "s/</\\&lt;/g" \
168 -e "s/>/\\&gt;/g"; \
169 echo "</programlisting>") > $@
170
171###
172# Help targets as used by the top-level makefile
173dochelp:
174 @echo ' Linux kernel internal documentation in different formats:'
175 @echo ' xmldocs (XML DocBook), psdocs (Postscript), pdfdocs (PDF)'
176 @echo ' htmldocs (HTML), mandocs (man pages, use installmandocs to install)'
177
178###
179# Temporary files left by various tools
180clean-files := $(DOCBOOKS) \
181 $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
182 $(patsubst %.xml, %.aux, $(DOCBOOKS)) \
183 $(patsubst %.xml, %.tex, $(DOCBOOKS)) \
184 $(patsubst %.xml, %.log, $(DOCBOOKS)) \
185 $(patsubst %.xml, %.out, $(DOCBOOKS)) \
186 $(patsubst %.xml, %.ps, $(DOCBOOKS)) \
187 $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
188 $(patsubst %.xml, %.html, $(DOCBOOKS)) \
189 $(patsubst %.xml, %.9, $(DOCBOOKS)) \
190 $(C-procfs-example)
191
192clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS))
193
194#man put files in man subdir - traverse down
195subdir- := man/
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
new file mode 100644
index 000000000000..6f41f2f5c6f6
--- /dev/null
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -0,0 +1,341 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="DoingIO">
6 <bookinfo>
7 <title>Bus-Independent Device Accesses</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Matthew</firstname>
12 <surname>Wilcox</surname>
13 <affiliation>
14 <address>
15 <email>matthew@wil.cx</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <authorgroup>
22 <author>
23 <firstname>Alan</firstname>
24 <surname>Cox</surname>
25 <affiliation>
26 <address>
27 <email>alan@redhat.com</email>
28 </address>
29 </affiliation>
30 </author>
31 </authorgroup>
32
33 <copyright>
34 <year>2001</year>
35 <holder>Matthew Wilcox</holder>
36 </copyright>
37
38 <legalnotice>
39 <para>
40 This documentation is free software; you can redistribute
41 it and/or modify it under the terms of the GNU General Public
42 License as published by the Free Software Foundation; either
43 version 2 of the License, or (at your option) any later
44 version.
45 </para>
46
47 <para>
48 This program is distributed in the hope that it will be
49 useful, but WITHOUT ANY WARRANTY; without even the implied
50 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
51 See the GNU General Public License for more details.
52 </para>
53
54 <para>
55 You should have received a copy of the GNU General Public
56 License along with this program; if not, write to the Free
57 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
58 MA 02111-1307 USA
59 </para>
60
61 <para>
62 For more details see the file COPYING in the source
63 distribution of Linux.
64 </para>
65 </legalnotice>
66 </bookinfo>
67
68<toc></toc>
69
70 <chapter id="intro">
71 <title>Introduction</title>
72 <para>
73 Linux provides an API which abstracts performing IO across all busses
74 and devices, allowing device drivers to be written independently of
75 bus type.
76 </para>
77 </chapter>
78
79 <chapter id="bugs">
80 <title>Known Bugs And Assumptions</title>
81 <para>
82 None.
83 </para>
84 </chapter>
85
86 <chapter id="mmio">
87 <title>Memory Mapped IO</title>
88 <sect1>
89 <title>Getting Access to the Device</title>
90 <para>
91 The most widely supported form of IO is memory mapped IO.
92 That is, a part of the CPU's address space is interpreted
93 not as accesses to memory, but as accesses to a device. Some
94 architectures define devices to be at a fixed address, but most
95 have some method of discovering devices. The PCI bus walk is a
96 good example of such a scheme. This document does not cover how
97 to receive such an address, but assumes you are starting with one.
98 Physical addresses are of type unsigned long.
99 </para>
100
101 <para>
102 This address should not be used directly. Instead, to get an
103 address suitable for passing to the accessor functions described
104 below, you should call <function>ioremap</function>.
105 An address suitable for accessing the device will be returned to you.
106 </para>
107
108 <para>
109 After you've finished using the device (say, in your module's
110 exit routine), call <function>iounmap</function> in order to return
111 the address space to the kernel. Most architectures allocate new
112 address space each time you call <function>ioremap</function>, and
113 they can run out unless you call <function>iounmap</function>.
114 </para>
115 </sect1>
116
117 <sect1>
118 <title>Accessing the device</title>
119 <para>
120 The part of the interface most used by drivers is reading and
121 writing memory-mapped registers on the device. Linux provides
122 interfaces to read and write 8-bit, 16-bit, 32-bit and 64-bit
123 quantities. Due to a historical accident, these are named byte,
124 word, long and quad accesses. Both read and write accesses are
125 supported; there is no prefetch support at this time.
126 </para>
127
128 <para>
129 The functions are named <function>readb</function>,
130 <function>readw</function>, <function>readl</function>,
131 <function>readq</function>, <function>readb_relaxed</function>,
132 <function>readw_relaxed</function>, <function>readl_relaxed</function>,
133 <function>readq_relaxed</function>, <function>writeb</function>,
134 <function>writew</function>, <function>writel</function> and
135 <function>writeq</function>.
136 </para>
137
138 <para>
139 Some devices (such as framebuffers) would like to use larger
140 transfers than 8 bytes at a time. For these devices, the
141 <function>memcpy_toio</function>, <function>memcpy_fromio</function>
142 and <function>memset_io</function> functions are provided.
143 Do not use memset or memcpy on IO addresses; they
144 are not guaranteed to copy data in order.
145 </para>
146
147 <para>
148 The read and write functions are defined to be ordered. That is the
149 compiler is not permitted to reorder the I/O sequence. When the
150 ordering can be compiler optimised, you can use <function>
151 __readb</function> and friends to indicate the relaxed ordering. Use
152 this with care.
153 </para>
154
155 <para>
156 While the basic functions are defined to be synchronous with respect
157 to each other and ordered with respect to each other the busses the
158 devices sit on may themselves have asynchronicity. In particular many
159 authors are burned by the fact that PCI bus writes are posted
160 asynchronously. A driver author must issue a read from the same
161 device to ensure that writes have occurred in the specific cases the
162 author cares. This kind of property cannot be hidden from driver
163 writers in the API. In some cases, the read used to flush the device
164 may be expected to fail (if the card is resetting, for example). In
165 that case, the read should be done from config space, which is
166 guaranteed to soft-fail if the card doesn't respond.
167 </para>
168
169 <para>
170 The following is an example of flushing a write to a device when
171 the driver would like to ensure the write's effects are visible prior
172 to continuing execution.
173 </para>
174
175<programlisting>
176static inline void
177qla1280_disable_intrs(struct scsi_qla_host *ha)
178{
179 struct device_reg *reg;
180
181 reg = ha->iobase;
182 /* disable risc and host interrupts */
183 WRT_REG_WORD(&amp;reg->ictrl, 0);
184 /*
185 * The following read will ensure that the above write
186 * has been received by the device before we return from this
187 * function.
188 */
189 RD_REG_WORD(&amp;reg->ictrl);
190 ha->flags.ints_enabled = 0;
191}
192</programlisting>
193
194 <para>
195 In addition to write posting, on some large multiprocessing systems
196 (e.g. SGI Challenge, Origin and Altix machines) posted writes won't
197 be strongly ordered coming from different CPUs. Thus it's important
198 to properly protect parts of your driver that do memory-mapped writes
199 with locks and use the <function>mmiowb</function> to make sure they
200 arrive in the order intended. Issuing a regular <function>readX
201 </function> will also ensure write ordering, but should only be used
202 when the driver has to be sure that the write has actually arrived
203 at the device (not that it's simply ordered with respect to other
204 writes), since a full <function>readX</function> is a relatively
205 expensive operation.
206 </para>
207
208 <para>
209 Generally, one should use <function>mmiowb</function> prior to
210 releasing a spinlock that protects regions using <function>writeb
211 </function> or similar functions that aren't surrounded by <function>
212 readb</function> calls, which will ensure ordering and flushing. The
213 following pseudocode illustrates what might occur if write ordering
214 isn't guaranteed via <function>mmiowb</function> or one of the
215 <function>readX</function> functions.
216 </para>
217
218<programlisting>
219CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
220CPU A: ...
221CPU A: writel(newval, ring_ptr);
222CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
223 ...
224CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
225CPU B: writel(newval2, ring_ptr);
226CPU B: ...
227CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
228</programlisting>
229
230 <para>
231 In the case above, newval2 could be written to ring_ptr before
232 newval. Fixing it is easy though:
233 </para>
234
235<programlisting>
236CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
237CPU A: ...
238CPU A: writel(newval, ring_ptr);
239CPU A: mmiowb(); /* ensure no other writes beat us to the device */
240CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
241 ...
242CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
243CPU B: writel(newval2, ring_ptr);
244CPU B: ...
245CPU B: mmiowb();
246CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
247</programlisting>
248
249 <para>
250 See tg3.c for a real world example of how to use <function>mmiowb
251 </function>
252 </para>
253
254 <para>
255 PCI ordering rules also guarantee that PIO read responses arrive
256 after any outstanding DMA writes from that bus, since for some devices
257 the result of a <function>readb</function> call may signal to the
258 driver that a DMA transaction is complete. In many cases, however,
259 the driver may want to indicate that the next
260 <function>readb</function> call has no relation to any previous DMA
261 writes performed by the device. The driver can use
262 <function>readb_relaxed</function> for these cases, although only
263 some platforms will honor the relaxed semantics. Using the relaxed
264 read functions will provide significant performance benefits on
265 platforms that support it. The qla2xxx driver provides examples
266 of how to use <function>readX_relaxed</function>. In many cases,
267 a majority of the driver's <function>readX</function> calls can
268 safely be converted to <function>readX_relaxed</function> calls, since
269 only a few will indicate or depend on DMA completion.
270 </para>
271 </sect1>
272
273 <sect1>
274 <title>ISA legacy functions</title>
275 <para>
276 On older kernels (2.2 and earlier) the ISA bus could be read or
277 written with these functions and without ioremap being used. This is
278 no longer true in Linux 2.4. A set of equivalent functions exist for
279 easy legacy driver porting. The functions available are prefixed
280 with 'isa_' and are <function>isa_readb</function>,
281 <function>isa_writeb</function>, <function>isa_readw</function>,
282 <function>isa_writew</function>, <function>isa_readl</function>,
283 <function>isa_writel</function>, <function>isa_memcpy_fromio</function>
284 and <function>isa_memcpy_toio</function>
285 </para>
286 <para>
287 These functions should not be used in new drivers, and will
288 eventually be going away.
289 </para>
290 </sect1>
291
292 </chapter>
293
294 <chapter>
295 <title>Port Space Accesses</title>
296 <sect1>
297 <title>Port Space Explained</title>
298
299 <para>
300 Another form of IO commonly supported is Port Space. This is a
301 range of addresses separate to the normal memory address space.
302 Access to these addresses is generally not as fast as accesses
303 to the memory mapped addresses, and it also has a potentially
304 smaller address space.
305 </para>
306
307 <para>
308 Unlike memory mapped IO, no preparation is required
309 to access port space.
310 </para>
311
312 </sect1>
313 <sect1>
314 <title>Accessing Port Space</title>
315 <para>
316 Accesses to this space are provided through a set of functions
317 which allow 8-bit, 16-bit and 32-bit accesses; also
318 known as byte, word and long. These functions are
319 <function>inb</function>, <function>inw</function>,
320 <function>inl</function>, <function>outb</function>,
321 <function>outw</function> and <function>outl</function>.
322 </para>
323
324 <para>
325 Some variants are provided for these functions. Some devices
326 require that accesses to their ports are slowed down. This
327 functionality is provided by appending a <function>_p</function>
328 to the end of the function. There are also equivalents to memcpy.
329 The <function>ins</function> and <function>outs</function>
330 functions copy bytes, words or longs to the given port.
331 </para>
332 </sect1>
333
334 </chapter>
335
336 <chapter id="pubfunctions">
337 <title>Public Functions Provided</title>
338!Einclude/asm-i386/io.h
339 </chapter>
340
341</book>
diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl
new file mode 100644
index 000000000000..a34442436128
--- /dev/null
+++ b/Documentation/DocBook/gadget.tmpl
@@ -0,0 +1,752 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="USB-Gadget-API">
6 <bookinfo>
7 <title>USB Gadget API for Linux</title>
8 <date>20 August 2004</date>
9 <edition>20 August 2004</edition>
10
11 <legalnotice>
12 <para>
13 This documentation is free software; you can redistribute
14 it and/or modify it under the terms of the GNU General Public
15 License as published by the Free Software Foundation; either
16 version 2 of the License, or (at your option) any later
17 version.
18 </para>
19
20 <para>
21 This program is distributed in the hope that it will be
22 useful, but WITHOUT ANY WARRANTY; without even the implied
23 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
24 See the GNU General Public License for more details.
25 </para>
26
27 <para>
28 You should have received a copy of the GNU General Public
29 License along with this program; if not, write to the Free
30 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
31 MA 02111-1307 USA
32 </para>
33
34 <para>
35 For more details see the file COPYING in the source
36 distribution of Linux.
37 </para>
38 </legalnotice>
39 <copyright>
40 <year>2003-2004</year>
41 <holder>David Brownell</holder>
42 </copyright>
43
44 <author>
45 <firstname>David</firstname>
46 <surname>Brownell</surname>
47 <affiliation>
48 <address><email>dbrownell@users.sourceforge.net</email></address>
49 </affiliation>
50 </author>
51 </bookinfo>
52
53<toc></toc>
54
55<chapter><title>Introduction</title>
56
57<para>This document presents a Linux-USB "Gadget"
58kernel mode
59API, for use within peripherals and other USB devices
60that embed Linux.
61It provides an overview of the API structure,
62and shows how that fits into a system development project.
63This is the first such API released on Linux to address
64a number of important problems, including: </para>
65
66<itemizedlist>
67 <listitem><para>Supports USB 2.0, for high speed devices which
68 can stream data at several dozen megabytes per second.
69 </para></listitem>
70 <listitem><para>Handles devices with dozens of endpoints just as
71 well as ones with just two fixed-function ones. Gadget drivers
72 can be written so they're easy to port to new hardware.
73 </para></listitem>
74 <listitem><para>Flexible enough to expose more complex USB device
75 capabilities such as multiple configurations, multiple interfaces,
76 composite devices,
77 and alternate interface settings.
78 </para></listitem>
79 <listitem><para>USB "On-The-Go" (OTG) support, in conjunction
80 with updates to the Linux-USB host side.
81 </para></listitem>
82 <listitem><para>Sharing data structures and API models with the
83 Linux-USB host side API. This helps the OTG support, and
84 looks forward to more-symmetric frameworks (where the same
85 I/O model is used by both host and device side drivers).
86 </para></listitem>
87 <listitem><para>Minimalist, so it's easier to support new device
88 controller hardware. I/O processing doesn't imply large
89 demands for memory or CPU resources.
90 </para></listitem>
91</itemizedlist>
92
93
94<para>Most Linux developers will not be able to use this API, since they
95have USB "host" hardware in a PC, workstation, or server.
96Linux users with embedded systems are more likely to
97have USB peripheral hardware.
98To distinguish drivers running inside such hardware from the
99more familiar Linux "USB device drivers",
100which are host side proxies for the real USB devices,
101a different term is used:
102the drivers inside the peripherals are "USB gadget drivers".
103In USB protocol interactions, the device driver is the master
104(or "client driver")
105and the gadget driver is the slave (or "function driver").
106</para>
107
108<para>The gadget API resembles the host side Linux-USB API in that both
109use queues of request objects to package I/O buffers, and those requests
110may be submitted or canceled.
111They share common definitions for the standard USB
112<emphasis>Chapter 9</emphasis> messages, structures, and constants.
113Also, both APIs bind and unbind drivers to devices.
114The APIs differ in detail, since the host side's current
115URB framework exposes a number of implementation details
116and assumptions that are inappropriate for a gadget API.
117While the model for control transfers and configuration
118management is necessarily different (one side is a hardware-neutral master,
119the other is a hardware-aware slave), the endpoint I/0 API used here
120should also be usable for an overhead-reduced host side API.
121</para>
122
123</chapter>
124
125<chapter id="structure"><title>Structure of Gadget Drivers</title>
126
127<para>A system running inside a USB peripheral
128normally has at least three layers inside the kernel to handle
129USB protocol processing, and may have additional layers in
130user space code.
131The "gadget" API is used by the middle layer to interact
132with the lowest level (which directly handles hardware).
133</para>
134
135<para>In Linux, from the bottom up, these layers are:
136</para>
137
138<variablelist>
139
140 <varlistentry>
141 <term><emphasis>USB Controller Driver</emphasis></term>
142
143 <listitem>
144 <para>This is the lowest software level.
145 It is the only layer that talks to hardware,
146 through registers, fifos, dma, irqs, and the like.
147 The <filename>&lt;linux/usb_gadget.h&gt;</filename> API abstracts
148 the peripheral controller endpoint hardware.
149 That hardware is exposed through endpoint objects, which accept
150 streams of IN/OUT buffers, and through callbacks that interact
151 with gadget drivers.
152 Since normal USB devices only have one upstream
153 port, they only have one of these drivers.
154 The controller driver can support any number of different
155 gadget drivers, but only one of them can be used at a time.
156 </para>
157
158 <para>Examples of such controller hardware include
159 the PCI-based NetChip 2280 USB 2.0 high speed controller,
160 the SA-11x0 or PXA-25x UDC (found within many PDAs),
161 and a variety of other products.
162 </para>
163
164 </listitem></varlistentry>
165
166 <varlistentry>
167 <term><emphasis>Gadget Driver</emphasis></term>
168
169 <listitem>
170 <para>The lower boundary of this driver implements hardware-neutral
171 USB functions, using calls to the controller driver.
172 Because such hardware varies widely in capabilities and restrictions,
173 and is used in embedded environments where space is at a premium,
174 the gadget driver is often configured at compile time
175 to work with endpoints supported by one particular controller.
176 Gadget drivers may be portable to several different controllers,
177 using conditional compilation.
178 (Recent kernels substantially simplify the work involved in
179 supporting new hardware, by <emphasis>autoconfiguring</emphasis>
180 endpoints automatically for many bulk-oriented drivers.)
181 Gadget driver responsibilities include:
182 </para>
183 <itemizedlist>
184 <listitem><para>handling setup requests (ep0 protocol responses)
185 possibly including class-specific functionality
186 </para></listitem>
187 <listitem><para>returning configuration and string descriptors
188 </para></listitem>
189 <listitem><para>(re)setting configurations and interface
190 altsettings, including enabling and configuring endpoints
191 </para></listitem>
192 <listitem><para>handling life cycle events, such as managing
193 bindings to hardware,
194 USB suspend/resume, remote wakeup,
195 and disconnection from the USB host.
196 </para></listitem>
197 <listitem><para>managing IN and OUT transfers on all currently
198 enabled endpoints
199 </para></listitem>
200 </itemizedlist>
201
202 <para>
203 Such drivers may be modules of proprietary code, although
204 that approach is discouraged in the Linux community.
205 </para>
206 </listitem></varlistentry>
207
208 <varlistentry>
209 <term><emphasis>Upper Level</emphasis></term>
210
211 <listitem>
212 <para>Most gadget drivers have an upper boundary that connects
213 to some Linux driver or framework in Linux.
214 Through that boundary flows the data which the gadget driver
215 produces and/or consumes through protocol transfers over USB.
216 Examples include:
217 </para>
218 <itemizedlist>
219 <listitem><para>user mode code, using generic (gadgetfs)
220 or application specific files in
221 <filename>/dev</filename>
222 </para></listitem>
223 <listitem><para>networking subsystem (for network gadgets,
224 like the CDC Ethernet Model gadget driver)
225 </para></listitem>
226 <listitem><para>data capture drivers, perhaps video4Linux or
227 a scanner driver; or test and measurement hardware.
228 </para></listitem>
229 <listitem><para>input subsystem (for HID gadgets)
230 </para></listitem>
231 <listitem><para>sound subsystem (for audio gadgets)
232 </para></listitem>
233 <listitem><para>file system (for PTP gadgets)
234 </para></listitem>
235 <listitem><para>block i/o subsystem (for usb-storage gadgets)
236 </para></listitem>
237 <listitem><para>... and more </para></listitem>
238 </itemizedlist>
239 </listitem></varlistentry>
240
241 <varlistentry>
242 <term><emphasis>Additional Layers</emphasis></term>
243
244 <listitem>
245 <para>Other layers may exist.
246 These could include kernel layers, such as network protocol stacks,
247 as well as user mode applications building on standard POSIX
248 system call APIs such as
249 <emphasis>open()</emphasis>, <emphasis>close()</emphasis>,
250 <emphasis>read()</emphasis> and <emphasis>write()</emphasis>.
251 On newer systems, POSIX Async I/O calls may be an option.
252 Such user mode code will not necessarily be subject to
253 the GNU General Public License (GPL).
254 </para>
255 </listitem></varlistentry>
256
257
258</variablelist>
259
260<para>OTG-capable systems will also need to include a standard Linux-USB
261host side stack,
262with <emphasis>usbcore</emphasis>,
263one or more <emphasis>Host Controller Drivers</emphasis> (HCDs),
264<emphasis>USB Device Drivers</emphasis> to support
265the OTG "Targeted Peripheral List",
266and so forth.
267There will also be an <emphasis>OTG Controller Driver</emphasis>,
268which is visible to gadget and device driver developers only indirectly.
269That helps the host and device side USB controllers implement the
270two new OTG protocols (HNP and SRP).
271Roles switch (host to peripheral, or vice versa) using HNP
272during USB suspend processing, and SRP can be viewed as a
273more battery-friendly kind of device wakeup protocol.
274</para>
275
276<para>Over time, reusable utilities are evolving to help make some
277gadget driver tasks simpler.
278For example, building configuration descriptors from vectors of
279descriptors for the configurations interfaces and endpoints is
280now automated, and many drivers now use autoconfiguration to
281choose hardware endpoints and initialize their descriptors.
282
283A potential example of particular interest
284is code implementing standard USB-IF protocols for
285HID, networking, storage, or audio classes.
286Some developers are interested in KDB or KGDB hooks, to let
287target hardware be remotely debugged.
288Most such USB protocol code doesn't need to be hardware-specific,
289any more than network protocols like X11, HTTP, or NFS are.
290Such gadget-side interface drivers should eventually be combined,
291to implement composite devices.
292</para>
293
294</chapter>
295
296
297<chapter id="api"><title>Kernel Mode Gadget API</title>
298
299<para>Gadget drivers declare themselves through a
300<emphasis>struct usb_gadget_driver</emphasis>, which is responsible for
301most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>.
302The response to a set_configuration usually involves
303enabling one or more of the <emphasis>struct usb_ep</emphasis> objects
304exposed by the gadget, and submitting one or more
305<emphasis>struct usb_request</emphasis> buffers to transfer data.
306Understand those four data types, and their operations, and
307you will understand how this API works.
308</para>
309
310<note><title>Incomplete Data Type Descriptions</title>
311
312<para>This documentation was prepared using the standard Linux
313kernel <filename>docproc</filename> tool, which turns text
314and in-code comments into SGML DocBook and then into usable
315formats such as HTML or PDF.
316Other than the "Chapter 9" data types, most of the significant
317data types and functions are described here.
318</para>
319
320<para>However, docproc does not understand all the C constructs
321that are used, so some relevant information is likely omitted from
322what you are reading.
323One example of such information is endpoint autoconfiguration.
324You'll have to read the header file, and use example source
325code (such as that for "Gadget Zero"), to fully understand the API.
326</para>
327
328<para>The part of the API implementing some basic
329driver capabilities is specific to the version of the
330Linux kernel that's in use.
331The 2.6 kernel includes a <emphasis>driver model</emphasis>
332framework that has no analogue on earlier kernels;
333so those parts of the gadget API are not fully portable.
334(They are implemented on 2.4 kernels, but in a different way.)
335The driver model state is another part of this API that is
336ignored by the kerneldoc tools.
337</para>
338</note>
339
340<para>The core API does not expose
341every possible hardware feature, only the most widely available ones.
342There are significant hardware features, such as device-to-device DMA
343(without temporary storage in a memory buffer)
344that would be added using hardware-specific APIs.
345</para>
346
347<para>This API allows drivers to use conditional compilation to handle
348endpoint capabilities of different hardware, but doesn't require that.
349Hardware tends to have arbitrary restrictions, relating to
350transfer types, addressing, packet sizes, buffering, and availability.
351As a rule, such differences only matter for "endpoint zero" logic
352that handles device configuration and management.
353The API supports limited run-time
354detection of capabilities, through naming conventions for endpoints.
355Many drivers will be able to at least partially autoconfigure
356themselves.
357In particular, driver init sections will often have endpoint
358autoconfiguration logic that scans the hardware's list of endpoints
359to find ones matching the driver requirements
360(relying on those conventions), to eliminate some of the most
361common reasons for conditional compilation.
362</para>
363
364<para>Like the Linux-USB host side API, this API exposes
365the "chunky" nature of USB messages: I/O requests are in terms
366of one or more "packets", and packet boundaries are visible to drivers.
367Compared to RS-232 serial protocols, USB resembles
368synchronous protocols like HDLC
369(N bytes per frame, multipoint addressing, host as the primary
370station and devices as secondary stations)
371more than asynchronous ones
372(tty style: 8 data bits per frame, no parity, one stop bit).
373So for example the controller drivers won't buffer
374two single byte writes into a single two-byte USB IN packet,
375although gadget drivers may do so when they implement
376protocols where packet boundaries (and "short packets")
377are not significant.
378</para>
379
380<sect1 id="lifecycle"><title>Driver Life Cycle</title>
381
382<para>Gadget drivers make endpoint I/O requests to hardware without
383needing to know many details of the hardware, but driver
384setup/configuration code needs to handle some differences.
385Use the API like this:
386</para>
387
388<orderedlist numeration='arabic'>
389
390<listitem><para>Register a driver for the particular device side
391usb controller hardware,
392such as the net2280 on PCI (USB 2.0),
393sa11x0 or pxa25x as found in Linux PDAs,
394and so on.
395At this point the device is logically in the USB ch9 initial state
396("attached"), drawing no power and not usable
397(since it does not yet support enumeration).
398Any host should not see the device, since it's not
399activated the data line pullup used by the host to
400detect a device, even if VBUS power is available.
401</para></listitem>
402
403<listitem><para>Register a gadget driver that implements some higher level
404device function. That will then bind() to a usb_gadget, which
405activates the data line pullup sometime after detecting VBUS.
406</para></listitem>
407
408<listitem><para>The hardware driver can now start enumerating.
409The steps it handles are to accept USB power and set_address requests.
410Other steps are handled by the gadget driver.
411If the gadget driver module is unloaded before the host starts to
412enumerate, steps before step 7 are skipped.
413</para></listitem>
414
415<listitem><para>The gadget driver's setup() call returns usb descriptors,
416based both on what the bus interface hardware provides and on the
417functionality being implemented.
418That can involve alternate settings or configurations,
419unless the hardware prevents such operation.
420For OTG devices, each configuration descriptor includes
421an OTG descriptor.
422</para></listitem>
423
424<listitem><para>The gadget driver handles the last step of enumeration,
425when the USB host issues a set_configuration call.
426It enables all endpoints used in that configuration,
427with all interfaces in their default settings.
428That involves using a list of the hardware's endpoints, enabling each
429endpoint according to its descriptor.
430It may also involve using <function>usb_gadget_vbus_draw</function>
431to let more power be drawn from VBUS, as allowed by that configuration.
432For OTG devices, setting a configuration may also involve reporting
433HNP capabilities through a user interface.
434</para></listitem>
435
436<listitem><para>Do real work and perform data transfers, possibly involving
437changes to interface settings or switching to new configurations, until the
438device is disconnect()ed from the host.
439Queue any number of transfer requests to each endpoint.
440It may be suspended and resumed several times before being disconnected.
441On disconnect, the drivers go back to step 3 (above).
442</para></listitem>
443
444<listitem><para>When the gadget driver module is being unloaded,
445the driver unbind() callback is issued. That lets the controller
446driver be unloaded.
447</para></listitem>
448
449</orderedlist>
450
451<para>Drivers will normally be arranged so that just loading the
452gadget driver module (or statically linking it into a Linux kernel)
453allows the peripheral device to be enumerated, but some drivers
454will defer enumeration until some higher level component (like
455a user mode daemon) enables it.
456Note that at this lowest level there are no policies about how
457ep0 configuration logic is implemented,
458except that it should obey USB specifications.
459Such issues are in the domain of gadget drivers,
460including knowing about implementation constraints
461imposed by some USB controllers
462or understanding that composite devices might happen to
463be built by integrating reusable components.
464</para>
465
466<para>Note that the lifecycle above can be slightly different
467for OTG devices.
468Other than providing an additional OTG descriptor in each
469configuration, only the HNP-related differences are particularly
470visible to driver code.
471They involve reporting requirements during the SET_CONFIGURATION
472request, and the option to invoke HNP during some suspend callbacks.
473Also, SRP changes the semantics of
474<function>usb_gadget_wakeup</function>
475slightly.
476</para>
477
478</sect1>
479
480<sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title>
481
482<para>Gadget drivers
483rely on common USB structures and constants
484defined in the
485<filename>&lt;linux/usb_ch9.h&gt;</filename>
486header file, which is standard in Linux 2.6 kernels.
487These are the same types and constants used by host
488side drivers (and usbcore).
489</para>
490
491!Iinclude/linux/usb_ch9.h
492</sect1>
493
494<sect1 id="core"><title>Core Objects and Methods</title>
495
496<para>These are declared in
497<filename>&lt;linux/usb_gadget.h&gt;</filename>,
498and are used by gadget drivers to interact with
499USB peripheral controller drivers.
500</para>
501
502 <!-- yeech, this is ugly in nsgmls PDF output.
503
504 the PDF bookmark and refentry output nesting is wrong,
505 and the member/argument documentation indents ugly.
506
507 plus something (docproc?) adds whitespace before the
508 descriptive paragraph text, so it can't line up right
509 unless the explanations are trivial.
510 -->
511
512!Iinclude/linux/usb_gadget.h
513</sect1>
514
515<sect1 id="utils"><title>Optional Utilities</title>
516
517<para>The core API is sufficient for writing a USB Gadget Driver,
518but some optional utilities are provided to simplify common tasks.
519These utilities include endpoint autoconfiguration.
520</para>
521
522!Edrivers/usb/gadget/usbstring.c
523!Edrivers/usb/gadget/config.c
524<!-- !Edrivers/usb/gadget/epautoconf.c -->
525</sect1>
526
527</chapter>
528
529<chapter id="controllers"><title>Peripheral Controller Drivers</title>
530
531<para>The first hardware supporting this API was the NetChip 2280
532controller, which supports USB 2.0 high speed and is based on PCI.
533This is the <filename>net2280</filename> driver module.
534The driver supports Linux kernel versions 2.4 and 2.6;
535contact NetChip Technologies for development boards and product
536information.
537</para>
538
539<para>Other hardware working in the "gadget" framework includes:
540Intel's PXA 25x and IXP42x series processors
541(<filename>pxa2xx_udc</filename>),
542Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>),
543Renesas SH7705/7727 (<filename>sh_udc</filename>),
544MediaQ 11xx (<filename>mq11xx_udc</filename>),
545Hynix HMS30C7202 (<filename>h7202_udc</filename>),
546National 9303/4 (<filename>n9604_udc</filename>),
547Texas Instruments OMAP (<filename>omap_udc</filename>),
548Sharp LH7A40x (<filename>lh7a40x_udc</filename>),
549and more.
550Most of those are full speed controllers.
551</para>
552
553<para>At this writing, there are people at work on drivers in
554this framework for several other USB device controllers,
555with plans to make many of them be widely available.
556</para>
557
558<!-- !Edrivers/usb/gadget/net2280.c -->
559
560<para>A partial USB simulator,
561the <filename>dummy_hcd</filename> driver, is available.
562It can act like a net2280, a pxa25x, or an sa11x0 in terms
563of available endpoints and device speeds; and it simulates
564control, bulk, and to some extent interrupt transfers.
565That lets you develop some parts of a gadget driver on a normal PC,
566without any special hardware, and perhaps with the assistance
567of tools such as GDB running with User Mode Linux.
568At least one person has expressed interest in adapting that
569approach, hooking it up to a simulator for a microcontroller.
570Such simulators can help debug subsystems where the runtime hardware
571is unfriendly to software development, or is not yet available.
572</para>
573
574<para>Support for other controllers is expected to be developed
575and contributed
576over time, as this driver framework evolves.
577</para>
578
579</chapter>
580
581<chapter id="gadget"><title>Gadget Drivers</title>
582
583<para>In addition to <emphasis>Gadget Zero</emphasis>
584(used primarily for testing and development with drivers
585for usb controller hardware), other gadget drivers exist.
586</para>
587
588<para>There's an <emphasis>ethernet</emphasis> gadget
589driver, which implements one of the most useful
590<emphasis>Communications Device Class</emphasis> (CDC) models.
591One of the standards for cable modem interoperability even
592specifies the use of this ethernet model as one of two
593mandatory options.
594Gadgets using this code look to a USB host as if they're
595an Ethernet adapter.
596It provides access to a network where the gadget's CPU is one host,
597which could easily be bridging, routing, or firewalling
598access to other networks.
599Since some hardware can't fully implement the CDC Ethernet
600requirements, this driver also implements a "good parts only"
601subset of CDC Ethernet.
602(That subset doesn't advertise itself as CDC Ethernet,
603to avoid creating problems.)
604</para>
605
606<para>Support for Microsoft's <emphasis>RNDIS</emphasis>
607protocol has been contributed by Pengutronix and Auerswald GmbH.
608This is like CDC Ethernet, but it runs on more slightly USB hardware
609(but less than the CDC subset).
610However, its main claim to fame is being able to connect directly to
611recent versions of Windows, using drivers that Microsoft bundles
612and supports, making it much simpler to network with Windows.
613</para>
614
615<para>There is also support for user mode gadget drivers,
616using <emphasis>gadgetfs</emphasis>.
617This provides a <emphasis>User Mode API</emphasis> that presents
618each endpoint as a single file descriptor. I/O is done using
619normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls.
620Familiar tools like GDB and pthreads can be used to
621develop and debug user mode drivers, so that once a robust
622controller driver is available many applications for it
623won't require new kernel mode software.
624Linux 2.6 <emphasis>Async I/O (AIO)</emphasis>
625support is available, so that user mode software
626can stream data with only slightly more overhead
627than a kernel driver.
628</para>
629
630<para>There's a USB Mass Storage class driver, which provides
631a different solution for interoperability with systems such
632as MS-Windows and MacOS.
633That <emphasis>File-backed Storage</emphasis> driver uses a
634file or block device as backing store for a drive,
635like the <filename>loop</filename> driver.
636The USB host uses the BBB, CB, or CBI versions of the mass
637storage class specification, using transparent SCSI commands
638to access the data from the backing store.
639</para>
640
641<para>There's a "serial line" driver, useful for TTY style
642operation over USB.
643The latest version of that driver supports CDC ACM style
644operation, like a USB modem, and so on most hardware it can
645interoperate easily with MS-Windows.
646One interesting use of that driver is in boot firmware (like a BIOS),
647which can sometimes use that model with very small systems without
648real serial lines.
649</para>
650
651<para>Support for other kinds of gadget is expected to
652be developed and contributed
653over time, as this driver framework evolves.
654</para>
655
656</chapter>
657
658<chapter id="otg"><title>USB On-The-GO (OTG)</title>
659
660<para>USB OTG support on Linux 2.6 was initially developed
661by Texas Instruments for
662<ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx
663series processors.
664Other OTG systems should work in similar ways, but the
665hardware level details could be very different.
666</para>
667
668<para>Systems need specialized hardware support to implement OTG,
669notably including a special <emphasis>Mini-AB</emphasis> jack
670and associated transciever to support <emphasis>Dual-Role</emphasis>
671operation:
672they can act either as a host, using the standard
673Linux-USB host side driver stack,
674or as a peripheral, using this "gadget" framework.
675To do that, the system software relies on small additions
676to those programming interfaces,
677and on a new internal component (here called an "OTG Controller")
678affecting which driver stack connects to the OTG port.
679In each role, the system can re-use the existing pool of
680hardware-neutral drivers, layered on top of the controller
681driver interfaces (<emphasis>usb_bus</emphasis> or
682<emphasis>usb_gadget</emphasis>).
683Such drivers need at most minor changes, and most of the calls
684added to support OTG can also benefit non-OTG products.
685</para>
686
687<itemizedlist>
688 <listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis>
689 flag, and use it to determine whether or not to include
690 an OTG descriptor in each of their configurations.
691 </para></listitem>
692 <listitem><para>Gadget drivers may need changes to support the
693 two new OTG protocols, exposed in new gadget attributes
694 such as <emphasis>b_hnp_enable</emphasis> flag.
695 HNP support should be reported through a user interface
696 (two LEDs could suffice), and is triggered in some cases
697 when the host suspends the peripheral.
698 SRP support can be user-initiated just like remote wakeup,
699 probably by pressing the same button.
700 </para></listitem>
701 <listitem><para>On the host side, USB device drivers need
702 to be taught to trigger HNP at appropriate moments, using
703 <function>usb_suspend_device()</function>.
704 That also conserves battery power, which is useful even
705 for non-OTG configurations.
706 </para></listitem>
707 <listitem><para>Also on the host side, a driver must support the
708 OTG "Targeted Peripheral List". That's just a whitelist,
709 used to reject peripherals not supported with a given
710 Linux OTG host.
711 <emphasis>This whitelist is product-specific;
712 each product must modify <filename>otg_whitelist.h</filename>
713 to match its interoperability specification.
714 </emphasis>
715 </para>
716 <para>Non-OTG Linux hosts, like PCs and workstations,
717 normally have some solution for adding drivers, so that
718 peripherals that aren't recognized can eventually be supported.
719 That approach is unreasonable for consumer products that may
720 never have their firmware upgraded, and where it's usually
721 unrealistic to expect traditional PC/workstation/server kinds
722 of support model to work.
723 For example, it's often impractical to change device firmware
724 once the product has been distributed, so driver bugs can't
725 normally be fixed if they're found after shipment.
726 </para></listitem>
727</itemizedlist>
728
729<para>
730Additional changes are needed below those hardware-neutral
731<emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis>
732driver interfaces; those aren't discussed here in any detail.
733Those affect the hardware-specific code for each USB Host or Peripheral
734controller, and how the HCD initializes (since OTG can be active only
735on a single port).
736They also involve what may be called an <emphasis>OTG Controller
737Driver</emphasis>, managing the OTG transceiver and the OTG state
738machine logic as well as much of the root hub behavior for the
739OTG port.
740The OTG controller driver needs to activate and deactivate USB
741controllers depending on the relevant device role.
742Some related changes were needed inside usbcore, so that it
743can identify OTG-capable devices and respond appropriately
744to HNP or SRP protocols.
745</para>
746
747</chapter>
748
749</book>
750<!--
751 vim:syntax=sgml:sw=4
752-->
diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl
new file mode 100644
index 000000000000..1ef6f43c6d8f
--- /dev/null
+++ b/Documentation/DocBook/journal-api.tmpl
@@ -0,0 +1,333 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="LinuxJBDAPI">
6 <bookinfo>
7 <title>The Linux Journalling API</title>
8 <authorgroup>
9 <author>
10 <firstname>Roger</firstname>
11 <surname>Gammans</surname>
12 <affiliation>
13 <address>
14 <email>rgammans@computer-surgery.co.uk</email>
15 </address>
16 </affiliation>
17 </author>
18 </authorgroup>
19
20 <authorgroup>
21 <author>
22 <firstname>Stephen</firstname>
23 <surname>Tweedie</surname>
24 <affiliation>
25 <address>
26 <email>sct@redhat.com</email>
27 </address>
28 </affiliation>
29 </author>
30 </authorgroup>
31
32 <copyright>
33 <year>2002</year>
34 <holder>Roger Gammans</holder>
35 </copyright>
36
37<legalnotice>
38 <para>
39 This documentation is free software; you can redistribute
40 it and/or modify it under the terms of the GNU General Public
41 License as published by the Free Software Foundation; either
42 version 2 of the License, or (at your option) any later
43 version.
44 </para>
45
46 <para>
47 This program is distributed in the hope that it will be
48 useful, but WITHOUT ANY WARRANTY; without even the implied
49 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
50 See the GNU General Public License for more details.
51 </para>
52
53 <para>
54 You should have received a copy of the GNU General Public
55 License along with this program; if not, write to the Free
56 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
57 MA 02111-1307 USA
58 </para>
59
60 <para>
61 For more details see the file COPYING in the source
62 distribution of Linux.
63 </para>
64 </legalnotice>
65 </bookinfo>
66
67<toc></toc>
68
69 <chapter id="Overview">
70 <title>Overview</title>
71 <sect1>
72 <title>Details</title>
73<para>
74The journalling layer is easy to use. You need to
75first of all create a journal_t data structure. There are
76two calls to do this dependent on how you decide to allocate the physical
77media on which the journal resides. The journal_init_inode() call
78is for journals stored in filesystem inodes, or the journal_init_dev()
79call can be use for journal stored on a raw device (in a continuous range
80of blocks). A journal_t is a typedef for a struct pointer, so when
81you are finally finished make sure you call journal_destroy() on it
82to free up any used kernel memory.
83</para>
84
85<para>
86Once you have got your journal_t object you need to 'mount' or load the journal
87file, unless of course you haven't initialised it yet - in which case you
88need to call journal_create().
89</para>
90
91<para>
92Most of the time however your journal file will already have been created, but
93before you load it you must call journal_wipe() to empty the journal file.
94Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the
95job of the client file system to detect this and skip the call to journal_wipe().
96</para>
97
98<para>
99In either case the next call should be to journal_load() which prepares the
100journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery()
101for you if it detects any outstanding transactions in the journal and similarly
102journal_load() will call journal_recover() if necessary.
103I would advise reading fs/ext3/super.c for examples on this stage.
104[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly
105complicate the API. Or isn't a good idea for the journal layer to hide
106dirty mounts from the client fs]
107</para>
108
109<para>
110Now you can go ahead and start modifying the underlying
111filesystem. Almost.
112</para>
113
114
115<para>
116
117You still need to actually journal your filesystem changes, this
118is done by wrapping them into transactions. Additionally you
119also need to wrap the modification of each of the the buffers
120with calls to the journal layer, so it knows what the modifications
121you are actually making are. To do this use journal_start() which
122returns a transaction handle.
123</para>
124
125<para>
126journal_start()
127and its counterpart journal_stop(), which indicates the end of a transaction
128are nestable calls, so you can reenter a transaction if necessary,
129but remember you must call journal_stop() the same number of times as
130journal_start() before the transaction is completed (or more accurately
131leaves the the update phase). Ext3/VFS makes use of this feature to simplify
132quota support.
133</para>
134
135<para>
136Inside each transaction you need to wrap the modifications to the
137individual buffers (blocks). Before you start to modify a buffer you
138need to call journal_get_{create,write,undo}_access() as appropriate,
139this allows the journalling layer to copy the unmodified data if it
140needs to. After all the buffer may be part of a previously uncommitted
141transaction.
142At this point you are at last ready to modify a buffer, and once
143you are have done so you need to call journal_dirty_{meta,}data().
144Or if you've asked for access to a buffer you now know is now longer
145required to be pushed back on the device you can call journal_forget()
146in much the same way as you might have used bforget() in the past.
147</para>
148
149<para>
150A journal_flush() may be called at any time to commit and checkpoint
151all your transactions.
152</para>
153
154<para>
155Then at umount time , in your put_super() (2.4) or write_super() (2.5)
156you can then call journal_destroy() to clean up your in-core journal object.
157</para>
158
159
160<para>
161Unfortunately there a couple of ways the journal layer can cause a deadlock.
162The first thing to note is that each task can only have
163a single outstanding transaction at any one time, remember nothing
164commits until the outermost journal_stop(). This means
165you must complete the transaction at the end of each file/inode/address
166etc. operation you perform, so that the journalling system isn't re-entered
167on another journal. Since transactions can't be nested/batched
168across differing journals, and another filesystem other than
169yours (say ext3) may be modified in a later syscall.
170</para>
171
172<para>
173The second case to bear in mind is that journal_start() can
174block if there isn't enough space in the journal for your transaction
175(based on the passed nblocks param) - when it blocks it merely(!) needs to
176wait for transactions to complete and be committed from other tasks,
177so essentially we are waiting for journal_stop(). So to avoid
178deadlocks you must treat journal_start/stop() as if they
179were semaphores and include them in your semaphore ordering rules to prevent
180deadlocks. Note that journal_extend() has similar blocking behaviour to
181journal_start() so you can deadlock here just as easily as on journal_start().
182</para>
183
184<para>
185Try to reserve the right number of blocks the first time. ;-). This will
186be the maximum number of blocks you are going to touch in this transaction.
187I advise having a look at at least ext3_jbd.h to see the basis on which
188ext3 uses to make these decisions.
189</para>
190
191<para>
192Another wriggle to watch out for is your on-disk block allocation strategy.
193why? Because, if you undo a delete, you need to ensure you haven't reused any
194of the freed blocks in a later transaction. One simple way of doing this
195is make sure any blocks you allocate only have checkpointed transactions
196listed against them. Ext3 does this in ext3_test_allocatable().
197</para>
198
199<para>
200Lock is also providing through journal_{un,}lock_updates(),
201ext3 uses this when it wants a window with a clean and stable fs for a moment.
202eg.
203</para>
204
205<programlisting>
206
207 journal_lock_updates() //stop new stuff happening..
208 journal_flush() // checkpoint everything.
209 ..do stuff on stable fs
210 journal_unlock_updates() // carry on with filesystem use.
211</programlisting>
212
213<para>
214The opportunities for abuse and DOS attacks with this should be obvious,
215if you allow unprivileged userspace to trigger codepaths containing these
216calls.
217</para>
218
219<para>
220A new feature of jbd since 2.5.25 is commit callbacks with the new
221journal_callback_set() function you can now ask the journalling layer
222to call you back when the transaction is finally committed to disk, so that
223you can do some of your own management. The key to this is the journal_callback
224struct, this maintains the internal callback information but you can
225extend it like this:-
226</para>
227<programlisting>
228 struct myfs_callback_s {
229 //Data structure element required by jbd..
230 struct journal_callback for_jbd;
231 // Stuff for myfs allocated together.
232 myfs_inode* i_commited;
233
234 }
235</programlisting>
236
237<para>
238this would be useful if you needed to know when data was committed to a
239particular inode.
240</para>
241
242</sect1>
243
244<sect1>
245<title>Summary</title>
246<para>
247Using the journal is a matter of wrapping the different context changes,
248being each mount, each modification (transaction) and each changed buffer
249to tell the journalling layer about them.
250</para>
251
252<para>
253Here is a some pseudo code to give you an idea of how it works, as
254an example.
255</para>
256
257<programlisting>
258 journal_t* my_jnrl = journal_create();
259 journal_init_{dev,inode}(jnrl,...)
260 if (clean) journal_wipe();
261 journal_load();
262
263 foreach(transaction) { /*transactions must be
264 completed before
265 a syscall returns to
266 userspace*/
267
268 handle_t * xct=journal_start(my_jnrl);
269 foreach(bh) {
270 journal_get_{create,write,undo}_access(xact,bh);
271 if ( myfs_modify(bh) ) { /* returns true
272 if makes changes */
273 journal_dirty_{meta,}data(xact,bh);
274 } else {
275 journal_forget(bh);
276 }
277 }
278 journal_stop(xct);
279 }
280 journal_destroy(my_jrnl);
281</programlisting>
282</sect1>
283
284</chapter>
285
286 <chapter id="adt">
287 <title>Data Types</title>
288 <para>
289 The journalling layer uses typedefs to 'hide' the concrete definitions
290 of the structures used. As a client of the JBD layer you can
291 just rely on the using the pointer as a magic cookie of some sort.
292
293 Obviously the hiding is not enforced as this is 'C'.
294 </para>
295 <sect1><title>Structures</title>
296!Iinclude/linux/jbd.h
297 </sect1>
298</chapter>
299
300 <chapter id="calls">
301 <title>Functions</title>
302 <para>
303 The functions here are split into two groups those that
304 affect a journal as a whole, and those which are used to
305 manage transactions
306</para>
307 <sect1><title>Journal Level</title>
308!Efs/jbd/journal.c
309!Efs/jbd/recovery.c
310 </sect1>
311 <sect1><title>Transasction Level</title>
312!Efs/jbd/transaction.c
313 </sect1>
314</chapter>
315<chapter>
316 <title>See also</title>
317 <para>
318 <citation>
319 <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz">
320 Journaling the Linux ext2fs Filesystem,LinuxExpo 98, Stephen Tweedie
321 </ulink>
322 </citation>
323 </para>
324 <para>
325 <citation>
326 <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
327 Ext3 Journalling FileSystem , OLS 2000, Dr. Stephen Tweedie
328 </ulink>
329 </citation>
330 </para>
331</chapter>
332
333</book>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
new file mode 100644
index 000000000000..1bd20c860285
--- /dev/null
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -0,0 +1,342 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="LinuxKernelAPI">
6 <bookinfo>
7 <title>The Linux Kernel API</title>
8
9 <legalnotice>
10 <para>
11 This documentation is free software; you can redistribute
12 it and/or modify it under the terms of the GNU General Public
13 License as published by the Free Software Foundation; either
14 version 2 of the License, or (at your option) any later
15 version.
16 </para>
17
18 <para>
19 This program is distributed in the hope that it will be
20 useful, but WITHOUT ANY WARRANTY; without even the implied
21 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 See the GNU General Public License for more details.
23 </para>
24
25 <para>
26 You should have received a copy of the GNU General Public
27 License along with this program; if not, write to the Free
28 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
29 MA 02111-1307 USA
30 </para>
31
32 <para>
33 For more details see the file COPYING in the source
34 distribution of Linux.
35 </para>
36 </legalnotice>
37 </bookinfo>
38
39<toc></toc>
40
41 <chapter id="Basics">
42 <title>Driver Basics</title>
43 <sect1><title>Driver Entry and Exit points</title>
44!Iinclude/linux/init.h
45 </sect1>
46
47 <sect1><title>Atomic and pointer manipulation</title>
48!Iinclude/asm-i386/atomic.h
49!Iinclude/asm-i386/unaligned.h
50 </sect1>
51
52<!-- FIXME:
53 kernel/sched.c has no docs, which stuffs up the sgml. Comment
54 out until somebody adds docs. KAO
55 <sect1><title>Delaying, scheduling, and timer routines</title>
56X!Ekernel/sched.c
57 </sect1>
58KAO -->
59 </chapter>
60
61 <chapter id="adt">
62 <title>Data Types</title>
63 <sect1><title>Doubly Linked Lists</title>
64!Iinclude/linux/list.h
65 </sect1>
66 </chapter>
67
68 <chapter id="libc">
69 <title>Basic C Library Functions</title>
70
71 <para>
72 When writing drivers, you cannot in general use routines which are
73 from the C Library. Some of the functions have been found generally
74 useful and they are listed below. The behaviour of these functions
75 may vary slightly from those defined by ANSI, and these deviations
76 are noted in the text.
77 </para>
78
79 <sect1><title>String Conversions</title>
80!Ilib/vsprintf.c
81!Elib/vsprintf.c
82 </sect1>
83 <sect1><title>String Manipulation</title>
84!Ilib/string.c
85!Elib/string.c
86 </sect1>
87 <sect1><title>Bit Operations</title>
88!Iinclude/asm-i386/bitops.h
89 </sect1>
90 </chapter>
91
92 <chapter id="mm">
93 <title>Memory Management in Linux</title>
94 <sect1><title>The Slab Cache</title>
95!Emm/slab.c
96 </sect1>
97 <sect1><title>User Space Memory Access</title>
98!Iinclude/asm-i386/uaccess.h
99!Iarch/i386/lib/usercopy.c
100 </sect1>
101 </chapter>
102
103 <chapter id="kfifo">
104 <title>FIFO Buffer</title>
105 <sect1><title>kfifo interface</title>
106!Iinclude/linux/kfifo.h
107!Ekernel/kfifo.c
108 </sect1>
109 </chapter>
110
111 <chapter id="proc">
112 <title>The proc filesystem</title>
113
114 <sect1><title>sysctl interface</title>
115!Ekernel/sysctl.c
116 </sect1>
117 </chapter>
118
119 <chapter id="debugfs">
120 <title>The debugfs filesystem</title>
121
122 <sect1><title>debugfs interface</title>
123!Efs/debugfs/inode.c
124!Efs/debugfs/file.c
125 </sect1>
126 </chapter>
127
128 <chapter id="vfs">
129 <title>The Linux VFS</title>
130 <sect1><title>The Directory Cache</title>
131!Efs/dcache.c
132!Iinclude/linux/dcache.h
133 </sect1>
134 <sect1><title>Inode Handling</title>
135!Efs/inode.c
136!Efs/bad_inode.c
137 </sect1>
138 <sect1><title>Registration and Superblocks</title>
139!Efs/super.c
140 </sect1>
141 <sect1><title>File Locks</title>
142!Efs/locks.c
143!Ifs/locks.c
144 </sect1>
145 </chapter>
146
147 <chapter id="netcore">
148 <title>Linux Networking</title>
149 <sect1><title>Socket Buffer Functions</title>
150!Iinclude/linux/skbuff.h
151!Enet/core/skbuff.c
152 </sect1>
153 <sect1><title>Socket Filter</title>
154!Enet/core/filter.c
155 </sect1>
156 <sect1><title>Generic Network Statistics</title>
157!Iinclude/linux/gen_stats.h
158!Enet/core/gen_stats.c
159!Enet/core/gen_estimator.c
160 </sect1>
161 </chapter>
162
163 <chapter id="netdev">
164 <title>Network device support</title>
165 <sect1><title>Driver Support</title>
166!Enet/core/dev.c
167 </sect1>
168 <sect1><title>8390 Based Network Cards</title>
169!Edrivers/net/8390.c
170 </sect1>
171 <sect1><title>Synchronous PPP</title>
172!Edrivers/net/wan/syncppp.c
173 </sect1>
174 </chapter>
175
176 <chapter id="modload">
177 <title>Module Support</title>
178 <sect1><title>Module Loading</title>
179!Ekernel/kmod.c
180 </sect1>
181 <sect1><title>Inter Module support</title>
182 <para>
183 Refer to the file kernel/module.c for more information.
184 </para>
185<!-- FIXME: Removed for now since no structured comments in source
186X!Ekernel/module.c
187-->
188 </sect1>
189 </chapter>
190
191 <chapter id="hardware">
192 <title>Hardware Interfaces</title>
193 <sect1><title>Interrupt Handling</title>
194!Iarch/i386/kernel/irq.c
195 </sect1>
196
197 <sect1><title>MTRR Handling</title>
198!Earch/i386/kernel/cpu/mtrr/main.c
199 </sect1>
200 <sect1><title>PCI Support Library</title>
201!Edrivers/pci/pci.c
202 </sect1>
203 <sect1><title>PCI Hotplug Support Library</title>
204!Edrivers/pci/hotplug/pci_hotplug_core.c
205 </sect1>
206 <sect1><title>MCA Architecture</title>
207 <sect2><title>MCA Device Functions</title>
208 <para>
209 Refer to the file arch/i386/kernel/mca.c for more information.
210 </para>
211<!-- FIXME: Removed for now since no structured comments in source
212X!Earch/i386/kernel/mca.c
213-->
214 </sect2>
215 <sect2><title>MCA Bus DMA</title>
216!Iinclude/asm-i386/mca_dma.h
217 </sect2>
218 </sect1>
219 </chapter>
220
221 <chapter id="devfs">
222 <title>The Device File System</title>
223!Efs/devfs/base.c
224 </chapter>
225
226 <chapter id="security">
227 <title>Security Framework</title>
228!Esecurity/security.c
229 </chapter>
230
231 <chapter id="pmfuncs">
232 <title>Power Management</title>
233!Ekernel/power/pm.c
234 </chapter>
235
236 <chapter id="blkdev">
237 <title>Block Devices</title>
238!Edrivers/block/ll_rw_blk.c
239 </chapter>
240
241 <chapter id="miscdev">
242 <title>Miscellaneous Devices</title>
243!Edrivers/char/misc.c
244 </chapter>
245
246 <chapter id="viddev">
247 <title>Video4Linux</title>
248!Edrivers/media/video/videodev.c
249 </chapter>
250
251 <chapter id="snddev">
252 <title>Sound Devices</title>
253!Esound/sound_core.c
254<!-- FIXME: Removed for now since no structured comments in source
255X!Isound/sound_firmware.c
256-->
257 </chapter>
258
259 <chapter id="uart16x50">
260 <title>16x50 UART Driver</title>
261!Edrivers/serial/serial_core.c
262!Edrivers/serial/8250.c
263 </chapter>
264
265 <chapter id="z85230">
266 <title>Z85230 Support Library</title>
267!Edrivers/net/wan/z85230.c
268 </chapter>
269
270 <chapter id="fbdev">
271 <title>Frame Buffer Library</title>
272
273 <para>
274 The frame buffer drivers depend heavily on four data structures.
275 These structures are declared in include/linux/fb.h. They are
276 fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs.
277 The last three can be made available to and from userland.
278 </para>
279
280 <para>
281 fb_info defines the current state of a particular video card.
282 Inside fb_info, there exists a fb_ops structure which is a
283 collection of needed functions to make fbdev and fbcon work.
284 fb_info is only visible to the kernel.
285 </para>
286
287 <para>
288 fb_var_screeninfo is used to describe the features of a video card
289 that are user defined. With fb_var_screeninfo, things such as
290 depth and the resolution may be defined.
291 </para>
292
293 <para>
294 The next structure is fb_fix_screeninfo. This defines the
295 properties of a card that are created when a mode is set and can't
296 be changed otherwise. A good example of this is the start of the
297 frame buffer memory. This "locks" the address of the frame buffer
298 memory, so that it cannot be changed or moved.
299 </para>
300
301 <para>
302 The last structure is fb_monospecs. In the old API, there was
303 little importance for fb_monospecs. This allowed for forbidden things
304 such as setting a mode of 800x600 on a fix frequency monitor. With
305 the new API, fb_monospecs prevents such things, and if used
306 correctly, can prevent a monitor from being cooked. fb_monospecs
307 will not be useful until kernels 2.5.x.
308 </para>
309
310 <sect1><title>Frame Buffer Memory</title>
311!Edrivers/video/fbmem.c
312 </sect1>
313 <sect1><title>Frame Buffer Console</title>
314!Edrivers/video/console/fbcon.c
315 </sect1>
316 <sect1><title>Frame Buffer Colormap</title>
317!Edrivers/video/fbcmap.c
318 </sect1>
319<!-- FIXME:
320 drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment
321 out until somebody adds docs. KAO
322 <sect1><title>Frame Buffer Generic Functions</title>
323X!Idrivers/video/fbgen.c
324 </sect1>
325KAO -->
326 <sect1><title>Frame Buffer Video Mode Database</title>
327!Idrivers/video/modedb.c
328!Edrivers/video/modedb.c
329 </sect1>
330 <sect1><title>Frame Buffer Macintosh Video Mode Database</title>
331!Idrivers/video/macmodes.c
332 </sect1>
333 <sect1><title>Frame Buffer Fonts</title>
334 <para>
335 Refer to the file drivers/video/console/fonts.c for more information.
336 </para>
337<!-- FIXME: Removed for now since no structured comments in source
338X!Idrivers/video/console/fonts.c
339-->
340 </sect1>
341 </chapter>
342</book>
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
new file mode 100644
index 000000000000..49a9ef82d575
--- /dev/null
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -0,0 +1,1349 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="lk-hacking-guide">
6 <bookinfo>
7 <title>Unreliable Guide To Hacking The Linux Kernel</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Paul</firstname>
12 <othername>Rusty</othername>
13 <surname>Russell</surname>
14 <affiliation>
15 <address>
16 <email>rusty@rustcorp.com.au</email>
17 </address>
18 </affiliation>
19 </author>
20 </authorgroup>
21
22 <copyright>
23 <year>2001</year>
24 <holder>Rusty Russell</holder>
25 </copyright>
26
27 <legalnotice>
28 <para>
29 This documentation is free software; you can redistribute
30 it and/or modify it under the terms of the GNU General Public
31 License as published by the Free Software Foundation; either
32 version 2 of the License, or (at your option) any later
33 version.
34 </para>
35
36 <para>
37 This program is distributed in the hope that it will be
38 useful, but WITHOUT ANY WARRANTY; without even the implied
39 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
40 See the GNU General Public License for more details.
41 </para>
42
43 <para>
44 You should have received a copy of the GNU General Public
45 License along with this program; if not, write to the Free
46 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
47 MA 02111-1307 USA
48 </para>
49
50 <para>
51 For more details see the file COPYING in the source
52 distribution of Linux.
53 </para>
54 </legalnotice>
55
56 <releaseinfo>
57 This is the first release of this document as part of the kernel tarball.
58 </releaseinfo>
59
60 </bookinfo>
61
62 <toc></toc>
63
64 <chapter id="introduction">
65 <title>Introduction</title>
66 <para>
67 Welcome, gentle reader, to Rusty's Unreliable Guide to Linux
68 Kernel Hacking. This document describes the common routines and
69 general requirements for kernel code: its goal is to serve as a
70 primer for Linux kernel development for experienced C
71 programmers. I avoid implementation details: that's what the
72 code is for, and I ignore whole tracts of useful routines.
73 </para>
74 <para>
75 Before you read this, please understand that I never wanted to
76 write this document, being grossly under-qualified, but I always
77 wanted to read it, and this was the only way. I hope it will
78 grow into a compendium of best practice, common starting points
79 and random information.
80 </para>
81 </chapter>
82
83 <chapter id="basic-players">
84 <title>The Players</title>
85
86 <para>
87 At any time each of the CPUs in a system can be:
88 </para>
89
90 <itemizedlist>
91 <listitem>
92 <para>
93 not associated with any process, serving a hardware interrupt;
94 </para>
95 </listitem>
96
97 <listitem>
98 <para>
99 not associated with any process, serving a softirq, tasklet or bh;
100 </para>
101 </listitem>
102
103 <listitem>
104 <para>
105 running in kernel space, associated with a process;
106 </para>
107 </listitem>
108
109 <listitem>
110 <para>
111 running a process in user space.
112 </para>
113 </listitem>
114 </itemizedlist>
115
116 <para>
117 There is a strict ordering between these: other than the last
118 category (userspace) each can only be pre-empted by those above.
119 For example, while a softirq is running on a CPU, no other
120 softirq will pre-empt it, but a hardware interrupt can. However,
121 any other CPUs in the system execute independently.
122 </para>
123
124 <para>
125 We'll see a number of ways that the user context can block
126 interrupts, to become truly non-preemptable.
127 </para>
128
129 <sect1 id="basics-usercontext">
130 <title>User Context</title>
131
132 <para>
133 User context is when you are coming in from a system call or
134 other trap: you can sleep, and you own the CPU (except for
135 interrupts) until you call <function>schedule()</function>.
136 In other words, user context (unlike userspace) is not pre-emptable.
137 </para>
138
139 <note>
140 <para>
141 You are always in user context on module load and unload,
142 and on operations on the block device layer.
143 </para>
144 </note>
145
146 <para>
147 In user context, the <varname>current</varname> pointer (indicating
148 the task we are currently executing) is valid, and
149 <function>in_interrupt()</function>
150 (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false
151 </returnvalue>.
152 </para>
153
154 <caution>
155 <para>
156 Beware that if you have interrupts or bottom halves disabled
157 (see below), <function>in_interrupt()</function> will return a
158 false positive.
159 </para>
160 </caution>
161 </sect1>
162
163 <sect1 id="basics-hardirqs">
164 <title>Hardware Interrupts (Hard IRQs)</title>
165
166 <para>
167 Timer ticks, <hardware>network cards</hardware> and
168 <hardware>keyboard</hardware> are examples of real
169 hardware which produce interrupts at any time. The kernel runs
170 interrupt handlers, which services the hardware. The kernel
171 guarantees that this handler is never re-entered: if another
172 interrupt arrives, it is queued (or dropped). Because it
173 disables interrupts, this handler has to be fast: frequently it
174 simply acknowledges the interrupt, marks a `software interrupt'
175 for execution and exits.
176 </para>
177
178 <para>
179 You can tell you are in a hardware interrupt, because
180 <function>in_irq()</function> returns <returnvalue>true</returnvalue>.
181 </para>
182 <caution>
183 <para>
184 Beware that this will return a false positive if interrupts are disabled
185 (see below).
186 </para>
187 </caution>
188 </sect1>
189
190 <sect1 id="basics-softirqs">
191 <title>Software Interrupt Context: Bottom Halves, Tasklets, softirqs</title>
192
193 <para>
194 Whenever a system call is about to return to userspace, or a
195 hardware interrupt handler exits, any `software interrupts'
196 which are marked pending (usually by hardware interrupts) are
197 run (<filename>kernel/softirq.c</filename>).
198 </para>
199
200 <para>
201 Much of the real interrupt handling work is done here. Early in
202 the transition to <acronym>SMP</acronym>, there were only `bottom
203 halves' (BHs), which didn't take advantage of multiple CPUs. Shortly
204 after we switched from wind-up computers made of match-sticks and snot,
205 we abandoned this limitation.
206 </para>
207
208 <para>
209 <filename class="headerfile">include/linux/interrupt.h</filename> lists the
210 different BH's. No matter how many CPUs you have, no two BHs will run at
211 the same time. This made the transition to SMP simpler, but sucks hard for
212 scalable performance. A very important bottom half is the timer
213 BH (<filename class="headerfile">include/linux/timer.h</filename>): you
214 can register to have it call functions for you in a given length of time.
215 </para>
216
217 <para>
218 2.3.43 introduced softirqs, and re-implemented the (now
219 deprecated) BHs underneath them. Softirqs are fully-SMP
220 versions of BHs: they can run on as many CPUs at once as
221 required. This means they need to deal with any races in shared
222 data using their own locks. A bitmask is used to keep track of
223 which are enabled, so the 32 available softirqs should not be
224 used up lightly. (<emphasis>Yes</emphasis>, people will
225 notice).
226 </para>
227
228 <para>
229 tasklets (<filename class="headerfile">include/linux/interrupt.h</filename>)
230 are like softirqs, except they are dynamically-registrable (meaning you
231 can have as many as you want), and they also guarantee that any tasklet
232 will only run on one CPU at any time, although different tasklets can
233 run simultaneously (unlike different BHs).
234 </para>
235 <caution>
236 <para>
237 The name `tasklet' is misleading: they have nothing to do with `tasks',
238 and probably more to do with some bad vodka Alexey Kuznetsov had at the
239 time.
240 </para>
241 </caution>
242
243 <para>
244 You can tell you are in a softirq (or bottom half, or tasklet)
245 using the <function>in_softirq()</function> macro
246 (<filename class="headerfile">include/linux/interrupt.h</filename>).
247 </para>
248 <caution>
249 <para>
250 Beware that this will return a false positive if a bh lock (see below)
251 is held.
252 </para>
253 </caution>
254 </sect1>
255 </chapter>
256
257 <chapter id="basic-rules">
258 <title>Some Basic Rules</title>
259
260 <variablelist>
261 <varlistentry>
262 <term>No memory protection</term>
263 <listitem>
264 <para>
265 If you corrupt memory, whether in user context or
266 interrupt context, the whole machine will crash. Are you
267 sure you can't do what you want in userspace?
268 </para>
269 </listitem>
270 </varlistentry>
271
272 <varlistentry>
273 <term>No floating point or <acronym>MMX</acronym></term>
274 <listitem>
275 <para>
276 The <acronym>FPU</acronym> context is not saved; even in user
277 context the <acronym>FPU</acronym> state probably won't
278 correspond with the current process: you would mess with some
279 user process' <acronym>FPU</acronym> state. If you really want
280 to do this, you would have to explicitly save/restore the full
281 <acronym>FPU</acronym> state (and avoid context switches). It
282 is generally a bad idea; use fixed point arithmetic first.
283 </para>
284 </listitem>
285 </varlistentry>
286
287 <varlistentry>
288 <term>A rigid stack limit</term>
289 <listitem>
290 <para>
291 The kernel stack is about 6K in 2.2 (for most
292 architectures: it's about 14K on the Alpha), and shared
293 with interrupts so you can't use it all. Avoid deep
294 recursion and huge local arrays on the stack (allocate
295 them dynamically instead).
296 </para>
297 </listitem>
298 </varlistentry>
299
300 <varlistentry>
301 <term>The Linux kernel is portable</term>
302 <listitem>
303 <para>
304 Let's keep it that way. Your code should be 64-bit clean,
305 and endian-independent. You should also minimize CPU
306 specific stuff, e.g. inline assembly should be cleanly
307 encapsulated and minimized to ease porting. Generally it
308 should be restricted to the architecture-dependent part of
309 the kernel tree.
310 </para>
311 </listitem>
312 </varlistentry>
313 </variablelist>
314 </chapter>
315
316 <chapter id="ioctls">
317 <title>ioctls: Not writing a new system call</title>
318
319 <para>
320 A system call generally looks like this
321 </para>
322
323 <programlisting>
324asmlinkage long sys_mycall(int arg)
325{
326 return 0;
327}
328 </programlisting>
329
330 <para>
331 First, in most cases you don't want to create a new system call.
332 You create a character device and implement an appropriate ioctl
333 for it. This is much more flexible than system calls, doesn't have
334 to be entered in every architecture's
335 <filename class="headerfile">include/asm/unistd.h</filename> and
336 <filename>arch/kernel/entry.S</filename> file, and is much more
337 likely to be accepted by Linus.
338 </para>
339
340 <para>
341 If all your routine does is read or write some parameter, consider
342 implementing a <function>sysctl</function> interface instead.
343 </para>
344
345 <para>
346 Inside the ioctl you're in user context to a process. When a
347 error occurs you return a negated errno (see
348 <filename class="headerfile">include/linux/errno.h</filename>),
349 otherwise you return <returnvalue>0</returnvalue>.
350 </para>
351
352 <para>
353 After you slept you should check if a signal occurred: the
354 Unix/Linux way of handling signals is to temporarily exit the
355 system call with the <constant>-ERESTARTSYS</constant> error. The
356 system call entry code will switch back to user context, process
357 the signal handler and then your system call will be restarted
358 (unless the user disabled that). So you should be prepared to
359 process the restart, e.g. if you're in the middle of manipulating
360 some data structure.
361 </para>
362
363 <programlisting>
364if (signal_pending())
365 return -ERESTARTSYS;
366 </programlisting>
367
368 <para>
369 If you're doing longer computations: first think userspace. If you
370 <emphasis>really</emphasis> want to do it in kernel you should
371 regularly check if you need to give up the CPU (remember there is
372 cooperative multitasking per CPU). Idiom:
373 </para>
374
375 <programlisting>
376cond_resched(); /* Will sleep */
377 </programlisting>
378
379 <para>
380 A short note on interface design: the UNIX system call motto is
381 "Provide mechanism not policy".
382 </para>
383 </chapter>
384
385 <chapter id="deadlock-recipes">
386 <title>Recipes for Deadlock</title>
387
388 <para>
389 You cannot call any routines which may sleep, unless:
390 </para>
391 <itemizedlist>
392 <listitem>
393 <para>
394 You are in user context.
395 </para>
396 </listitem>
397
398 <listitem>
399 <para>
400 You do not own any spinlocks.
401 </para>
402 </listitem>
403
404 <listitem>
405 <para>
406 You have interrupts enabled (actually, Andi Kleen says
407 that the scheduling code will enable them for you, but
408 that's probably not what you wanted).
409 </para>
410 </listitem>
411 </itemizedlist>
412
413 <para>
414 Note that some functions may sleep implicitly: common ones are
415 the user space access functions (*_user) and memory allocation
416 functions without <symbol>GFP_ATOMIC</symbol>.
417 </para>
418
419 <para>
420 You will eventually lock up your box if you break these rules.
421 </para>
422
423 <para>
424 Really.
425 </para>
426 </chapter>
427
428 <chapter id="common-routines">
429 <title>Common Routines</title>
430
431 <sect1 id="routines-printk">
432 <title>
433 <function>printk()</function>
434 <filename class="headerfile">include/linux/kernel.h</filename>
435 </title>
436
437 <para>
438 <function>printk()</function> feeds kernel messages to the
439 console, dmesg, and the syslog daemon. It is useful for debugging
440 and reporting errors, and can be used inside interrupt context,
441 but use with caution: a machine which has its console flooded with
442 printk messages is unusable. It uses a format string mostly
443 compatible with ANSI C printf, and C string concatenation to give
444 it a first "priority" argument:
445 </para>
446
447 <programlisting>
448printk(KERN_INFO "i = %u\n", i);
449 </programlisting>
450
451 <para>
452 See <filename class="headerfile">include/linux/kernel.h</filename>;
453 for other KERN_ values; these are interpreted by syslog as the
454 level. Special case: for printing an IP address use
455 </para>
456
457 <programlisting>
458__u32 ipaddress;
459printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress));
460 </programlisting>
461
462 <para>
463 <function>printk()</function> internally uses a 1K buffer and does
464 not catch overruns. Make sure that will be enough.
465 </para>
466
467 <note>
468 <para>
469 You will know when you are a real kernel hacker
470 when you start typoing printf as printk in your user programs :)
471 </para>
472 </note>
473
474 <!--- From the Lions book reader department -->
475
476 <note>
477 <para>
478 Another sidenote: the original Unix Version 6 sources had a
479 comment on top of its printf function: "Printf should not be
480 used for chit-chat". You should follow that advice.
481 </para>
482 </note>
483 </sect1>
484
485 <sect1 id="routines-copy">
486 <title>
487 <function>copy_[to/from]_user()</function>
488 /
489 <function>get_user()</function>
490 /
491 <function>put_user()</function>
492 <filename class="headerfile">include/asm/uaccess.h</filename>
493 </title>
494
495 <para>
496 <emphasis>[SLEEPS]</emphasis>
497 </para>
498
499 <para>
500 <function>put_user()</function> and <function>get_user()</function>
501 are used to get and put single values (such as an int, char, or
502 long) from and to userspace. A pointer into userspace should
503 never be simply dereferenced: data should be copied using these
504 routines. Both return <constant>-EFAULT</constant> or 0.
505 </para>
506 <para>
507 <function>copy_to_user()</function> and
508 <function>copy_from_user()</function> are more general: they copy
509 an arbitrary amount of data to and from userspace.
510 <caution>
511 <para>
512 Unlike <function>put_user()</function> and
513 <function>get_user()</function>, they return the amount of
514 uncopied data (ie. <returnvalue>0</returnvalue> still means
515 success).
516 </para>
517 </caution>
518 [Yes, this moronic interface makes me cringe. Please submit a
519 patch and become my hero --RR.]
520 </para>
521 <para>
522 The functions may sleep implicitly. This should never be called
523 outside user context (it makes no sense), with interrupts
524 disabled, or a spinlock held.
525 </para>
526 </sect1>
527
528 <sect1 id="routines-kmalloc">
529 <title><function>kmalloc()</function>/<function>kfree()</function>
530 <filename class="headerfile">include/linux/slab.h</filename></title>
531
532 <para>
533 <emphasis>[MAY SLEEP: SEE BELOW]</emphasis>
534 </para>
535
536 <para>
537 These routines are used to dynamically request pointer-aligned
538 chunks of memory, like malloc and free do in userspace, but
539 <function>kmalloc()</function> takes an extra flag word.
540 Important values:
541 </para>
542
543 <variablelist>
544 <varlistentry>
545 <term>
546 <constant>
547 GFP_KERNEL
548 </constant>
549 </term>
550 <listitem>
551 <para>
552 May sleep and swap to free memory. Only allowed in user
553 context, but is the most reliable way to allocate memory.
554 </para>
555 </listitem>
556 </varlistentry>
557
558 <varlistentry>
559 <term>
560 <constant>
561 GFP_ATOMIC
562 </constant>
563 </term>
564 <listitem>
565 <para>
566 Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>,
567 but may be called from interrupt context. You should
568 <emphasis>really</emphasis> have a good out-of-memory
569 error-handling strategy.
570 </para>
571 </listitem>
572 </varlistentry>
573
574 <varlistentry>
575 <term>
576 <constant>
577 GFP_DMA
578 </constant>
579 </term>
580 <listitem>
581 <para>
582 Allocate ISA DMA lower than 16MB. If you don't know what that
583 is you don't need it. Very unreliable.
584 </para>
585 </listitem>
586 </varlistentry>
587 </variablelist>
588
589 <para>
590 If you see a <errorname>kmem_grow: Called nonatomically from int
591 </errorname> warning message you called a memory allocation function
592 from interrupt context without <constant>GFP_ATOMIC</constant>.
593 You should really fix that. Run, don't walk.
594 </para>
595
596 <para>
597 If you are allocating at least <constant>PAGE_SIZE</constant>
598 (<filename class="headerfile">include/asm/page.h</filename>) bytes,
599 consider using <function>__get_free_pages()</function>
600
601 (<filename class="headerfile">include/linux/mm.h</filename>). It
602 takes an order argument (0 for page sized, 1 for double page, 2
603 for four pages etc.) and the same memory priority flag word as
604 above.
605 </para>
606
607 <para>
608 If you are allocating more than a page worth of bytes you can use
609 <function>vmalloc()</function>. It'll allocate virtual memory in
610 the kernel map. This block is not contiguous in physical memory,
611 but the <acronym>MMU</acronym> makes it look like it is for you
612 (so it'll only look contiguous to the CPUs, not to external device
613 drivers). If you really need large physically contiguous memory
614 for some weird device, you have a problem: it is poorly supported
615 in Linux because after some time memory fragmentation in a running
616 kernel makes it hard. The best way is to allocate the block early
617 in the boot process via the <function>alloc_bootmem()</function>
618 routine.
619 </para>
620
621 <para>
622 Before inventing your own cache of often-used objects consider
623 using a slab cache in
624 <filename class="headerfile">include/linux/slab.h</filename>
625 </para>
626 </sect1>
627
628 <sect1 id="routines-current">
629 <title><function>current</function>
630 <filename class="headerfile">include/asm/current.h</filename></title>
631
632 <para>
633 This global variable (really a macro) contains a pointer to
634 the current task structure, so is only valid in user context.
635 For example, when a process makes a system call, this will
636 point to the task structure of the calling process. It is
637 <emphasis>not NULL</emphasis> in interrupt context.
638 </para>
639 </sect1>
640
641 <sect1 id="routines-udelay">
642 <title><function>udelay()</function>/<function>mdelay()</function>
643 <filename class="headerfile">include/asm/delay.h</filename>
644 <filename class="headerfile">include/linux/delay.h</filename>
645 </title>
646
647 <para>
648 The <function>udelay()</function> function can be used for small pauses.
649 Do not use large values with <function>udelay()</function> as you risk
650 overflow - the helper function <function>mdelay()</function> is useful
651 here, or even consider <function>schedule_timeout()</function>.
652 </para>
653 </sect1>
654
655 <sect1 id="routines-endian">
656 <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function>
657 <filename class="headerfile">include/asm/byteorder.h</filename>
658 </title>
659
660 <para>
661 The <function>cpu_to_be32()</function> family (where the "32" can
662 be replaced by 64 or 16, and the "be" can be replaced by "le") are
663 the general way to do endian conversions in the kernel: they
664 return the converted value. All variations supply the reverse as
665 well: <function>be32_to_cpu()</function>, etc.
666 </para>
667
668 <para>
669 There are two major variations of these functions: the pointer
670 variation, such as <function>cpu_to_be32p()</function>, which take
671 a pointer to the given type, and return the converted value. The
672 other variation is the "in-situ" family, such as
673 <function>cpu_to_be32s()</function>, which convert value referred
674 to by the pointer, and return void.
675 </para>
676 </sect1>
677
678 <sect1 id="routines-local-irqs">
679 <title><function>local_irq_save()</function>/<function>local_irq_restore()</function>
680 <filename class="headerfile">include/asm/system.h</filename>
681 </title>
682
683 <para>
684 These routines disable hard interrupts on the local CPU, and
685 restore them. They are reentrant; saving the previous state in
686 their one <varname>unsigned long flags</varname> argument. If you
687 know that interrupts are enabled, you can simply use
688 <function>local_irq_disable()</function> and
689 <function>local_irq_enable()</function>.
690 </para>
691 </sect1>
692
693 <sect1 id="routines-softirqs">
694 <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function>
695 <filename class="headerfile">include/linux/interrupt.h</filename></title>
696
697 <para>
698 These routines disable soft interrupts on the local CPU, and
699 restore them. They are reentrant; if soft interrupts were
700 disabled before, they will still be disabled after this pair
701 of functions has been called. They prevent softirqs, tasklets
702 and bottom halves from running on the current CPU.
703 </para>
704 </sect1>
705
706 <sect1 id="routines-processorids">
707 <title><function>smp_processor_id</function>()
708 <filename class="headerfile">include/asm/smp.h</filename></title>
709
710 <para>
711 <function>smp_processor_id()</function> returns the current
712 processor number, between 0 and <symbol>NR_CPUS</symbol> (the
713 maximum number of CPUs supported by Linux, currently 32). These
714 values are not necessarily continuous.
715 </para>
716 </sect1>
717
718 <sect1 id="routines-init">
719 <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type>
720 <filename class="headerfile">include/linux/init.h</filename></title>
721
722 <para>
723 After boot, the kernel frees up a special section; functions
724 marked with <type>__init</type> and data structures marked with
725 <type>__initdata</type> are dropped after boot is complete (within
726 modules this directive is currently ignored). <type>__exit</type>
727 is used to declare a function which is only required on exit: the
728 function will be dropped if this file is not compiled as a module.
729 See the header file for use. Note that it makes no sense for a function
730 marked with <type>__init</type> to be exported to modules with
731 <function>EXPORT_SYMBOL()</function> - this will break.
732 </para>
733 <para>
734 Static data structures marked as <type>__initdata</type> must be initialised
735 (as opposed to ordinary static data which is zeroed BSS) and cannot be
736 <type>const</type>.
737 </para>
738
739 </sect1>
740
741 <sect1 id="routines-init-again">
742 <title><function>__initcall()</function>/<function>module_init()</function>
743 <filename class="headerfile">include/linux/init.h</filename></title>
744 <para>
745 Many parts of the kernel are well served as a module
746 (dynamically-loadable parts of the kernel). Using the
747 <function>module_init()</function> and
748 <function>module_exit()</function> macros it is easy to write code
749 without #ifdefs which can operate both as a module or built into
750 the kernel.
751 </para>
752
753 <para>
754 The <function>module_init()</function> macro defines which
755 function is to be called at module insertion time (if the file is
756 compiled as a module), or at boot time: if the file is not
757 compiled as a module the <function>module_init()</function> macro
758 becomes equivalent to <function>__initcall()</function>, which
759 through linker magic ensures that the function is called on boot.
760 </para>
761
762 <para>
763 The function can return a negative error number to cause
764 module loading to fail (unfortunately, this has no effect if
765 the module is compiled into the kernel). For modules, this is
766 called in user context, with interrupts enabled, and the
767 kernel lock held, so it can sleep.
768 </para>
769 </sect1>
770
771 <sect1 id="routines-moduleexit">
772 <title> <function>module_exit()</function>
773 <filename class="headerfile">include/linux/init.h</filename> </title>
774
775 <para>
776 This macro defines the function to be called at module removal
777 time (or never, in the case of the file compiled into the
778 kernel). It will only be called if the module usage count has
779 reached zero. This function can also sleep, but cannot fail:
780 everything must be cleaned up by the time it returns.
781 </para>
782 </sect1>
783
784 <!-- add info on new-style module refcounting here -->
785 </chapter>
786
787 <chapter id="queues">
788 <title>Wait Queues
789 <filename class="headerfile">include/linux/wait.h</filename>
790 </title>
791 <para>
792 <emphasis>[SLEEPS]</emphasis>
793 </para>
794
795 <para>
796 A wait queue is used to wait for someone to wake you up when a
797 certain condition is true. They must be used carefully to ensure
798 there is no race condition. You declare a
799 <type>wait_queue_head_t</type>, and then processes which want to
800 wait for that condition declare a <type>wait_queue_t</type>
801 referring to themselves, and place that in the queue.
802 </para>
803
804 <sect1 id="queue-declaring">
805 <title>Declaring</title>
806
807 <para>
808 You declare a <type>wait_queue_head_t</type> using the
809 <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the
810 <function>init_waitqueue_head()</function> routine in your
811 initialization code.
812 </para>
813 </sect1>
814
815 <sect1 id="queue-waitqueue">
816 <title>Queuing</title>
817
818 <para>
819 Placing yourself in the waitqueue is fairly complex, because you
820 must put yourself in the queue before checking the condition.
821 There is a macro to do this:
822 <function>wait_event_interruptible()</function>
823
824 <filename class="headerfile">include/linux/sched.h</filename> The
825 first argument is the wait queue head, and the second is an
826 expression which is evaluated; the macro returns
827 <returnvalue>0</returnvalue> when this expression is true, or
828 <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
829 The <function>wait_event()</function> version ignores signals.
830 </para>
831 <para>
832 Do not use the <function>sleep_on()</function> function family -
833 it is very easy to accidentally introduce races; almost certainly
834 one of the <function>wait_event()</function> family will do, or a
835 loop around <function>schedule_timeout()</function>. If you choose
836 to loop around <function>schedule_timeout()</function> remember
837 you must set the task state (with
838 <function>set_current_state()</function>) on each iteration to avoid
839 busy-looping.
840 </para>
841
842 </sect1>
843
844 <sect1 id="queue-waking">
845 <title>Waking Up Queued Tasks</title>
846
847 <para>
848 Call <function>wake_up()</function>
849
850 <filename class="headerfile">include/linux/sched.h</filename>;,
851 which will wake up every process in the queue. The exception is
852 if one has <constant>TASK_EXCLUSIVE</constant> set, in which case
853 the remainder of the queue will not be woken.
854 </para>
855 </sect1>
856 </chapter>
857
858 <chapter id="atomic-ops">
859 <title>Atomic Operations</title>
860
861 <para>
862 Certain operations are guaranteed atomic on all platforms. The
863 first class of operations work on <type>atomic_t</type>
864
865 <filename class="headerfile">include/asm/atomic.h</filename>; this
866 contains a signed integer (at least 24 bits long), and you must use
867 these functions to manipulate or read atomic_t variables.
868 <function>atomic_read()</function> and
869 <function>atomic_set()</function> get and set the counter,
870 <function>atomic_add()</function>,
871 <function>atomic_sub()</function>,
872 <function>atomic_inc()</function>,
873 <function>atomic_dec()</function>, and
874 <function>atomic_dec_and_test()</function> (returns
875 <returnvalue>true</returnvalue> if it was decremented to zero).
876 </para>
877
878 <para>
879 Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the
880 atomic variable is zero.
881 </para>
882
883 <para>
884 Note that these functions are slower than normal arithmetic, and
885 so should not be used unnecessarily. On some platforms they
886 are much slower, like 32-bit Sparc where they use a spinlock.
887 </para>
888
889 <para>
890 The second class of atomic operations is atomic bit operations on a
891 <type>long</type>, defined in
892
893 <filename class="headerfile">include/linux/bitops.h</filename>. These
894 operations generally take a pointer to the bit pattern, and a bit
895 number: 0 is the least significant bit.
896 <function>set_bit()</function>, <function>clear_bit()</function>
897 and <function>change_bit()</function> set, clear, and flip the
898 given bit. <function>test_and_set_bit()</function>,
899 <function>test_and_clear_bit()</function> and
900 <function>test_and_change_bit()</function> do the same thing,
901 except return true if the bit was previously set; these are
902 particularly useful for very simple locking.
903 </para>
904
905 <para>
906 It is possible to call these operations with bit indices greater
907 than BITS_PER_LONG. The resulting behavior is strange on big-endian
908 platforms though so it is a good idea not to do this.
909 </para>
910
911 <para>
912 Note that the order of bits depends on the architecture, and in
913 particular, the bitfield passed to these operations must be at
914 least as large as a <type>long</type>.
915 </para>
916 </chapter>
917
918 <chapter id="symbols">
919 <title>Symbols</title>
920
921 <para>
922 Within the kernel proper, the normal linking rules apply
923 (ie. unless a symbol is declared to be file scope with the
924 <type>static</type> keyword, it can be used anywhere in the
925 kernel). However, for modules, a special exported symbol table is
926 kept which limits the entry points to the kernel proper. Modules
927 can also export symbols.
928 </para>
929
930 <sect1 id="sym-exportsymbols">
931 <title><function>EXPORT_SYMBOL()</function>
932 <filename class="headerfile">include/linux/module.h</filename></title>
933
934 <para>
935 This is the classic method of exporting a symbol, and it works
936 for both modules and non-modules. In the kernel all these
937 declarations are often bundled into a single file to help
938 genksyms (which searches source files for these declarations).
939 See the comment on genksyms and Makefiles below.
940 </para>
941 </sect1>
942
943 <sect1 id="sym-exportsymbols-gpl">
944 <title><function>EXPORT_SYMBOL_GPL()</function>
945 <filename class="headerfile">include/linux/module.h</filename></title>
946
947 <para>
948 Similar to <function>EXPORT_SYMBOL()</function> except that the
949 symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can
950 only be seen by modules with a
951 <function>MODULE_LICENSE()</function> that specifies a GPL
952 compatible license.
953 </para>
954 </sect1>
955 </chapter>
956
957 <chapter id="conventions">
958 <title>Routines and Conventions</title>
959
960 <sect1 id="conventions-doublelinkedlist">
961 <title>Double-linked lists
962 <filename class="headerfile">include/linux/list.h</filename></title>
963
964 <para>
965 There are three sets of linked-list routines in the kernel
966 headers, but this one seems to be winning out (and Linus has
967 used it). If you don't have some particular pressing need for
968 a single list, it's a good choice. In fact, I don't care
969 whether it's a good choice or not, just use it so we can get
970 rid of the others.
971 </para>
972 </sect1>
973
974 <sect1 id="convention-returns">
975 <title>Return Conventions</title>
976
977 <para>
978 For code called in user context, it's very common to defy C
979 convention, and return <returnvalue>0</returnvalue> for success,
980 and a negative error number
981 (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be
982 unintuitive at first, but it's fairly widespread in the networking
983 code, for example.
984 </para>
985
986 <para>
987 The filesystem code uses <function>ERR_PTR()</function>
988
989 <filename class="headerfile">include/linux/fs.h</filename>; to
990 encode a negative error number into a pointer, and
991 <function>IS_ERR()</function> and <function>PTR_ERR()</function>
992 to get it back out again: avoids a separate pointer parameter for
993 the error number. Icky, but in a good way.
994 </para>
995 </sect1>
996
997 <sect1 id="conventions-borkedcompile">
998 <title>Breaking Compilation</title>
999
1000 <para>
1001 Linus and the other developers sometimes change function or
1002 structure names in development kernels; this is not done just to
1003 keep everyone on their toes: it reflects a fundamental change
1004 (eg. can no longer be called with interrupts on, or does extra
1005 checks, or doesn't do checks which were caught before). Usually
1006 this is accompanied by a fairly complete note to the linux-kernel
1007 mailing list; search the archive. Simply doing a global replace
1008 on the file usually makes things <emphasis>worse</emphasis>.
1009 </para>
1010 </sect1>
1011
1012 <sect1 id="conventions-initialising">
1013 <title>Initializing structure members</title>
1014
1015 <para>
1016 The preferred method of initializing structures is to use
1017 designated initialisers, as defined by ISO C99, eg:
1018 </para>
1019 <programlisting>
1020static struct block_device_operations opt_fops = {
1021 .open = opt_open,
1022 .release = opt_release,
1023 .ioctl = opt_ioctl,
1024 .check_media_change = opt_media_change,
1025};
1026 </programlisting>
1027 <para>
1028 This makes it easy to grep for, and makes it clear which
1029 structure fields are set. You should do this because it looks
1030 cool.
1031 </para>
1032 </sect1>
1033
1034 <sect1 id="conventions-gnu-extns">
1035 <title>GNU Extensions</title>
1036
1037 <para>
1038 GNU Extensions are explicitly allowed in the Linux kernel.
1039 Note that some of the more complex ones are not very well
1040 supported, due to lack of general use, but the following are
1041 considered standard (see the GCC info page section "C
1042 Extensions" for more details - Yes, really the info page, the
1043 man page is only a short summary of the stuff in info):
1044 </para>
1045 <itemizedlist>
1046 <listitem>
1047 <para>
1048 Inline functions
1049 </para>
1050 </listitem>
1051 <listitem>
1052 <para>
1053 Statement expressions (ie. the ({ and }) constructs).
1054 </para>
1055 </listitem>
1056 <listitem>
1057 <para>
1058 Declaring attributes of a function / variable / type
1059 (__attribute__)
1060 </para>
1061 </listitem>
1062 <listitem>
1063 <para>
1064 typeof
1065 </para>
1066 </listitem>
1067 <listitem>
1068 <para>
1069 Zero length arrays
1070 </para>
1071 </listitem>
1072 <listitem>
1073 <para>
1074 Macro varargs
1075 </para>
1076 </listitem>
1077 <listitem>
1078 <para>
1079 Arithmetic on void pointers
1080 </para>
1081 </listitem>
1082 <listitem>
1083 <para>
1084 Non-Constant initializers
1085 </para>
1086 </listitem>
1087 <listitem>
1088 <para>
1089 Assembler Instructions (not outside arch/ and include/asm/)
1090 </para>
1091 </listitem>
1092 <listitem>
1093 <para>
1094 Function names as strings (__FUNCTION__)
1095 </para>
1096 </listitem>
1097 <listitem>
1098 <para>
1099 __builtin_constant_p()
1100 </para>
1101 </listitem>
1102 </itemizedlist>
1103
1104 <para>
1105 Be wary when using long long in the kernel, the code gcc generates for
1106 it is horrible and worse: division and multiplication does not work
1107 on i386 because the GCC runtime functions for it are missing from
1108 the kernel environment.
1109 </para>
1110
1111 <!-- FIXME: add a note about ANSI aliasing cleanness -->
1112 </sect1>
1113
1114 <sect1 id="conventions-cplusplus">
1115 <title>C++</title>
1116
1117 <para>
1118 Using C++ in the kernel is usually a bad idea, because the
1119 kernel does not provide the necessary runtime environment
1120 and the include files are not tested for it. It is still
1121 possible, but not recommended. If you really want to do
1122 this, forget about exceptions at least.
1123 </para>
1124 </sect1>
1125
1126 <sect1 id="conventions-ifdef">
1127 <title>&num;if</title>
1128
1129 <para>
1130 It is generally considered cleaner to use macros in header files
1131 (or at the top of .c files) to abstract away functions rather than
1132 using `#if' pre-processor statements throughout the source code.
1133 </para>
1134 </sect1>
1135 </chapter>
1136
1137 <chapter id="submitting">
1138 <title>Putting Your Stuff in the Kernel</title>
1139
1140 <para>
1141 In order to get your stuff into shape for official inclusion, or
1142 even to make a neat patch, there's administrative work to be
1143 done:
1144 </para>
1145 <itemizedlist>
1146 <listitem>
1147 <para>
1148 Figure out whose pond you've been pissing in. Look at the top of
1149 the source files, inside the <filename>MAINTAINERS</filename>
1150 file, and last of all in the <filename>CREDITS</filename> file.
1151 You should coordinate with this person to make sure you're not
1152 duplicating effort, or trying something that's already been
1153 rejected.
1154 </para>
1155
1156 <para>
1157 Make sure you put your name and EMail address at the top of
1158 any files you create or mangle significantly. This is the
1159 first place people will look when they find a bug, or when
1160 <emphasis>they</emphasis> want to make a change.
1161 </para>
1162 </listitem>
1163
1164 <listitem>
1165 <para>
1166 Usually you want a configuration option for your kernel hack.
1167 Edit <filename>Config.in</filename> in the appropriate directory
1168 (but under <filename>arch/</filename> it's called
1169 <filename>config.in</filename>). The Config Language used is not
1170 bash, even though it looks like bash; the safe way is to use only
1171 the constructs that you already see in
1172 <filename>Config.in</filename> files (see
1173 <filename>Documentation/kbuild/kconfig-language.txt</filename>).
1174 It's good to run "make xconfig" at least once to test (because
1175 it's the only one with a static parser).
1176 </para>
1177
1178 <para>
1179 Variables which can be Y or N use <type>bool</type> followed by a
1180 tagline and the config define name (which must start with
1181 CONFIG_). The <type>tristate</type> function is the same, but
1182 allows the answer M (which defines
1183 <symbol>CONFIG_foo_MODULE</symbol> in your source, instead of
1184 <symbol>CONFIG_FOO</symbol>) if <symbol>CONFIG_MODULES</symbol>
1185 is enabled.
1186 </para>
1187
1188 <para>
1189 You may well want to make your CONFIG option only visible if
1190 <symbol>CONFIG_EXPERIMENTAL</symbol> is enabled: this serves as a
1191 warning to users. There many other fancy things you can do: see
1192 the various <filename>Config.in</filename> files for ideas.
1193 </para>
1194 </listitem>
1195
1196 <listitem>
1197 <para>
1198 Edit the <filename>Makefile</filename>: the CONFIG variables are
1199 exported here so you can conditionalize compilation with `ifeq'.
1200 If your file exports symbols then add the names to
1201 <varname>export-objs</varname> so that genksyms will find them.
1202 <caution>
1203 <para>
1204 There is a restriction on the kernel build system that objects
1205 which export symbols must have globally unique names.
1206 If your object does not have a globally unique name then the
1207 standard fix is to move the
1208 <function>EXPORT_SYMBOL()</function> statements to their own
1209 object with a unique name.
1210 This is why several systems have separate exporting objects,
1211 usually suffixed with ksyms.
1212 </para>
1213 </caution>
1214 </para>
1215 </listitem>
1216
1217 <listitem>
1218 <para>
1219 Document your option in Documentation/Configure.help. Mention
1220 incompatibilities and issues here. <emphasis> Definitely
1221 </emphasis> end your description with <quote> if in doubt, say N
1222 </quote> (or, occasionally, `Y'); this is for people who have no
1223 idea what you are talking about.
1224 </para>
1225 </listitem>
1226
1227 <listitem>
1228 <para>
1229 Put yourself in <filename>CREDITS</filename> if you've done
1230 something noteworthy, usually beyond a single file (your name
1231 should be at the top of the source files anyway).
1232 <filename>MAINTAINERS</filename> means you want to be consulted
1233 when changes are made to a subsystem, and hear about bugs; it
1234 implies a more-than-passing commitment to some part of the code.
1235 </para>
1236 </listitem>
1237
1238 <listitem>
1239 <para>
1240 Finally, don't forget to read <filename>Documentation/SubmittingPatches</filename>
1241 and possibly <filename>Documentation/SubmittingDrivers</filename>.
1242 </para>
1243 </listitem>
1244 </itemizedlist>
1245 </chapter>
1246
1247 <chapter id="cantrips">
1248 <title>Kernel Cantrips</title>
1249
1250 <para>
1251 Some favorites from browsing the source. Feel free to add to this
1252 list.
1253 </para>
1254
1255 <para>
1256 <filename>include/linux/brlock.h:</filename>
1257 </para>
1258 <programlisting>
1259extern inline void br_read_lock (enum brlock_indices idx)
1260{
1261 /*
1262 * This causes a link-time bug message if an
1263 * invalid index is used:
1264 */
1265 if (idx >= __BR_END)
1266 __br_lock_usage_bug();
1267
1268 read_lock(&amp;__brlock_array[smp_processor_id()][idx]);
1269}
1270 </programlisting>
1271
1272 <para>
1273 <filename>include/linux/fs.h</filename>:
1274 </para>
1275 <programlisting>
1276/*
1277 * Kernel pointers have redundant information, so we can use a
1278 * scheme where we can return either an error code or a dentry
1279 * pointer with the same return value.
1280 *
1281 * This should be a per-architecture thing, to allow different
1282 * error and pointer decisions.
1283 */
1284 #define ERR_PTR(err) ((void *)((long)(err)))
1285 #define PTR_ERR(ptr) ((long)(ptr))
1286 #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
1287</programlisting>
1288
1289 <para>
1290 <filename>include/asm-i386/uaccess.h:</filename>
1291 </para>
1292
1293 <programlisting>
1294#define copy_to_user(to,from,n) \
1295 (__builtin_constant_p(n) ? \
1296 __constant_copy_to_user((to),(from),(n)) : \
1297 __generic_copy_to_user((to),(from),(n)))
1298 </programlisting>
1299
1300 <para>
1301 <filename>arch/sparc/kernel/head.S:</filename>
1302 </para>
1303
1304 <programlisting>
1305/*
1306 * Sun people can't spell worth damn. "compatability" indeed.
1307 * At least we *know* we can't spell, and use a spell-checker.
1308 */
1309
1310/* Uh, actually Linus it is I who cannot spell. Too much murky
1311 * Sparc assembly will do this to ya.
1312 */
1313C_LABEL(cputypvar):
1314 .asciz "compatability"
1315
1316/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
1317 .align 4
1318C_LABEL(cputypvar_sun4m):
1319 .asciz "compatible"
1320 </programlisting>
1321
1322 <para>
1323 <filename>arch/sparc/lib/checksum.S:</filename>
1324 </para>
1325
1326 <programlisting>
1327 /* Sun, you just can't beat me, you just can't. Stop trying,
1328 * give up. I'm serious, I am going to kick the living shit
1329 * out of you, game over, lights out.
1330 */
1331 </programlisting>
1332 </chapter>
1333
1334 <chapter id="credits">
1335 <title>Thanks</title>
1336
1337 <para>
1338 Thanks to Andi Kleen for the idea, answering my questions, fixing
1339 my mistakes, filling content, etc. Philipp Rumpf for more spelling
1340 and clarity fixes, and some excellent non-obvious points. Werner
1341 Almesberger for giving me a great summary of
1342 <function>disable_irq()</function>, and Jes Sorensen and Andrea
1343 Arcangeli added caveats. Michael Elizabeth Chastain for checking
1344 and adding to the Configure section. <!-- Rusty insisted on this
1345 bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook.
1346 </para>
1347 </chapter>
1348</book>
1349
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
new file mode 100644
index 000000000000..90dc2de8e0af
--- /dev/null
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -0,0 +1,2088 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="LKLockingGuide">
6 <bookinfo>
7 <title>Unreliable Guide To Locking</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Rusty</firstname>
12 <surname>Russell</surname>
13 <affiliation>
14 <address>
15 <email>rusty@rustcorp.com.au</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2003</year>
23 <holder>Rusty Russell</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53 </legalnotice>
54 </bookinfo>
55
56 <toc></toc>
57 <chapter id="intro">
58 <title>Introduction</title>
59 <para>
60 Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
61 Locking issues. This document describes the locking systems in
62 the Linux Kernel in 2.6.
63 </para>
64 <para>
65 With the wide availability of HyperThreading, and <firstterm
66 linkend="gloss-preemption">preemption </firstterm> in the Linux
67 Kernel, everyone hacking on the kernel needs to know the
68 fundamentals of concurrency and locking for
69 <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
70 </para>
71 </chapter>
72
73 <chapter id="races">
74 <title>The Problem With Concurrency</title>
75 <para>
76 (Skip this if you know what a Race Condition is).
77 </para>
78 <para>
79 In a normal program, you can increment a counter like so:
80 </para>
81 <programlisting>
82 very_important_count++;
83 </programlisting>
84
85 <para>
86 This is what they would expect to happen:
87 </para>
88
89 <table>
90 <title>Expected Results</title>
91
92 <tgroup cols="2" align="left">
93
94 <thead>
95 <row>
96 <entry>Instance 1</entry>
97 <entry>Instance 2</entry>
98 </row>
99 </thead>
100
101 <tbody>
102 <row>
103 <entry>read very_important_count (5)</entry>
104 <entry></entry>
105 </row>
106 <row>
107 <entry>add 1 (6)</entry>
108 <entry></entry>
109 </row>
110 <row>
111 <entry>write very_important_count (6)</entry>
112 <entry></entry>
113 </row>
114 <row>
115 <entry></entry>
116 <entry>read very_important_count (6)</entry>
117 </row>
118 <row>
119 <entry></entry>
120 <entry>add 1 (7)</entry>
121 </row>
122 <row>
123 <entry></entry>
124 <entry>write very_important_count (7)</entry>
125 </row>
126 </tbody>
127
128 </tgroup>
129 </table>
130
131 <para>
132 This is what might happen:
133 </para>
134
135 <table>
136 <title>Possible Results</title>
137
138 <tgroup cols="2" align="left">
139 <thead>
140 <row>
141 <entry>Instance 1</entry>
142 <entry>Instance 2</entry>
143 </row>
144 </thead>
145
146 <tbody>
147 <row>
148 <entry>read very_important_count (5)</entry>
149 <entry></entry>
150 </row>
151 <row>
152 <entry></entry>
153 <entry>read very_important_count (5)</entry>
154 </row>
155 <row>
156 <entry>add 1 (6)</entry>
157 <entry></entry>
158 </row>
159 <row>
160 <entry></entry>
161 <entry>add 1 (6)</entry>
162 </row>
163 <row>
164 <entry>write very_important_count (6)</entry>
165 <entry></entry>
166 </row>
167 <row>
168 <entry></entry>
169 <entry>write very_important_count (6)</entry>
170 </row>
171 </tbody>
172 </tgroup>
173 </table>
174
175 <sect1 id="race-condition">
176 <title>Race Conditions and Critical Regions</title>
177 <para>
178 This overlap, where the result depends on the
179 relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
180 The piece of code containing the concurrency issue is called a
181 <firstterm>critical region</firstterm>. And especially since Linux starting running
182 on SMP machines, they became one of the major issues in kernel
183 design and implementation.
184 </para>
185 <para>
186 Preemption can have the same effect, even if there is only one
187 CPU: by preempting one task during the critical region, we have
188 exactly the same race condition. In this case the thread which
189 preempts might run the critical region itself.
190 </para>
191 <para>
192 The solution is to recognize when these simultaneous accesses
193 occur, and use locks to make sure that only one instance can
194 enter the critical region at any time. There are many
195 friendly primitives in the Linux kernel to help you do this.
196 And then there are the unfriendly primitives, but I'll pretend
197 they don't exist.
198 </para>
199 </sect1>
200 </chapter>
201
202 <chapter id="locks">
203 <title>Locking in the Linux Kernel</title>
204
205 <para>
206 If I could give you one piece of advice: never sleep with anyone
207 crazier than yourself. But if I had to give you advice on
208 locking: <emphasis>keep it simple</emphasis>.
209 </para>
210
211 <para>
212 Be reluctant to introduce new locks.
213 </para>
214
215 <para>
216 Strangely enough, this last one is the exact reverse of my advice when
217 you <emphasis>have</emphasis> slept with someone crazier than yourself.
218 And you should think about getting a big dog.
219 </para>
220
221 <sect1 id="lock-intro">
222 <title>Two Main Types of Kernel Locks: Spinlocks and Semaphores</title>
223
224 <para>
225 There are two main types of kernel locks. The fundamental type
226 is the spinlock
227 (<filename class="headerfile">include/asm/spinlock.h</filename>),
228 which is a very simple single-holder lock: if you can't get the
229 spinlock, you keep trying (spinning) until you can. Spinlocks are
230 very small and fast, and can be used anywhere.
231 </para>
232 <para>
233 The second type is a semaphore
234 (<filename class="headerfile">include/asm/semaphore.h</filename>): it
235 can have more than one holder at any time (the number decided at
236 initialization time), although it is most commonly used as a
237 single-holder lock (a mutex). If you can't get a semaphore,
238 your task will put itself on the queue, and be woken up when the
239 semaphore is released. This means the CPU will do something
240 else while you are waiting, but there are many cases when you
241 simply can't sleep (see <xref linkend="sleeping-things"/>), and so
242 have to use a spinlock instead.
243 </para>
244 <para>
245 Neither type of lock is recursive: see
246 <xref linkend="deadlock"/>.
247 </para>
248 </sect1>
249
250 <sect1 id="uniprocessor">
251 <title>Locks and Uniprocessor Kernels</title>
252
253 <para>
254 For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
255 without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
256 all. This is an excellent design decision: when no-one else can
257 run at the same time, there is no reason to have a lock.
258 </para>
259
260 <para>
261 If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
262 but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
263 simply disable preemption, which is sufficient to prevent any
264 races. For most purposes, we can think of preemption as
265 equivalent to SMP, and not worry about it separately.
266 </para>
267
268 <para>
269 You should always test your locking code with <symbol>CONFIG_SMP</symbol>
270 and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
271 will still catch some kinds of locking bugs.
272 </para>
273
274 <para>
275 Semaphores still exist, because they are required for
276 synchronization between <firstterm linkend="gloss-usercontext">user
277 contexts</firstterm>, as we will see below.
278 </para>
279 </sect1>
280
281 <sect1 id="usercontextlocking">
282 <title>Locking Only In User Context</title>
283
284 <para>
285 If you have a data structure which is only ever accessed from
286 user context, then you can use a simple semaphore
287 (<filename>linux/asm/semaphore.h</filename>) to protect it. This
288 is the most trivial case: you initialize the semaphore to the number
289 of resources available (usually 1), and call
290 <function>down_interruptible()</function> to grab the semaphore, and
291 <function>up()</function> to release it. There is also a
292 <function>down()</function>, which should be avoided, because it
293 will not return if a signal is received.
294 </para>
295
296 <para>
297 Example: <filename>linux/net/core/netfilter.c</filename> allows
298 registration of new <function>setsockopt()</function> and
299 <function>getsockopt()</function> calls, with
300 <function>nf_register_sockopt()</function>. Registration and
301 de-registration are only done on module load and unload (and boot
302 time, where there is no concurrency), and the list of registrations
303 is only consulted for an unknown <function>setsockopt()</function>
304 or <function>getsockopt()</function> system call. The
305 <varname>nf_sockopt_mutex</varname> is perfect to protect this,
306 especially since the setsockopt and getsockopt calls may well
307 sleep.
308 </para>
309 </sect1>
310
311 <sect1 id="lock-user-bh">
312 <title>Locking Between User Context and Softirqs</title>
313
314 <para>
315 If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
316 data with user context, you have two problems. Firstly, the current
317 user context can be interrupted by a softirq, and secondly, the
318 critical region could be entered from another CPU. This is where
319 <function>spin_lock_bh()</function>
320 (<filename class="headerfile">include/linux/spinlock.h</filename>) is
321 used. It disables softirqs on that CPU, then grabs the lock.
322 <function>spin_unlock_bh()</function> does the reverse. (The
323 '_bh' suffix is a historical reference to "Bottom Halves", the
324 old name for software interrupts. It should really be
325 called spin_lock_softirq()' in a perfect world).
326 </para>
327
328 <para>
329 Note that you can also use <function>spin_lock_irq()</function>
330 or <function>spin_lock_irqsave()</function> here, which stop
331 hardware interrupts as well: see <xref linkend="hardirq-context"/>.
332 </para>
333
334 <para>
335 This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
336 </acronym></firstterm> as well: the spin lock vanishes, and this macro
337 simply becomes <function>local_bh_disable()</function>
338 (<filename class="headerfile">include/linux/interrupt.h</filename>), which
339 protects you from the softirq being run.
340 </para>
341 </sect1>
342
343 <sect1 id="lock-user-tasklet">
344 <title>Locking Between User Context and Tasklets</title>
345
346 <para>
347 This is exactly the same as above, because <firstterm
348 linkend="gloss-tasklet">tasklets</firstterm> are actually run
349 from a softirq.
350 </para>
351 </sect1>
352
353 <sect1 id="lock-user-timers">
354 <title>Locking Between User Context and Timers</title>
355
356 <para>
357 This, too, is exactly the same as above, because <firstterm
358 linkend="gloss-timers">timers</firstterm> are actually run from
359 a softirq. From a locking point of view, tasklets and timers
360 are identical.
361 </para>
362 </sect1>
363
364 <sect1 id="lock-tasklets">
365 <title>Locking Between Tasklets/Timers</title>
366
367 <para>
368 Sometimes a tasklet or timer might want to share data with
369 another tasklet or timer.
370 </para>
371
372 <sect2 id="lock-tasklets-same">
373 <title>The Same Tasklet/Timer</title>
374 <para>
375 Since a tasklet is never run on two CPUs at once, you don't
376 need to worry about your tasklet being reentrant (running
377 twice at once), even on SMP.
378 </para>
379 </sect2>
380
381 <sect2 id="lock-tasklets-different">
382 <title>Different Tasklets/Timers</title>
383 <para>
384 If another tasklet/timer wants
385 to share data with your tasklet or timer , you will both need to use
386 <function>spin_lock()</function> and
387 <function>spin_unlock()</function> calls.
388 <function>spin_lock_bh()</function> is
389 unnecessary here, as you are already in a tasklet, and
390 none will be run on the same CPU.
391 </para>
392 </sect2>
393 </sect1>
394
395 <sect1 id="lock-softirqs">
396 <title>Locking Between Softirqs</title>
397
398 <para>
399 Often a softirq might
400 want to share data with itself or a tasklet/timer.
401 </para>
402
403 <sect2 id="lock-softirqs-same">
404 <title>The Same Softirq</title>
405
406 <para>
407 The same softirq can run on the other CPUs: you can use a
408 per-CPU array (see <xref linkend="per-cpu"/>) for better
409 performance. If you're going so far as to use a softirq,
410 you probably care about scalable performance enough
411 to justify the extra complexity.
412 </para>
413
414 <para>
415 You'll need to use <function>spin_lock()</function> and
416 <function>spin_unlock()</function> for shared data.
417 </para>
418 </sect2>
419
420 <sect2 id="lock-softirqs-different">
421 <title>Different Softirqs</title>
422
423 <para>
424 You'll need to use <function>spin_lock()</function> and
425 <function>spin_unlock()</function> for shared data, whether it
426 be a timer, tasklet, different softirq or the same or another
427 softirq: any of them could be running on a different CPU.
428 </para>
429 </sect2>
430 </sect1>
431 </chapter>
432
433 <chapter id="hardirq-context">
434 <title>Hard IRQ Context</title>
435
436 <para>
437 Hardware interrupts usually communicate with a
438 tasklet or softirq. Frequently this involves putting work in a
439 queue, which the softirq will take out.
440 </para>
441
442 <sect1 id="hardirq-softirq">
443 <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
444
445 <para>
446 If a hardware irq handler shares data with a softirq, you have
447 two concerns. Firstly, the softirq processing can be
448 interrupted by a hardware interrupt, and secondly, the
449 critical region could be entered by a hardware interrupt on
450 another CPU. This is where <function>spin_lock_irq()</function> is
451 used. It is defined to disable interrupts on that cpu, then grab
452 the lock. <function>spin_unlock_irq()</function> does the reverse.
453 </para>
454
455 <para>
456 The irq handler does not to use
457 <function>spin_lock_irq()</function>, because the softirq cannot
458 run while the irq handler is running: it can use
459 <function>spin_lock()</function>, which is slightly faster. The
460 only exception would be if a different hardware irq handler uses
461 the same lock: <function>spin_lock_irq()</function> will stop
462 that from interrupting us.
463 </para>
464
465 <para>
466 This works perfectly for UP as well: the spin lock vanishes,
467 and this macro simply becomes <function>local_irq_disable()</function>
468 (<filename class="headerfile">include/asm/smp.h</filename>), which
469 protects you from the softirq/tasklet/BH being run.
470 </para>
471
472 <para>
473 <function>spin_lock_irqsave()</function>
474 (<filename>include/linux/spinlock.h</filename>) is a variant
475 which saves whether interrupts were on or off in a flags word,
476 which is passed to <function>spin_unlock_irqrestore()</function>. This
477 means that the same code can be used inside an hard irq handler (where
478 interrupts are already off) and in softirqs (where the irq
479 disabling is required).
480 </para>
481
482 <para>
483 Note that softirqs (and hence tasklets and timers) are run on
484 return from hardware interrupts, so
485 <function>spin_lock_irq()</function> also stops these. In that
486 sense, <function>spin_lock_irqsave()</function> is the most
487 general and powerful locking function.
488 </para>
489
490 </sect1>
491 <sect1 id="hardirq-hardirq">
492 <title>Locking Between Two Hard IRQ Handlers</title>
493 <para>
494 It is rare to have to share data between two IRQ handlers, but
495 if you do, <function>spin_lock_irqsave()</function> should be
496 used: it is architecture-specific whether all interrupts are
497 disabled inside irq handlers themselves.
498 </para>
499 </sect1>
500
501 </chapter>
502
503 <chapter id="cheatsheet">
504 <title>Cheat Sheet For Locking</title>
505 <para>
506 Pete Zaitcev gives the following summary:
507 </para>
508 <itemizedlist>
509 <listitem>
510 <para>
511 If you are in a process context (any syscall) and want to
512 lock other process out, use a semaphore. You can take a semaphore
513 and sleep (<function>copy_from_user*(</function> or
514 <function>kmalloc(x,GFP_KERNEL)</function>).
515 </para>
516 </listitem>
517 <listitem>
518 <para>
519 Otherwise (== data can be touched in an interrupt), use
520 <function>spin_lock_irqsave()</function> and
521 <function>spin_unlock_irqrestore()</function>.
522 </para>
523 </listitem>
524 <listitem>
525 <para>
526 Avoid holding spinlock for more than 5 lines of code and
527 across any function call (except accessors like
528 <function>readb</function>).
529 </para>
530 </listitem>
531 </itemizedlist>
532
533 <sect1 id="minimum-lock-reqirements">
534 <title>Table of Minimum Requirements</title>
535
536 <para> The following table lists the <emphasis>minimum</emphasis>
537 locking requirements between various contexts. In some cases,
538 the same context can only be running on one CPU at a time, so
539 no locking is required for that context (eg. a particular
540 thread can only run on one CPU at a time, but if it needs
541 shares data with another thread, locking is required).
542 </para>
543 <para>
544 Remember the advice above: you can always use
545 <function>spin_lock_irqsave()</function>, which is a superset
546 of all other spinlock primitives.
547 </para>
548 <table>
549<title>Table of Locking Requirements</title>
550<tgroup cols="11">
551<tbody>
552<row>
553<entry></entry>
554<entry>IRQ Handler A</entry>
555<entry>IRQ Handler B</entry>
556<entry>Softirq A</entry>
557<entry>Softirq B</entry>
558<entry>Tasklet A</entry>
559<entry>Tasklet B</entry>
560<entry>Timer A</entry>
561<entry>Timer B</entry>
562<entry>User Context A</entry>
563<entry>User Context B</entry>
564</row>
565
566<row>
567<entry>IRQ Handler A</entry>
568<entry>None</entry>
569</row>
570
571<row>
572<entry>IRQ Handler B</entry>
573<entry>spin_lock_irqsave</entry>
574<entry>None</entry>
575</row>
576
577<row>
578<entry>Softirq A</entry>
579<entry>spin_lock_irq</entry>
580<entry>spin_lock_irq</entry>
581<entry>spin_lock</entry>
582</row>
583
584<row>
585<entry>Softirq B</entry>
586<entry>spin_lock_irq</entry>
587<entry>spin_lock_irq</entry>
588<entry>spin_lock</entry>
589<entry>spin_lock</entry>
590</row>
591
592<row>
593<entry>Tasklet A</entry>
594<entry>spin_lock_irq</entry>
595<entry>spin_lock_irq</entry>
596<entry>spin_lock</entry>
597<entry>spin_lock</entry>
598<entry>None</entry>
599</row>
600
601<row>
602<entry>Tasklet B</entry>
603<entry>spin_lock_irq</entry>
604<entry>spin_lock_irq</entry>
605<entry>spin_lock</entry>
606<entry>spin_lock</entry>
607<entry>spin_lock</entry>
608<entry>None</entry>
609</row>
610
611<row>
612<entry>Timer A</entry>
613<entry>spin_lock_irq</entry>
614<entry>spin_lock_irq</entry>
615<entry>spin_lock</entry>
616<entry>spin_lock</entry>
617<entry>spin_lock</entry>
618<entry>spin_lock</entry>
619<entry>None</entry>
620</row>
621
622<row>
623<entry>Timer B</entry>
624<entry>spin_lock_irq</entry>
625<entry>spin_lock_irq</entry>
626<entry>spin_lock</entry>
627<entry>spin_lock</entry>
628<entry>spin_lock</entry>
629<entry>spin_lock</entry>
630<entry>spin_lock</entry>
631<entry>None</entry>
632</row>
633
634<row>
635<entry>User Context A</entry>
636<entry>spin_lock_irq</entry>
637<entry>spin_lock_irq</entry>
638<entry>spin_lock_bh</entry>
639<entry>spin_lock_bh</entry>
640<entry>spin_lock_bh</entry>
641<entry>spin_lock_bh</entry>
642<entry>spin_lock_bh</entry>
643<entry>spin_lock_bh</entry>
644<entry>None</entry>
645</row>
646
647<row>
648<entry>User Context B</entry>
649<entry>spin_lock_irq</entry>
650<entry>spin_lock_irq</entry>
651<entry>spin_lock_bh</entry>
652<entry>spin_lock_bh</entry>
653<entry>spin_lock_bh</entry>
654<entry>spin_lock_bh</entry>
655<entry>spin_lock_bh</entry>
656<entry>spin_lock_bh</entry>
657<entry>down_interruptible</entry>
658<entry>None</entry>
659</row>
660
661</tbody>
662</tgroup>
663</table>
664</sect1>
665</chapter>
666
667 <chapter id="Examples">
668 <title>Common Examples</title>
669 <para>
670Let's step through a simple example: a cache of number to name
671mappings. The cache keeps a count of how often each of the objects is
672used, and when it gets full, throws out the least used one.
673
674 </para>
675
676 <sect1 id="examples-usercontext">
677 <title>All In User Context</title>
678 <para>
679For our first example, we assume that all operations are in user
680context (ie. from system calls), so we can sleep. This means we can
681use a semaphore to protect the cache and all the objects within
682it. Here's the code:
683 </para>
684
685 <programlisting>
686#include &lt;linux/list.h&gt;
687#include &lt;linux/slab.h&gt;
688#include &lt;linux/string.h&gt;
689#include &lt;asm/semaphore.h&gt;
690#include &lt;asm/errno.h&gt;
691
692struct object
693{
694 struct list_head list;
695 int id;
696 char name[32];
697 int popularity;
698};
699
700/* Protects the cache, cache_num, and the objects within it */
701static DECLARE_MUTEX(cache_lock);
702static LIST_HEAD(cache);
703static unsigned int cache_num = 0;
704#define MAX_CACHE_SIZE 10
705
706/* Must be holding cache_lock */
707static struct object *__cache_find(int id)
708{
709 struct object *i;
710
711 list_for_each_entry(i, &amp;cache, list)
712 if (i-&gt;id == id) {
713 i-&gt;popularity++;
714 return i;
715 }
716 return NULL;
717}
718
719/* Must be holding cache_lock */
720static void __cache_delete(struct object *obj)
721{
722 BUG_ON(!obj);
723 list_del(&amp;obj-&gt;list);
724 kfree(obj);
725 cache_num--;
726}
727
728/* Must be holding cache_lock */
729static void __cache_add(struct object *obj)
730{
731 list_add(&amp;obj-&gt;list, &amp;cache);
732 if (++cache_num > MAX_CACHE_SIZE) {
733 struct object *i, *outcast = NULL;
734 list_for_each_entry(i, &amp;cache, list) {
735 if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
736 outcast = i;
737 }
738 __cache_delete(outcast);
739 }
740}
741
742int cache_add(int id, const char *name)
743{
744 struct object *obj;
745
746 if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
747 return -ENOMEM;
748
749 strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
750 obj-&gt;id = id;
751 obj-&gt;popularity = 0;
752
753 down(&amp;cache_lock);
754 __cache_add(obj);
755 up(&amp;cache_lock);
756 return 0;
757}
758
759void cache_delete(int id)
760{
761 down(&amp;cache_lock);
762 __cache_delete(__cache_find(id));
763 up(&amp;cache_lock);
764}
765
766int cache_find(int id, char *name)
767{
768 struct object *obj;
769 int ret = -ENOENT;
770
771 down(&amp;cache_lock);
772 obj = __cache_find(id);
773 if (obj) {
774 ret = 0;
775 strcpy(name, obj-&gt;name);
776 }
777 up(&amp;cache_lock);
778 return ret;
779}
780</programlisting>
781
782 <para>
783Note that we always make sure we have the cache_lock when we add,
784delete, or look up the cache: both the cache infrastructure itself and
785the contents of the objects are protected by the lock. In this case
786it's easy, since we copy the data for the user, and never let them
787access the objects directly.
788 </para>
789 <para>
790There is a slight (and common) optimization here: in
791<function>cache_add</function> we set up the fields of the object
792before grabbing the lock. This is safe, as no-one else can access it
793until we put it in cache.
794 </para>
795 </sect1>
796
797 <sect1 id="examples-interrupt">
798 <title>Accessing From Interrupt Context</title>
799 <para>
800Now consider the case where <function>cache_find</function> can be
801called from interrupt context: either a hardware interrupt or a
802softirq. An example would be a timer which deletes object from the
803cache.
804 </para>
805 <para>
806The change is shown below, in standard patch format: the
807<symbol>-</symbol> are lines which are taken away, and the
808<symbol>+</symbol> are lines which are added.
809 </para>
810<programlisting>
811--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
812+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
813@@ -12,7 +12,7 @@
814 int popularity;
815 };
816
817-static DECLARE_MUTEX(cache_lock);
818+static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
819 static LIST_HEAD(cache);
820 static unsigned int cache_num = 0;
821 #define MAX_CACHE_SIZE 10
822@@ -55,6 +55,7 @@
823 int cache_add(int id, const char *name)
824 {
825 struct object *obj;
826+ unsigned long flags;
827
828 if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
829 return -ENOMEM;
830@@ -63,30 +64,33 @@
831 obj-&gt;id = id;
832 obj-&gt;popularity = 0;
833
834- down(&amp;cache_lock);
835+ spin_lock_irqsave(&amp;cache_lock, flags);
836 __cache_add(obj);
837- up(&amp;cache_lock);
838+ spin_unlock_irqrestore(&amp;cache_lock, flags);
839 return 0;
840 }
841
842 void cache_delete(int id)
843 {
844- down(&amp;cache_lock);
845+ unsigned long flags;
846+
847+ spin_lock_irqsave(&amp;cache_lock, flags);
848 __cache_delete(__cache_find(id));
849- up(&amp;cache_lock);
850+ spin_unlock_irqrestore(&amp;cache_lock, flags);
851 }
852
853 int cache_find(int id, char *name)
854 {
855 struct object *obj;
856 int ret = -ENOENT;
857+ unsigned long flags;
858
859- down(&amp;cache_lock);
860+ spin_lock_irqsave(&amp;cache_lock, flags);
861 obj = __cache_find(id);
862 if (obj) {
863 ret = 0;
864 strcpy(name, obj-&gt;name);
865 }
866- up(&amp;cache_lock);
867+ spin_unlock_irqrestore(&amp;cache_lock, flags);
868 return ret;
869 }
870</programlisting>
871
872 <para>
873Note that the <function>spin_lock_irqsave</function> will turn off
874interrupts if they are on, otherwise does nothing (if we are already
875in an interrupt handler), hence these functions are safe to call from
876any context.
877 </para>
878 <para>
879Unfortunately, <function>cache_add</function> calls
880<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
881flag, which is only legal in user context. I have assumed that
882<function>cache_add</function> is still only called in user context,
883otherwise this should become a parameter to
884<function>cache_add</function>.
885 </para>
886 </sect1>
887 <sect1 id="examples-refcnt">
888 <title>Exposing Objects Outside This File</title>
889 <para>
890If our objects contained more information, it might not be sufficient
891to copy the information in and out: other parts of the code might want
892to keep pointers to these objects, for example, rather than looking up
893the id every time. This produces two problems.
894 </para>
895 <para>
896The first problem is that we use the <symbol>cache_lock</symbol> to
897protect objects: we'd need to make this non-static so the rest of the
898code can use it. This makes locking trickier, as it is no longer all
899in one place.
900 </para>
901 <para>
902The second problem is the lifetime problem: if another structure keeps
903a pointer to an object, it presumably expects that pointer to remain
904valid. Unfortunately, this is only guaranteed while you hold the
905lock, otherwise someone might call <function>cache_delete</function>
906and even worse, add another object, re-using the same address.
907 </para>
908 <para>
909As there is only one lock, you can't hold it forever: no-one else would
910get any work done.
911 </para>
912 <para>
913The solution to this problem is to use a reference count: everyone who
914has a pointer to the object increases it when they first get the
915object, and drops the reference count when they're finished with it.
916Whoever drops it to zero knows it is unused, and can actually delete it.
917 </para>
918 <para>
919Here is the code:
920 </para>
921
922<programlisting>
923--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
924+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
925@@ -7,6 +7,7 @@
926 struct object
927 {
928 struct list_head list;
929+ unsigned int refcnt;
930 int id;
931 char name[32];
932 int popularity;
933@@ -17,6 +18,35 @@
934 static unsigned int cache_num = 0;
935 #define MAX_CACHE_SIZE 10
936
937+static void __object_put(struct object *obj)
938+{
939+ if (--obj-&gt;refcnt == 0)
940+ kfree(obj);
941+}
942+
943+static void __object_get(struct object *obj)
944+{
945+ obj-&gt;refcnt++;
946+}
947+
948+void object_put(struct object *obj)
949+{
950+ unsigned long flags;
951+
952+ spin_lock_irqsave(&amp;cache_lock, flags);
953+ __object_put(obj);
954+ spin_unlock_irqrestore(&amp;cache_lock, flags);
955+}
956+
957+void object_get(struct object *obj)
958+{
959+ unsigned long flags;
960+
961+ spin_lock_irqsave(&amp;cache_lock, flags);
962+ __object_get(obj);
963+ spin_unlock_irqrestore(&amp;cache_lock, flags);
964+}
965+
966 /* Must be holding cache_lock */
967 static struct object *__cache_find(int id)
968 {
969@@ -35,6 +65,7 @@
970 {
971 BUG_ON(!obj);
972 list_del(&amp;obj-&gt;list);
973+ __object_put(obj);
974 cache_num--;
975 }
976
977@@ -63,6 +94,7 @@
978 strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
979 obj-&gt;id = id;
980 obj-&gt;popularity = 0;
981+ obj-&gt;refcnt = 1; /* The cache holds a reference */
982
983 spin_lock_irqsave(&amp;cache_lock, flags);
984 __cache_add(obj);
985@@ -79,18 +111,15 @@
986 spin_unlock_irqrestore(&amp;cache_lock, flags);
987 }
988
989-int cache_find(int id, char *name)
990+struct object *cache_find(int id)
991 {
992 struct object *obj;
993- int ret = -ENOENT;
994 unsigned long flags;
995
996 spin_lock_irqsave(&amp;cache_lock, flags);
997 obj = __cache_find(id);
998- if (obj) {
999- ret = 0;
1000- strcpy(name, obj-&gt;name);
1001- }
1002+ if (obj)
1003+ __object_get(obj);
1004 spin_unlock_irqrestore(&amp;cache_lock, flags);
1005- return ret;
1006+ return obj;
1007 }
1008</programlisting>
1009
1010<para>
1011We encapsulate the reference counting in the standard 'get' and 'put'
1012functions. Now we can return the object itself from
1013<function>cache_find</function> which has the advantage that the user
1014can now sleep holding the object (eg. to
1015<function>copy_to_user</function> to name to userspace).
1016</para>
1017<para>
1018The other point to note is that I said a reference should be held for
1019every pointer to the object: thus the reference count is 1 when first
1020inserted into the cache. In some versions the framework does not hold
1021a reference count, but they are more complicated.
1022</para>
1023
1024 <sect2 id="examples-refcnt-atomic">
1025 <title>Using Atomic Operations For The Reference Count</title>
1026<para>
1027In practice, <type>atomic_t</type> would usually be used for
1028<structfield>refcnt</structfield>. There are a number of atomic
1029operations defined in
1030
1031<filename class="headerfile">include/asm/atomic.h</filename>: these are
1032guaranteed to be seen atomically from all CPUs in the system, so no
1033lock is required. In this case, it is simpler than using spinlocks,
1034although for anything non-trivial using spinlocks is clearer. The
1035<function>atomic_inc</function> and
1036<function>atomic_dec_and_test</function> are used instead of the
1037standard increment and decrement operators, and the lock is no longer
1038used to protect the reference count itself.
1039</para>
1040
1041<programlisting>
1042--- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100
1043+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100
1044@@ -7,7 +7,7 @@
1045 struct object
1046 {
1047 struct list_head list;
1048- unsigned int refcnt;
1049+ atomic_t refcnt;
1050 int id;
1051 char name[32];
1052 int popularity;
1053@@ -18,33 +18,15 @@
1054 static unsigned int cache_num = 0;
1055 #define MAX_CACHE_SIZE 10
1056
1057-static void __object_put(struct object *obj)
1058-{
1059- if (--obj-&gt;refcnt == 0)
1060- kfree(obj);
1061-}
1062-
1063-static void __object_get(struct object *obj)
1064-{
1065- obj-&gt;refcnt++;
1066-}
1067-
1068 void object_put(struct object *obj)
1069 {
1070- unsigned long flags;
1071-
1072- spin_lock_irqsave(&amp;cache_lock, flags);
1073- __object_put(obj);
1074- spin_unlock_irqrestore(&amp;cache_lock, flags);
1075+ if (atomic_dec_and_test(&amp;obj-&gt;refcnt))
1076+ kfree(obj);
1077 }
1078
1079 void object_get(struct object *obj)
1080 {
1081- unsigned long flags;
1082-
1083- spin_lock_irqsave(&amp;cache_lock, flags);
1084- __object_get(obj);
1085- spin_unlock_irqrestore(&amp;cache_lock, flags);
1086+ atomic_inc(&amp;obj-&gt;refcnt);
1087 }
1088
1089 /* Must be holding cache_lock */
1090@@ -65,7 +47,7 @@
1091 {
1092 BUG_ON(!obj);
1093 list_del(&amp;obj-&gt;list);
1094- __object_put(obj);
1095+ object_put(obj);
1096 cache_num--;
1097 }
1098
1099@@ -94,7 +76,7 @@
1100 strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
1101 obj-&gt;id = id;
1102 obj-&gt;popularity = 0;
1103- obj-&gt;refcnt = 1; /* The cache holds a reference */
1104+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
1105
1106 spin_lock_irqsave(&amp;cache_lock, flags);
1107 __cache_add(obj);
1108@@ -119,7 +101,7 @@
1109 spin_lock_irqsave(&amp;cache_lock, flags);
1110 obj = __cache_find(id);
1111 if (obj)
1112- __object_get(obj);
1113+ object_get(obj);
1114 spin_unlock_irqrestore(&amp;cache_lock, flags);
1115 return obj;
1116 }
1117</programlisting>
1118</sect2>
1119</sect1>
1120
1121 <sect1 id="examples-lock-per-obj">
1122 <title>Protecting The Objects Themselves</title>
1123 <para>
1124In these examples, we assumed that the objects (except the reference
1125counts) never changed once they are created. If we wanted to allow
1126the name to change, there are three possibilities:
1127 </para>
1128 <itemizedlist>
1129 <listitem>
1130 <para>
1131You can make <symbol>cache_lock</symbol> non-static, and tell people
1132to grab that lock before changing the name in any object.
1133 </para>
1134 </listitem>
1135 <listitem>
1136 <para>
1137You can provide a <function>cache_obj_rename</function> which grabs
1138this lock and changes the name for the caller, and tell everyone to
1139use that function.
1140 </para>
1141 </listitem>
1142 <listitem>
1143 <para>
1144You can make the <symbol>cache_lock</symbol> protect only the cache
1145itself, and use another lock to protect the name.
1146 </para>
1147 </listitem>
1148 </itemizedlist>
1149
1150 <para>
1151Theoretically, you can make the locks as fine-grained as one lock for
1152every field, for every object. In practice, the most common variants
1153are:
1154</para>
1155 <itemizedlist>
1156 <listitem>
1157 <para>
1158One lock which protects the infrastructure (the <symbol>cache</symbol>
1159list in this example) and all the objects. This is what we have done
1160so far.
1161 </para>
1162 </listitem>
1163 <listitem>
1164 <para>
1165One lock which protects the infrastructure (including the list
1166pointers inside the objects), and one lock inside the object which
1167protects the rest of that object.
1168 </para>
1169 </listitem>
1170 <listitem>
1171 <para>
1172Multiple locks to protect the infrastructure (eg. one lock per hash
1173chain), possibly with a separate per-object lock.
1174 </para>
1175 </listitem>
1176 </itemizedlist>
1177
1178<para>
1179Here is the "lock-per-object" implementation:
1180</para>
1181<programlisting>
1182--- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100
1183+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
1184@@ -6,11 +6,17 @@
1185
1186 struct object
1187 {
1188+ /* These two protected by cache_lock. */
1189 struct list_head list;
1190+ int popularity;
1191+
1192 atomic_t refcnt;
1193+
1194+ /* Doesn't change once created. */
1195 int id;
1196+
1197+ spinlock_t lock; /* Protects the name */
1198 char name[32];
1199- int popularity;
1200 };
1201
1202 static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
1203@@ -77,6 +84,7 @@
1204 obj-&gt;id = id;
1205 obj-&gt;popularity = 0;
1206 atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
1207+ spin_lock_init(&amp;obj-&gt;lock);
1208
1209 spin_lock_irqsave(&amp;cache_lock, flags);
1210 __cache_add(obj);
1211</programlisting>
1212
1213<para>
1214Note that I decide that the <structfield>popularity</structfield>
1215count should be protected by the <symbol>cache_lock</symbol> rather
1216than the per-object lock: this is because it (like the
1217<structname>struct list_head</structname> inside the object) is
1218logically part of the infrastructure. This way, I don't need to grab
1219the lock of every object in <function>__cache_add</function> when
1220seeking the least popular.
1221</para>
1222
1223<para>
1224I also decided that the <structfield>id</structfield> member is
1225unchangeable, so I don't need to grab each object lock in
1226<function>__cache_find()</function> to examine the
1227<structfield>id</structfield>: the object lock is only used by a
1228caller who wants to read or write the <structfield>name</structfield>
1229field.
1230</para>
1231
1232<para>
1233Note also that I added a comment describing what data was protected by
1234which locks. This is extremely important, as it describes the runtime
1235behavior of the code, and can be hard to gain from just reading. And
1236as Alan Cox says, <quote>Lock data, not code</quote>.
1237</para>
1238</sect1>
1239</chapter>
1240
1241 <chapter id="common-problems">
1242 <title>Common Problems</title>
1243 <sect1 id="deadlock">
1244 <title>Deadlock: Simple and Advanced</title>
1245
1246 <para>
1247 There is a coding bug where a piece of code tries to grab a
1248 spinlock twice: it will spin forever, waiting for the lock to
1249 be released (spinlocks, rwlocks and semaphores are not
1250 recursive in Linux). This is trivial to diagnose: not a
1251 stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
1252 problem.
1253 </para>
1254
1255 <para>
1256 For a slightly more complex case, imagine you have a region
1257 shared by a softirq and user context. If you use a
1258 <function>spin_lock()</function> call to protect it, it is
1259 possible that the user context will be interrupted by the softirq
1260 while it holds the lock, and the softirq will then spin
1261 forever trying to get the same lock.
1262 </para>
1263
1264 <para>
1265 Both of these are called deadlock, and as shown above, it can
1266 occur even with a single CPU (although not on UP compiles,
1267 since spinlocks vanish on kernel compiles with
1268 <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption
1269 in the second example).
1270 </para>
1271
1272 <para>
1273 This complete lockup is easy to diagnose: on SMP boxes the
1274 watchdog timer or compiling with <symbol>DEBUG_SPINLOCKS</symbol> set
1275 (<filename>include/linux/spinlock.h</filename>) will show this up
1276 immediately when it happens.
1277 </para>
1278
1279 <para>
1280 A more complex problem is the so-called 'deadly embrace',
1281 involving two or more locks. Say you have a hash table: each
1282 entry in the table is a spinlock, and a chain of hashed
1283 objects. Inside a softirq handler, you sometimes want to
1284 alter an object from one place in the hash to another: you
1285 grab the spinlock of the old hash chain and the spinlock of
1286 the new hash chain, and delete the object from the old one,
1287 and insert it in the new one.
1288 </para>
1289
1290 <para>
1291 There are two problems here. First, if your code ever
1292 tries to move the object to the same chain, it will deadlock
1293 with itself as it tries to lock it twice. Secondly, if the
1294 same softirq on another CPU is trying to move another object
1295 in the reverse direction, the following could happen:
1296 </para>
1297
1298 <table>
1299 <title>Consequences</title>
1300
1301 <tgroup cols="2" align="left">
1302
1303 <thead>
1304 <row>
1305 <entry>CPU 1</entry>
1306 <entry>CPU 2</entry>
1307 </row>
1308 </thead>
1309
1310 <tbody>
1311 <row>
1312 <entry>Grab lock A -&gt; OK</entry>
1313 <entry>Grab lock B -&gt; OK</entry>
1314 </row>
1315 <row>
1316 <entry>Grab lock B -&gt; spin</entry>
1317 <entry>Grab lock A -&gt; spin</entry>
1318 </row>
1319 </tbody>
1320 </tgroup>
1321 </table>
1322
1323 <para>
1324 The two CPUs will spin forever, waiting for the other to give up
1325 their lock. It will look, smell, and feel like a crash.
1326 </para>
1327 </sect1>
1328
1329 <sect1 id="techs-deadlock-prevent">
1330 <title>Preventing Deadlock</title>
1331
1332 <para>
1333 Textbooks will tell you that if you always lock in the same
1334 order, you will never get this kind of deadlock. Practice
1335 will tell you that this approach doesn't scale: when I
1336 create a new lock, I don't understand enough of the kernel
1337 to figure out where in the 5000 lock hierarchy it will fit.
1338 </para>
1339
1340 <para>
1341 The best locks are encapsulated: they never get exposed in
1342 headers, and are never held around calls to non-trivial
1343 functions outside the same file. You can read through this
1344 code and see that it will never deadlock, because it never
1345 tries to grab another lock while it has that one. People
1346 using your code don't even need to know you are using a
1347 lock.
1348 </para>
1349
1350 <para>
1351 A classic problem here is when you provide callbacks or
1352 hooks: if you call these with the lock held, you risk simple
1353 deadlock, or a deadly embrace (who knows what the callback
1354 will do?). Remember, the other programmers are out to get
1355 you, so don't do this.
1356 </para>
1357
1358 <sect2 id="techs-deadlock-overprevent">
1359 <title>Overzealous Prevention Of Deadlocks</title>
1360
1361 <para>
1362 Deadlocks are problematic, but not as bad as data
1363 corruption. Code which grabs a read lock, searches a list,
1364 fails to find what it wants, drops the read lock, grabs a
1365 write lock and inserts the object has a race condition.
1366 </para>
1367
1368 <para>
1369 If you don't see why, please stay the fuck away from my code.
1370 </para>
1371 </sect2>
1372 </sect1>
1373
1374 <sect1 id="racing-timers">
1375 <title>Racing Timers: A Kernel Pastime</title>
1376
1377 <para>
1378 Timers can produce their own special problems with races.
1379 Consider a collection of objects (list, hash, etc) where each
1380 object has a timer which is due to destroy it.
1381 </para>
1382
1383 <para>
1384 If you want to destroy the entire collection (say on module
1385 removal), you might do the following:
1386 </para>
1387
1388 <programlisting>
1389 /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
1390 HUNGARIAN NOTATION */
1391 spin_lock_bh(&amp;list_lock);
1392
1393 while (list) {
1394 struct foo *next = list-&gt;next;
1395 del_timer(&amp;list-&gt;timer);
1396 kfree(list);
1397 list = next;
1398 }
1399
1400 spin_unlock_bh(&amp;list_lock);
1401 </programlisting>
1402
1403 <para>
1404 Sooner or later, this will crash on SMP, because a timer can
1405 have just gone off before the <function>spin_lock_bh()</function>,
1406 and it will only get the lock after we
1407 <function>spin_unlock_bh()</function>, and then try to free
1408 the element (which has already been freed!).
1409 </para>
1410
1411 <para>
1412 This can be avoided by checking the result of
1413 <function>del_timer()</function>: if it returns
1414 <returnvalue>1</returnvalue>, the timer has been deleted.
1415 If <returnvalue>0</returnvalue>, it means (in this
1416 case) that it is currently running, so we can do:
1417 </para>
1418
1419 <programlisting>
1420 retry:
1421 spin_lock_bh(&amp;list_lock);
1422
1423 while (list) {
1424 struct foo *next = list-&gt;next;
1425 if (!del_timer(&amp;list-&gt;timer)) {
1426 /* Give timer a chance to delete this */
1427 spin_unlock_bh(&amp;list_lock);
1428 goto retry;
1429 }
1430 kfree(list);
1431 list = next;
1432 }
1433
1434 spin_unlock_bh(&amp;list_lock);
1435 </programlisting>
1436
1437 <para>
1438 Another common problem is deleting timers which restart
1439 themselves (by calling <function>add_timer()</function> at the end
1440 of their timer function). Because this is a fairly common case
1441 which is prone to races, you should use <function>del_timer_sync()</function>
1442 (<filename class="headerfile">include/linux/timer.h</filename>)
1443 to handle this case. It returns the number of times the timer
1444 had to be deleted before we finally stopped it from adding itself back
1445 in.
1446 </para>
1447 </sect1>
1448
1449 </chapter>
1450
1451 <chapter id="Efficiency">
1452 <title>Locking Speed</title>
1453
1454 <para>
1455There are three main things to worry about when considering speed of
1456some code which does locking. First is concurrency: how many things
1457are going to be waiting while someone else is holding a lock. Second
1458is the time taken to actually acquire and release an uncontended lock.
1459Third is using fewer, or smarter locks. I'm assuming that the lock is
1460used fairly often: otherwise, you wouldn't be concerned about
1461efficiency.
1462</para>
1463 <para>
1464Concurrency depends on how long the lock is usually held: you should
1465hold the lock for as long as needed, but no longer. In the cache
1466example, we always create the object without the lock held, and then
1467grab the lock only when we are ready to insert it in the list.
1468</para>
1469 <para>
1470Acquisition times depend on how much damage the lock operations do to
1471the pipeline (pipeline stalls) and how likely it is that this CPU was
1472the last one to grab the lock (ie. is the lock cache-hot for this
1473CPU): on a machine with more CPUs, this likelihood drops fast.
1474Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
1475an atomic increment takes about 58ns, a lock which is cache-hot on
1476this CPU takes 160ns, and a cacheline transfer from another CPU takes
1477an additional 170 to 360ns. (These figures from Paul McKenney's
1478<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux
1479Journal RCU article</ulink>).
1480</para>
1481 <para>
1482These two aims conflict: holding a lock for a short time might be done
1483by splitting locks into parts (such as in our final per-object-lock
1484example), but this increases the number of lock acquisitions, and the
1485results are often slower than having a single lock. This is another
1486reason to advocate locking simplicity.
1487</para>
1488 <para>
1489The third concern is addressed below: there are some methods to reduce
1490the amount of locking which needs to be done.
1491</para>
1492
1493 <sect1 id="efficiency-rwlocks">
1494 <title>Read/Write Lock Variants</title>
1495
1496 <para>
1497 Both spinlocks and semaphores have read/write variants:
1498 <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
1499 These divide users into two classes: the readers and the writers. If
1500 you are only reading the data, you can get a read lock, but to write to
1501 the data you need the write lock. Many people can hold a read lock,
1502 but a writer must be sole holder.
1503 </para>
1504
1505 <para>
1506 If your code divides neatly along reader/writer lines (as our
1507 cache code does), and the lock is held by readers for
1508 significant lengths of time, using these locks can help. They
1509 are slightly slower than the normal locks though, so in practice
1510 <type>rwlock_t</type> is not usually worthwhile.
1511 </para>
1512 </sect1>
1513
1514 <sect1 id="efficiency-read-copy-update">
1515 <title>Avoiding Locks: Read Copy Update</title>
1516
1517 <para>
1518 There is a special method of read/write locking called Read Copy
1519 Update. Using RCU, the readers can avoid taking a lock
1520 altogether: as we expect our cache to be read more often than
1521 updated (otherwise the cache is a waste of time), it is a
1522 candidate for this optimization.
1523 </para>
1524
1525 <para>
1526 How do we get rid of read locks? Getting rid of read locks
1527 means that writers may be changing the list underneath the
1528 readers. That is actually quite simple: we can read a linked
1529 list while an element is being added if the writer adds the
1530 element very carefully. For example, adding
1531 <symbol>new</symbol> to a single linked list called
1532 <symbol>list</symbol>:
1533 </para>
1534
1535 <programlisting>
1536 new-&gt;next = list-&gt;next;
1537 wmb();
1538 list-&gt;next = new;
1539 </programlisting>
1540
1541 <para>
1542 The <function>wmb()</function> is a write memory barrier. It
1543 ensures that the first operation (setting the new element's
1544 <symbol>next</symbol> pointer) is complete and will be seen by
1545 all CPUs, before the second operation is (putting the new
1546 element into the list). This is important, since modern
1547 compilers and modern CPUs can both reorder instructions unless
1548 told otherwise: we want a reader to either not see the new
1549 element at all, or see the new element with the
1550 <symbol>next</symbol> pointer correctly pointing at the rest of
1551 the list.
1552 </para>
1553 <para>
1554 Fortunately, there is a function to do this for standard
1555 <structname>struct list_head</structname> lists:
1556 <function>list_add_rcu()</function>
1557 (<filename>include/linux/list.h</filename>).
1558 </para>
1559 <para>
1560 Removing an element from the list is even simpler: we replace
1561 the pointer to the old element with a pointer to its successor,
1562 and readers will either see it, or skip over it.
1563 </para>
1564 <programlisting>
1565 list-&gt;next = old-&gt;next;
1566 </programlisting>
1567 <para>
1568 There is <function>list_del_rcu()</function>
1569 (<filename>include/linux/list.h</filename>) which does this (the
1570 normal version poisons the old object, which we don't want).
1571 </para>
1572 <para>
1573 The reader must also be careful: some CPUs can look through the
1574 <symbol>next</symbol> pointer to start reading the contents of
1575 the next element early, but don't realize that the pre-fetched
1576 contents is wrong when the <symbol>next</symbol> pointer changes
1577 underneath them. Once again, there is a
1578 <function>list_for_each_entry_rcu()</function>
1579 (<filename>include/linux/list.h</filename>) to help you. Of
1580 course, writers can just use
1581 <function>list_for_each_entry()</function>, since there cannot
1582 be two simultaneous writers.
1583 </para>
1584 <para>
1585 Our final dilemma is this: when can we actually destroy the
1586 removed element? Remember, a reader might be stepping through
1587 this element in the list right now: it we free this element and
1588 the <symbol>next</symbol> pointer changes, the reader will jump
1589 off into garbage and crash. We need to wait until we know that
1590 all the readers who were traversing the list when we deleted the
1591 element are finished. We use <function>call_rcu()</function> to
1592 register a callback which will actually destroy the object once
1593 the readers are finished.
1594 </para>
1595 <para>
1596 But how does Read Copy Update know when the readers are
1597 finished? The method is this: firstly, the readers always
1598 traverse the list inside
1599 <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function>
1600 pairs: these simply disable preemption so the reader won't go to
1601 sleep while reading the list.
1602 </para>
1603 <para>
1604 RCU then waits until every other CPU has slept at least once:
1605 since readers cannot sleep, we know that any readers which were
1606 traversing the list during the deletion are finished, and the
1607 callback is triggered. The real Read Copy Update code is a
1608 little more optimized than this, but this is the fundamental
1609 idea.
1610 </para>
1611
1612<programlisting>
1613--- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
1614+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100
1615@@ -1,15 +1,18 @@
1616 #include &lt;linux/list.h&gt;
1617 #include &lt;linux/slab.h&gt;
1618 #include &lt;linux/string.h&gt;
1619+#include &lt;linux/rcupdate.h&gt;
1620 #include &lt;asm/semaphore.h&gt;
1621 #include &lt;asm/errno.h&gt;
1622
1623 struct object
1624 {
1625- /* These two protected by cache_lock. */
1626+ /* This is protected by RCU */
1627 struct list_head list;
1628 int popularity;
1629
1630+ struct rcu_head rcu;
1631+
1632 atomic_t refcnt;
1633
1634 /* Doesn't change once created. */
1635@@ -40,7 +43,7 @@
1636 {
1637 struct object *i;
1638
1639- list_for_each_entry(i, &amp;cache, list) {
1640+ list_for_each_entry_rcu(i, &amp;cache, list) {
1641 if (i-&gt;id == id) {
1642 i-&gt;popularity++;
1643 return i;
1644@@ -49,19 +52,25 @@
1645 return NULL;
1646 }
1647
1648+/* Final discard done once we know no readers are looking. */
1649+static void cache_delete_rcu(void *arg)
1650+{
1651+ object_put(arg);
1652+}
1653+
1654 /* Must be holding cache_lock */
1655 static void __cache_delete(struct object *obj)
1656 {
1657 BUG_ON(!obj);
1658- list_del(&amp;obj-&gt;list);
1659- object_put(obj);
1660+ list_del_rcu(&amp;obj-&gt;list);
1661 cache_num--;
1662+ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
1663 }
1664
1665 /* Must be holding cache_lock */
1666 static void __cache_add(struct object *obj)
1667 {
1668- list_add(&amp;obj-&gt;list, &amp;cache);
1669+ list_add_rcu(&amp;obj-&gt;list, &amp;cache);
1670 if (++cache_num > MAX_CACHE_SIZE) {
1671 struct object *i, *outcast = NULL;
1672 list_for_each_entry(i, &amp;cache, list) {
1673@@ -85,6 +94,7 @@
1674 obj-&gt;popularity = 0;
1675 atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
1676 spin_lock_init(&amp;obj-&gt;lock);
1677+ INIT_RCU_HEAD(&amp;obj-&gt;rcu);
1678
1679 spin_lock_irqsave(&amp;cache_lock, flags);
1680 __cache_add(obj);
1681@@ -104,12 +114,11 @@
1682 struct object *cache_find(int id)
1683 {
1684 struct object *obj;
1685- unsigned long flags;
1686
1687- spin_lock_irqsave(&amp;cache_lock, flags);
1688+ rcu_read_lock();
1689 obj = __cache_find(id);
1690 if (obj)
1691 object_get(obj);
1692- spin_unlock_irqrestore(&amp;cache_lock, flags);
1693+ rcu_read_unlock();
1694 return obj;
1695 }
1696</programlisting>
1697
1698<para>
1699Note that the reader will alter the
1700<structfield>popularity</structfield> member in
1701<function>__cache_find()</function>, and now it doesn't hold a lock.
1702One solution would be to make it an <type>atomic_t</type>, but for
1703this usage, we don't really care about races: an approximate result is
1704good enough, so I didn't change it.
1705</para>
1706
1707<para>
1708The result is that <function>cache_find()</function> requires no
1709synchronization with any other functions, so is almost as fast on SMP
1710as it would be on UP.
1711</para>
1712
1713<para>
1714There is a furthur optimization possible here: remember our original
1715cache code, where there were no reference counts and the caller simply
1716held the lock whenever using the object? This is still possible: if
1717you hold the lock, noone can delete the object, so you don't need to
1718get and put the reference count.
1719</para>
1720
1721<para>
1722Now, because the 'read lock' in RCU is simply disabling preemption, a
1723caller which always has preemption disabled between calling
1724<function>cache_find()</function> and
1725<function>object_put()</function> does not need to actually get and
1726put the reference count: we could expose
1727<function>__cache_find()</function> by making it non-static, and
1728such callers could simply call that.
1729</para>
1730<para>
1731The benefit here is that the reference count is not written to: the
1732object is not altered in any way, which is much faster on SMP
1733machines due to caching.
1734</para>
1735 </sect1>
1736
1737 <sect1 id="per-cpu">
1738 <title>Per-CPU Data</title>
1739
1740 <para>
1741 Another technique for avoiding locking which is used fairly
1742 widely is to duplicate information for each CPU. For example,
1743 if you wanted to keep a count of a common condition, you could
1744 use a spin lock and a single counter. Nice and simple.
1745 </para>
1746
1747 <para>
1748 If that was too slow (it's usually not, but if you've got a
1749 really big machine to test on and can show that it is), you
1750 could instead use a counter for each CPU, then none of them need
1751 an exclusive lock. See <function>DEFINE_PER_CPU()</function>,
1752 <function>get_cpu_var()</function> and
1753 <function>put_cpu_var()</function>
1754 (<filename class="headerfile">include/linux/percpu.h</filename>).
1755 </para>
1756
1757 <para>
1758 Of particular use for simple per-cpu counters is the
1759 <type>local_t</type> type, and the
1760 <function>cpu_local_inc()</function> and related functions,
1761 which are more efficient than simple code on some architectures
1762 (<filename class="headerfile">include/asm/local.h</filename>).
1763 </para>
1764
1765 <para>
1766 Note that there is no simple, reliable way of getting an exact
1767 value of such a counter, without introducing more locks. This
1768 is not a problem for some uses.
1769 </para>
1770 </sect1>
1771
1772 <sect1 id="mostly-hardirq">
1773 <title>Data Which Mostly Used By An IRQ Handler</title>
1774
1775 <para>
1776 If data is always accessed from within the same IRQ handler, you
1777 don't need a lock at all: the kernel already guarantees that the
1778 irq handler will not run simultaneously on multiple CPUs.
1779 </para>
1780 <para>
1781 Manfred Spraul points out that you can still do this, even if
1782 the data is very occasionally accessed in user context or
1783 softirqs/tasklets. The irq handler doesn't use a lock, and
1784 all other accesses are done as so:
1785 </para>
1786
1787<programlisting>
1788 spin_lock(&amp;lock);
1789 disable_irq(irq);
1790 ...
1791 enable_irq(irq);
1792 spin_unlock(&amp;lock);
1793</programlisting>
1794 <para>
1795 The <function>disable_irq()</function> prevents the irq handler
1796 from running (and waits for it to finish if it's currently
1797 running on other CPUs). The spinlock prevents any other
1798 accesses happening at the same time. Naturally, this is slower
1799 than just a <function>spin_lock_irq()</function> call, so it
1800 only makes sense if this type of access happens extremely
1801 rarely.
1802 </para>
1803 </sect1>
1804 </chapter>
1805
1806 <chapter id="sleeping-things">
1807 <title>What Functions Are Safe To Call From Interrupts?</title>
1808
1809 <para>
1810 Many functions in the kernel sleep (ie. call schedule())
1811 directly or indirectly: you can never call them while holding a
1812 spinlock, or with preemption disabled. This also means you need
1813 to be in user context: calling them from an interrupt is illegal.
1814 </para>
1815
1816 <sect1 id="sleeping">
1817 <title>Some Functions Which Sleep</title>
1818
1819 <para>
1820 The most common ones are listed below, but you usually have to
1821 read the code to find out if other calls are safe. If everyone
1822 else who calls it can sleep, you probably need to be able to
1823 sleep, too. In particular, registration and deregistration
1824 functions usually expect to be called from user context, and can
1825 sleep.
1826 </para>
1827
1828 <itemizedlist>
1829 <listitem>
1830 <para>
1831 Accesses to
1832 <firstterm linkend="gloss-userspace">userspace</firstterm>:
1833 </para>
1834 <itemizedlist>
1835 <listitem>
1836 <para>
1837 <function>copy_from_user()</function>
1838 </para>
1839 </listitem>
1840 <listitem>
1841 <para>
1842 <function>copy_to_user()</function>
1843 </para>
1844 </listitem>
1845 <listitem>
1846 <para>
1847 <function>get_user()</function>
1848 </para>
1849 </listitem>
1850 <listitem>
1851 <para>
1852 <function> put_user()</function>
1853 </para>
1854 </listitem>
1855 </itemizedlist>
1856 </listitem>
1857
1858 <listitem>
1859 <para>
1860 <function>kmalloc(GFP_KERNEL)</function>
1861 </para>
1862 </listitem>
1863
1864 <listitem>
1865 <para>
1866 <function>down_interruptible()</function> and
1867 <function>down()</function>
1868 </para>
1869 <para>
1870 There is a <function>down_trylock()</function> which can be
1871 used inside interrupt context, as it will not sleep.
1872 <function>up()</function> will also never sleep.
1873 </para>
1874 </listitem>
1875 </itemizedlist>
1876 </sect1>
1877
1878 <sect1 id="dont-sleep">
1879 <title>Some Functions Which Don't Sleep</title>
1880
1881 <para>
1882 Some functions are safe to call from any context, or holding
1883 almost any lock.
1884 </para>
1885
1886 <itemizedlist>
1887 <listitem>
1888 <para>
1889 <function>printk()</function>
1890 </para>
1891 </listitem>
1892 <listitem>
1893 <para>
1894 <function>kfree()</function>
1895 </para>
1896 </listitem>
1897 <listitem>
1898 <para>
1899 <function>add_timer()</function> and <function>del_timer()</function>
1900 </para>
1901 </listitem>
1902 </itemizedlist>
1903 </sect1>
1904 </chapter>
1905
1906 <chapter id="references">
1907 <title>Further reading</title>
1908
1909 <itemizedlist>
1910 <listitem>
1911 <para>
1912 <filename>Documentation/spinlocks.txt</filename>:
1913 Linus Torvalds' spinlocking tutorial in the kernel sources.
1914 </para>
1915 </listitem>
1916
1917 <listitem>
1918 <para>
1919 Unix Systems for Modern Architectures: Symmetric
1920 Multiprocessing and Caching for Kernel Programmers:
1921 </para>
1922
1923 <para>
1924 Curt Schimmel's very good introduction to kernel level
1925 locking (not written for Linux, but nearly everything
1926 applies). The book is expensive, but really worth every
1927 penny to understand SMP locking. [ISBN: 0201633388]
1928 </para>
1929 </listitem>
1930 </itemizedlist>
1931 </chapter>
1932
1933 <chapter id="thanks">
1934 <title>Thanks</title>
1935
1936 <para>
1937 Thanks to Telsa Gwynne for DocBooking, neatening and adding
1938 style.
1939 </para>
1940
1941 <para>
1942 Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
1943 Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
1944 Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
1945 John Ashby for proofreading, correcting, flaming, commenting.
1946 </para>
1947
1948 <para>
1949 Thanks to the cabal for having no influence on this document.
1950 </para>
1951 </chapter>
1952
1953 <glossary id="glossary">
1954 <title>Glossary</title>
1955
1956 <glossentry id="gloss-preemption">
1957 <glossterm>preemption</glossterm>
1958 <glossdef>
1959 <para>
1960 Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
1961 unset, processes in user context inside the kernel would not
1962 preempt each other (ie. you had that CPU until you have it up,
1963 except for interrupts). With the addition of
1964 <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
1965 in user context, higher priority tasks can "cut in": spinlocks
1966 were changed to disable preemption, even on UP.
1967 </para>
1968 </glossdef>
1969 </glossentry>
1970
1971 <glossentry id="gloss-bh">
1972 <glossterm>bh</glossterm>
1973 <glossdef>
1974 <para>
1975 Bottom Half: for historical reasons, functions with
1976 '_bh' in them often now refer to any software interrupt, e.g.
1977 <function>spin_lock_bh()</function> blocks any software interrupt
1978 on the current CPU. Bottom halves are deprecated, and will
1979 eventually be replaced by tasklets. Only one bottom half will be
1980 running at any time.
1981 </para>
1982 </glossdef>
1983 </glossentry>
1984
1985 <glossentry id="gloss-hwinterrupt">
1986 <glossterm>Hardware Interrupt / Hardware IRQ</glossterm>
1987 <glossdef>
1988 <para>
1989 Hardware interrupt request. <function>in_irq()</function> returns
1990 <returnvalue>true</returnvalue> in a hardware interrupt handler.
1991 </para>
1992 </glossdef>
1993 </glossentry>
1994
1995 <glossentry id="gloss-interruptcontext">
1996 <glossterm>Interrupt Context</glossterm>
1997 <glossdef>
1998 <para>
1999 Not user context: processing a hardware irq or software irq.
2000 Indicated by the <function>in_interrupt()</function> macro
2001 returning <returnvalue>true</returnvalue>.
2002 </para>
2003 </glossdef>
2004 </glossentry>
2005
2006 <glossentry id="gloss-smp">
2007 <glossterm><acronym>SMP</acronym></glossterm>
2008 <glossdef>
2009 <para>
2010 Symmetric Multi-Processor: kernels compiled for multiple-CPU
2011 machines. (CONFIG_SMP=y).
2012 </para>
2013 </glossdef>
2014 </glossentry>
2015
2016 <glossentry id="gloss-softirq">
2017 <glossterm>Software Interrupt / softirq</glossterm>
2018 <glossdef>
2019 <para>
2020 Software interrupt handler. <function>in_irq()</function> returns
2021 <returnvalue>false</returnvalue>; <function>in_softirq()</function>
2022 returns <returnvalue>true</returnvalue>. Tasklets and softirqs
2023 both fall into the category of 'software interrupts'.
2024 </para>
2025 <para>
2026 Strictly speaking a softirq is one of up to 32 enumerated software
2027 interrupts which can run on multiple CPUs at once.
2028 Sometimes used to refer to tasklets as
2029 well (ie. all software interrupts).
2030 </para>
2031 </glossdef>
2032 </glossentry>
2033
2034 <glossentry id="gloss-tasklet">
2035 <glossterm>tasklet</glossterm>
2036 <glossdef>
2037 <para>
2038 A dynamically-registrable software interrupt,
2039 which is guaranteed to only run on one CPU at a time.
2040 </para>
2041 </glossdef>
2042 </glossentry>
2043
2044 <glossentry id="gloss-timers">
2045 <glossterm>timer</glossterm>
2046 <glossdef>
2047 <para>
2048 A dynamically-registrable software interrupt, which is run at
2049 (or close to) a given time. When running, it is just like a
2050 tasklet (in fact, they are called from the TIMER_SOFTIRQ).
2051 </para>
2052 </glossdef>
2053 </glossentry>
2054
2055 <glossentry id="gloss-up">
2056 <glossterm><acronym>UP</acronym></glossterm>
2057 <glossdef>
2058 <para>
2059 Uni-Processor: Non-SMP. (CONFIG_SMP=n).
2060 </para>
2061 </glossdef>
2062 </glossentry>
2063
2064 <glossentry id="gloss-usercontext">
2065 <glossterm>User Context</glossterm>
2066 <glossdef>
2067 <para>
2068 The kernel executing on behalf of a particular process (ie. a
2069 system call or trap) or kernel thread. You can tell which
2070 process with the <symbol>current</symbol> macro.) Not to
2071 be confused with userspace. Can be interrupted by software or
2072 hardware interrupts.
2073 </para>
2074 </glossdef>
2075 </glossentry>
2076
2077 <glossentry id="gloss-userspace">
2078 <glossterm>Userspace</glossterm>
2079 <glossdef>
2080 <para>
2081 A process executing its own code outside the kernel.
2082 </para>
2083 </glossdef>
2084 </glossentry>
2085
2086 </glossary>
2087</book>
2088
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
new file mode 100644
index 000000000000..cf2fce7707da
--- /dev/null
+++ b/Documentation/DocBook/libata.tmpl
@@ -0,0 +1,282 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="libataDevGuide">
6 <bookinfo>
7 <title>libATA Developer's Guide</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Jeff</firstname>
12 <surname>Garzik</surname>
13 </author>
14 </authorgroup>
15
16 <copyright>
17 <year>2003</year>
18 <holder>Jeff Garzik</holder>
19 </copyright>
20
21 <legalnotice>
22 <para>
23 The contents of this file are subject to the Open
24 Software License version 1.1 that can be found at
25 <ulink url="http://www.opensource.org/licenses/osl-1.1.txt">http://www.opensource.org/licenses/osl-1.1.txt</ulink> and is included herein
26 by reference.
27 </para>
28
29 <para>
30 Alternatively, the contents of this file may be used under the terms
31 of the GNU General Public License version 2 (the "GPL") as distributed
32 in the kernel source COPYING file, in which case the provisions of
33 the GPL are applicable instead of the above. If you wish to allow
34 the use of your version of this file only under the terms of the
35 GPL and not to allow others to use your version of this file under
36 the OSL, indicate your decision by deleting the provisions above and
37 replace them with the notice and other provisions required by the GPL.
38 If you do not delete the provisions above, a recipient may use your
39 version of this file under either the OSL or the GPL.
40 </para>
41
42 </legalnotice>
43 </bookinfo>
44
45<toc></toc>
46
47 <chapter id="libataThanks">
48 <title>Thanks</title>
49 <para>
50 The bulk of the ATA knowledge comes thanks to long conversations with
51 Andre Hedrick (www.linux-ide.org).
52 </para>
53 <para>
54 Thanks to Alan Cox for pointing out similarities
55 between SATA and SCSI, and in general for motivation to hack on
56 libata.
57 </para>
58 <para>
59 libata's device detection
60 method, ata_pio_devchk, and in general all the early probing was
61 based on extensive study of Hale Landis's probe/reset code in his
62 ATADRVR driver (www.ata-atapi.com).
63 </para>
64 </chapter>
65
66 <chapter id="libataDriverApi">
67 <title>libata Driver API</title>
68 <sect1>
69 <title>struct ata_port_operations</title>
70
71 <programlisting>
72void (*port_disable) (struct ata_port *);
73 </programlisting>
74
75 <para>
76 Called from ata_bus_probe() and ata_bus_reset() error paths,
77 as well as when unregistering from the SCSI module (rmmod, hot
78 unplug).
79 </para>
80
81 <programlisting>
82void (*dev_config) (struct ata_port *, struct ata_device *);
83 </programlisting>
84
85 <para>
86 Called after IDENTIFY [PACKET] DEVICE is issued to each device
87 found. Typically used to apply device-specific fixups prior to
88 issue of SET FEATURES - XFER MODE, and prior to operation.
89 </para>
90
91 <programlisting>
92void (*set_piomode) (struct ata_port *, struct ata_device *);
93void (*set_dmamode) (struct ata_port *, struct ata_device *);
94void (*post_set_mode) (struct ata_port *ap);
95 </programlisting>
96
97 <para>
98 Hooks called prior to the issue of SET FEATURES - XFER MODE
99 command. dev->pio_mode is guaranteed to be valid when
100 ->set_piomode() is called, and dev->dma_mode is guaranteed to be
101 valid when ->set_dmamode() is called. ->post_set_mode() is
102 called unconditionally, after the SET FEATURES - XFER MODE
103 command completes successfully.
104 </para>
105
106 <para>
107 ->set_piomode() is always called (if present), but
108 ->set_dma_mode() is only called if DMA is possible.
109 </para>
110
111 <programlisting>
112void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
113void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
114 </programlisting>
115
116 <para>
117 ->tf_load() is called to load the given taskfile into hardware
118 registers / DMA buffers. ->tf_read() is called to read the
119 hardware registers / DMA buffers, to obtain the current set of
120 taskfile register values.
121 </para>
122
123 <programlisting>
124void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
125 </programlisting>
126
127 <para>
128 causes an ATA command, previously loaded with
129 ->tf_load(), to be initiated in hardware.
130 </para>
131
132 <programlisting>
133u8 (*check_status)(struct ata_port *ap);
134void (*dev_select)(struct ata_port *ap, unsigned int device);
135 </programlisting>
136
137 <para>
138 Reads the Status ATA shadow register from hardware. On some
139 hardware, this has the side effect of clearing the interrupt
140 condition.
141 </para>
142
143 <programlisting>
144void (*dev_select)(struct ata_port *ap, unsigned int device);
145 </programlisting>
146
147 <para>
148 Issues the low-level hardware command(s) that causes one of N
149 hardware devices to be considered 'selected' (active and
150 available for use) on the ATA bus.
151 </para>
152
153 <programlisting>
154void (*phy_reset) (struct ata_port *ap);
155 </programlisting>
156
157 <para>
158 The very first step in the probe phase. Actions vary depending
159 on the bus type, typically. After waking up the device and probing
160 for device presence (PATA and SATA), typically a soft reset
161 (SRST) will be performed. Drivers typically use the helper
162 functions ata_bus_reset() or sata_phy_reset() for this hook.
163 </para>
164
165 <programlisting>
166void (*bmdma_setup) (struct ata_queued_cmd *qc);
167void (*bmdma_start) (struct ata_queued_cmd *qc);
168 </programlisting>
169
170 <para>
171 When setting up an IDE BMDMA transaction, these hooks arm
172 (->bmdma_setup) and fire (->bmdma_start) the hardware's DMA
173 engine.
174 </para>
175
176 <programlisting>
177void (*qc_prep) (struct ata_queued_cmd *qc);
178int (*qc_issue) (struct ata_queued_cmd *qc);
179 </programlisting>
180
181 <para>
182 Higher-level hooks, these two hooks can potentially supercede
183 several of the above taskfile/DMA engine hooks. ->qc_prep is
184 called after the buffers have been DMA-mapped, and is typically
185 used to populate the hardware's DMA scatter-gather table.
186 Most drivers use the standard ata_qc_prep() helper function, but
187 more advanced drivers roll their own.
188 </para>
189 <para>
190 ->qc_issue is used to make a command active, once the hardware
191 and S/G tables have been prepared. IDE BMDMA drivers use the
192 helper function ata_qc_issue_prot() for taskfile protocol-based
193 dispatch. More advanced drivers roll their own ->qc_issue
194 implementation, using this as the "issue new ATA command to
195 hardware" hook.
196 </para>
197
198 <programlisting>
199void (*eng_timeout) (struct ata_port *ap);
200 </programlisting>
201
202 <para>
203 This is a high level error handling function, called from the
204 error handling thread, when a command times out.
205 </para>
206
207 <programlisting>
208irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
209void (*irq_clear) (struct ata_port *);
210 </programlisting>
211
212 <para>
213 ->irq_handler is the interrupt handling routine registered with
214 the system, by libata. ->irq_clear is called during probe just
215 before the interrupt handler is registered, to be sure hardware
216 is quiet.
217 </para>
218
219 <programlisting>
220u32 (*scr_read) (struct ata_port *ap, unsigned int sc_reg);
221void (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
222 u32 val);
223 </programlisting>
224
225 <para>
226 Read and write standard SATA phy registers. Currently only used
227 if ->phy_reset hook called the sata_phy_reset() helper function.
228 </para>
229
230 <programlisting>
231int (*port_start) (struct ata_port *ap);
232void (*port_stop) (struct ata_port *ap);
233void (*host_stop) (struct ata_host_set *host_set);
234 </programlisting>
235
236 <para>
237 ->port_start() is called just after the data structures for each
238 port are initialized. Typically this is used to alloc per-port
239 DMA buffers / tables / rings, enable DMA engines, and similar
240 tasks.
241 </para>
242 <para>
243 ->host_stop() is called when the rmmod or hot unplug process
244 begins. The hook must stop all hardware interrupts, DMA
245 engines, etc.
246 </para>
247 <para>
248 ->port_stop() is called after ->host_stop(). It's sole function
249 is to release DMA/memory resources, now that they are no longer
250 actively being used.
251 </para>
252
253 </sect1>
254 </chapter>
255
256 <chapter id="libataExt">
257 <title>libata Library</title>
258!Edrivers/scsi/libata-core.c
259 </chapter>
260
261 <chapter id="libataInt">
262 <title>libata Core Internals</title>
263!Idrivers/scsi/libata-core.c
264 </chapter>
265
266 <chapter id="libataScsiInt">
267 <title>libata SCSI translation/emulation</title>
268!Edrivers/scsi/libata-scsi.c
269!Idrivers/scsi/libata-scsi.c
270 </chapter>
271
272 <chapter id="PiixInt">
273 <title>ata_piix Internals</title>
274!Idrivers/scsi/ata_piix.c
275 </chapter>
276
277 <chapter id="SILInt">
278 <title>sata_sil Internals</title>
279!Idrivers/scsi/sata_sil.c
280 </chapter>
281
282</book>
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
new file mode 100644
index 000000000000..3ff39bafc00e
--- /dev/null
+++ b/Documentation/DocBook/librs.tmpl
@@ -0,0 +1,289 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="Reed-Solomon-Library-Guide">
6 <bookinfo>
7 <title>Reed-Solomon Library Programming Interface</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Thomas</firstname>
12 <surname>Gleixner</surname>
13 <affiliation>
14 <address>
15 <email>tglx@linutronix.de</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2004</year>
23 <holder>Thomas Gleixner</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License version 2 as published by the Free Software Foundation.
31 </para>
32
33 <para>
34 This program is distributed in the hope that it will be
35 useful, but WITHOUT ANY WARRANTY; without even the implied
36 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37 See the GNU General Public License for more details.
38 </para>
39
40 <para>
41 You should have received a copy of the GNU General Public
42 License along with this program; if not, write to the Free
43 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
44 MA 02111-1307 USA
45 </para>
46
47 <para>
48 For more details see the file COPYING in the source
49 distribution of Linux.
50 </para>
51 </legalnotice>
52 </bookinfo>
53
54<toc></toc>
55
56 <chapter id="intro">
57 <title>Introduction</title>
58 <para>
59 The generic Reed-Solomon Library provides encoding, decoding
60 and error correction functions.
61 </para>
62 <para>
63 Reed-Solomon codes are used in communication and storage
64 applications to ensure data integrity.
65 </para>
66 <para>
67 This documentation is provided for developers who want to utilize
68 the functions provided by the library.
69 </para>
70 </chapter>
71
72 <chapter id="bugs">
73 <title>Known Bugs And Assumptions</title>
74 <para>
75 None.
76 </para>
77 </chapter>
78
79 <chapter id="usage">
80 <title>Usage</title>
81 <para>
82 This chapter provides examples how to use the library.
83 </para>
84 <sect1>
85 <title>Initializing</title>
86 <para>
87 The init function init_rs returns a pointer to a
88 rs decoder structure, which holds the necessary
89 information for encoding, decoding and error correction
90 with the given polynomial. It either uses an existing
91 matching decoder or creates a new one. On creation all
92 the lookup tables for fast en/decoding are created.
93 The function may take a while, so make sure not to
94 call it in critical code paths.
95 </para>
96 <programlisting>
97/* the Reed Solomon control structure */
98static struct rs_control *rs_decoder;
99
100/* Symbolsize is 10 (bits)
101 * Primitve polynomial is x^10+x^3+1
102 * first consecutive root is 0
103 * primitve element to generate roots = 1
104 * generator polinomial degree (number of roots) = 6
105 */
106rs_decoder = init_rs (10, 0x409, 0, 1, 6);
107 </programlisting>
108 </sect1>
109 <sect1>
110 <title>Encoding</title>
111 <para>
112 The encoder calculates the Reed-Solomon code over
113 the given data length and stores the result in
114 the parity buffer. Note that the parity buffer must
115 be initialized before calling the encoder.
116 </para>
117 <para>
118 The expanded data can be inverted on the fly by
119 providing a non zero inversion mask. The expanded data is
120 XOR'ed with the mask. This is used e.g. for FLASH
121 ECC, where the all 0xFF is inverted to an all 0x00.
122 The Reed-Solomon code for all 0x00 is all 0x00. The
123 code is inverted before storing to FLASH so it is 0xFF
124 too. This prevent's that reading from an erased FLASH
125 results in ECC errors.
126 </para>
127 <para>
128 The databytes are expanded to the given symbol size
129 on the fly. There is no support for encoding continuous
130 bitstreams with a symbol size != 8 at the moment. If
131 it is necessary it should be not a big deal to implement
132 such functionality.
133 </para>
134 <programlisting>
135/* Parity buffer. Size = number of roots */
136uint16_t par[6];
137/* Initialize the parity buffer */
138memset(par, 0, sizeof(par));
139/* Encode 512 byte in data8. Store parity in buffer par */
140encode_rs8 (rs_decoder, data8, 512, par, 0);
141 </programlisting>
142 </sect1>
143 <sect1>
144 <title>Decoding</title>
145 <para>
146 The decoder calculates the syndrome over
147 the given data length and the received parity symbols
148 and corrects errors in the data.
149 </para>
150 <para>
151 If a syndrome is available from a hardware decoder
152 then the syndrome calculation is skipped.
153 </para>
154 <para>
155 The correction of the data buffer can be suppressed
156 by providing a correction pattern buffer and an error
157 location buffer to the decoder. The decoder stores the
158 calculated error location and the correction bitmask
159 in the given buffers. This is useful for hardware
160 decoders which use a weird bit ordering scheme.
161 </para>
162 <para>
163 The databytes are expanded to the given symbol size
164 on the fly. There is no support for decoding continuous
165 bitstreams with a symbolsize != 8 at the moment. If
166 it is necessary it should be not a big deal to implement
167 such functionality.
168 </para>
169
170 <sect2>
171 <title>
172 Decoding with syndrome calculation, direct data correction
173 </title>
174 <programlisting>
175/* Parity buffer. Size = number of roots */
176uint16_t par[6];
177uint8_t data[512];
178int numerr;
179/* Receive data */
180.....
181/* Receive parity */
182.....
183/* Decode 512 byte in data8.*/
184numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
185 </programlisting>
186 </sect2>
187
188 <sect2>
189 <title>
190 Decoding with syndrome given by hardware decoder, direct data correction
191 </title>
192 <programlisting>
193/* Parity buffer. Size = number of roots */
194uint16_t par[6], syn[6];
195uint8_t data[512];
196int numerr;
197/* Receive data */
198.....
199/* Receive parity */
200.....
201/* Get syndrome from hardware decoder */
202.....
203/* Decode 512 byte in data8.*/
204numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
205 </programlisting>
206 </sect2>
207
208 <sect2>
209 <title>
210 Decoding with syndrome given by hardware decoder, no direct data correction.
211 </title>
212 <para>
213 Note: It's not necessary to give data and received parity to the decoder.
214 </para>
215 <programlisting>
216/* Parity buffer. Size = number of roots */
217uint16_t par[6], syn[6], corr[8];
218uint8_t data[512];
219int numerr, errpos[8];
220/* Receive data */
221.....
222/* Receive parity */
223.....
224/* Get syndrome from hardware decoder */
225.....
226/* Decode 512 byte in data8.*/
227numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
228for (i = 0; i &lt; numerr; i++) {
229 do_error_correction_in_your_buffer(errpos[i], corr[i]);
230}
231 </programlisting>
232 </sect2>
233 </sect1>
234 <sect1>
235 <title>Cleanup</title>
236 <para>
237 The function free_rs frees the allocated resources,
238 if the caller is the last user of the decoder.
239 </para>
240 <programlisting>
241/* Release resources */
242free_rs(rs_decoder);
243 </programlisting>
244 </sect1>
245
246 </chapter>
247
248 <chapter id="structs">
249 <title>Structures</title>
250 <para>
251 This chapter contains the autogenerated documentation of the structures which are
252 used in the Reed-Solomon Library and are relevant for a developer.
253 </para>
254!Iinclude/linux/rslib.h
255 </chapter>
256
257 <chapter id="pubfunctions">
258 <title>Public Functions Provided</title>
259 <para>
260 This chapter contains the autogenerated documentation of the Reed-Solomon functions
261 which are exported.
262 </para>
263!Elib/reed_solomon/reed_solomon.c
264 </chapter>
265
266 <chapter id="credits">
267 <title>Credits</title>
268 <para>
269 The library code for encoding and decoding was written by Phil Karn.
270 </para>
271 <programlisting>
272 Copyright 2002, Phil Karn, KA9Q
273 May be used under the terms of the GNU General Public License (GPL)
274 </programlisting>
275 <para>
276 The wrapper functions and interfaces are written by Thomas Gleixner
277 </para>
278 <para>
279 Many users have provided bugfixes, improvements and helping hands for testing.
280 Thanks a lot.
281 </para>
282 <para>
283 The following people have contributed to this document:
284 </para>
285 <para>
286 Thomas Gleixner<email>tglx@linutronix.de</email>
287 </para>
288 </chapter>
289</book>
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl
new file mode 100644
index 000000000000..f63822195871
--- /dev/null
+++ b/Documentation/DocBook/lsm.tmpl
@@ -0,0 +1,265 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<article class="whitepaper" id="LinuxSecurityModule" lang="en">
6 <articleinfo>
7 <title>Linux Security Modules: General Security Hooks for Linux</title>
8 <authorgroup>
9 <author>
10 <firstname>Stephen</firstname>
11 <surname>Smalley</surname>
12 <affiliation>
13 <orgname>NAI Labs</orgname>
14 <address><email>ssmalley@nai.com</email></address>
15 </affiliation>
16 </author>
17 <author>
18 <firstname>Timothy</firstname>
19 <surname>Fraser</surname>
20 <affiliation>
21 <orgname>NAI Labs</orgname>
22 <address><email>tfraser@nai.com</email></address>
23 </affiliation>
24 </author>
25 <author>
26 <firstname>Chris</firstname>
27 <surname>Vance</surname>
28 <affiliation>
29 <orgname>NAI Labs</orgname>
30 <address><email>cvance@nai.com</email></address>
31 </affiliation>
32 </author>
33 </authorgroup>
34 </articleinfo>
35
36<sect1><title>Introduction</title>
37
38<para>
39In March 2001, the National Security Agency (NSA) gave a presentation
40about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
41Summit. SELinux is an implementation of flexible and fine-grained
42nondiscretionary access controls in the Linux kernel, originally
43implemented as its own particular kernel patch. Several other
44security projects (e.g. RSBAC, Medusa) have also developed flexible
45access control architectures for the Linux kernel, and various
46projects have developed particular access control models for Linux
47(e.g. LIDS, DTE, SubDomain). Each project has developed and
48maintained its own kernel patch to support its security needs.
49</para>
50
51<para>
52In response to the NSA presentation, Linus Torvalds made a set of
53remarks that described a security framework he would be willing to
54consider for inclusion in the mainstream Linux kernel. He described a
55general framework that would provide a set of security hooks to
56control operations on kernel objects and a set of opaque security
57fields in kernel data structures for maintaining security attributes.
58This framework could then be used by loadable kernel modules to
59implement any desired model of security. Linus also suggested the
60possibility of migrating the Linux capabilities code into such a
61module.
62</para>
63
64<para>
65The Linux Security Modules (LSM) project was started by WireX to
66develop such a framework. LSM is a joint development effort by
67several security projects, including Immunix, SELinux, SGI and Janus,
68and several individuals, including Greg Kroah-Hartman and James
69Morris, to develop a Linux kernel patch that implements this
70framework. The patch is currently tracking the 2.4 series and is
71targeted for integration into the 2.5 development series. This
72technical report provides an overview of the framework and the example
73capabilities security module provided by the LSM kernel patch.
74</para>
75
76</sect1>
77
78<sect1 id="framework"><title>LSM Framework</title>
79
80<para>
81The LSM kernel patch provides a general kernel framework to support
82security modules. In particular, the LSM framework is primarily
83focused on supporting access control modules, although future
84development is likely to address other security needs such as
85auditing. By itself, the framework does not provide any additional
86security; it merely provides the infrastructure to support security
87modules. The LSM kernel patch also moves most of the capabilities
88logic into an optional security module, with the system defaulting
89to the traditional superuser logic. This capabilities module
90is discussed further in <xref linkend="cap"/>.
91</para>
92
93<para>
94The LSM kernel patch adds security fields to kernel data structures
95and inserts calls to hook functions at critical points in the kernel
96code to manage the security fields and to perform access control. It
97also adds functions for registering and unregistering security
98modules, and adds a general <function>security</function> system call
99to support new system calls for security-aware applications.
100</para>
101
102<para>
103The LSM security fields are simply <type>void*</type> pointers. For
104process and program execution security information, security fields
105were added to <structname>struct task_struct</structname> and
106<structname>struct linux_binprm</structname>. For filesystem security
107information, a security field was added to
108<structname>struct super_block</structname>. For pipe, file, and socket
109security information, security fields were added to
110<structname>struct inode</structname> and
111<structname>struct file</structname>. For packet and network device security
112information, security fields were added to
113<structname>struct sk_buff</structname> and
114<structname>struct net_device</structname>. For System V IPC security
115information, security fields were added to
116<structname>struct kern_ipc_perm</structname> and
117<structname>struct msg_msg</structname>; additionally, the definitions
118for <structname>struct msg_msg</structname>, <structname>struct
119msg_queue</structname>, and <structname>struct
120shmid_kernel</structname> were moved to header files
121(<filename>include/linux/msg.h</filename> and
122<filename>include/linux/shm.h</filename> as appropriate) to allow
123the security modules to use these definitions.
124</para>
125
126<para>
127Each LSM hook is a function pointer in a global table,
128security_ops. This table is a
129<structname>security_operations</structname> structure as defined by
130<filename>include/linux/security.h</filename>. Detailed documentation
131for each hook is included in this header file. At present, this
132structure consists of a collection of substructures that group related
133hooks based on the kernel object (e.g. task, inode, file, sk_buff,
134etc) as well as some top-level hook function pointers for system
135operations. This structure is likely to be flattened in the future
136for performance. The placement of the hook calls in the kernel code
137is described by the "called:" lines in the per-hook documentation in
138the header file. The hook calls can also be easily found in the
139kernel code by looking for the string "security_ops->".
140
141</para>
142
143<para>
144Linus mentioned per-process security hooks in his original remarks as a
145possible alternative to global security hooks. However, if LSM were
146to start from the perspective of per-process hooks, then the base
147framework would have to deal with how to handle operations that
148involve multiple processes (e.g. kill), since each process might have
149its own hook for controlling the operation. This would require a
150general mechanism for composing hooks in the base framework.
151Additionally, LSM would still need global hooks for operations that
152have no process context (e.g. network input operations).
153Consequently, LSM provides global security hooks, but a security
154module is free to implement per-process hooks (where that makes sense)
155by storing a security_ops table in each process' security field and
156then invoking these per-process hooks from the global hooks.
157The problem of composition is thus deferred to the module.
158</para>
159
160<para>
161The global security_ops table is initialized to a set of hook
162functions provided by a dummy security module that provides
163traditional superuser logic. A <function>register_security</function>
164function (in <filename>security/security.c</filename>) is provided to
165allow a security module to set security_ops to refer to its own hook
166functions, and an <function>unregister_security</function> function is
167provided to revert security_ops to the dummy module hooks. This
168mechanism is used to set the primary security module, which is
169responsible for making the final decision for each hook.
170</para>
171
172<para>
173LSM also provides a simple mechanism for stacking additional security
174modules with the primary security module. It defines
175<function>register_security</function> and
176<function>unregister_security</function> hooks in the
177<structname>security_operations</structname> structure and provides
178<function>mod_reg_security</function> and
179<function>mod_unreg_security</function> functions that invoke these
180hooks after performing some sanity checking. A security module can
181call these functions in order to stack with other modules. However,
182the actual details of how this stacking is handled are deferred to the
183module, which can implement these hooks in any way it wishes
184(including always returning an error if it does not wish to support
185stacking). In this manner, LSM again defers the problem of
186composition to the module.
187</para>
188
189<para>
190Although the LSM hooks are organized into substructures based on
191kernel object, all of the hooks can be viewed as falling into two
192major categories: hooks that are used to manage the security fields
193and hooks that are used to perform access control. Examples of the
194first category of hooks include the
195<function>alloc_security</function> and
196<function>free_security</function> hooks defined for each kernel data
197structure that has a security field. These hooks are used to allocate
198and free security structures for kernel objects. The first category
199of hooks also includes hooks that set information in the security
200field after allocation, such as the <function>post_lookup</function>
201hook in <structname>struct inode_security_ops</structname>. This hook
202is used to set security information for inodes after successful lookup
203operations. An example of the second category of hooks is the
204<function>permission</function> hook in
205<structname>struct inode_security_ops</structname>. This hook checks
206permission when accessing an inode.
207</para>
208
209</sect1>
210
211<sect1 id="cap"><title>LSM Capabilities Module</title>
212
213<para>
214The LSM kernel patch moves most of the existing POSIX.1e capabilities
215logic into an optional security module stored in the file
216<filename>security/capability.c</filename>. This change allows
217users who do not want to use capabilities to omit this code entirely
218from their kernel, instead using the dummy module for traditional
219superuser logic or any other module that they desire. This change
220also allows the developers of the capabilities logic to maintain and
221enhance their code more freely, without needing to integrate patches
222back into the base kernel.
223</para>
224
225<para>
226In addition to moving the capabilities logic, the LSM kernel patch
227could move the capability-related fields from the kernel data
228structures into the new security fields managed by the security
229modules. However, at present, the LSM kernel patch leaves the
230capability fields in the kernel data structures. In his original
231remarks, Linus suggested that this might be preferable so that other
232security modules can be easily stacked with the capabilities module
233without needing to chain multiple security structures on the security field.
234It also avoids imposing extra overhead on the capabilities module
235to manage the security fields. However, the LSM framework could
236certainly support such a move if it is determined to be desirable,
237with only a few additional changes described below.
238</para>
239
240<para>
241At present, the capabilities logic for computing process capabilities
242on <function>execve</function> and <function>set*uid</function>,
243checking capabilities for a particular process, saving and checking
244capabilities for netlink messages, and handling the
245<function>capget</function> and <function>capset</function> system
246calls have been moved into the capabilities module. There are still a
247few locations in the base kernel where capability-related fields are
248directly examined or modified, but the current version of the LSM
249patch does allow a security module to completely replace the
250assignment and testing of capabilities. These few locations would
251need to be changed if the capability-related fields were moved into
252the security field. The following is a list of known locations that
253still perform such direct examination or modification of
254capability-related fields:
255<itemizedlist>
256<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
257<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
258<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
259<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
260</itemizedlist>
261</para>
262
263</sect1>
264
265</article>
diff --git a/Documentation/DocBook/man/Makefile b/Documentation/DocBook/man/Makefile
new file mode 100644
index 000000000000..4fb7ea0f7ac8
--- /dev/null
+++ b/Documentation/DocBook/man/Makefile
@@ -0,0 +1,3 @@
1# Rules are put in Documentation/DocBook
2
3clean-files := *.9.gz *.sgml manpage.links manpage.refs
diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
new file mode 100644
index 000000000000..4367f4642f3d
--- /dev/null
+++ b/Documentation/DocBook/mcabook.tmpl
@@ -0,0 +1,107 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="MCAGuide">
6 <bookinfo>
7 <title>MCA Driver Programming Interface</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Alan</firstname>
12 <surname>Cox</surname>
13 <affiliation>
14 <address>
15 <email>alan@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 <author>
20 <firstname>David</firstname>
21 <surname>Weinehall</surname>
22 </author>
23 <author>
24 <firstname>Chris</firstname>
25 <surname>Beauregard</surname>
26 </author>
27 </authorgroup>
28
29 <copyright>
30 <year>2000</year>
31 <holder>Alan Cox</holder>
32 <holder>David Weinehall</holder>
33 <holder>Chris Beauregard</holder>
34 </copyright>
35
36 <legalnotice>
37 <para>
38 This documentation is free software; you can redistribute
39 it and/or modify it under the terms of the GNU General Public
40 License as published by the Free Software Foundation; either
41 version 2 of the License, or (at your option) any later
42 version.
43 </para>
44
45 <para>
46 This program is distributed in the hope that it will be
47 useful, but WITHOUT ANY WARRANTY; without even the implied
48 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
49 See the GNU General Public License for more details.
50 </para>
51
52 <para>
53 You should have received a copy of the GNU General Public
54 License along with this program; if not, write to the Free
55 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
56 MA 02111-1307 USA
57 </para>
58
59 <para>
60 For more details see the file COPYING in the source
61 distribution of Linux.
62 </para>
63 </legalnotice>
64 </bookinfo>
65
66<toc></toc>
67
68 <chapter id="intro">
69 <title>Introduction</title>
70 <para>
71 The MCA bus functions provide a generalised interface to find MCA
72 bus cards, to claim them for a driver, and to read and manipulate POS
73 registers without being aware of the motherboard internals or
74 certain deep magic specific to onboard devices.
75 </para>
76 <para>
77 The basic interface to the MCA bus devices is the slot. Each slot
78 is numbered and virtual slot numbers are assigned to the internal
79 devices. Using a pci_dev as other busses do does not really make
80 sense in the MCA context as the MCA bus resources require card
81 specific interpretation.
82 </para>
83 <para>
84 Finally the MCA bus functions provide a parallel set of DMA
85 functions mimicing the ISA bus DMA functions as closely as possible,
86 although also supporting the additional DMA functionality on the
87 MCA bus controllers.
88 </para>
89 </chapter>
90 <chapter id="bugs">
91 <title>Known Bugs And Assumptions</title>
92 <para>
93 None.
94 </para>
95 </chapter>
96
97 <chapter id="pubfunctions">
98 <title>Public Functions Provided</title>
99!Earch/i386/kernel/mca.c
100 </chapter>
101
102 <chapter id="dmafunctions">
103 <title>DMA Functions Provided</title>
104!Iinclude/asm-i386/mca_dma.h
105 </chapter>
106
107</book>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
new file mode 100644
index 000000000000..6e463d0db266
--- /dev/null
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -0,0 +1,1320 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="MTD-NAND-Guide">
6 <bookinfo>
7 <title>MTD NAND Driver Programming Interface</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Thomas</firstname>
12 <surname>Gleixner</surname>
13 <affiliation>
14 <address>
15 <email>tglx@linutronix.de</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2004</year>
23 <holder>Thomas Gleixner</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License version 2 as published by the Free Software Foundation.
31 </para>
32
33 <para>
34 This program is distributed in the hope that it will be
35 useful, but WITHOUT ANY WARRANTY; without even the implied
36 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37 See the GNU General Public License for more details.
38 </para>
39
40 <para>
41 You should have received a copy of the GNU General Public
42 License along with this program; if not, write to the Free
43 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
44 MA 02111-1307 USA
45 </para>
46
47 <para>
48 For more details see the file COPYING in the source
49 distribution of Linux.
50 </para>
51 </legalnotice>
52 </bookinfo>
53
54<toc></toc>
55
56 <chapter id="intro">
57 <title>Introduction</title>
58 <para>
59 The generic NAND driver supports almost all NAND and AG-AND based
60 chips and connects them to the Memory Technology Devices (MTD)
61 subsystem of the Linux Kernel.
62 </para>
63 <para>
64 This documentation is provided for developers who want to implement
65 board drivers or filesystem drivers suitable for NAND devices.
66 </para>
67 </chapter>
68
69 <chapter id="bugs">
70 <title>Known Bugs And Assumptions</title>
71 <para>
72 None.
73 </para>
74 </chapter>
75
76 <chapter id="dochints">
77 <title>Documentation hints</title>
78 <para>
79 The function and structure docs are autogenerated. Each function and
80 struct member has a short description which is marked with an [XXX] identifier.
81 The following chapters explain the meaning of those identifiers.
82 </para>
83 <sect1>
84 <title>Function identifiers [XXX]</title>
85 <para>
86 The functions are marked with [XXX] identifiers in the short
87 comment. The identifiers explain the usage and scope of the
88 functions. Following identifiers are used:
89 </para>
90 <itemizedlist>
91 <listitem><para>
92 [MTD Interface]</para><para>
93 These functions provide the interface to the MTD kernel API.
94 They are not replacable and provide functionality
95 which is complete hardware independent.
96 </para></listitem>
97 <listitem><para>
98 [NAND Interface]</para><para>
99 These functions are exported and provide the interface to the NAND kernel API.
100 </para></listitem>
101 <listitem><para>
102 [GENERIC]</para><para>
103 Generic functions are not replacable and provide functionality
104 which is complete hardware independent.
105 </para></listitem>
106 <listitem><para>
107 [DEFAULT]</para><para>
108 Default functions provide hardware related functionality which is suitable
109 for most of the implementations. These functions can be replaced by the
110 board driver if neccecary. Those functions are called via pointers in the
111 NAND chip description structure. The board driver can set the functions which
112 should be replaced by board dependend functions before calling nand_scan().
113 If the function pointer is NULL on entry to nand_scan() then the pointer
114 is set to the default function which is suitable for the detected chip type.
115 </para></listitem>
116 </itemizedlist>
117 </sect1>
118 <sect1>
119 <title>Struct member identifiers [XXX]</title>
120 <para>
121 The struct members are marked with [XXX] identifiers in the
122 comment. The identifiers explain the usage and scope of the
123 members. Following identifiers are used:
124 </para>
125 <itemizedlist>
126 <listitem><para>
127 [INTERN]</para><para>
128 These members are for NAND driver internal use only and must not be
129 modified. Most of these values are calculated from the chip geometry
130 information which is evaluated during nand_scan().
131 </para></listitem>
132 <listitem><para>
133 [REPLACEABLE]</para><para>
134 Replaceable members hold hardware related functions which can be
135 provided by the board driver. The board driver can set the functions which
136 should be replaced by board dependend functions before calling nand_scan().
137 If the function pointer is NULL on entry to nand_scan() then the pointer
138 is set to the default function which is suitable for the detected chip type.
139 </para></listitem>
140 <listitem><para>
141 [BOARDSPECIFIC]</para><para>
142 Board specific members hold hardware related information which must
143 be provided by the board driver. The board driver must set the function
144 pointers and datafields before calling nand_scan().
145 </para></listitem>
146 <listitem><para>
147 [OPTIONAL]</para><para>
148 Optional members can hold information relevant for the board driver. The
149 generic NAND driver code does not use this information.
150 </para></listitem>
151 </itemizedlist>
152 </sect1>
153 </chapter>
154
155 <chapter id="basicboarddriver">
156 <title>Basic board driver</title>
157 <para>
158 For most boards it will be sufficient to provide just the
159 basic functions and fill out some really board dependend
160 members in the nand chip description structure.
161 See drivers/mtd/nand/skeleton for reference.
162 </para>
163 <sect1>
164 <title>Basic defines</title>
165 <para>
166 At least you have to provide a mtd structure and
167 a storage for the ioremap'ed chip address.
168 You can allocate the mtd structure using kmalloc
169 or you can allocate it statically.
170 In case of static allocation you have to allocate
171 a nand_chip structure too.
172 </para>
173 <para>
174 Kmalloc based example
175 </para>
176 <programlisting>
177static struct mtd_info *board_mtd;
178static unsigned long baseaddr;
179 </programlisting>
180 <para>
181 Static example
182 </para>
183 <programlisting>
184static struct mtd_info board_mtd;
185static struct nand_chip board_chip;
186static unsigned long baseaddr;
187 </programlisting>
188 </sect1>
189 <sect1>
190 <title>Partition defines</title>
191 <para>
192 If you want to divide your device into parititions, then
193 enable the configuration switch CONFIG_MTD_PARITIONS and define
194 a paritioning scheme suitable to your board.
195 </para>
196 <programlisting>
197#define NUM_PARTITIONS 2
198static struct mtd_partition partition_info[] = {
199 { .name = "Flash partition 1",
200 .offset = 0,
201 .size = 8 * 1024 * 1024 },
202 { .name = "Flash partition 2",
203 .offset = MTDPART_OFS_NEXT,
204 .size = MTDPART_SIZ_FULL },
205};
206 </programlisting>
207 </sect1>
208 <sect1>
209 <title>Hardware control function</title>
210 <para>
211 The hardware control function provides access to the
212 control pins of the NAND chip(s).
213 The access can be done by GPIO pins or by address lines.
214 If you use address lines, make sure that the timing
215 requirements are met.
216 </para>
217 <para>
218 <emphasis>GPIO based example</emphasis>
219 </para>
220 <programlisting>
221static void board_hwcontrol(struct mtd_info *mtd, int cmd)
222{
223 switch(cmd){
224 case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
225 case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
226 case NAND_CTL_SETALE: /* Set ALE pin high */ break;
227 case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
228 case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
229 case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
230 }
231}
232 </programlisting>
233 <para>
234 <emphasis>Address lines based example.</emphasis> It's assumed that the
235 nCE pin is driven by a chip select decoder.
236 </para>
237 <programlisting>
238static void board_hwcontrol(struct mtd_info *mtd, int cmd)
239{
240 struct nand_chip *this = (struct nand_chip *) mtd->priv;
241 switch(cmd){
242 case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break;
243 case NAND_CTL_CLRCLE: this->IO_ADDR_W &amp;= ~CLE_ADRR_BIT; break;
244 case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break;
245 case NAND_CTL_CLRALE: this->IO_ADDR_W &amp;= ~ALE_ADRR_BIT; break;
246 }
247}
248 </programlisting>
249 </sect1>
250 <sect1>
251 <title>Device ready function</title>
252 <para>
253 If the hardware interface has the ready busy pin of the NAND chip connected to a
254 GPIO or other accesible I/O pin, this function is used to read back the state of the
255 pin. The function has no arguments and should return 0, if the device is busy (R/B pin
256 is low) and 1, if the device is ready (R/B pin is high).
257 If the hardware interface does not give access to the ready busy pin, then
258 the function must not be defined and the function pointer this->dev_ready is set to NULL.
259 </para>
260 </sect1>
261 <sect1>
262 <title>Init function</title>
263 <para>
264 The init function allocates memory and sets up all the board
265 specific parameters and function pointers. When everything
266 is set up nand_scan() is called. This function tries to
267 detect and identify then chip. If a chip is found all the
268 internal data fields are initialized accordingly.
269 The structure(s) have to be zeroed out first and then filled with the neccecary
270 information about the device.
271 </para>
272 <programlisting>
273int __init board_init (void)
274{
275 struct nand_chip *this;
276 int err = 0;
277
278 /* Allocate memory for MTD device structure and private data */
279 board_mtd = kmalloc (sizeof(struct mtd_info) + sizeof (struct nand_chip), GFP_KERNEL);
280 if (!board_mtd) {
281 printk ("Unable to allocate NAND MTD device structure.\n");
282 err = -ENOMEM;
283 goto out;
284 }
285
286 /* Initialize structures */
287 memset ((char *) board_mtd, 0, sizeof(struct mtd_info) + sizeof(struct nand_chip));
288
289 /* map physical adress */
290 baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
291 if(!baseaddr){
292 printk("Ioremap to access NAND chip failed\n");
293 err = -EIO;
294 goto out_mtd;
295 }
296
297 /* Get pointer to private data */
298 this = (struct nand_chip *) ();
299 /* Link the private data with the MTD structure */
300 board_mtd->priv = this;
301
302 /* Set address of NAND IO lines */
303 this->IO_ADDR_R = baseaddr;
304 this->IO_ADDR_W = baseaddr;
305 /* Reference hardware control function */
306 this->hwcontrol = board_hwcontrol;
307 /* Set command delay time, see datasheet for correct value */
308 this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
309 /* Assign the device ready function, if available */
310 this->dev_ready = board_dev_ready;
311 this->eccmode = NAND_ECC_SOFT;
312
313 /* Scan to find existance of the device */
314 if (nand_scan (board_mtd, 1)) {
315 err = -ENXIO;
316 goto out_ior;
317 }
318
319 add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
320 goto out;
321
322out_ior:
323 iounmap((void *)baseaddr);
324out_mtd:
325 kfree (board_mtd);
326out:
327 return err;
328}
329module_init(board_init);
330 </programlisting>
331 </sect1>
332 <sect1>
333 <title>Exit function</title>
334 <para>
335 The exit function is only neccecary if the driver is
336 compiled as a module. It releases all resources which
337 are held by the chip driver and unregisters the partitions
338 in the MTD layer.
339 </para>
340 <programlisting>
341#ifdef MODULE
342static void __exit board_cleanup (void)
343{
344 /* Release resources, unregister device */
345 nand_release (board_mtd);
346
347 /* unmap physical adress */
348 iounmap((void *)baseaddr);
349
350 /* Free the MTD device structure */
351 kfree (board_mtd);
352}
353module_exit(board_cleanup);
354#endif
355 </programlisting>
356 </sect1>
357 </chapter>
358
359 <chapter id="boarddriversadvanced">
360 <title>Advanced board driver functions</title>
361 <para>
362 This chapter describes the advanced functionality of the NAND
363 driver. For a list of functions which can be overridden by the board
364 driver see the documentation of the nand_chip structure.
365 </para>
366 <sect1>
367 <title>Multiple chip control</title>
368 <para>
369 The nand driver can control chip arrays. Therefor the
370 board driver must provide an own select_chip function. This
371 function must (de)select the requested chip.
372 The function pointer in the nand_chip structure must
373 be set before calling nand_scan(). The maxchip parameter
374 of nand_scan() defines the maximum number of chips to
375 scan for. Make sure that the select_chip function can
376 handle the requested number of chips.
377 </para>
378 <para>
379 The nand driver concatenates the chips to one virtual
380 chip and provides this virtual chip to the MTD layer.
381 </para>
382 <para>
383 <emphasis>Note: The driver can only handle linear chip arrays
384 of equally sized chips. There is no support for
385 parallel arrays which extend the buswidth.</emphasis>
386 </para>
387 <para>
388 <emphasis>GPIO based example</emphasis>
389 </para>
390 <programlisting>
391static void board_select_chip (struct mtd_info *mtd, int chip)
392{
393 /* Deselect all chips, set all nCE pins high */
394 GPIO(BOARD_NAND_NCE) |= 0xff;
395 if (chip >= 0)
396 GPIO(BOARD_NAND_NCE) &amp;= ~ (1 &lt;&lt; chip);
397}
398 </programlisting>
399 <para>
400 <emphasis>Address lines based example.</emphasis>
401 Its assumed that the nCE pins are connected to an
402 address decoder.
403 </para>
404 <programlisting>
405static void board_select_chip (struct mtd_info *mtd, int chip)
406{
407 struct nand_chip *this = (struct nand_chip *) mtd->priv;
408
409 /* Deselect all chips */
410 this->IO_ADDR_R &amp;= ~BOARD_NAND_ADDR_MASK;
411 this->IO_ADDR_W &amp;= ~BOARD_NAND_ADDR_MASK;
412 switch (chip) {
413 case 0:
414 this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
415 this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
416 break;
417 ....
418 case n:
419 this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
420 this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
421 break;
422 }
423}
424 </programlisting>
425 </sect1>
426 <sect1>
427 <title>Hardware ECC support</title>
428 <sect2>
429 <title>Functions and constants</title>
430 <para>
431 The nand driver supports three different types of
432 hardware ECC.
433 <itemizedlist>
434 <listitem><para>NAND_ECC_HW3_256</para><para>
435 Hardware ECC generator providing 3 bytes ECC per
436 256 byte.
437 </para> </listitem>
438 <listitem><para>NAND_ECC_HW3_512</para><para>
439 Hardware ECC generator providing 3 bytes ECC per
440 512 byte.
441 </para> </listitem>
442 <listitem><para>NAND_ECC_HW6_512</para><para>
443 Hardware ECC generator providing 6 bytes ECC per
444 512 byte.
445 </para> </listitem>
446 <listitem><para>NAND_ECC_HW8_512</para><para>
447 Hardware ECC generator providing 6 bytes ECC per
448 512 byte.
449 </para> </listitem>
450 </itemizedlist>
451 If your hardware generator has a different functionality
452 add it at the appropriate place in nand_base.c
453 </para>
454 <para>
455 The board driver must provide following functions:
456 <itemizedlist>
457 <listitem><para>enable_hwecc</para><para>
458 This function is called before reading / writing to
459 the chip. Reset or initialize the hardware generator
460 in this function. The function is called with an
461 argument which let you distinguish between read
462 and write operations.
463 </para> </listitem>
464 <listitem><para>calculate_ecc</para><para>
465 This function is called after read / write from / to
466 the chip. Transfer the ECC from the hardware to
467 the buffer. If the option NAND_HWECC_SYNDROME is set
468 then the function is only called on write. See below.
469 </para> </listitem>
470 <listitem><para>correct_data</para><para>
471 In case of an ECC error this function is called for
472 error detection and correction. Return 1 respectively 2
473 in case the error can be corrected. If the error is
474 not correctable return -1. If your hardware generator
475 matches the default algorithm of the nand_ecc software
476 generator then use the correction function provided
477 by nand_ecc instead of implementing duplicated code.
478 </para> </listitem>
479 </itemizedlist>
480 </para>
481 </sect2>
482 <sect2>
483 <title>Hardware ECC with syndrome calculation</title>
484 <para>
485 Many hardware ECC implementations provide Reed-Solomon
486 codes and calculate an error syndrome on read. The syndrome
487 must be converted to a standard Reed-Solomon syndrome
488 before calling the error correction code in the generic
489 Reed-Solomon library.
490 </para>
491 <para>
492 The ECC bytes must be placed immidiately after the data
493 bytes in order to make the syndrome generator work. This
494 is contrary to the usual layout used by software ECC. The
495 seperation of data and out of band area is not longer
496 possible. The nand driver code handles this layout and
497 the remaining free bytes in the oob area are managed by
498 the autoplacement code. Provide a matching oob-layout
499 in this case. See rts_from4.c and diskonchip.c for
500 implementation reference. In those cases we must also
501 use bad block tables on FLASH, because the ECC layout is
502 interferring with the bad block marker positions.
503 See bad block table support for details.
504 </para>
505 </sect2>
506 </sect1>
507 <sect1>
508 <title>Bad block table support</title>
509 <para>
510 Most NAND chips mark the bad blocks at a defined
511 position in the spare area. Those blocks must
512 not be erased under any circumstances as the bad
513 block information would be lost.
514 It is possible to check the bad block mark each
515 time when the blocks are accessed by reading the
516 spare area of the first page in the block. This
517 is time consuming so a bad block table is used.
518 </para>
519 <para>
520 The nand driver supports various types of bad block
521 tables.
522 <itemizedlist>
523 <listitem><para>Per device</para><para>
524 The bad block table contains all bad block information
525 of the device which can consist of multiple chips.
526 </para> </listitem>
527 <listitem><para>Per chip</para><para>
528 A bad block table is used per chip and contains the
529 bad block information for this particular chip.
530 </para> </listitem>
531 <listitem><para>Fixed offset</para><para>
532 The bad block table is located at a fixed offset
533 in the chip (device). This applies to various
534 DiskOnChip devices.
535 </para> </listitem>
536 <listitem><para>Automatic placed</para><para>
537 The bad block table is automatically placed and
538 detected either at the end or at the beginning
539 of a chip (device)
540 </para> </listitem>
541 <listitem><para>Mirrored tables</para><para>
542 The bad block table is mirrored on the chip (device) to
543 allow updates of the bad block table without data loss.
544 </para> </listitem>
545 </itemizedlist>
546 </para>
547 <para>
548 nand_scan() calls the function nand_default_bbt().
549 nand_default_bbt() selects appropriate default
550 bad block table desriptors depending on the chip information
551 which was retrieved by nand_scan().
552 </para>
553 <para>
554 The standard policy is scanning the device for bad
555 blocks and build a ram based bad block table which
556 allows faster access than always checking the
557 bad block information on the flash chip itself.
558 </para>
559 <sect2>
560 <title>Flash based tables</title>
561 <para>
562 It may be desired or neccecary to keep a bad block table in FLASH.
563 For AG-AND chips this is mandatory, as they have no factory marked
564 bad blocks. They have factory marked good blocks. The marker pattern
565 is erased when the block is erased to be reused. So in case of
566 powerloss before writing the pattern back to the chip this block
567 would be lost and added to the bad blocks. Therefor we scan the
568 chip(s) when we detect them the first time for good blocks and
569 store this information in a bad block table before erasing any
570 of the blocks.
571 </para>
572 <para>
573 The blocks in which the tables are stored are procteted against
574 accidental access by marking them bad in the memory bad block
575 table. The bad block table managment functions are allowed
576 to circumvernt this protection.
577 </para>
578 <para>
579 The simplest way to activate the FLASH based bad block table support
580 is to set the option NAND_USE_FLASH_BBT in the option field of
581 the nand chip structure before calling nand_scan(). For AG-AND
582 chips is this done by default.
583 This activates the default FLASH based bad block table functionality
584 of the NAND driver. The default bad block table options are
585 <itemizedlist>
586 <listitem><para>Store bad block table per chip</para></listitem>
587 <listitem><para>Use 2 bits per block</para></listitem>
588 <listitem><para>Automatic placement at the end of the chip</para></listitem>
589 <listitem><para>Use mirrored tables with version numbers</para></listitem>
590 <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem>
591 </itemizedlist>
592 </para>
593 </sect2>
594 <sect2>
595 <title>User defined tables</title>
596 <para>
597 User defined tables are created by filling out a
598 nand_bbt_descr structure and storing the pointer in the
599 nand_chip structure member bbt_td before calling nand_scan().
600 If a mirror table is neccecary a second structure must be
601 created and a pointer to this structure must be stored
602 in bbt_md inside the nand_chip structure. If the bbt_md
603 member is set to NULL then only the main table is used
604 and no scan for the mirrored table is performed.
605 </para>
606 <para>
607 The most important field in the nand_bbt_descr structure
608 is the options field. The options define most of the
609 table properties. Use the predefined constants from
610 nand.h to define the options.
611 <itemizedlist>
612 <listitem><para>Number of bits per block</para>
613 <para>The supported number of bits is 1, 2, 4, 8.</para></listitem>
614 <listitem><para>Table per chip</para>
615 <para>Setting the constant NAND_BBT_PERCHIP selects that
616 a bad block table is managed for each chip in a chip array.
617 If this option is not set then a per device bad block table
618 is used.</para></listitem>
619 <listitem><para>Table location is absolute</para>
620 <para>Use the option constant NAND_BBT_ABSPAGE and
621 define the absolute page number where the bad block
622 table starts in the field pages. If you have selected bad block
623 tables per chip and you have a multi chip array then the start page
624 must be given for each chip in the chip array. Note: there is no scan
625 for a table ident pattern performed, so the fields
626 pattern, veroffs, offs, len can be left uninitialized</para></listitem>
627 <listitem><para>Table location is automatically detected</para>
628 <para>The table can either be located in the first or the last good
629 blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
630 the bad block table at the end of the chip (device). The
631 bad block tables are marked and identified by a pattern which
632 is stored in the spare area of the first page in the block which
633 holds the bad block table. Store a pointer to the pattern
634 in the pattern field. Further the length of the pattern has to be
635 stored in len and the offset in the spare area must be given
636 in the offs member of the nand_bbt_descr stucture. For mirrored
637 bad block tables different patterns are mandatory.</para></listitem>
638 <listitem><para>Table creation</para>
639 <para>Set the option NAND_BBT_CREATE to enable the table creation
640 if no table can be found during the scan. Usually this is done only
641 once if a new chip is found. </para></listitem>
642 <listitem><para>Table write support</para>
643 <para>Set the option NAND_BBT_WRITE to enable the table write support.
644 This allows the update of the bad block table(s) in case a block has
645 to be marked bad due to wear. The MTD interface function block_markbad
646 is calling the update function of the bad block table. If the write
647 support is enabled then the table is updated on FLASH.</para>
648 <para>
649 Note: Write support should only be enabled for mirrored tables with
650 version control.
651 </para></listitem>
652 <listitem><para>Table version control</para>
653 <para>Set the option NAND_BBT_VERSION to enable the table version control.
654 It's highly recommended to enable this for mirrored tables with write
655 support. It makes sure that the risk of loosing the bad block
656 table information is reduced to the loss of the information about the
657 one worn out block which should be marked bad. The version is stored in
658 4 consecutive bytes in the spare area of the device. The position of
659 the version number is defined by the member veroffs in the bad block table
660 descriptor.</para></listitem>
661 <listitem><para>Save block contents on write</para>
662 <para>
663 In case that the block which holds the bad block table does contain
664 other useful information, set the option NAND_BBT_SAVECONTENT. When
665 the bad block table is written then the whole block is read the bad
666 block table is updated and the block is erased and everything is
667 written back. If this option is not set only the bad block table
668 is written and everything else in the block is ignored and erased.
669 </para></listitem>
670 <listitem><para>Number of reserved blocks</para>
671 <para>
672 For automatic placement some blocks must be reserved for
673 bad block table storage. The number of reserved blocks is defined
674 in the maxblocks member of the babd block table description structure.
675 Reserving 4 blocks for mirrored tables should be a reasonable number.
676 This also limits the number of blocks which are scanned for the bad
677 block table ident pattern.
678 </para></listitem>
679 </itemizedlist>
680 </para>
681 </sect2>
682 </sect1>
683 <sect1>
684 <title>Spare area (auto)placement</title>
685 <para>
686 The nand driver implements different possibilities for
687 placement of filesystem data in the spare area,
688 <itemizedlist>
689 <listitem><para>Placement defined by fs driver</para></listitem>
690 <listitem><para>Automatic placement</para></listitem>
691 </itemizedlist>
692 The default placement function is automatic placement. The
693 nand driver has built in default placement schemes for the
694 various chiptypes. If due to hardware ECC functionality the
695 default placement does not fit then the board driver can
696 provide a own placement scheme.
697 </para>
698 <para>
699 File system drivers can provide a own placement scheme which
700 is used instead of the default placement scheme.
701 </para>
702 <para>
703 Placement schemes are defined by a nand_oobinfo structure
704 <programlisting>
705struct nand_oobinfo {
706 int useecc;
707 int eccbytes;
708 int eccpos[24];
709 int oobfree[8][2];
710};
711 </programlisting>
712 <itemizedlist>
713 <listitem><para>useecc</para><para>
714 The useecc member controls the ecc and placement function. The header
715 file include/mtd/mtd-abi.h contains constants to select ecc and
716 placement. MTD_NANDECC_OFF switches off the ecc complete. This is
717 not recommended and available for testing and diagnosis only.
718 MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
719 selects automatic placement.
720 </para></listitem>
721 <listitem><para>eccbytes</para><para>
722 The eccbytes member defines the number of ecc bytes per page.
723 </para></listitem>
724 <listitem><para>eccpos</para><para>
725 The eccpos array holds the byte offsets in the spare area where
726 the ecc codes are placed.
727 </para></listitem>
728 <listitem><para>oobfree</para><para>
729 The oobfree array defines the areas in the spare area which can be
730 used for automatic placement. The information is given in the format
731 {offset, size}. offset defines the start of the usable area, size the
732 length in bytes. More than one area can be defined. The list is terminated
733 by an {0, 0} entry.
734 </para></listitem>
735 </itemizedlist>
736 </para>
737 <sect2>
738 <title>Placement defined by fs driver</title>
739 <para>
740 The calling function provides a pointer to a nand_oobinfo
741 structure which defines the ecc placement. For writes the
742 caller must provide a spare area buffer along with the
743 data buffer. The spare area buffer size is (number of pages) *
744 (size of spare area). For reads the buffer size is
745 (number of pages) * ((size of spare area) + (number of ecc
746 steps per page) * sizeof (int)). The driver stores the
747 result of the ecc check for each tuple in the spare buffer.
748 The storage sequence is
749 </para>
750 <para>
751 &lt;spare data page 0&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
752 </para>
753 <para>
754 ...
755 </para>
756 <para>
757 &lt;spare data page n&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
758 </para>
759 <para>
760 This is a legacy mode used by YAFFS1.
761 </para>
762 <para>
763 If the spare area buffer is NULL then only the ECC placement is
764 done according to the given scheme in the nand_oobinfo structure.
765 </para>
766 </sect2>
767 <sect2>
768 <title>Automatic placement</title>
769 <para>
770 Automatic placement uses the built in defaults to place the
771 ecc bytes in the spare area. If filesystem data have to be stored /
772 read into the spare area then the calling function must provide a
773 buffer. The buffer size per page is determined by the oobfree array in
774 the nand_oobinfo structure.
775 </para>
776 <para>
777 If the spare area buffer is NULL then only the ECC placement is
778 done according to the default builtin scheme.
779 </para>
780 </sect2>
781 <sect2>
782 <title>User space placement selection</title>
783 <para>
784 All non ecc functions like mtd->read and mtd->write use an internal
785 structure, which can be set by an ioctl. This structure is preset
786 to the autoplacement default.
787 <programlisting>
788 ioctl (fd, MEMSETOOBSEL, oobsel);
789 </programlisting>
790 oobsel is a pointer to a user supplied structure of type
791 nand_oobconfig. The contents of this structure must match the
792 criteria of the filesystem, which will be used. See an example in utils/nandwrite.c.
793 </para>
794 </sect2>
795 </sect1>
796 <sect1>
797 <title>Spare area autoplacement default schemes</title>
798 <sect2>
799 <title>256 byte pagesize</title>
800<informaltable><tgroup cols="3"><tbody>
801<row>
802<entry>Offset</entry>
803<entry>Content</entry>
804<entry>Comment</entry>
805</row>
806<row>
807<entry>0x00</entry>
808<entry>ECC byte 0</entry>
809<entry>Error correction code byte 0</entry>
810</row>
811<row>
812<entry>0x01</entry>
813<entry>ECC byte 1</entry>
814<entry>Error correction code byte 1</entry>
815</row>
816<row>
817<entry>0x02</entry>
818<entry>ECC byte 2</entry>
819<entry>Error correction code byte 2</entry>
820</row>
821<row>
822<entry>0x03</entry>
823<entry>Autoplace 0</entry>
824<entry></entry>
825</row>
826<row>
827<entry>0x04</entry>
828<entry>Autoplace 1</entry>
829<entry></entry>
830</row>
831<row>
832<entry>0x05</entry>
833<entry>Bad block marker</entry>
834<entry>If any bit in this byte is zero, then this block is bad.
835This applies only to the first page in a block. In the remaining
836pages this byte is reserved</entry>
837</row>
838<row>
839<entry>0x06</entry>
840<entry>Autoplace 2</entry>
841<entry></entry>
842</row>
843<row>
844<entry>0x07</entry>
845<entry>Autoplace 3</entry>
846<entry></entry>
847</row>
848</tbody></tgroup></informaltable>
849 </sect2>
850 <sect2>
851 <title>512 byte pagesize</title>
852<informaltable><tgroup cols="3"><tbody>
853<row>
854<entry>Offset</entry>
855<entry>Content</entry>
856<entry>Comment</entry>
857</row>
858<row>
859<entry>0x00</entry>
860<entry>ECC byte 0</entry>
861<entry>Error correction code byte 0 of the lower 256 Byte data in
862this page</entry>
863</row>
864<row>
865<entry>0x01</entry>
866<entry>ECC byte 1</entry>
867<entry>Error correction code byte 1 of the lower 256 Bytes of data
868in this page</entry>
869</row>
870<row>
871<entry>0x02</entry>
872<entry>ECC byte 2</entry>
873<entry>Error correction code byte 2 of the lower 256 Bytes of data
874in this page</entry>
875</row>
876<row>
877<entry>0x03</entry>
878<entry>ECC byte 3</entry>
879<entry>Error correction code byte 0 of the upper 256 Bytes of data
880in this page</entry>
881</row>
882<row>
883<entry>0x04</entry>
884<entry>reserved</entry>
885<entry>reserved</entry>
886</row>
887<row>
888<entry>0x05</entry>
889<entry>Bad block marker</entry>
890<entry>If any bit in this byte is zero, then this block is bad.
891This applies only to the first page in a block. In the remaining
892pages this byte is reserved</entry>
893</row>
894<row>
895<entry>0x06</entry>
896<entry>ECC byte 4</entry>
897<entry>Error correction code byte 1 of the upper 256 Bytes of data
898in this page</entry>
899</row>
900<row>
901<entry>0x07</entry>
902<entry>ECC byte 5</entry>
903<entry>Error correction code byte 2 of the upper 256 Bytes of data
904in this page</entry>
905</row>
906<row>
907<entry>0x08 - 0x0F</entry>
908<entry>Autoplace 0 - 7</entry>
909<entry></entry>
910</row>
911</tbody></tgroup></informaltable>
912 </sect2>
913 <sect2>
914 <title>2048 byte pagesize</title>
915<informaltable><tgroup cols="3"><tbody>
916<row>
917<entry>Offset</entry>
918<entry>Content</entry>
919<entry>Comment</entry>
920</row>
921<row>
922<entry>0x00</entry>
923<entry>Bad block marker</entry>
924<entry>If any bit in this byte is zero, then this block is bad.
925This applies only to the first page in a block. In the remaining
926pages this byte is reserved</entry>
927</row>
928<row>
929<entry>0x01</entry>
930<entry>Reserved</entry>
931<entry>Reserved</entry>
932</row>
933<row>
934<entry>0x02-0x27</entry>
935<entry>Autoplace 0 - 37</entry>
936<entry></entry>
937</row>
938<row>
939<entry>0x28</entry>
940<entry>ECC byte 0</entry>
941<entry>Error correction code byte 0 of the first 256 Byte data in
942this page</entry>
943</row>
944<row>
945<entry>0x29</entry>
946<entry>ECC byte 1</entry>
947<entry>Error correction code byte 1 of the first 256 Bytes of data
948in this page</entry>
949</row>
950<row>
951<entry>0x2A</entry>
952<entry>ECC byte 2</entry>
953<entry>Error correction code byte 2 of the first 256 Bytes data in
954this page</entry>
955</row>
956<row>
957<entry>0x2B</entry>
958<entry>ECC byte 3</entry>
959<entry>Error correction code byte 0 of the second 256 Bytes of data
960in this page</entry>
961</row>
962<row>
963<entry>0x2C</entry>
964<entry>ECC byte 4</entry>
965<entry>Error correction code byte 1 of the second 256 Bytes of data
966in this page</entry>
967</row>
968<row>
969<entry>0x2D</entry>
970<entry>ECC byte 5</entry>
971<entry>Error correction code byte 2 of the second 256 Bytes of data
972in this page</entry>
973</row>
974<row>
975<entry>0x2E</entry>
976<entry>ECC byte 6</entry>
977<entry>Error correction code byte 0 of the third 256 Bytes of data
978in this page</entry>
979</row>
980<row>
981<entry>0x2F</entry>
982<entry>ECC byte 7</entry>
983<entry>Error correction code byte 1 of the third 256 Bytes of data
984in this page</entry>
985</row>
986<row>
987<entry>0x30</entry>
988<entry>ECC byte 8</entry>
989<entry>Error correction code byte 2 of the third 256 Bytes of data
990in this page</entry>
991</row>
992<row>
993<entry>0x31</entry>
994<entry>ECC byte 9</entry>
995<entry>Error correction code byte 0 of the fourth 256 Bytes of data
996in this page</entry>
997</row>
998<row>
999<entry>0x32</entry>
1000<entry>ECC byte 10</entry>
1001<entry>Error correction code byte 1 of the fourth 256 Bytes of data
1002in this page</entry>
1003</row>
1004<row>
1005<entry>0x33</entry>
1006<entry>ECC byte 11</entry>
1007<entry>Error correction code byte 2 of the fourth 256 Bytes of data
1008in this page</entry>
1009</row>
1010<row>
1011<entry>0x34</entry>
1012<entry>ECC byte 12</entry>
1013<entry>Error correction code byte 0 of the fifth 256 Bytes of data
1014in this page</entry>
1015</row>
1016<row>
1017<entry>0x35</entry>
1018<entry>ECC byte 13</entry>
1019<entry>Error correction code byte 1 of the fifth 256 Bytes of data
1020in this page</entry>
1021</row>
1022<row>
1023<entry>0x36</entry>
1024<entry>ECC byte 14</entry>
1025<entry>Error correction code byte 2 of the fifth 256 Bytes of data
1026in this page</entry>
1027</row>
1028<row>
1029<entry>0x37</entry>
1030<entry>ECC byte 15</entry>
1031<entry>Error correction code byte 0 of the sixt 256 Bytes of data
1032in this page</entry>
1033</row>
1034<row>
1035<entry>0x38</entry>
1036<entry>ECC byte 16</entry>
1037<entry>Error correction code byte 1 of the sixt 256 Bytes of data
1038in this page</entry>
1039</row>
1040<row>
1041<entry>0x39</entry>
1042<entry>ECC byte 17</entry>
1043<entry>Error correction code byte 2 of the sixt 256 Bytes of data
1044in this page</entry>
1045</row>
1046<row>
1047<entry>0x3A</entry>
1048<entry>ECC byte 18</entry>
1049<entry>Error correction code byte 0 of the seventh 256 Bytes of
1050data in this page</entry>
1051</row>
1052<row>
1053<entry>0x3B</entry>
1054<entry>ECC byte 19</entry>
1055<entry>Error correction code byte 1 of the seventh 256 Bytes of
1056data in this page</entry>
1057</row>
1058<row>
1059<entry>0x3C</entry>
1060<entry>ECC byte 20</entry>
1061<entry>Error correction code byte 2 of the seventh 256 Bytes of
1062data in this page</entry>
1063</row>
1064<row>
1065<entry>0x3D</entry>
1066<entry>ECC byte 21</entry>
1067<entry>Error correction code byte 0 of the eigth 256 Bytes of data
1068in this page</entry>
1069</row>
1070<row>
1071<entry>0x3E</entry>
1072<entry>ECC byte 22</entry>
1073<entry>Error correction code byte 1 of the eigth 256 Bytes of data
1074in this page</entry>
1075</row>
1076<row>
1077<entry>0x3F</entry>
1078<entry>ECC byte 23</entry>
1079<entry>Error correction code byte 2 of the eigth 256 Bytes of data
1080in this page</entry>
1081</row>
1082</tbody></tgroup></informaltable>
1083 </sect2>
1084 </sect1>
1085 </chapter>
1086
1087 <chapter id="filesystems">
1088 <title>Filesystem support</title>
1089 <para>
1090 The NAND driver provides all neccecary functions for a
1091 filesystem via the MTD interface.
1092 </para>
1093 <para>
1094 Filesystems must be aware of the NAND pecularities and
1095 restrictions. One major restrictions of NAND Flash is, that you cannot
1096 write as often as you want to a page. The consecutive writes to a page,
1097 before erasing it again, are restricted to 1-3 writes, depending on the
1098 manufacturers specifications. This applies similar to the spare area.
1099 </para>
1100 <para>
1101 Therefor NAND aware filesystems must either write in page size chunks
1102 or hold a writebuffer to collect smaller writes until they sum up to
1103 pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
1104 </para>
1105 <para>
1106 The spare area usage to store filesystem data is controlled by
1107 the spare area placement functionality which is described in one
1108 of the earlier chapters.
1109 </para>
1110 </chapter>
1111 <chapter id="tools">
1112 <title>Tools</title>
1113 <para>
1114 The MTD project provides a couple of helpful tools to handle NAND Flash.
1115 <itemizedlist>
1116 <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem>
1117 <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem>
1118 <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem>
1119 </itemizedlist>
1120 </para>
1121 <para>
1122 These tools are aware of the NAND restrictions. Please use those tools
1123 instead of complaining about errors which are caused by non NAND aware
1124 access methods.
1125 </para>
1126 </chapter>
1127
1128 <chapter id="defines">
1129 <title>Constants</title>
1130 <para>
1131 This chapter describes the constants which might be relevant for a driver developer.
1132 </para>
1133 <sect1>
1134 <title>Chip option constants</title>
1135 <sect2>
1136 <title>Constants for chip id table</title>
1137 <para>
1138 These constants are defined in nand.h. They are ored together to describe
1139 the chip functionality.
1140 <programlisting>
1141/* Chip can not auto increment pages */
1142#define NAND_NO_AUTOINCR 0x00000001
1143/* Buswitdh is 16 bit */
1144#define NAND_BUSWIDTH_16 0x00000002
1145/* Device supports partial programming without padding */
1146#define NAND_NO_PADDING 0x00000004
1147/* Chip has cache program function */
1148#define NAND_CACHEPRG 0x00000008
1149/* Chip has copy back function */
1150#define NAND_COPYBACK 0x00000010
1151/* AND Chip which has 4 banks and a confusing page / block
1152 * assignment. See Renesas datasheet for further information */
1153#define NAND_IS_AND 0x00000020
1154/* Chip has a array of 4 pages which can be read without
1155 * additional ready /busy waits */
1156#define NAND_4PAGE_ARRAY 0x00000040
1157 </programlisting>
1158 </para>
1159 </sect2>
1160 <sect2>
1161 <title>Constants for runtime options</title>
1162 <para>
1163 These constants are defined in nand.h. They are ored together to describe
1164 the functionality.
1165 <programlisting>
1166/* Use a flash based bad block table. This option is parsed by the
1167 * default bad block table function (nand_default_bbt). */
1168#define NAND_USE_FLASH_BBT 0x00010000
1169/* The hw ecc generator provides a syndrome instead a ecc value on read
1170 * This can only work if we have the ecc bytes directly behind the
1171 * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
1172#define NAND_HWECC_SYNDROME 0x00020000
1173 </programlisting>
1174 </para>
1175 </sect2>
1176 </sect1>
1177
1178 <sect1>
1179 <title>ECC selection constants</title>
1180 <para>
1181 Use these constants to select the ECC algorithm.
1182 <programlisting>
1183/* No ECC. Usage is not recommended ! */
1184#define NAND_ECC_NONE 0
1185/* Software ECC 3 byte ECC per 256 Byte data */
1186#define NAND_ECC_SOFT 1
1187/* Hardware ECC 3 byte ECC per 256 Byte data */
1188#define NAND_ECC_HW3_256 2
1189/* Hardware ECC 3 byte ECC per 512 Byte data */
1190#define NAND_ECC_HW3_512 3
1191/* Hardware ECC 6 byte ECC per 512 Byte data */
1192#define NAND_ECC_HW6_512 4
1193/* Hardware ECC 6 byte ECC per 512 Byte data */
1194#define NAND_ECC_HW8_512 6
1195 </programlisting>
1196 </para>
1197 </sect1>
1198
1199 <sect1>
1200 <title>Hardware control related constants</title>
1201 <para>
1202 These constants describe the requested hardware access function when
1203 the boardspecific hardware control function is called
1204 <programlisting>
1205/* Select the chip by setting nCE to low */
1206#define NAND_CTL_SETNCE 1
1207/* Deselect the chip by setting nCE to high */
1208#define NAND_CTL_CLRNCE 2
1209/* Select the command latch by setting CLE to high */
1210#define NAND_CTL_SETCLE 3
1211/* Deselect the command latch by setting CLE to low */
1212#define NAND_CTL_CLRCLE 4
1213/* Select the address latch by setting ALE to high */
1214#define NAND_CTL_SETALE 5
1215/* Deselect the address latch by setting ALE to low */
1216#define NAND_CTL_CLRALE 6
1217/* Set write protection by setting WP to high. Not used! */
1218#define NAND_CTL_SETWP 7
1219/* Clear write protection by setting WP to low. Not used! */
1220#define NAND_CTL_CLRWP 8
1221 </programlisting>
1222 </para>
1223 </sect1>
1224
1225 <sect1>
1226 <title>Bad block table related constants</title>
1227 <para>
1228 These constants describe the options used for bad block
1229 table descriptors.
1230 <programlisting>
1231/* Options for the bad block table descriptors */
1232
1233/* The number of bits used per block in the bbt on the device */
1234#define NAND_BBT_NRBITS_MSK 0x0000000F
1235#define NAND_BBT_1BIT 0x00000001
1236#define NAND_BBT_2BIT 0x00000002
1237#define NAND_BBT_4BIT 0x00000004
1238#define NAND_BBT_8BIT 0x00000008
1239/* The bad block table is in the last good block of the device */
1240#define NAND_BBT_LASTBLOCK 0x00000010
1241/* The bbt is at the given page, else we must scan for the bbt */
1242#define NAND_BBT_ABSPAGE 0x00000020
1243/* The bbt is at the given page, else we must scan for the bbt */
1244#define NAND_BBT_SEARCH 0x00000040
1245/* bbt is stored per chip on multichip devices */
1246#define NAND_BBT_PERCHIP 0x00000080
1247/* bbt has a version counter at offset veroffs */
1248#define NAND_BBT_VERSION 0x00000100
1249/* Create a bbt if none axists */
1250#define NAND_BBT_CREATE 0x00000200
1251/* Search good / bad pattern through all pages of a block */
1252#define NAND_BBT_SCANALLPAGES 0x00000400
1253/* Scan block empty during good / bad block scan */
1254#define NAND_BBT_SCANEMPTY 0x00000800
1255/* Write bbt if neccecary */
1256#define NAND_BBT_WRITE 0x00001000
1257/* Read and write back block contents when writing bbt */
1258#define NAND_BBT_SAVECONTENT 0x00002000
1259 </programlisting>
1260 </para>
1261 </sect1>
1262
1263 </chapter>
1264
1265 <chapter id="structs">
1266 <title>Structures</title>
1267 <para>
1268 This chapter contains the autogenerated documentation of the structures which are
1269 used in the NAND driver and might be relevant for a driver developer. Each
1270 struct member has a short description which is marked with an [XXX] identifier.
1271 See the chapter "Documentation hints" for an explanation.
1272 </para>
1273!Iinclude/linux/mtd/nand.h
1274 </chapter>
1275
1276 <chapter id="pubfunctions">
1277 <title>Public Functions Provided</title>
1278 <para>
1279 This chapter contains the autogenerated documentation of the NAND kernel API functions
1280 which are exported. Each function has a short description which is marked with an [XXX] identifier.
1281 See the chapter "Documentation hints" for an explanation.
1282 </para>
1283!Edrivers/mtd/nand/nand_base.c
1284!Edrivers/mtd/nand/nand_bbt.c
1285!Edrivers/mtd/nand/nand_ecc.c
1286 </chapter>
1287
1288 <chapter id="intfunctions">
1289 <title>Internal Functions Provided</title>
1290 <para>
1291 This chapter contains the autogenerated documentation of the NAND driver internal functions.
1292 Each function has a short description which is marked with an [XXX] identifier.
1293 See the chapter "Documentation hints" for an explanation.
1294 The functions marked with [DEFAULT] might be relevant for a board driver developer.
1295 </para>
1296!Idrivers/mtd/nand/nand_base.c
1297!Idrivers/mtd/nand/nand_bbt.c
1298!Idrivers/mtd/nand/nand_ecc.c
1299 </chapter>
1300
1301 <chapter id="credits">
1302 <title>Credits</title>
1303 <para>
1304 The following people have contributed to the NAND driver:
1305 <orderedlist>
1306 <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem>
1307 <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem>
1308 <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
1309 </orderedlist>
1310 A lot of users have provided bugfixes, improvements and helping hands for testing.
1311 Thanks a lot.
1312 </para>
1313 <para>
1314 The following people have contributed to this document:
1315 <orderedlist>
1316 <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
1317 </orderedlist>
1318 </para>
1319 </chapter>
1320</book>
diff --git a/Documentation/DocBook/procfs-guide.tmpl b/Documentation/DocBook/procfs-guide.tmpl
new file mode 100644
index 000000000000..45cad23efefa
--- /dev/null
+++ b/Documentation/DocBook/procfs-guide.tmpl
@@ -0,0 +1,591 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
4<!ENTITY procfsexample SYSTEM "procfs_example.xml">
5]>
6
7<book id="LKProcfsGuide">
8 <bookinfo>
9 <title>Linux Kernel Procfs Guide</title>
10
11 <authorgroup>
12 <author>
13 <firstname>Erik</firstname>
14 <othername>(J.A.K.)</othername>
15 <surname>Mouw</surname>
16 <affiliation>
17 <orgname>Delft University of Technology</orgname>
18 <orgdiv>Faculty of Information Technology and Systems</orgdiv>
19 <address>
20 <email>J.A.K.Mouw@its.tudelft.nl</email>
21 <pob>PO BOX 5031</pob>
22 <postcode>2600 GA</postcode>
23 <city>Delft</city>
24 <country>The Netherlands</country>
25 </address>
26 </affiliation>
27 </author>
28 </authorgroup>
29
30 <revhistory>
31 <revision>
32 <revnumber>1.0&nbsp;</revnumber>
33 <date>May 30, 2001</date>
34 <revremark>Initial revision posted to linux-kernel</revremark>
35 </revision>
36 <revision>
37 <revnumber>1.1&nbsp;</revnumber>
38 <date>June 3, 2001</date>
39 <revremark>Revised after comments from linux-kernel</revremark>
40 </revision>
41 </revhistory>
42
43 <copyright>
44 <year>2001</year>
45 <holder>Erik Mouw</holder>
46 </copyright>
47
48
49 <legalnotice>
50 <para>
51 This documentation is free software; you can redistribute it
52 and/or modify it under the terms of the GNU General Public
53 License as published by the Free Software Foundation; either
54 version 2 of the License, or (at your option) any later
55 version.
56 </para>
57
58 <para>
59 This documentation is distributed in the hope that it will be
60 useful, but WITHOUT ANY WARRANTY; without even the implied
61 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
62 PURPOSE. See the GNU General Public License for more details.
63 </para>
64
65 <para>
66 You should have received a copy of the GNU General Public
67 License along with this program; if not, write to the Free
68 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
69 MA 02111-1307 USA
70 </para>
71
72 <para>
73 For more details see the file COPYING in the source
74 distribution of Linux.
75 </para>
76 </legalnotice>
77 </bookinfo>
78
79
80
81
82 <toc>
83 </toc>
84
85
86
87
88 <preface>
89 <title>Preface</title>
90
91 <para>
92 This guide describes the use of the procfs file system from
93 within the Linux kernel. The idea to write this guide came up on
94 the #kernelnewbies IRC channel (see <ulink
95 url="http://www.kernelnewbies.org/">http://www.kernelnewbies.org/</ulink>),
96 when Jeff Garzik explained the use of procfs and forwarded me a
97 message Alexander Viro wrote to the linux-kernel mailing list. I
98 agreed to write it up nicely, so here it is.
99 </para>
100
101 <para>
102 I'd like to thank Jeff Garzik
103 <email>jgarzik@pobox.com</email> and Alexander Viro
104 <email>viro@parcelfarce.linux.theplanet.co.uk</email> for their input,
105 Tim Waugh <email>twaugh@redhat.com</email> for his <ulink
106 url="http://people.redhat.com/twaugh/docbook/selfdocbook/">Selfdocbook</ulink>,
107 and Marc Joosen <email>marcj@historia.et.tudelft.nl</email> for
108 proofreading.
109 </para>
110
111 <para>
112 This documentation was written while working on the LART
113 computing board (<ulink
114 url="http://www.lart.tudelft.nl/">http://www.lart.tudelft.nl/</ulink>),
115 which is sponsored by the Mobile Multi-media Communications
116 (<ulink
117 url="http://www.mmc.tudelft.nl/">http://www.mmc.tudelft.nl/</ulink>)
118 and Ubiquitous Communications (<ulink
119 url="http://www.ubicom.tudelft.nl/">http://www.ubicom.tudelft.nl/</ulink>)
120 projects.
121 </para>
122
123 <para>
124 Erik
125 </para>
126 </preface>
127
128
129
130
131 <chapter id="intro">
132 <title>Introduction</title>
133
134 <para>
135 The <filename class="directory">/proc</filename> file system
136 (procfs) is a special file system in the linux kernel. It's a
137 virtual file system: it is not associated with a block device
138 but exists only in memory. The files in the procfs are there to
139 allow userland programs access to certain information from the
140 kernel (like process information in <filename
141 class="directory">/proc/[0-9]+/</filename>), but also for debug
142 purposes (like <filename>/proc/ksyms</filename>).
143 </para>
144
145 <para>
146 This guide describes the use of the procfs file system from
147 within the Linux kernel. It starts by introducing all relevant
148 functions to manage the files within the file system. After that
149 it shows how to communicate with userland, and some tips and
150 tricks will be pointed out. Finally a complete example will be
151 shown.
152 </para>
153
154 <para>
155 Note that the files in <filename
156 class="directory">/proc/sys</filename> are sysctl files: they
157 don't belong to procfs and are governed by a completely
158 different API described in the Kernel API book.
159 </para>
160 </chapter>
161
162
163
164
165 <chapter id="managing">
166 <title>Managing procfs entries</title>
167
168 <para>
169 This chapter describes the functions that various kernel
170 components use to populate the procfs with files, symlinks,
171 device nodes, and directories.
172 </para>
173
174 <para>
175 A minor note before we start: if you want to use any of the
176 procfs functions, be sure to include the correct header file!
177 This should be one of the first lines in your code:
178 </para>
179
180 <programlisting>
181#include &lt;linux/proc_fs.h&gt;
182 </programlisting>
183
184
185
186
187 <sect1 id="regularfile">
188 <title>Creating a regular file</title>
189
190 <funcsynopsis>
191 <funcprototype>
192 <funcdef>struct proc_dir_entry* <function>create_proc_entry</function></funcdef>
193 <paramdef>const char* <parameter>name</parameter></paramdef>
194 <paramdef>mode_t <parameter>mode</parameter></paramdef>
195 <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
196 </funcprototype>
197 </funcsynopsis>
198
199 <para>
200 This function creates a regular file with the name
201 <parameter>name</parameter>, file mode
202 <parameter>mode</parameter> in the directory
203 <parameter>parent</parameter>. To create a file in the root of
204 the procfs, use <constant>NULL</constant> as
205 <parameter>parent</parameter> parameter. When successful, the
206 function will return a pointer to the freshly created
207 <structname>struct proc_dir_entry</structname>; otherwise it
208 will return <constant>NULL</constant>. <xref
209 linkend="userland"/> describes how to do something useful with
210 regular files.
211 </para>
212
213 <para>
214 Note that it is specifically supported that you can pass a
215 path that spans multiple directories. For example
216 <function>create_proc_entry</function>(<parameter>"drivers/via0/info"</parameter>)
217 will create the <filename class="directory">via0</filename>
218 directory if necessary, with standard
219 <constant>0755</constant> permissions.
220 </para>
221
222 <para>
223 If you only want to be able to read the file, the function
224 <function>create_proc_read_entry</function> described in <xref
225 linkend="convenience"/> may be used to create and initialise
226 the procfs entry in one single call.
227 </para>
228 </sect1>
229
230
231
232
233 <sect1>
234 <title>Creating a symlink</title>
235
236 <funcsynopsis>
237 <funcprototype>
238 <funcdef>struct proc_dir_entry*
239 <function>proc_symlink</function></funcdef> <paramdef>const
240 char* <parameter>name</parameter></paramdef>
241 <paramdef>struct proc_dir_entry*
242 <parameter>parent</parameter></paramdef> <paramdef>const
243 char* <parameter>dest</parameter></paramdef>
244 </funcprototype>
245 </funcsynopsis>
246
247 <para>
248 This creates a symlink in the procfs directory
249 <parameter>parent</parameter> that points from
250 <parameter>name</parameter> to
251 <parameter>dest</parameter>. This translates in userland to
252 <literal>ln -s</literal> <parameter>dest</parameter>
253 <parameter>name</parameter>.
254 </para>
255 </sect1>
256
257 <sect1>
258 <title>Creating a directory</title>
259
260 <funcsynopsis>
261 <funcprototype>
262 <funcdef>struct proc_dir_entry* <function>proc_mkdir</function></funcdef>
263 <paramdef>const char* <parameter>name</parameter></paramdef>
264 <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
265 </funcprototype>
266 </funcsynopsis>
267
268 <para>
269 Create a directory <parameter>name</parameter> in the procfs
270 directory <parameter>parent</parameter>.
271 </para>
272 </sect1>
273
274
275
276
277 <sect1>
278 <title>Removing an entry</title>
279
280 <funcsynopsis>
281 <funcprototype>
282 <funcdef>void <function>remove_proc_entry</function></funcdef>
283 <paramdef>const char* <parameter>name</parameter></paramdef>
284 <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
285 </funcprototype>
286 </funcsynopsis>
287
288 <para>
289 Removes the entry <parameter>name</parameter> in the directory
290 <parameter>parent</parameter> from the procfs. Entries are
291 removed by their <emphasis>name</emphasis>, not by the
292 <structname>struct proc_dir_entry</structname> returned by the
293 various create functions. Note that this function doesn't
294 recursively remove entries.
295 </para>
296
297 <para>
298 Be sure to free the <structfield>data</structfield> entry from
299 the <structname>struct proc_dir_entry</structname> before
300 <function>remove_proc_entry</function> is called (that is: if
301 there was some <structfield>data</structfield> allocated, of
302 course). See <xref linkend="usingdata"/> for more information
303 on using the <structfield>data</structfield> entry.
304 </para>
305 </sect1>
306 </chapter>
307
308
309
310
311 <chapter id="userland">
312 <title>Communicating with userland</title>
313
314 <para>
315 Instead of reading (or writing) information directly from
316 kernel memory, procfs works with <emphasis>call back
317 functions</emphasis> for files: functions that are called when
318 a specific file is being read or written. Such functions have
319 to be initialised after the procfs file is created by setting
320 the <structfield>read_proc</structfield> and/or
321 <structfield>write_proc</structfield> fields in the
322 <structname>struct proc_dir_entry*</structname> that the
323 function <function>create_proc_entry</function> returned:
324 </para>
325
326 <programlisting>
327struct proc_dir_entry* entry;
328
329entry->read_proc = read_proc_foo;
330entry->write_proc = write_proc_foo;
331 </programlisting>
332
333 <para>
334 If you only want to use a the
335 <structfield>read_proc</structfield>, the function
336 <function>create_proc_read_entry</function> described in <xref
337 linkend="convenience"/> may be used to create and initialise the
338 procfs entry in one single call.
339 </para>
340
341
342
343 <sect1>
344 <title>Reading data</title>
345
346 <para>
347 The read function is a call back function that allows userland
348 processes to read data from the kernel. The read function
349 should have the following format:
350 </para>
351
352 <funcsynopsis>
353 <funcprototype>
354 <funcdef>int <function>read_func</function></funcdef>
355 <paramdef>char* <parameter>page</parameter></paramdef>
356 <paramdef>char** <parameter>start</parameter></paramdef>
357 <paramdef>off_t <parameter>off</parameter></paramdef>
358 <paramdef>int <parameter>count</parameter></paramdef>
359 <paramdef>int* <parameter>eof</parameter></paramdef>
360 <paramdef>void* <parameter>data</parameter></paramdef>
361 </funcprototype>
362 </funcsynopsis>
363
364 <para>
365 The read function should write its information into the
366 <parameter>page</parameter>. For proper use, the function
367 should start writing at an offset of
368 <parameter>off</parameter> in <parameter>page</parameter> and
369 write at most <parameter>count</parameter> bytes, but because
370 most read functions are quite simple and only return a small
371 amount of information, these two parameters are usually
372 ignored (it breaks pagers like <literal>more</literal> and
373 <literal>less</literal>, but <literal>cat</literal> still
374 works).
375 </para>
376
377 <para>
378 If the <parameter>off</parameter> and
379 <parameter>count</parameter> parameters are properly used,
380 <parameter>eof</parameter> should be used to signal that the
381 end of the file has been reached by writing
382 <literal>1</literal> to the memory location
383 <parameter>eof</parameter> points to.
384 </para>
385
386 <para>
387 The parameter <parameter>start</parameter> doesn't seem to be
388 used anywhere in the kernel. The <parameter>data</parameter>
389 parameter can be used to create a single call back function for
390 several files, see <xref linkend="usingdata"/>.
391 </para>
392
393 <para>
394 The <function>read_func</function> function must return the
395 number of bytes written into the <parameter>page</parameter>.
396 </para>
397
398 <para>
399 <xref linkend="example"/> shows how to use a read call back
400 function.
401 </para>
402 </sect1>
403
404
405
406
407 <sect1>
408 <title>Writing data</title>
409
410 <para>
411 The write call back function allows a userland process to write
412 data to the kernel, so it has some kind of control over the
413 kernel. The write function should have the following format:
414 </para>
415
416 <funcsynopsis>
417 <funcprototype>
418 <funcdef>int <function>write_func</function></funcdef>
419 <paramdef>struct file* <parameter>file</parameter></paramdef>
420 <paramdef>const char* <parameter>buffer</parameter></paramdef>
421 <paramdef>unsigned long <parameter>count</parameter></paramdef>
422 <paramdef>void* <parameter>data</parameter></paramdef>
423 </funcprototype>
424 </funcsynopsis>
425
426 <para>
427 The write function should read <parameter>count</parameter>
428 bytes at maximum from the <parameter>buffer</parameter>. Note
429 that the <parameter>buffer</parameter> doesn't live in the
430 kernel's memory space, so it should first be copied to kernel
431 space with <function>copy_from_user</function>. The
432 <parameter>file</parameter> parameter is usually
433 ignored. <xref linkend="usingdata"/> shows how to use the
434 <parameter>data</parameter> parameter.
435 </para>
436
437 <para>
438 Again, <xref linkend="example"/> shows how to use this call back
439 function.
440 </para>
441 </sect1>
442
443
444
445
446 <sect1 id="usingdata">
447 <title>A single call back for many files</title>
448
449 <para>
450 When a large number of almost identical files is used, it's
451 quite inconvenient to use a separate call back function for
452 each file. A better approach is to have a single call back
453 function that distinguishes between the files by using the
454 <structfield>data</structfield> field in <structname>struct
455 proc_dir_entry</structname>. First of all, the
456 <structfield>data</structfield> field has to be initialised:
457 </para>
458
459 <programlisting>
460struct proc_dir_entry* entry;
461struct my_file_data *file_data;
462
463file_data = kmalloc(sizeof(struct my_file_data), GFP_KERNEL);
464entry->data = file_data;
465 </programlisting>
466
467 <para>
468 The <structfield>data</structfield> field is a <type>void
469 *</type>, so it can be initialised with anything.
470 </para>
471
472 <para>
473 Now that the <structfield>data</structfield> field is set, the
474 <function>read_proc</function> and
475 <function>write_proc</function> can use it to distinguish
476 between files because they get it passed into their
477 <parameter>data</parameter> parameter:
478 </para>
479
480 <programlisting>
481int foo_read_func(char *page, char **start, off_t off,
482 int count, int *eof, void *data)
483{
484 int len;
485
486 if(data == file_data) {
487 /* special case for this file */
488 } else {
489 /* normal processing */
490 }
491
492 return len;
493}
494 </programlisting>
495
496 <para>
497 Be sure to free the <structfield>data</structfield> data field
498 when removing the procfs entry.
499 </para>
500 </sect1>
501 </chapter>
502
503
504
505
506 <chapter id="tips">
507 <title>Tips and tricks</title>
508
509
510
511
512 <sect1 id="convenience">
513 <title>Convenience functions</title>
514
515 <funcsynopsis>
516 <funcprototype>
517 <funcdef>struct proc_dir_entry* <function>create_proc_read_entry</function></funcdef>
518 <paramdef>const char* <parameter>name</parameter></paramdef>
519 <paramdef>mode_t <parameter>mode</parameter></paramdef>
520 <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
521 <paramdef>read_proc_t* <parameter>read_proc</parameter></paramdef>
522 <paramdef>void* <parameter>data</parameter></paramdef>
523 </funcprototype>
524 </funcsynopsis>
525
526 <para>
527 This function creates a regular file in exactly the same way
528 as <function>create_proc_entry</function> from <xref
529 linkend="regularfile"/> does, but also allows to set the read
530 function <parameter>read_proc</parameter> in one call. This
531 function can set the <parameter>data</parameter> as well, like
532 explained in <xref linkend="usingdata"/>.
533 </para>
534 </sect1>
535
536
537
538 <sect1>
539 <title>Modules</title>
540
541 <para>
542 If procfs is being used from within a module, be sure to set
543 the <structfield>owner</structfield> field in the
544 <structname>struct proc_dir_entry</structname> to
545 <constant>THIS_MODULE</constant>.
546 </para>
547
548 <programlisting>
549struct proc_dir_entry* entry;
550
551entry->owner = THIS_MODULE;
552 </programlisting>
553 </sect1>
554
555
556
557
558 <sect1>
559 <title>Mode and ownership</title>
560
561 <para>
562 Sometimes it is useful to change the mode and/or ownership of
563 a procfs entry. Here is an example that shows how to achieve
564 that:
565 </para>
566
567 <programlisting>
568struct proc_dir_entry* entry;
569
570entry->mode = S_IWUSR |S_IRUSR | S_IRGRP | S_IROTH;
571entry->uid = 0;
572entry->gid = 100;
573 </programlisting>
574
575 </sect1>
576 </chapter>
577
578
579
580
581 <chapter id="example">
582 <title>Example</title>
583
584 <!-- be careful with the example code: it shouldn't be wider than
585 approx. 60 columns, or otherwise it won't fit properly on a page
586 -->
587
588&procfsexample;
589
590 </chapter>
591</book>
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
new file mode 100644
index 000000000000..7064084c1c5e
--- /dev/null
+++ b/Documentation/DocBook/procfs_example.c
@@ -0,0 +1,224 @@
1/*
2 * procfs_example.c: an example proc interface
3 *
4 * Copyright (C) 2001, Erik Mouw (J.A.K.Mouw@its.tudelft.nl)
5 *
6 * This file accompanies the procfs-guide in the Linux kernel
7 * source. Its main use is to demonstrate the concepts and
8 * functions described in the guide.
9 *
10 * This software has been developed while working on the LART
11 * computing board (http://www.lart.tudelft.nl/), which is
12 * sponsored by the Mobile Multi-media Communications
13 * (http://www.mmc.tudelft.nl/) and Ubiquitous Communications
14 * (http://www.ubicom.tudelft.nl/) projects.
15 *
16 * The author can be reached at:
17 *
18 * Erik Mouw
19 * Information and Communication Theory Group
20 * Faculty of Information Technology and Systems
21 * Delft University of Technology
22 * P.O. Box 5031
23 * 2600 GA Delft
24 * The Netherlands
25 *
26 *
27 * This program is free software; you can redistribute
28 * it and/or modify it under the terms of the GNU General
29 * Public License as published by the Free Software
30 * Foundation; either version 2 of the License, or (at your
31 * option) any later version.
32 *
33 * This program is distributed in the hope that it will be
34 * useful, but WITHOUT ANY WARRANTY; without even the implied
35 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
36 * PURPOSE. See the GNU General Public License for more
37 * details.
38 *
39 * You should have received a copy of the GNU General Public
40 * License along with this program; if not, write to the
41 * Free Software Foundation, Inc., 59 Temple Place,
42 * Suite 330, Boston, MA 02111-1307 USA
43 *
44 */
45
46#include <linux/module.h>
47#include <linux/kernel.h>
48#include <linux/init.h>
49#include <linux/proc_fs.h>
50#include <linux/jiffies.h>
51#include <asm/uaccess.h>
52
53
54#define MODULE_VERS "1.0"
55#define MODULE_NAME "procfs_example"
56
57#define FOOBAR_LEN 8
58
59struct fb_data_t {
60 char name[FOOBAR_LEN + 1];
61 char value[FOOBAR_LEN + 1];
62};
63
64
65static struct proc_dir_entry *example_dir, *foo_file,
66 *bar_file, *jiffies_file, *symlink;
67
68
69struct fb_data_t foo_data, bar_data;
70
71
72static int proc_read_jiffies(char *page, char **start,
73 off_t off, int count,
74 int *eof, void *data)
75{
76 int len;
77
78 len = sprintf(page, "jiffies = %ld\n",
79 jiffies);
80
81 return len;
82}
83
84
85static int proc_read_foobar(char *page, char **start,
86 off_t off, int count,
87 int *eof, void *data)
88{
89 int len;
90 struct fb_data_t *fb_data = (struct fb_data_t *)data;
91
92 /* DON'T DO THAT - buffer overruns are bad */
93 len = sprintf(page, "%s = '%s'\n",
94 fb_data->name, fb_data->value);
95
96 return len;
97}
98
99
100static int proc_write_foobar(struct file *file,
101 const char *buffer,
102 unsigned long count,
103 void *data)
104{
105 int len;
106 struct fb_data_t *fb_data = (struct fb_data_t *)data;
107
108 if(count > FOOBAR_LEN)
109 len = FOOBAR_LEN;
110 else
111 len = count;
112
113 if(copy_from_user(fb_data->value, buffer, len))
114 return -EFAULT;
115
116 fb_data->value[len] = '\0';
117
118 return len;
119}
120
121
122static int __init init_procfs_example(void)
123{
124 int rv = 0;
125
126 /* create directory */
127 example_dir = proc_mkdir(MODULE_NAME, NULL);
128 if(example_dir == NULL) {
129 rv = -ENOMEM;
130 goto out;
131 }
132
133 example_dir->owner = THIS_MODULE;
134
135 /* create jiffies using convenience function */
136 jiffies_file = create_proc_read_entry("jiffies",
137 0444, example_dir,
138 proc_read_jiffies,
139 NULL);
140 if(jiffies_file == NULL) {
141 rv = -ENOMEM;
142 goto no_jiffies;
143 }
144
145 jiffies_file->owner = THIS_MODULE;
146
147 /* create foo and bar files using same callback
148 * functions
149 */
150 foo_file = create_proc_entry("foo", 0644, example_dir);
151 if(foo_file == NULL) {
152 rv = -ENOMEM;
153 goto no_foo;
154 }
155
156 strcpy(foo_data.name, "foo");
157 strcpy(foo_data.value, "foo");
158 foo_file->data = &foo_data;
159 foo_file->read_proc = proc_read_foobar;
160 foo_file->write_proc = proc_write_foobar;
161 foo_file->owner = THIS_MODULE;
162
163 bar_file = create_proc_entry("bar", 0644, example_dir);
164 if(bar_file == NULL) {
165 rv = -ENOMEM;
166 goto no_bar;
167 }
168
169 strcpy(bar_data.name, "bar");
170 strcpy(bar_data.value, "bar");
171 bar_file->data = &bar_data;
172 bar_file->read_proc = proc_read_foobar;
173 bar_file->write_proc = proc_write_foobar;
174 bar_file->owner = THIS_MODULE;
175
176 /* create symlink */
177 symlink = proc_symlink("jiffies_too", example_dir,
178 "jiffies");
179 if(symlink == NULL) {
180 rv = -ENOMEM;
181 goto no_symlink;
182 }
183
184 symlink->owner = THIS_MODULE;
185
186 /* everything OK */
187 printk(KERN_INFO "%s %s initialised\n",
188 MODULE_NAME, MODULE_VERS);
189 return 0;
190
191no_symlink:
192 remove_proc_entry("tty", example_dir);
193no_tty:
194 remove_proc_entry("bar", example_dir);
195no_bar:
196 remove_proc_entry("foo", example_dir);
197no_foo:
198 remove_proc_entry("jiffies", example_dir);
199no_jiffies:
200 remove_proc_entry(MODULE_NAME, NULL);
201out:
202 return rv;
203}
204
205
206static void __exit cleanup_procfs_example(void)
207{
208 remove_proc_entry("jiffies_too", example_dir);
209 remove_proc_entry("tty", example_dir);
210 remove_proc_entry("bar", example_dir);
211 remove_proc_entry("foo", example_dir);
212 remove_proc_entry("jiffies", example_dir);
213 remove_proc_entry(MODULE_NAME, NULL);
214
215 printk(KERN_INFO "%s %s removed\n",
216 MODULE_NAME, MODULE_VERS);
217}
218
219
220module_init(init_procfs_example);
221module_exit(cleanup_procfs_example);
222
223MODULE_AUTHOR("Erik Mouw");
224MODULE_DESCRIPTION("procfs examples");
diff --git a/Documentation/DocBook/scsidrivers.tmpl b/Documentation/DocBook/scsidrivers.tmpl
new file mode 100644
index 000000000000..d058e65daf19
--- /dev/null
+++ b/Documentation/DocBook/scsidrivers.tmpl
@@ -0,0 +1,193 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="scsidrivers">
6 <bookinfo>
7 <title>SCSI Subsystem Interfaces</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Douglas</firstname>
12 <surname>Gilbert</surname>
13 <affiliation>
14 <address>
15 <email>dgilbert@interlog.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20 <pubdate>2003-08-11</pubdate>
21
22 <copyright>
23 <year>2002</year>
24 <year>2003</year>
25 <holder>Douglas Gilbert</holder>
26 </copyright>
27
28 <legalnotice>
29 <para>
30 This documentation is free software; you can redistribute
31 it and/or modify it under the terms of the GNU General Public
32 License as published by the Free Software Foundation; either
33 version 2 of the License, or (at your option) any later
34 version.
35 </para>
36
37 <para>
38 This program is distributed in the hope that it will be
39 useful, but WITHOUT ANY WARRANTY; without even the implied
40 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
41 See the GNU General Public License for more details.
42 </para>
43
44 <para>
45 You should have received a copy of the GNU General Public
46 License along with this program; if not, write to the Free
47 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
48 MA 02111-1307 USA
49 </para>
50
51 <para>
52 For more details see the file COPYING in the source
53 distribution of Linux.
54 </para>
55 </legalnotice>
56
57 </bookinfo>
58
59<toc></toc>
60
61 <chapter id="intro">
62 <title>Introduction</title>
63 <para>
64This document outlines the interface between the Linux scsi mid level
65and lower level drivers. Lower level drivers are variously called HBA
66(host bus adapter) drivers, host drivers (HD) or pseudo adapter drivers.
67The latter alludes to the fact that a lower level driver may be a
68bridge to another IO subsystem (and the "ide-scsi" driver is an example
69of this). There can be many lower level drivers active in a running
70system, but only one per hardware type. For example, the aic7xxx driver
71controls adaptec controllers based on the 7xxx chip series. Most lower
72level drivers can control one or more scsi hosts (a.k.a. scsi initiators).
73 </para>
74<para>
75This document can been found in an ASCII text file in the linux kernel
76source: <filename>Documentation/scsi/scsi_mid_low_api.txt</filename> .
77It currently hold a little more information than this document. The
78<filename>drivers/scsi/hosts.h</filename> and <filename>
79drivers/scsi/scsi.h</filename> headers contain descriptions of members
80of important structures for the scsi subsystem.
81</para>
82 </chapter>
83
84 <chapter id="driver-struct">
85 <title>Driver structure</title>
86 <para>
87Traditionally a lower level driver for the scsi subsystem has been
88at least two files in the drivers/scsi directory. For example, a
89driver called "xyz" has a header file "xyz.h" and a source file
90"xyz.c". [Actually there is no good reason why this couldn't all
91be in one file.] Some drivers that have been ported to several operating
92systems (e.g. aic7xxx which has separate files for generic and
93OS-specific code) have more than two files. Such drivers tend to have
94their own directory under the drivers/scsi directory.
95 </para>
96 <para>
97scsi_module.c is normally included at the end of a lower
98level driver. For it to work a declaration like this is needed before
99it is included:
100<programlisting>
101 static Scsi_Host_Template driver_template = DRIVER_TEMPLATE;
102 /* DRIVER_TEMPLATE should contain pointers to supported interface
103 functions. Scsi_Host_Template is defined hosts.h */
104 #include "scsi_module.c"
105</programlisting>
106 </para>
107 <para>
108The scsi_module.c assumes the name "driver_template" is appropriately
109defined. It contains 2 functions:
110<orderedlist>
111<listitem><para>
112 init_this_scsi_driver() called during builtin and module driver
113 initialization: invokes mid level's scsi_register_host()
114</para></listitem>
115<listitem><para>
116 exit_this_scsi_driver() called during closedown: invokes
117 mid level's scsi_unregister_host()
118</para></listitem>
119</orderedlist>
120 </para>
121<para>
122When a new, lower level driver is being added to Linux, the following
123files (all found in the drivers/scsi directory) will need some attention:
124Makefile, Config.help and Config.in . It is probably best to look at what
125an existing lower level driver does in this regard.
126</para>
127 </chapter>
128
129 <chapter id="intfunctions">
130 <title>Interface Functions</title>
131!EDocumentation/scsi/scsi_mid_low_api.txt
132 </chapter>
133
134 <chapter id="locks">
135 <title>Locks</title>
136<para>
137Each Scsi_Host instance has a spin_lock called Scsi_Host::default_lock
138which is initialized in scsi_register() [found in hosts.c]. Within the
139same function the Scsi_Host::host_lock pointer is initialized to point
140at default_lock with the scsi_assign_lock() function. Thereafter
141lock and unlock operations performed by the mid level use the
142Scsi_Host::host_lock pointer.
143</para>
144<para>
145Lower level drivers can override the use of Scsi_Host::default_lock by
146using scsi_assign_lock(). The earliest opportunity to do this would
147be in the detect() function after it has invoked scsi_register(). It
148could be replaced by a coarser grain lock (e.g. per driver) or a
149lock of equal granularity (i.e. per host). Using finer grain locks
150(e.g. per scsi device) may be possible by juggling locks in
151queuecommand().
152</para>
153 </chapter>
154
155 <chapter id="changes">
156 <title>Changes since lk 2.4 series</title>
157<para>
158io_request_lock has been replaced by several finer grained locks. The lock
159relevant to lower level drivers is Scsi_Host::host_lock and there is one
160per scsi host.
161</para>
162<para>
163The older error handling mechanism has been removed. This means the
164lower level interface functions abort() and reset() have been removed.
165</para>
166<para>
167In the 2.4 series the scsi subsystem configuration descriptions were
168aggregated with the configuration descriptions from all other Linux
169subsystems in the Documentation/Configure.help file. In the 2.5 series,
170the scsi subsystem now has its own (much smaller) drivers/scsi/Config.help
171file.
172</para>
173 </chapter>
174
175 <chapter id="credits">
176 <title>Credits</title>
177<para>
178The following people have contributed to this document:
179<orderedlist>
180<listitem><para>
181Mike Anderson <email>andmike@us.ibm.com</email>
182</para></listitem>
183<listitem><para>
184James Bottomley <email>James.Bottomley@steeleye.com</email>
185</para></listitem>
186<listitem><para>
187Patrick Mansfield <email>patmans@us.ibm.com</email>
188</para></listitem>
189</orderedlist>
190</para>
191 </chapter>
192
193</book>
diff --git a/Documentation/DocBook/sis900.tmpl b/Documentation/DocBook/sis900.tmpl
new file mode 100644
index 000000000000..6c2cbac93c3f
--- /dev/null
+++ b/Documentation/DocBook/sis900.tmpl
@@ -0,0 +1,585 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="SiS900Guide">
6
7<bookinfo>
8
9<title>SiS 900/7016 Fast Ethernet Device Driver</title>
10
11<authorgroup>
12<author>
13<firstname>Ollie</firstname>
14<surname>Lho</surname>
15</author>
16
17<author>
18<firstname>Lei Chun</firstname>
19<surname>Chang</surname>
20</author>
21</authorgroup>
22
23<edition>Document Revision: 0.3 for SiS900 driver v1.06 &amp; v1.07</edition>
24<pubdate>November 16, 2000</pubdate>
25
26<copyright>
27 <year>1999</year>
28 <holder>Silicon Integrated System Corp.</holder>
29</copyright>
30
31<legalnotice>
32 <para>
33 This program is free software; you can redistribute it and/or modify
34 it under the terms of the GNU General Public License as published by
35 the Free Software Foundation; either version 2 of the License, or
36 (at your option) any later version.
37 </para>
38
39 <para>
40 This program is distributed in the hope that it will be useful,
41 but WITHOUT ANY WARRANTY; without even the implied warranty of
42 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
43 GNU General Public License for more details.
44 </para>
45
46 <para>
47 You should have received a copy of the GNU General Public License
48 along with this program; if not, write to the Free Software
49 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
50 </para>
51</legalnotice>
52
53<abstract>
54<para>
55This document gives some information on installation and usage of SiS 900/7016
56device driver under Linux.
57</para>
58</abstract>
59
60</bookinfo>
61
62<toc></toc>
63
64<chapter id="intro">
65 <title>Introduction</title>
66
67<para>
68This document describes the revision 1.06 and 1.07 of SiS 900/7016 Fast Ethernet
69device driver under Linux. The driver is developed by Silicon Integrated
70System Corp. and distributed freely under the GNU General Public License (GPL).
71The driver can be compiled as a loadable module and used under Linux kernel
72version 2.2.x. (rev. 1.06)
73With minimal changes, the driver can also be used under 2.3.x and 2.4.x kernel
74(rev. 1.07), please see
75<xref linkend="install"/>. If you are intended to
76use the driver for earlier kernels, you are on your own.
77</para>
78
79<para>
80The driver is tested with usual TCP/IP applications including
81FTP, Telnet, Netscape etc. and is used constantly by the developers.
82</para>
83
84<para>
85Please send all comments/fixes/questions to
86<ulink url="mailto:lcchang@sis.com.tw">Lei-Chun Chang</ulink>.
87</para>
88</chapter>
89
90<chapter id="changes">
91 <title>Changes</title>
92
93<para>
94Changes made in Revision 1.07
95
96<orderedlist>
97<listitem>
98<para>
99Separation of sis900.c and sis900.h in order to move most
100constant definition to sis900.h (many of those constants were
101corrected)
102</para>
103</listitem>
104
105<listitem>
106<para>
107Clean up PCI detection, the pci-scan from Donald Becker were not used,
108just simple pci&lowbar;find&lowbar;*.
109</para>
110</listitem>
111
112<listitem>
113<para>
114MII detection is modified to support multiple mii transceiver.
115</para>
116</listitem>
117
118<listitem>
119<para>
120Bugs in read&lowbar;eeprom, mdio&lowbar;* were removed.
121</para>
122</listitem>
123
124<listitem>
125<para>
126Lot of sis900 irrelevant comments were removed/changed and
127more comments were added to reflect the real situation.
128</para>
129</listitem>
130
131<listitem>
132<para>
133Clean up of physical/virtual address space mess in buffer
134descriptors.
135</para>
136</listitem>
137
138<listitem>
139<para>
140Better transmit/receive error handling.
141</para>
142</listitem>
143
144<listitem>
145<para>
146The driver now uses zero-copy single buffer management
147scheme to improve performance.
148</para>
149</listitem>
150
151<listitem>
152<para>
153Names of variables were changed to be more consistent.
154</para>
155</listitem>
156
157<listitem>
158<para>
159Clean up of auo-negotiation and timer code.
160</para>
161</listitem>
162
163<listitem>
164<para>
165Automatic detection and change of PHY on the fly.
166</para>
167</listitem>
168
169<listitem>
170<para>
171Bug in mac probing fixed.
172</para>
173</listitem>
174
175<listitem>
176<para>
177Fix 630E equalier problem by modifying the equalizer workaround rule.
178</para>
179</listitem>
180
181<listitem>
182<para>
183Support for ICS1893 10/100 Interated PHYceiver.
184</para>
185</listitem>
186
187<listitem>
188<para>
189Support for media select by ifconfig.
190</para>
191</listitem>
192
193<listitem>
194<para>
195Added kernel-doc extratable documentation.
196</para>
197</listitem>
198
199</orderedlist>
200</para>
201</chapter>
202
203<chapter id="tested">
204 <title>Tested Environment</title>
205
206<para>
207This driver is developed on the following hardware
208
209<itemizedlist>
210<listitem>
211
212<para>
213Intel Celeron 500 with SiS 630 (rev 02) chipset
214</para>
215</listitem>
216<listitem>
217
218<para>
219SiS 900 (rev 01) and SiS 7016/7014 Fast Ethernet Card
220</para>
221</listitem>
222
223</itemizedlist>
224
225and tested with these software environments
226
227<itemizedlist>
228<listitem>
229
230<para>
231Red Hat Linux version 6.2
232</para>
233</listitem>
234<listitem>
235
236<para>
237Linux kernel version 2.4.0
238</para>
239</listitem>
240<listitem>
241
242<para>
243Netscape version 4.6
244</para>
245</listitem>
246<listitem>
247
248<para>
249NcFTP 3.0.0 beta 18
250</para>
251</listitem>
252<listitem>
253
254<para>
255Samba version 2.0.3
256</para>
257</listitem>
258
259</itemizedlist>
260
261</para>
262
263</chapter>
264
265<chapter id="files">
266<title>Files in This Package</title>
267
268<para>
269In the package you can find these files:
270</para>
271
272<para>
273<variablelist>
274
275<varlistentry>
276<term>sis900.c</term>
277<listitem>
278<para>
279Driver source file in C
280</para>
281</listitem>
282</varlistentry>
283
284<varlistentry>
285<term>sis900.h</term>
286<listitem>
287<para>
288Header file for sis900.c
289</para>
290</listitem>
291</varlistentry>
292
293<varlistentry>
294<term>sis900.sgml</term>
295<listitem>
296<para>
297DocBook SGML source of the document
298</para>
299</listitem>
300</varlistentry>
301
302<varlistentry>
303<term>sis900.txt</term>
304<listitem>
305<para>
306Driver document in plain text
307</para>
308</listitem>
309</varlistentry>
310
311</variablelist>
312</para>
313</chapter>
314
315<chapter id="install">
316 <title>Installation</title>
317
318<para>
319Silicon Integrated System Corp. is cooperating closely with core Linux Kernel
320developers. The revisions of SiS 900 driver are distributed by the usuall channels
321for kernel tar files and patches. Those kernel tar files for official kernel and
322patches for kernel pre-release can be download at
323<ulink url="http://ftp.kernel.org/pub/linux/kernel/">official kernel ftp site</ulink>
324and its mirrors.
325The 1.06 revision can be found in kernel version later than 2.3.15 and pre-2.2.14,
326and 1.07 revision can be found in kernel version 2.4.0.
327If you have no prior experience in networking under Linux, please read
328<ulink url="http://www.tldp.org/">Ethernet HOWTO</ulink> and
329<ulink url="http://www.tldp.org/">Networking HOWTO</ulink> available from
330Linux Documentation Project (LDP).
331</para>
332
333<para>
334The driver is bundled in release later than 2.2.11 and 2.3.15 so this
335is the most easy case.
336Be sure you have the appropriate packages for compiling kernel source.
337Those packages are listed in Document/Changes in kernel source
338distribution. If you have to install the driver other than those bundled
339in kernel release, you should have your driver file
340<filename>sis900.c</filename> and <filename>sis900.h</filename>
341copied into <filename class="directory">/usr/src/linux/drivers/net/</filename> first.
342There are two alternative ways to install the driver
343</para>
344
345<sect1>
346<title>Building the driver as loadable module</title>
347
348<para>
349To build the driver as a loadable kernel module you have to reconfigure
350the kernel to activate network support by
351</para>
352
353<para><screen>
354make menuconfig
355</screen></para>
356
357<para>
358Choose <quote>Loadable module support ---></quote>,
359then select <quote>Enable loadable module support</quote>.
360</para>
361
362<para>
363Choose <quote>Network Device Support ---></quote>, select
364<quote>Ethernet (10 or 100Mbit)</quote>.
365Then select <quote>EISA, VLB, PCI and on board controllers</quote>,
366and choose <quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote>
367to <quote>M</quote>.
368</para>
369
370<para>
371After reconfiguring the kernel, you can make the driver module by
372</para>
373
374<para><screen>
375make modules
376</screen></para>
377
378<para>
379The driver should be compiled with no errors. After compiling the driver,
380the driver can be installed to proper place by
381</para>
382
383<para><screen>
384make modules_install
385</screen></para>
386
387<para>
388Load the driver into kernel by
389</para>
390
391<para><screen>
392insmod sis900
393</screen></para>
394
395<para>
396When loading the driver into memory, some information message can be view by
397</para>
398
399<para>
400<screen>
401dmesg
402</screen>
403
404or
405
406<screen>
407cat /var/log/message
408</screen>
409</para>
410
411<para>
412If the driver is loaded properly you will have messages similar to this:
413</para>
414
415<para><screen>
416sis900.c: v1.07.06 11/07/2000
417eth0: SiS 900 PCI Fast Ethernet at 0xd000, IRQ 10, 00:00:e8:83:7f:a4.
418eth0: SiS 900 Internal MII PHY transceiver found at address 1.
419eth0: Using SiS 900 Internal MII PHY as default
420</screen></para>
421
422<para>
423showing the version of the driver and the results of probing routine.
424</para>
425
426<para>
427Once the driver is loaded, network can be brought up by
428</para>
429
430<para><screen>
431/sbin/ifconfig eth0 IPADDR broadcast BROADCAST netmask NETMASK media TYPE
432</screen></para>
433
434<para>
435where IPADDR, BROADCAST, NETMASK are your IP address, broadcast address and
436netmask respectively. TYPE is used to set medium type used by the device.
437Typical values are "10baseT"(twisted-pair 10Mbps Ethernet) or "100baseT"
438(twisted-pair 100Mbps Ethernet). For more information on how to configure
439network interface, please refer to
440<ulink url="http://www.tldp.org/">Networking HOWTO</ulink>.
441</para>
442
443<para>
444The link status is also shown by kernel messages. For example, after the
445network interface is activated, you may have the message:
446</para>
447
448<para><screen>
449eth0: Media Link On 100mbps full-duplex
450</screen></para>
451
452<para>
453If you try to unplug the twist pair (TP) cable you will get
454</para>
455
456<para><screen>
457eth0: Media Link Off
458</screen></para>
459
460<para>
461indicating that the link is failed.
462</para>
463</sect1>
464
465<sect1>
466<title>Building the driver into kernel</title>
467
468<para>
469If you want to make the driver into kernel, choose <quote>Y</quote>
470rather than <quote>M</quote> on
471<quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote>
472when configuring the kernel. Build the kernel image in the usual way
473</para>
474
475<para><screen>
476make clean
477
478make bzlilo
479</screen></para>
480
481<para>
482Next time the system reboot, you have the driver in memory.
483</para>
484
485</sect1>
486</chapter>
487
488<chapter id="problems">
489 <title>Known Problems and Bugs</title>
490
491<para>
492There are some known problems and bugs. If you find any other bugs please
493mail to <ulink url="mailto:lcchang@sis.com.tw">lcchang@sis.com.tw</ulink>
494
495<orderedlist>
496
497<listitem>
498<para>
499AM79C901 HomePNA PHY is not thoroughly tested, there may be some
500bugs in the <quote>on the fly</quote> change of transceiver.
501</para>
502</listitem>
503
504<listitem>
505<para>
506A bug is hidden somewhere in the receive buffer management code,
507the bug causes NULL pointer reference in the kernel. This fault is
508caught before bad things happen and reported with the message:
509
510<computeroutput>
511eth0: NULL pointer encountered in Rx ring, skipping
512</computeroutput>
513
514which can be viewed with <literal remap="tt">dmesg</literal> or
515<literal remap="tt">cat /var/log/message</literal>.
516</para>
517</listitem>
518
519<listitem>
520<para>
521The media type change from 10Mbps to 100Mbps twisted-pair ethernet
522by ifconfig causes the media link down.
523</para>
524</listitem>
525
526</orderedlist>
527</para>
528</chapter>
529
530<chapter id="RHistory">
531 <title>Revision History</title>
532
533<para>
534<itemizedlist>
535
536<listitem>
537<para>
538November 13, 2000, Revision 1.07, seventh release, 630E problem fixed
539and further clean up.
540</para>
541</listitem>
542
543<listitem>
544<para>
545November 4, 1999, Revision 1.06, Second release, lots of clean up
546and optimization.
547</para>
548</listitem>
549
550<listitem>
551<para>
552August 8, 1999, Revision 1.05, Initial Public Release
553</para>
554</listitem>
555
556</itemizedlist>
557</para>
558</chapter>
559
560<chapter id="acknowledgements">
561 <title>Acknowledgements</title>
562
563<para>
564This driver was originally derived form
565<ulink url="mailto:becker@cesdis1.gsfc.nasa.gov">Donald Becker</ulink>'s
566<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/pci-skeleton.c"
567>pci-skeleton</ulink> and
568<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/rtl8139.c"
569>rtl8139</ulink> drivers. Donald also provided various suggestion
570regarded with improvements made in revision 1.06.
571</para>
572
573<para>
574The 1.05 revision was created by
575<ulink url="mailto:cmhuang@sis.com.tw">Jim Huang</ulink>, AMD 79c901
576support was added by <ulink url="mailto:lcs@sis.com.tw">Chin-Shan Li</ulink>.
577</para>
578</chapter>
579
580<chapter id="functions">
581<title>List of Functions</title>
582!Idrivers/net/sis900.c
583</chapter>
584
585</book>
diff --git a/Documentation/DocBook/tulip-user.tmpl b/Documentation/DocBook/tulip-user.tmpl
new file mode 100644
index 000000000000..6520d7a1b132
--- /dev/null
+++ b/Documentation/DocBook/tulip-user.tmpl
@@ -0,0 +1,327 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="TulipUserGuide">
6 <bookinfo>
7 <title>Tulip Driver User's Guide</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Jeff</firstname>
12 <surname>Garzik</surname>
13 <affiliation>
14 <address>
15 <email>jgarzik@pobox.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2001</year>
23 <holder>Jeff Garzik</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53 </legalnotice>
54 </bookinfo>
55
56 <toc></toc>
57
58 <chapter id="intro">
59 <title>Introduction</title>
60<para>
61The Tulip Ethernet Card Driver
62is maintained by Jeff Garzik (<email>jgarzik@pobox.com</email>).
63</para>
64
65<para>
66The Tulip driver was developed by Donald Becker and changed by
67Jeff Garzik, Takashi Manabe and a cast of thousands.
68</para>
69
70<para>
71For 2.4.x and later kernels, the Linux Tulip driver is available at
72<ulink url="http://sourceforge.net/projects/tulip/">http://sourceforge.net/projects/tulip/</ulink>
73</para>
74
75<para>
76 This driver is for the Digital "Tulip" Ethernet adapter interface.
77 It should work with most DEC 21*4*-based chips/ethercards, as well as
78 with work-alike chips from Lite-On (PNIC) and Macronix (MXIC) and ASIX.
79</para>
80
81<para>
82 The original author may be reached as becker@scyld.com, or C/O
83 Scyld Computing Corporation,
84 410 Severn Ave., Suite 210,
85 Annapolis MD 21403
86</para>
87
88<para>
89 Additional information on Donald Becker's tulip.c
90 is available at <ulink url="http://www.scyld.com/network/tulip.html">http://www.scyld.com/network/tulip.html</ulink>
91</para>
92
93 </chapter>
94
95 <chapter id="drvr-compat">
96 <title>Driver Compatibility</title>
97
98<para>
99This device driver is designed for the DECchip "Tulip", Digital's
100single-chip ethernet controllers for PCI (now owned by Intel).
101Supported members of the family
102are the 21040, 21041, 21140, 21140A, 21142, and 21143. Similar work-alike
103chips from Lite-On, Macronics, ASIX, Compex and other listed below are also
104supported.
105</para>
106
107<para>
108These chips are used on at least 140 unique PCI board designs. The great
109number of chips and board designs supported is the reason for the
110driver size and complexity. Almost of the increasing complexity is in the
111board configuration and media selection code. There is very little
112increasing in the operational critical path length.
113</para>
114 </chapter>
115
116 <chapter id="board-settings">
117 <title>Board-specific Settings</title>
118
119<para>
120PCI bus devices are configured by the system at boot time, so no jumpers
121need to be set on the board. The system BIOS preferably should assign the
122PCI INTA signal to an otherwise unused system IRQ line.
123</para>
124
125<para>
126Some boards have EEPROMs tables with default media entry. The factory default
127is usually "autoselect". This should only be overridden when using
128transceiver connections without link beat e.g. 10base2 or AUI, or (rarely!)
129for forcing full-duplex when used with old link partners that do not do
130autonegotiation.
131</para>
132 </chapter>
133
134 <chapter id="driver-operation">
135 <title>Driver Operation</title>
136
137<sect1><title>Ring buffers</title>
138
139<para>
140The Tulip can use either ring buffers or lists of Tx and Rx descriptors.
141This driver uses statically allocated rings of Rx and Tx descriptors, set at
142compile time by RX/TX_RING_SIZE. This version of the driver allocates skbuffs
143for the Rx ring buffers at open() time and passes the skb->data field to the
144Tulip as receive data buffers. When an incoming frame is less than
145RX_COPYBREAK bytes long, a fresh skbuff is allocated and the frame is
146copied to the new skbuff. When the incoming frame is larger, the skbuff is
147passed directly up the protocol stack and replaced by a newly allocated
148skbuff.
149</para>
150
151<para>
152The RX_COPYBREAK value is chosen to trade-off the memory wasted by
153using a full-sized skbuff for small frames vs. the copying costs of larger
154frames. For small frames the copying cost is negligible (esp. considering
155that we are pre-loading the cache with immediately useful header
156information). For large frames the copying cost is non-trivial, and the
157larger copy might flush the cache of useful data. A subtle aspect of this
158choice is that the Tulip only receives into longword aligned buffers, thus
159the IP header at offset 14 isn't longword aligned for further processing.
160Copied frames are put into the new skbuff at an offset of "+2", thus copying
161has the beneficial effect of aligning the IP header and preloading the
162cache.
163</para>
164
165</sect1>
166
167<sect1><title>Synchronization</title>
168<para>
169The driver runs as two independent, single-threaded flows of control. One
170is the send-packet routine, which enforces single-threaded use by the
171dev->tbusy flag. The other thread is the interrupt handler, which is single
172threaded by the hardware and other software.
173</para>
174
175<para>
176The send packet thread has partial control over the Tx ring and 'dev->tbusy'
177flag. It sets the tbusy flag whenever it's queuing a Tx packet. If the next
178queue slot is empty, it clears the tbusy flag when finished otherwise it sets
179the 'tp->tx_full' flag.
180</para>
181
182<para>
183The interrupt handler has exclusive control over the Rx ring and records stats
184from the Tx ring. (The Tx-done interrupt can't be selectively turned off, so
185we can't avoid the interrupt overhead by having the Tx routine reap the Tx
186stats.) After reaping the stats, it marks the queue entry as empty by setting
187the 'base' to zero. Iff the 'tp->tx_full' flag is set, it clears both the
188tx_full and tbusy flags.
189</para>
190
191</sect1>
192
193 </chapter>
194
195 <chapter id="errata">
196 <title>Errata</title>
197
198<para>
199The old DEC databooks were light on details.
200The 21040 databook claims that CSR13, CSR14, and CSR15 should each be the last
201register of the set CSR12-15 written. Hmmm, now how is that possible?
202</para>
203
204<para>
205The DEC SROM format is very badly designed not precisely defined, leading to
206part of the media selection junkheap below. Some boards do not have EEPROM
207media tables and need to be patched up. Worse, other boards use the DEC
208design kit media table when it isn't correct for their board.
209</para>
210
211<para>
212We cannot use MII interrupts because there is no defined GPIO pin to attach
213them. The MII transceiver status is polled using an kernel timer.
214</para>
215 </chapter>
216
217 <chapter id="changelog">
218 <title>Driver Change History</title>
219
220 <sect1><title>Version 0.9.14 (February 20, 2001)</title>
221 <itemizedlist>
222 <listitem><para>Fix PNIC problems (Manfred Spraul)</para></listitem>
223 <listitem><para>Add new PCI id for Accton comet</para></listitem>
224 <listitem><para>Support Davicom tulips</para></listitem>
225 <listitem><para>Fix oops in eeprom parsing</para></listitem>
226 <listitem><para>Enable workarounds for early PCI chipsets</para></listitem>
227 <listitem><para>IA64, hppa csr0 support</para></listitem>
228 <listitem><para>Support media types 5, 6</para></listitem>
229 <listitem><para>Interpret a bit more of the 21142 SROM extended media type 3</para></listitem>
230 <listitem><para>Add missing delay in eeprom reading</para></listitem>
231 </itemizedlist>
232 </sect1>
233
234 <sect1><title>Version 0.9.11 (November 3, 2000)</title>
235 <itemizedlist>
236 <listitem><para>Eliminate extra bus accesses when sharing interrupts (prumpf)</para></listitem>
237 <listitem><para>Barrier following ownership descriptor bit flip (prumpf)</para></listitem>
238 <listitem><para>Endianness fixes for >14 addresses in setup frames (prumpf)</para></listitem>
239 <listitem><para>Report link beat to kernel/userspace via netif_carrier_*. (kuznet)</para></listitem>
240 <listitem><para>Better spinlocking in set_rx_mode.</para></listitem>
241 <listitem><para>Fix I/O resource request failure error messages (DaveM catch)</para></listitem>
242 <listitem><para>Handle DMA allocation failure.</para></listitem>
243 </itemizedlist>
244 </sect1>
245
246 <sect1><title>Version 0.9.10 (September 6, 2000)</title>
247 <itemizedlist>
248 <listitem><para>Simple interrupt mitigation (via jamal)</para></listitem>
249 <listitem><para>More PCI ids</para></listitem>
250 </itemizedlist>
251 </sect1>
252
253 <sect1><title>Version 0.9.9 (August 11, 2000)</title>
254 <itemizedlist>
255 <listitem><para>More PCI ids</para></listitem>
256 </itemizedlist>
257 </sect1>
258
259 <sect1><title>Version 0.9.8 (July 13, 2000)</title>
260 <itemizedlist>
261 <listitem><para>Correct signed/unsigned comparison for dummy frame index</para></listitem>
262 <listitem><para>Remove outdated references to struct enet_statistics</para></listitem>
263 </itemizedlist>
264 </sect1>
265
266 <sect1><title>Version 0.9.7 (June 17, 2000)</title>
267 <itemizedlist>
268 <listitem><para>Timer cleanups (Andrew Morton)</para></listitem>
269 <listitem><para>Alpha compile fix (somebody?)</para></listitem>
270 </itemizedlist>
271 </sect1>
272
273 <sect1><title>Version 0.9.6 (May 31, 2000)</title>
274 <itemizedlist>
275 <listitem><para>Revert 21143-related support flag patch</para></listitem>
276 <listitem><para>Add HPPA/media-table debugging printk</para></listitem>
277 </itemizedlist>
278 </sect1>
279
280 <sect1><title>Version 0.9.5 (May 30, 2000)</title>
281 <itemizedlist>
282 <listitem><para>HPPA support (willy@puffingroup)</para></listitem>
283 <listitem><para>CSR6 bits and tulip.h cleanup (Chris Smith)</para></listitem>
284 <listitem><para>Improve debugging messages a bit</para></listitem>
285 <listitem><para>Add delay after CSR13 write in t21142_start_nway</para></listitem>
286 <listitem><para>Remove unused ETHER_STATS code</para></listitem>
287 <listitem><para>Convert 'extern inline' to 'static inline' in tulip.h (Chris Smith)</para></listitem>
288 <listitem><para>Update DS21143 support flags in tulip_chip_info[]</para></listitem>
289 <listitem><para>Use spin_lock_irq, not _irqsave/restore, in tulip_start_xmit()</para></listitem>
290 <listitem><para>Add locking to set_rx_mode()</para></listitem>
291 <listitem><para>Fix race with chip setting DescOwned bit (Hal Murray)</para></listitem>
292 <listitem><para>Request 100% of PIO and MMIO resource space assigned to card</para></listitem>
293 <listitem><para>Remove error message from pci_enable_device failure</para></listitem>
294 </itemizedlist>
295 </sect1>
296
297 <sect1><title>Version 0.9.4.3 (April 14, 2000)</title>
298 <itemizedlist>
299 <listitem><para>mod_timer fix (Hal Murray)</para></listitem>
300 <listitem><para>PNIC2 resuscitation (Chris Smith)</para></listitem>
301 </itemizedlist>
302 </sect1>
303
304 <sect1><title>Version 0.9.4.2 (March 21, 2000)</title>
305 <itemizedlist>
306 <listitem><para>Fix 21041 CSR7, CSR13/14/15 handling</para></listitem>
307 <listitem><para>Merge some PCI ids from tulip 0.91x</para></listitem>
308 <listitem><para>Merge some HAS_xxx flags and flag settings from tulip 0.91x</para></listitem>
309 <listitem><para>asm/io.h fix (submitted by many) and cleanup</para></listitem>
310 <listitem><para>s/HAS_NWAY143/HAS_NWAY/</para></listitem>
311 <listitem><para>Cleanup 21041 mode reporting</para></listitem>
312 <listitem><para>Small code cleanups</para></listitem>
313 </itemizedlist>
314 </sect1>
315
316 <sect1><title>Version 0.9.4.1 (March 18, 2000)</title>
317 <itemizedlist>
318 <listitem><para>Finish PCI DMA conversion (davem)</para></listitem>
319 <listitem><para>Do not netif_start_queue() at end of tulip_tx_timeout() (kuznet)</para></listitem>
320 <listitem><para>PCI DMA fix (kuznet)</para></listitem>
321 <listitem><para>eeprom.c code cleanup</para></listitem>
322 <listitem><para>Remove Xircom Tulip crud</para></listitem>
323 </itemizedlist>
324 </sect1>
325 </chapter>
326
327</book>
diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
new file mode 100644
index 000000000000..f3ef0bf435e9
--- /dev/null
+++ b/Documentation/DocBook/usb.tmpl
@@ -0,0 +1,979 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="Linux-USB-API">
6 <bookinfo>
7 <title>The Linux-USB Host Side API</title>
8
9 <legalnotice>
10 <para>
11 This documentation is free software; you can redistribute
12 it and/or modify it under the terms of the GNU General Public
13 License as published by the Free Software Foundation; either
14 version 2 of the License, or (at your option) any later
15 version.
16 </para>
17
18 <para>
19 This program is distributed in the hope that it will be
20 useful, but WITHOUT ANY WARRANTY; without even the implied
21 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 See the GNU General Public License for more details.
23 </para>
24
25 <para>
26 You should have received a copy of the GNU General Public
27 License along with this program; if not, write to the Free
28 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
29 MA 02111-1307 USA
30 </para>
31
32 <para>
33 For more details see the file COPYING in the source
34 distribution of Linux.
35 </para>
36 </legalnotice>
37 </bookinfo>
38
39<toc></toc>
40
41<chapter id="intro">
42 <title>Introduction to USB on Linux</title>
43
44 <para>A Universal Serial Bus (USB) is used to connect a host,
45 such as a PC or workstation, to a number of peripheral
46 devices. USB uses a tree structure, with the host at the
47 root (the system's master), hubs as interior nodes, and
48 peripheral devices as leaves (and slaves).
49 Modern PCs support several such trees of USB devices, usually
50 one USB 2.0 tree (480 Mbit/sec each) with
51 a few USB 1.1 trees (12 Mbit/sec each) that are used when you
52 connect a USB 1.1 device directly to the machine's "root hub".
53 </para>
54
55 <para>That master/slave asymmetry was designed in part for
56 ease of use. It is not physically possible to assemble
57 (legal) USB cables incorrectly: all upstream "to-the-host"
58 connectors are the rectangular type, matching the sockets on
59 root hubs, and the downstream type are the squarish type
60 (or they are built in to the peripheral).
61 Software doesn't need to deal with distributed autoconfiguration
62 since the pre-designated master node manages all that.
63 At the electrical level, bus protocol overhead is reduced by
64 eliminating arbitration and moving scheduling into host software.
65 </para>
66
67 <para>USB 1.0 was announced in January 1996, and was revised
68 as USB 1.1 (with improvements in hub specification and
69 support for interrupt-out transfers) in September 1998.
70 USB 2.0 was released in April 2000, including high speed
71 transfers and transaction translating hubs (used for USB 1.1
72 and 1.0 backward compatibility).
73 </para>
74
75 <para>USB support was added to Linux early in the 2.2 kernel series
76 shortly before the 2.3 development forked off. Updates
77 from 2.3 were regularly folded back into 2.2 releases, bringing
78 new features such as <filename>/sbin/hotplug</filename> support,
79 more drivers, and more robustness.
80 The 2.5 kernel series continued such improvements, and also
81 worked on USB 2.0 support,
82 higher performance,
83 better consistency between host controller drivers,
84 API simplification (to make bugs less likely),
85 and providing internal "kerneldoc" documentation.
86 </para>
87
88 <para>Linux can run inside USB devices as well as on
89 the hosts that control the devices.
90 Because the Linux 2.x USB support evolved to support mass market
91 platforms such as Apple Macintosh or PC-compatible systems,
92 it didn't address design concerns for those types of USB systems.
93 So it can't be used inside mass-market PDAs, or other peripherals.
94 USB device drivers running inside those Linux peripherals
95 don't do the same things as the ones running inside hosts,
96 and so they've been given a different name:
97 they're called <emphasis>gadget drivers</emphasis>.
98 This document does not present gadget drivers.
99 </para>
100
101 </chapter>
102
103<chapter id="host">
104 <title>USB Host-Side API Model</title>
105
106 <para>Within the kernel,
107 host-side drivers for USB devices talk to the "usbcore" APIs.
108 There are two types of public "usbcore" APIs, targetted at two different
109 layers of USB driver. Those are
110 <emphasis>general purpose</emphasis> drivers, exposed through
111 driver frameworks such as block, character, or network devices;
112 and drivers that are <emphasis>part of the core</emphasis>,
113 which are involved in managing a USB bus.
114 Such core drivers include the <emphasis>hub</emphasis> driver,
115 which manages trees of USB devices, and several different kinds
116 of <emphasis>host controller driver (HCD)</emphasis>,
117 which control individual busses.
118 </para>
119
120 <para>The device model seen by USB drivers is relatively complex.
121 </para>
122
123 <itemizedlist>
124
125 <listitem><para>USB supports four kinds of data transfer
126 (control, bulk, interrupt, and isochronous). Two transfer
127 types use bandwidth as it's available (control and bulk),
128 while the other two types of transfer (interrupt and isochronous)
129 are scheduled to provide guaranteed bandwidth.
130 </para></listitem>
131
132 <listitem><para>The device description model includes one or more
133 "configurations" per device, only one of which is active at a time.
134 Devices that are capable of high speed operation must also support
135 full speed configurations, along with a way to ask about the
136 "other speed" configurations that might be used.
137 </para></listitem>
138
139 <listitem><para>Configurations have one or more "interface", each
140 of which may have "alternate settings". Interfaces may be
141 standardized by USB "Class" specifications, or may be specific to
142 a vendor or device.</para>
143
144 <para>USB device drivers actually bind to interfaces, not devices.
145 Think of them as "interface drivers", though you
146 may not see many devices where the distinction is important.
147 <emphasis>Most USB devices are simple, with only one configuration,
148 one interface, and one alternate setting.</emphasis>
149 </para></listitem>
150
151 <listitem><para>Interfaces have one or more "endpoints", each of
152 which supports one type and direction of data transfer such as
153 "bulk out" or "interrupt in". The entire configuration may have
154 up to sixteen endpoints in each direction, allocated as needed
155 among all the interfaces.
156 </para></listitem>
157
158 <listitem><para>Data transfer on USB is packetized; each endpoint
159 has a maximum packet size.
160 Drivers must often be aware of conventions such as flagging the end
161 of bulk transfers using "short" (including zero length) packets.
162 </para></listitem>
163
164 <listitem><para>The Linux USB API supports synchronous calls for
165 control and bulk messaging.
166 It also supports asynchnous calls for all kinds of data transfer,
167 using request structures called "URBs" (USB Request Blocks).
168 </para></listitem>
169
170 </itemizedlist>
171
172 <para>Accordingly, the USB Core API exposed to device drivers
173 covers quite a lot of territory. You'll probably need to consult
174 the USB 2.0 specification, available online from www.usb.org at
175 no cost, as well as class or device specifications.
176 </para>
177
178 <para>The only host-side drivers that actually touch hardware
179 (reading/writing registers, handling IRQs, and so on) are the HCDs.
180 In theory, all HCDs provide the same functionality through the same
181 API. In practice, that's becoming more true on the 2.5 kernels,
182 but there are still differences that crop up especially with
183 fault handling. Different controllers don't necessarily report
184 the same aspects of failures, and recovery from faults (including
185 software-induced ones like unlinking an URB) isn't yet fully
186 consistent.
187 Device driver authors should make a point of doing disconnect
188 testing (while the device is active) with each different host
189 controller driver, to make sure drivers don't have bugs of
190 their own as well as to make sure they aren't relying on some
191 HCD-specific behavior.
192 (You will need external USB 1.1 and/or
193 USB 2.0 hubs to perform all those tests.)
194 </para>
195
196 </chapter>
197
198<chapter><title>USB-Standard Types</title>
199
200 <para>In <filename>&lt;linux/usb_ch9.h&gt;</filename> you will find
201 the USB data types defined in chapter 9 of the USB specification.
202 These data types are used throughout USB, and in APIs including
203 this host side API, gadget APIs, and usbfs.
204 </para>
205
206!Iinclude/linux/usb_ch9.h
207
208 </chapter>
209
210<chapter><title>Host-Side Data Types and Macros</title>
211
212 <para>The host side API exposes several layers to drivers, some of
213 which are more necessary than others.
214 These support lifecycle models for host side drivers
215 and devices, and support passing buffers through usbcore to
216 some HCD that performs the I/O for the device driver.
217 </para>
218
219
220!Iinclude/linux/usb.h
221
222 </chapter>
223
224 <chapter><title>USB Core APIs</title>
225
226 <para>There are two basic I/O models in the USB API.
227 The most elemental one is asynchronous: drivers submit requests
228 in the form of an URB, and the URB's completion callback
229 handle the next step.
230 All USB transfer types support that model, although there
231 are special cases for control URBs (which always have setup
232 and status stages, but may not have a data stage) and
233 isochronous URBs (which allow large packets and include
234 per-packet fault reports).
235 Built on top of that is synchronous API support, where a
236 driver calls a routine that allocates one or more URBs,
237 submits them, and waits until they complete.
238 There are synchronous wrappers for single-buffer control
239 and bulk transfers (which are awkward to use in some
240 driver disconnect scenarios), and for scatterlist based
241 streaming i/o (bulk or interrupt).
242 </para>
243
244 <para>USB drivers need to provide buffers that can be
245 used for DMA, although they don't necessarily need to
246 provide the DMA mapping themselves.
247 There are APIs to use used when allocating DMA buffers,
248 which can prevent use of bounce buffers on some systems.
249 In some cases, drivers may be able to rely on 64bit DMA
250 to eliminate another kind of bounce buffer.
251 </para>
252
253!Edrivers/usb/core/urb.c
254!Edrivers/usb/core/message.c
255!Edrivers/usb/core/file.c
256!Edrivers/usb/core/usb.c
257!Edrivers/usb/core/hub.c
258 </chapter>
259
260 <chapter><title>Host Controller APIs</title>
261
262 <para>These APIs are only for use by host controller drivers,
263 most of which implement standard register interfaces such as
264 EHCI, OHCI, or UHCI.
265 UHCI was one of the first interfaces, designed by Intel and
266 also used by VIA; it doesn't do much in hardware.
267 OHCI was designed later, to have the hardware do more work
268 (bigger transfers, tracking protocol state, and so on).
269 EHCI was designed with USB 2.0; its design has features that
270 resemble OHCI (hardware does much more work) as well as
271 UHCI (some parts of ISO support, TD list processing).
272 </para>
273
274 <para>There are host controllers other than the "big three",
275 although most PCI based controllers (and a few non-PCI based
276 ones) use one of those interfaces.
277 Not all host controllers use DMA; some use PIO, and there
278 is also a simulator.
279 </para>
280
281 <para>The same basic APIs are available to drivers for all
282 those controllers.
283 For historical reasons they are in two layers:
284 <structname>struct usb_bus</structname> is a rather thin
285 layer that became available in the 2.2 kernels, while
286 <structname>struct usb_hcd</structname> is a more featureful
287 layer (available in later 2.4 kernels and in 2.5) that
288 lets HCDs share common code, to shrink driver size
289 and significantly reduce hcd-specific behaviors.
290 </para>
291
292!Edrivers/usb/core/hcd.c
293!Edrivers/usb/core/hcd-pci.c
294!Edrivers/usb/core/buffer.c
295 </chapter>
296
297 <chapter>
298 <title>The USB Filesystem (usbfs)</title>
299
300 <para>This chapter presents the Linux <emphasis>usbfs</emphasis>.
301 You may prefer to avoid writing new kernel code for your
302 USB driver; that's the problem that usbfs set out to solve.
303 User mode device drivers are usually packaged as applications
304 or libraries, and may use usbfs through some programming library
305 that wraps it. Such libraries include
306 <ulink url="http://libusb.sourceforge.net">libusb</ulink>
307 for C/C++, and
308 <ulink url="http://jUSB.sourceforge.net">jUSB</ulink> for Java.
309 </para>
310
311 <note><title>Unfinished</title>
312 <para>This particular documentation is incomplete,
313 especially with respect to the asynchronous mode.
314 As of kernel 2.5.66 the code and this (new) documentation
315 need to be cross-reviewed.
316 </para>
317 </note>
318
319 <para>Configure usbfs into Linux kernels by enabling the
320 <emphasis>USB filesystem</emphasis> option (CONFIG_USB_DEVICEFS),
321 and you get basic support for user mode USB device drivers.
322 Until relatively recently it was often (confusingly) called
323 <emphasis>usbdevfs</emphasis> although it wasn't solving what
324 <emphasis>devfs</emphasis> was.
325 Every USB device will appear in usbfs, regardless of whether or
326 not it has a kernel driver; but only devices with kernel drivers
327 show up in devfs.
328 </para>
329
330 <sect1>
331 <title>What files are in "usbfs"?</title>
332
333 <para>Conventionally mounted at
334 <filename>/proc/bus/usb</filename>, usbfs
335 features include:
336 <itemizedlist>
337 <listitem><para><filename>/proc/bus/usb/devices</filename>
338 ... a text file
339 showing each of the USB devices on known to the kernel,
340 and their configuration descriptors.
341 You can also poll() this to learn about new devices.
342 </para></listitem>
343 <listitem><para><filename>/proc/bus/usb/BBB/DDD</filename>
344 ... magic files
345 exposing the each device's configuration descriptors, and
346 supporting a series of ioctls for making device requests,
347 including I/O to devices. (Purely for access by programs.)
348 </para></listitem>
349 </itemizedlist>
350 </para>
351
352 <para> Each bus is given a number (BBB) based on when it was
353 enumerated; within each bus, each device is given a similar
354 number (DDD).
355 Those BBB/DDD paths are not "stable" identifiers;
356 expect them to change even if you always leave the devices
357 plugged in to the same hub port.
358 <emphasis>Don't even think of saving these in application
359 configuration files.</emphasis>
360 Stable identifiers are available, for user mode applications
361 that want to use them. HID and networking devices expose
362 these stable IDs, so that for example you can be sure that
363 you told the right UPS to power down its second server.
364 "usbfs" doesn't (yet) expose those IDs.
365 </para>
366
367 </sect1>
368
369 <sect1>
370 <title>Mounting and Access Control</title>
371
372 <para>There are a number of mount options for usbfs, which will
373 be of most interest to you if you need to override the default
374 access control policy.
375 That policy is that only root may read or write device files
376 (<filename>/proc/bus/BBB/DDD</filename>) although anyone may read
377 the <filename>devices</filename>
378 or <filename>drivers</filename> files.
379 I/O requests to the device also need the CAP_SYS_RAWIO capability,
380 </para>
381
382 <para>The significance of that is that by default, all user mode
383 device drivers need super-user privileges.
384 You can change modes or ownership in a driver setup
385 when the device hotplugs, or maye just start the
386 driver right then, as a privileged server (or some activity
387 within one).
388 That's the most secure approach for multi-user systems,
389 but for single user systems ("trusted" by that user)
390 it's more convenient just to grant everyone all access
391 (using the <emphasis>devmode=0666</emphasis> option)
392 so the driver can start whenever it's needed.
393 </para>
394
395 <para>The mount options for usbfs, usable in /etc/fstab or
396 in command line invocations of <emphasis>mount</emphasis>, are:
397
398 <variablelist>
399 <varlistentry>
400 <term><emphasis>busgid</emphasis>=NNNNN</term>
401 <listitem><para>Controls the GID used for the
402 /proc/bus/usb/BBB
403 directories. (Default: 0)</para></listitem></varlistentry>
404 <varlistentry><term><emphasis>busmode</emphasis>=MMM</term>
405 <listitem><para>Controls the file mode used for the
406 /proc/bus/usb/BBB
407 directories. (Default: 0555)
408 </para></listitem></varlistentry>
409 <varlistentry><term><emphasis>busuid</emphasis>=NNNNN</term>
410 <listitem><para>Controls the UID used for the
411 /proc/bus/usb/BBB
412 directories. (Default: 0)</para></listitem></varlistentry>
413
414 <varlistentry><term><emphasis>devgid</emphasis>=NNNNN</term>
415 <listitem><para>Controls the GID used for the
416 /proc/bus/usb/BBB/DDD
417 files. (Default: 0)</para></listitem></varlistentry>
418 <varlistentry><term><emphasis>devmode</emphasis>=MMM</term>
419 <listitem><para>Controls the file mode used for the
420 /proc/bus/usb/BBB/DDD
421 files. (Default: 0644)</para></listitem></varlistentry>
422 <varlistentry><term><emphasis>devuid</emphasis>=NNNNN</term>
423 <listitem><para>Controls the UID used for the
424 /proc/bus/usb/BBB/DDD
425 files. (Default: 0)</para></listitem></varlistentry>
426
427 <varlistentry><term><emphasis>listgid</emphasis>=NNNNN</term>
428 <listitem><para>Controls the GID used for the
429 /proc/bus/usb/devices and drivers files.
430 (Default: 0)</para></listitem></varlistentry>
431 <varlistentry><term><emphasis>listmode</emphasis>=MMM</term>
432 <listitem><para>Controls the file mode used for the
433 /proc/bus/usb/devices and drivers files.
434 (Default: 0444)</para></listitem></varlistentry>
435 <varlistentry><term><emphasis>listuid</emphasis>=NNNNN</term>
436 <listitem><para>Controls the UID used for the
437 /proc/bus/usb/devices and drivers files.
438 (Default: 0)</para></listitem></varlistentry>
439 </variablelist>
440
441 </para>
442
443 <para>Note that many Linux distributions hard-wire the mount options
444 for usbfs in their init scripts, such as
445 <filename>/etc/rc.d/rc.sysinit</filename>,
446 rather than making it easy to set this per-system
447 policy in <filename>/etc/fstab</filename>.
448 </para>
449
450 </sect1>
451
452 <sect1>
453 <title>/proc/bus/usb/devices</title>
454
455 <para>This file is handy for status viewing tools in user
456 mode, which can scan the text format and ignore most of it.
457 More detailed device status (including class and vendor
458 status) is available from device-specific files.
459 For information about the current format of this file,
460 see the
461 <filename>Documentation/usb/proc_usb_info.txt</filename>
462 file in your Linux kernel sources.
463 </para>
464
465 <para>Otherwise the main use for this file from programs
466 is to poll() it to get notifications of usb devices
467 as they're plugged or unplugged.
468 To see what changed, you'd need to read the file and
469 compare "before" and "after" contents, scan the filesystem,
470 or see its hotplug event.
471 </para>
472
473 </sect1>
474
475 <sect1>
476 <title>/proc/bus/usb/BBB/DDD</title>
477
478 <para>Use these files in one of these basic ways:
479 </para>
480
481 <para><emphasis>They can be read,</emphasis>
482 producing first the device descriptor
483 (18 bytes) and then the descriptors for the current configuration.
484 See the USB 2.0 spec for details about those binary data formats.
485 You'll need to convert most multibyte values from little endian
486 format to your native host byte order, although a few of the
487 fields in the device descriptor (both of the BCD-encoded fields,
488 and the vendor and product IDs) will be byteswapped for you.
489 Note that configuration descriptors include descriptors for
490 interfaces, altsettings, endpoints, and maybe additional
491 class descriptors.
492 </para>
493
494 <para><emphasis>Perform USB operations</emphasis> using
495 <emphasis>ioctl()</emphasis> requests to make endpoint I/O
496 requests (synchronously or asynchronously) or manage
497 the device.
498 These requests need the CAP_SYS_RAWIO capability,
499 as well as filesystem access permissions.
500 Only one ioctl request can be made on one of these
501 device files at a time.
502 This means that if you are synchronously reading an endpoint
503 from one thread, you won't be able to write to a different
504 endpoint from another thread until the read completes.
505 This works for <emphasis>half duplex</emphasis> protocols,
506 but otherwise you'd use asynchronous i/o requests.
507 </para>
508
509 </sect1>
510
511
512 <sect1>
513 <title>Life Cycle of User Mode Drivers</title>
514
515 <para>Such a driver first needs to find a device file
516 for a device it knows how to handle.
517 Maybe it was told about it because a
518 <filename>/sbin/hotplug</filename> event handling agent
519 chose that driver to handle the new device.
520 Or maybe it's an application that scans all the
521 /proc/bus/usb device files, and ignores most devices.
522 In either case, it should <function>read()</function> all
523 the descriptors from the device file,
524 and check them against what it knows how to handle.
525 It might just reject everything except a particular
526 vendor and product ID, or need a more complex policy.
527 </para>
528
529 <para>Never assume there will only be one such device
530 on the system at a time!
531 If your code can't handle more than one device at
532 a time, at least detect when there's more than one, and
533 have your users choose which device to use.
534 </para>
535
536 <para>Once your user mode driver knows what device to use,
537 it interacts with it in either of two styles.
538 The simple style is to make only control requests; some
539 devices don't need more complex interactions than those.
540 (An example might be software using vendor-specific control
541 requests for some initialization or configuration tasks,
542 with a kernel driver for the rest.)
543 </para>
544
545 <para>More likely, you need a more complex style driver:
546 one using non-control endpoints, reading or writing data
547 and claiming exclusive use of an interface.
548 <emphasis>Bulk</emphasis> transfers are easiest to use,
549 but only their sibling <emphasis>interrupt</emphasis> transfers
550 work with low speed devices.
551 Both interrupt and <emphasis>isochronous</emphasis> transfers
552 offer service guarantees because their bandwidth is reserved.
553 Such "periodic" transfers are awkward to use through usbfs,
554 unless you're using the asynchronous calls. However, interrupt
555 transfers can also be used in a synchronous "one shot" style.
556 </para>
557
558 <para>Your user-mode driver should never need to worry
559 about cleaning up request state when the device is
560 disconnected, although it should close its open file
561 descriptors as soon as it starts seeing the ENODEV
562 errors.
563 </para>
564
565 </sect1>
566
567 <sect1><title>The ioctl() Requests</title>
568
569 <para>To use these ioctls, you need to include the following
570 headers in your userspace program:
571<programlisting>#include &lt;linux/usb.h&gt;
572#include &lt;linux/usbdevice_fs.h&gt;
573#include &lt;asm/byteorder.h&gt;</programlisting>
574 The standard USB device model requests, from "Chapter 9" of
575 the USB 2.0 specification, are automatically included from
576 the <filename>&lt;linux/usb_ch9.h&gt;</filename> header.
577 </para>
578
579 <para>Unless noted otherwise, the ioctl requests
580 described here will
581 update the modification time on the usbfs file to which
582 they are applied (unless they fail).
583 A return of zero indicates success; otherwise, a
584 standard USB error code is returned. (These are
585 documented in
586 <filename>Documentation/usb/error-codes.txt</filename>
587 in your kernel sources.)
588 </para>
589
590 <para>Each of these files multiplexes access to several
591 I/O streams, one per endpoint.
592 Each device has one control endpoint (endpoint zero)
593 which supports a limited RPC style RPC access.
594 Devices are configured
595 by khubd (in the kernel) setting a device-wide
596 <emphasis>configuration</emphasis> that affects things
597 like power consumption and basic functionality.
598 The endpoints are part of USB <emphasis>interfaces</emphasis>,
599 which may have <emphasis>altsettings</emphasis>
600 affecting things like which endpoints are available.
601 Many devices only have a single configuration and interface,
602 so drivers for them will ignore configurations and altsettings.
603 </para>
604
605
606 <sect2>
607 <title>Management/Status Requests</title>
608
609 <para>A number of usbfs requests don't deal very directly
610 with device I/O.
611 They mostly relate to device management and status.
612 These are all synchronous requests.
613 </para>
614
615 <variablelist>
616
617 <varlistentry><term>USBDEVFS_CLAIMINTERFACE</term>
618 <listitem><para>This is used to force usbfs to
619 claim a specific interface,
620 which has not previously been claimed by usbfs or any other
621 kernel driver.
622 The ioctl parameter is an integer holding the number of
623 the interface (bInterfaceNumber from descriptor).
624 </para><para>
625 Note that if your driver doesn't claim an interface
626 before trying to use one of its endpoints, and no
627 other driver has bound to it, then the interface is
628 automatically claimed by usbfs.
629 </para><para>
630 This claim will be released by a RELEASEINTERFACE ioctl,
631 or by closing the file descriptor.
632 File modification time is not updated by this request.
633 </para></listitem></varlistentry>
634
635 <varlistentry><term>USBDEVFS_CONNECTINFO</term>
636 <listitem><para>Says whether the device is lowspeed.
637 The ioctl parameter points to a structure like this:
638<programlisting>struct usbdevfs_connectinfo {
639 unsigned int devnum;
640 unsigned char slow;
641}; </programlisting>
642 File modification time is not updated by this request.
643 </para><para>
644 <emphasis>You can't tell whether a "not slow"
645 device is connected at high speed (480 MBit/sec)
646 or just full speed (12 MBit/sec).</emphasis>
647 You should know the devnum value already,
648 it's the DDD value of the device file name.
649 </para></listitem></varlistentry>
650
651 <varlistentry><term>USBDEVFS_GETDRIVER</term>
652 <listitem><para>Returns the name of the kernel driver
653 bound to a given interface (a string). Parameter
654 is a pointer to this structure, which is modified:
655<programlisting>struct usbdevfs_getdriver {
656 unsigned int interface;
657 char driver[USBDEVFS_MAXDRIVERNAME + 1];
658};</programlisting>
659 File modification time is not updated by this request.
660 </para></listitem></varlistentry>
661
662 <varlistentry><term>USBDEVFS_IOCTL</term>
663 <listitem><para>Passes a request from userspace through
664 to a kernel driver that has an ioctl entry in the
665 <emphasis>struct usb_driver</emphasis> it registered.
666<programlisting>struct usbdevfs_ioctl {
667 int ifno;
668 int ioctl_code;
669 void *data;
670};
671
672/* user mode call looks like this.
673 * 'request' becomes the driver->ioctl() 'code' parameter.
674 * the size of 'param' is encoded in 'request', and that data
675 * is copied to or from the driver->ioctl() 'buf' parameter.
676 */
677static int
678usbdev_ioctl (int fd, int ifno, unsigned request, void *param)
679{
680 struct usbdevfs_ioctl wrapper;
681
682 wrapper.ifno = ifno;
683 wrapper.ioctl_code = request;
684 wrapper.data = param;
685
686 return ioctl (fd, USBDEVFS_IOCTL, &amp;wrapper);
687} </programlisting>
688 File modification time is not updated by this request.
689 </para><para>
690 This request lets kernel drivers talk to user mode code
691 through filesystem operations even when they don't create
692 a charactor or block special device.
693 It's also been used to do things like ask devices what
694 device special file should be used.
695 Two pre-defined ioctls are used
696 to disconnect and reconnect kernel drivers, so
697 that user mode code can completely manage binding
698 and configuration of devices.
699 </para></listitem></varlistentry>
700
701 <varlistentry><term>USBDEVFS_RELEASEINTERFACE</term>
702 <listitem><para>This is used to release the claim usbfs
703 made on interface, either implicitly or because of a
704 USBDEVFS_CLAIMINTERFACE call, before the file
705 descriptor is closed.
706 The ioctl parameter is an integer holding the number of
707 the interface (bInterfaceNumber from descriptor);
708 File modification time is not updated by this request.
709 </para><warning><para>
710 <emphasis>No security check is made to ensure
711 that the task which made the claim is the one
712 which is releasing it.
713 This means that user mode driver may interfere
714 other ones. </emphasis>
715 </para></warning></listitem></varlistentry>
716
717 <varlistentry><term>USBDEVFS_RESETEP</term>
718 <listitem><para>Resets the data toggle value for an endpoint
719 (bulk or interrupt) to DATA0.
720 The ioctl parameter is an integer endpoint number
721 (1 to 15, as identified in the endpoint descriptor),
722 with USB_DIR_IN added if the device's endpoint sends
723 data to the host.
724 </para><warning><para>
725 <emphasis>Avoid using this request.
726 It should probably be removed.</emphasis>
727 Using it typically means the device and driver will lose
728 toggle synchronization. If you really lost synchronization,
729 you likely need to completely handshake with the device,
730 using a request like CLEAR_HALT
731 or SET_INTERFACE.
732 </para></warning></listitem></varlistentry>
733
734 </variablelist>
735
736 </sect2>
737
738 <sect2>
739 <title>Synchronous I/O Support</title>
740
741 <para>Synchronous requests involve the kernel blocking
742 until until the user mode request completes, either by
743 finishing successfully or by reporting an error.
744 In most cases this is the simplest way to use usbfs,
745 although as noted above it does prevent performing I/O
746 to more than one endpoint at a time.
747 </para>
748
749 <variablelist>
750
751 <varlistentry><term>USBDEVFS_BULK</term>
752 <listitem><para>Issues a bulk read or write request to the
753 device.
754 The ioctl parameter is a pointer to this structure:
755<programlisting>struct usbdevfs_bulktransfer {
756 unsigned int ep;
757 unsigned int len;
758 unsigned int timeout; /* in milliseconds */
759 void *data;
760};</programlisting>
761 </para><para>The "ep" value identifies a
762 bulk endpoint number (1 to 15, as identified in an endpoint
763 descriptor),
764 masked with USB_DIR_IN when referring to an endpoint which
765 sends data to the host from the device.
766 The length of the data buffer is identified by "len";
767 Recent kernels support requests up to about 128KBytes.
768 <emphasis>FIXME say how read length is returned,
769 and how short reads are handled.</emphasis>.
770 </para></listitem></varlistentry>
771
772 <varlistentry><term>USBDEVFS_CLEAR_HALT</term>
773 <listitem><para>Clears endpoint halt (stall) and
774 resets the endpoint toggle. This is only
775 meaningful for bulk or interrupt endpoints.
776 The ioctl parameter is an integer endpoint number
777 (1 to 15, as identified in an endpoint descriptor),
778 masked with USB_DIR_IN when referring to an endpoint which
779 sends data to the host from the device.
780 </para><para>
781 Use this on bulk or interrupt endpoints which have
782 stalled, returning <emphasis>-EPIPE</emphasis> status
783 to a data transfer request.
784 Do not issue the control request directly, since
785 that could invalidate the host's record of the
786 data toggle.
787 </para></listitem></varlistentry>
788
789 <varlistentry><term>USBDEVFS_CONTROL</term>
790 <listitem><para>Issues a control request to the device.
791 The ioctl parameter points to a structure like this:
792<programlisting>struct usbdevfs_ctrltransfer {
793 __u8 bRequestType;
794 __u8 bRequest;
795 __u16 wValue;
796 __u16 wIndex;
797 __u16 wLength;
798 __u32 timeout; /* in milliseconds */
799 void *data;
800};</programlisting>
801 </para><para>
802 The first eight bytes of this structure are the contents
803 of the SETUP packet to be sent to the device; see the
804 USB 2.0 specification for details.
805 The bRequestType value is composed by combining a
806 USB_TYPE_* value, a USB_DIR_* value, and a
807 USB_RECIP_* value (from
808 <emphasis>&lt;linux/usb.h&gt;</emphasis>).
809 If wLength is nonzero, it describes the length of the data
810 buffer, which is either written to the device
811 (USB_DIR_OUT) or read from the device (USB_DIR_IN).
812 </para><para>
813 At this writing, you can't transfer more than 4 KBytes
814 of data to or from a device; usbfs has a limit, and
815 some host controller drivers have a limit.
816 (That's not usually a problem.)
817 <emphasis>Also</emphasis> there's no way to say it's
818 not OK to get a short read back from the device.
819 </para></listitem></varlistentry>
820
821 <varlistentry><term>USBDEVFS_RESET</term>
822 <listitem><para>Does a USB level device reset.
823 The ioctl parameter is ignored.
824 After the reset, this rebinds all device interfaces.
825 File modification time is not updated by this request.
826 </para><warning><para>
827 <emphasis>Avoid using this call</emphasis>
828 until some usbcore bugs get fixed,
829 since it does not fully synchronize device, interface,
830 and driver (not just usbfs) state.
831 </para></warning></listitem></varlistentry>
832
833 <varlistentry><term>USBDEVFS_SETINTERFACE</term>
834 <listitem><para>Sets the alternate setting for an
835 interface. The ioctl parameter is a pointer to a
836 structure like this:
837<programlisting>struct usbdevfs_setinterface {
838 unsigned int interface;
839 unsigned int altsetting;
840}; </programlisting>
841 File modification time is not updated by this request.
842 </para><para>
843 Those struct members are from some interface descriptor
844 applying to the the current configuration.
845 The interface number is the bInterfaceNumber value, and
846 the altsetting number is the bAlternateSetting value.
847 (This resets each endpoint in the interface.)
848 </para></listitem></varlistentry>
849
850 <varlistentry><term>USBDEVFS_SETCONFIGURATION</term>
851 <listitem><para>Issues the
852 <function>usb_set_configuration</function> call
853 for the device.
854 The parameter is an integer holding the number of
855 a configuration (bConfigurationValue from descriptor).
856 File modification time is not updated by this request.
857 </para><warning><para>
858 <emphasis>Avoid using this call</emphasis>
859 until some usbcore bugs get fixed,
860 since it does not fully synchronize device, interface,
861 and driver (not just usbfs) state.
862 </para></warning></listitem></varlistentry>
863
864 </variablelist>
865 </sect2>
866
867 <sect2>
868 <title>Asynchronous I/O Support</title>
869
870 <para>As mentioned above, there are situations where it may be
871 important to initiate concurrent operations from user mode code.
872 This is particularly important for periodic transfers
873 (interrupt and isochronous), but it can be used for other
874 kinds of USB requests too.
875 In such cases, the asynchronous requests described here
876 are essential. Rather than submitting one request and having
877 the kernel block until it completes, the blocking is separate.
878 </para>
879
880 <para>These requests are packaged into a structure that
881 resembles the URB used by kernel device drivers.
882 (No POSIX Async I/O support here, sorry.)
883 It identifies the endpoint type (USBDEVFS_URB_TYPE_*),
884 endpoint (number, masked with USB_DIR_IN as appropriate),
885 buffer and length, and a user "context" value serving to
886 uniquely identify each request.
887 (It's usually a pointer to per-request data.)
888 Flags can modify requests (not as many as supported for
889 kernel drivers).
890 </para>
891
892 <para>Each request can specify a realtime signal number
893 (between SIGRTMIN and SIGRTMAX, inclusive) to request a
894 signal be sent when the request completes.
895 </para>
896
897 <para>When usbfs returns these urbs, the status value
898 is updated, and the buffer may have been modified.
899 Except for isochronous transfers, the actual_length is
900 updated to say how many bytes were transferred; if the
901 USBDEVFS_URB_DISABLE_SPD flag is set
902 ("short packets are not OK"), if fewer bytes were read
903 than were requested then you get an error report.
904 </para>
905
906<programlisting>struct usbdevfs_iso_packet_desc {
907 unsigned int length;
908 unsigned int actual_length;
909 unsigned int status;
910};
911
912struct usbdevfs_urb {
913 unsigned char type;
914 unsigned char endpoint;
915 int status;
916 unsigned int flags;
917 void *buffer;
918 int buffer_length;
919 int actual_length;
920 int start_frame;
921 int number_of_packets;
922 int error_count;
923 unsigned int signr;
924 void *usercontext;
925 struct usbdevfs_iso_packet_desc iso_frame_desc[];
926};</programlisting>
927
928 <para> For these asynchronous requests, the file modification
929 time reflects when the request was initiated.
930 This contrasts with their use with the synchronous requests,
931 where it reflects when requests complete.
932 </para>
933
934 <variablelist>
935
936 <varlistentry><term>USBDEVFS_DISCARDURB</term>
937 <listitem><para>
938 <emphasis>TBS</emphasis>
939 File modification time is not updated by this request.
940 </para><para>
941 </para></listitem></varlistentry>
942
943 <varlistentry><term>USBDEVFS_DISCSIGNAL</term>
944 <listitem><para>
945 <emphasis>TBS</emphasis>
946 File modification time is not updated by this request.
947 </para><para>
948 </para></listitem></varlistentry>
949
950 <varlistentry><term>USBDEVFS_REAPURB</term>
951 <listitem><para>
952 <emphasis>TBS</emphasis>
953 File modification time is not updated by this request.
954 </para><para>
955 </para></listitem></varlistentry>
956
957 <varlistentry><term>USBDEVFS_REAPURBNDELAY</term>
958 <listitem><para>
959 <emphasis>TBS</emphasis>
960 File modification time is not updated by this request.
961 </para><para>
962 </para></listitem></varlistentry>
963
964 <varlistentry><term>USBDEVFS_SUBMITURB</term>
965 <listitem><para>
966 <emphasis>TBS</emphasis>
967 </para><para>
968 </para></listitem></varlistentry>
969
970 </variablelist>
971 </sect2>
972
973 </sect1>
974
975 </chapter>
976
977</book>
978<!-- vim:syntax=sgml:sw=4
979-->
diff --git a/Documentation/DocBook/via-audio.tmpl b/Documentation/DocBook/via-audio.tmpl
new file mode 100644
index 000000000000..36e642147d6b
--- /dev/null
+++ b/Documentation/DocBook/via-audio.tmpl
@@ -0,0 +1,597 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="ViaAudioGuide">
6 <bookinfo>
7 <title>Via 686 Audio Driver for Linux</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Jeff</firstname>
12 <surname>Garzik</surname>
13 </author>
14 </authorgroup>
15
16 <copyright>
17 <year>1999-2001</year>
18 <holder>Jeff Garzik</holder>
19 </copyright>
20
21 <legalnotice>
22 <para>
23 This documentation is free software; you can redistribute
24 it and/or modify it under the terms of the GNU General Public
25 License as published by the Free Software Foundation; either
26 version 2 of the License, or (at your option) any later
27 version.
28 </para>
29
30 <para>
31 This program is distributed in the hope that it will be
32 useful, but WITHOUT ANY WARRANTY; without even the implied
33 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
34 See the GNU General Public License for more details.
35 </para>
36
37 <para>
38 You should have received a copy of the GNU General Public
39 License along with this program; if not, write to the Free
40 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
41 MA 02111-1307 USA
42 </para>
43
44 <para>
45 For more details see the file COPYING in the source
46 distribution of Linux.
47 </para>
48 </legalnotice>
49 </bookinfo>
50
51<toc></toc>
52
53 <chapter id="intro">
54 <title>Introduction</title>
55 <para>
56 The Via VT82C686A "super southbridge" chips contain
57 AC97-compatible audio logic which features dual 16-bit stereo
58 PCM sound channels (full duplex), plus a third PCM channel intended for use
59 in hardware-assisted FM synthesis.
60 </para>
61 <para>
62 The current Linux kernel audio driver for this family of chips
63 supports audio playback and recording, but hardware-assisted
64 FM features, and hardware buffer direct-access (mmap)
65 support are not yet available.
66 </para>
67 <para>
68 This driver supports any Linux kernel version after 2.4.10.
69 </para>
70 <para>
71 Please send bug reports to the mailing list <email>linux-via@gtf.org</email>.
72 To subscribe, e-mail <email>majordomo@gtf.org</email> with
73 </para>
74 <programlisting>
75 subscribe linux-via
76 </programlisting>
77 <para>
78 in the body of the message.
79 </para>
80 </chapter>
81
82 <chapter id="install">
83 <title>Driver Installation</title>
84 <para>
85 To use this audio driver, select the
86 CONFIG_SOUND_VIA82CXXX option in the section Sound during kernel configuration.
87 Follow the usual kernel procedures for rebuilding the kernel,
88 or building and installing driver modules.
89 </para>
90 <para>
91 To make this driver the default audio driver, you can add the
92 following to your /etc/conf.modules file:
93 </para>
94 <programlisting>
95 alias sound via82cxxx_audio
96 </programlisting>
97 <para>
98 Note that soundcore and ac97_codec support modules
99 are also required for working audio, in addition to
100 the via82cxxx_audio module itself.
101 </para>
102 </chapter>
103
104 <chapter id="reportbug">
105 <title>Submitting a bug report</title>
106 <sect1 id="bugrepdesc"><title>Description of problem</title>
107 <para>
108 Describe the application you were using to play/record sound, and how
109 to reproduce the problem.
110 </para>
111 </sect1>
112 <sect1 id="bugrepdiag"><title>Diagnostic output</title>
113 <para>
114 Obtain the via-audio-diag diagnostics program from
115 http://sf.net/projects/gkernel/ and provide a dump of the
116 audio chip's registers while the problem is occurring. Sample command line:
117 </para>
118 <programlisting>
119 ./via-audio-diag -aps > diag-output.txt
120 </programlisting>
121 </sect1>
122 <sect1 id="bugrepdebug"><title>Driver debug output</title>
123 <para>
124 Define <constant>VIA_DEBUG</constant> at the beginning of the driver, then capture and email
125 the kernel log output. This can be viewed in the system kernel log (if
126 enabled), or via the dmesg program. Sample command line:
127 </para>
128 <programlisting>
129 dmesg > /tmp/dmesg-output.txt
130 </programlisting>
131 </sect1>
132 <sect1 id="bugrepprintk"><title>Bigger kernel message buffer</title>
133 <para>
134 If you wish to increase the size of the buffer displayed by dmesg, then
135 change the <constant>LOG_BUF_LEN</constant> macro at the top of linux/kernel/printk.c, recompile
136 your kernel, and pass the <constant>LOG_BUF_LEN</constant> value to dmesg. Sample command line with
137 <constant>LOG_BUF_LEN</constant> == 32768:
138 </para>
139 <programlisting>
140 dmesg -s 32768 > /tmp/dmesg-output.txt
141 </programlisting>
142 </sect1>
143 </chapter>
144
145 <chapter id="bugs">
146 <title>Known Bugs And Assumptions</title>
147 <para>
148 <variablelist>
149 <varlistentry><term>Low volume</term>
150 <listitem>
151 <para>
152 Volume too low on many systems. Workaround: use mixer program
153 such as xmixer to increase volume.
154 </para>
155 </listitem></varlistentry>
156
157 </variablelist>
158
159 </para>
160 </chapter>
161
162 <chapter id="thanks">
163 <title>Thanks</title>
164 <para>
165 Via for providing e-mail support, specs, and NDA'd source code.
166 </para>
167 <para>
168 MandrakeSoft for providing hacking time.
169 </para>
170 <para>
171 AC97 mixer interface fixes and debugging by Ron Cemer <email>roncemer@gte.net</email>.
172 </para>
173 <para>
174 Rui Sousa <email>rui.sousa@conexant.com</email>, for bugfixing
175 MMAP support, and several other notable fixes that resulted from
176 his hard work and testing.
177 </para>
178 <para>
179 Adrian Cox <email>adrian@humboldt.co.uk</email>, for bugfixing
180 MMAP support, and several other notable fixes that resulted from
181 his hard work and testing.
182 </para>
183 <para>
184 Thomas Sailer for further bugfixes.
185 </para>
186 </chapter>
187
188 <chapter id="notes">
189 <title>Random Notes</title>
190 <para>
191 Two /proc pseudo-files provide diagnostic information. This is generally
192 not useful to most users. Power users can disable CONFIG_SOUND_VIA82CXXX_PROCFS,
193 and remove the /proc support code. Once
194 version 2.0.0 is released, the /proc support code will be disabled by
195 default. Available /proc pseudo-files:
196 </para>
197 <programlisting>
198 /proc/driver/via/0/info
199 /proc/driver/via/0/ac97
200 </programlisting>
201 <para>
202 This driver by default supports all PCI audio devices which report
203 a vendor id of 0x1106, and a device id of 0x3058. Subsystem vendor
204 and device ids are not examined.
205 </para>
206 <para>
207 GNU indent formatting options:
208 <programlisting>
209-kr -i8 -ts8 -br -ce -bap -sob -l80 -pcs -cs -ss -bs -di1 -nbc -lp -psl
210 </programlisting>
211 </para>
212 <para>
213 Via has graciously donated e-mail support and source code to help further
214 the development of this driver. Their assistance has been invaluable
215 in the design and coding of the next major version of this driver.
216 </para>
217 <para>
218 The Via audio chip apparently provides a second PCM scatter-gather
219 DMA channel just for FM data, but does not have a full hardware MIDI
220 processor. I haven't put much thought towards a solution here, but it
221 might involve using SoftOSS midi wave table, or simply disabling MIDI
222 support altogether and using the FM PCM channel as a second (input? output?)
223 </para>
224 </chapter>
225
226 <chapter id="changelog">
227 <title>Driver ChangeLog</title>
228
229<sect1 id="version191"><title>
230Version 1.9.1
231</title>
232 <itemizedlist spacing="compact">
233 <listitem>
234 <para>
235 DSP read/write bugfixes from Thomas Sailer.
236 </para>
237 </listitem>
238
239 <listitem>
240 <para>
241 Add new PCI id for single-channel use of Via 8233.
242 </para>
243 </listitem>
244
245 <listitem>
246 <para>
247 Other bug fixes, tweaks, new ioctls.
248 </para>
249 </listitem>
250
251 </itemizedlist>
252</sect1>
253
254<sect1 id="version1115"><title>
255Version 1.1.15
256</title>
257 <itemizedlist spacing="compact">
258 <listitem>
259 <para>
260 Support for variable fragment size and variable fragment number (Rui
261 Sousa)
262 </para>
263 </listitem>
264
265 <listitem>
266 <para>
267 Fixes for the SPEED, STEREO, CHANNELS, FMT ioctls when in read &amp;
268 write mode (Rui Sousa)
269 </para>
270 </listitem>
271
272 <listitem>
273 <para>
274 Mmaped sound is now fully functional. (Rui Sousa)
275 </para>
276 </listitem>
277
278 <listitem>
279 <para>
280 Make sure to enable PCI device before reading any of its PCI
281 config information. (fixes potential hotplug problems)
282 </para>
283 </listitem>
284
285 <listitem>
286 <para>
287 Clean up code a bit and add more internal function documentation.
288 </para>
289 </listitem>
290
291 <listitem>
292 <para>
293 AC97 codec access fixes (Adrian Cox)
294 </para>
295 </listitem>
296
297 <listitem>
298 <para>
299 Big endian fixes (Adrian Cox)
300 </para>
301 </listitem>
302
303 <listitem>
304 <para>
305 MIDI support (Adrian Cox)
306 </para>
307 </listitem>
308
309 <listitem>
310 <para>
311 Detect and report locked-rate AC97 codecs. If your hardware only
312 supports 48Khz (locked rate), then your recording/playback software
313 must upsample or downsample accordingly. The hardware cannot do it.
314 </para>
315 </listitem>
316
317 <listitem>
318 <para>
319 Use new pci_request_regions and pci_disable_device functions in
320 kernel 2.4.6.
321 </para>
322 </listitem>
323
324 </itemizedlist>
325</sect1>
326
327<sect1 id="version1114"><title>
328Version 1.1.14
329</title>
330 <itemizedlist spacing="compact">
331 <listitem>
332 <para>
333 Use VM_RESERVE when available, to eliminate unnecessary page faults.
334 </para>
335 </listitem>
336 </itemizedlist>
337</sect1>
338
339<sect1 id="version1112"><title>
340Version 1.1.12
341</title>
342 <itemizedlist spacing="compact">
343 <listitem>
344 <para>
345 mmap bug fixes from Linus.
346 </para>
347 </listitem>
348 </itemizedlist>
349</sect1>
350
351<sect1 id="version1111"><title>
352Version 1.1.11
353</title>
354 <itemizedlist spacing="compact">
355 <listitem>
356 <para>
357 Many more bug fixes. mmap enabled by default, but may still be buggy.
358 </para>
359 </listitem>
360
361 <listitem>
362 <para>
363 Uses new and spiffy method of mmap'ing the DMA buffer, based
364 on a suggestion from Linus.
365 </para>
366 </listitem>
367 </itemizedlist>
368</sect1>
369
370<sect1 id="version1110"><title>
371Version 1.1.10
372</title>
373 <itemizedlist spacing="compact">
374 <listitem>
375 <para>
376 Many bug fixes. mmap enabled by default, but may still be buggy.
377 </para>
378 </listitem>
379 </itemizedlist>
380</sect1>
381
382<sect1 id="version119"><title>
383Version 1.1.9
384</title>
385 <itemizedlist spacing="compact">
386 <listitem>
387 <para>
388 Redesign and rewrite audio playback implementation. (faster and smaller, hopefully)
389 </para>
390 </listitem>
391
392 <listitem>
393 <para>
394 Implement recording and full duplex (DSP_CAP_DUPLEX) support.
395 </para>
396 </listitem>
397
398 <listitem>
399 <para>
400 Make procfs support optional.
401 </para>
402 </listitem>
403
404 <listitem>
405 <para>
406 Quick interrupt status check, to lessen overhead in interrupt
407 sharing situations.
408 </para>
409 </listitem>
410
411 <listitem>
412 <para>
413 Add mmap(2) support. Disabled for now, it is still buggy and experimental.
414 </para>
415 </listitem>
416
417 <listitem>
418 <para>
419 Surround all syscalls with a semaphore for cheap and easy SMP protection.
420 </para>
421 </listitem>
422
423 <listitem>
424 <para>
425 Fix bug in channel shutdown (hardware channel reset) code.
426 </para>
427 </listitem>
428
429 <listitem>
430 <para>
431 Remove unnecessary spinlocks (better performance).
432 </para>
433 </listitem>
434
435 <listitem>
436 <para>
437 Eliminate "unknown AFMT" message by using a different method
438 of selecting the best AFMT_xxx sound sample format for use.
439 </para>
440 </listitem>
441
442 <listitem>
443 <para>
444 Support for realtime hardware pointer position reporting
445 (DSP_CAP_REALTIME, SNDCTL_DSP_GETxPTR ioctls)
446 </para>
447 </listitem>
448
449 <listitem>
450 <para>
451 Support for capture/playback triggering
452 (DSP_CAP_TRIGGER, SNDCTL_DSP_SETTRIGGER ioctls)
453 </para>
454 </listitem>
455
456 <listitem>
457 <para>
458 SNDCTL_DSP_SETDUPLEX and SNDCTL_DSP_POST ioctls now handled.
459 </para>
460 </listitem>
461
462 <listitem>
463 <para>
464 Rewrite open(2) and close(2) logic to allow only one user at
465 a time. All other open(2) attempts will sleep until they succeed.
466 FIXME: open(O_RDONLY) and open(O_WRONLY) should be allowed to succeed.
467 </para>
468 </listitem>
469
470 <listitem>
471 <para>
472 Reviewed code to ensure that SMP and multiple audio devices
473 are fully supported.
474 </para>
475 </listitem>
476
477 </itemizedlist>
478</sect1>
479
480<sect1 id="version118"><title>
481Version 1.1.8
482</title>
483 <itemizedlist spacing="compact">
484 <listitem>
485 <para>
486 Clean up interrupt handler output. Fixes the following kernel error message:
487 </para>
488 <programlisting>
489 unhandled interrupt ...
490 </programlisting>
491 </listitem>
492
493 <listitem>
494 <para>
495 Convert documentation to DocBook, so that PDF, HTML and PostScript (.ps) output is readily
496 available.
497 </para>
498 </listitem>
499
500 </itemizedlist>
501</sect1>
502
503<sect1 id="version117"><title>
504Version 1.1.7
505</title>
506 <itemizedlist spacing="compact">
507 <listitem>
508 <para>
509 Fix module unload bug where mixer device left registered
510 after driver exit
511 </para>
512 </listitem>
513 </itemizedlist>
514</sect1>
515
516<sect1 id="version116"><title>
517Version 1.1.6
518</title>
519 <itemizedlist spacing="compact">
520 <listitem>
521 <para>
522 Rewrite via_set_rate to mimic ALSA basic AC97 rate setting
523 </para>
524 </listitem>
525 <listitem>
526 <para>
527 Remove much dead code
528 </para>
529 </listitem>
530 <listitem>
531 <para>
532 Complete spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl
533 </para>
534 </listitem>
535 <listitem>
536 <para>
537 Fix build problem in via_dsp_ioctl
538 </para>
539 </listitem>
540 <listitem>
541 <para>
542 Optimize included headers to eliminate headers found in linux/sound
543 </para>
544 </listitem>
545 </itemizedlist>
546</sect1>
547
548<sect1 id="version115"><title>
549Version 1.1.5
550</title>
551 <itemizedlist spacing="compact">
552 <listitem>
553 <para>
554 Disable some overly-verbose debugging code
555 </para>
556 </listitem>
557 <listitem>
558 <para>
559 Remove unnecessary sound locks
560 </para>
561 </listitem>
562 <listitem>
563 <para>
564 Fix some ioctls for better time resolution
565 </para>
566 </listitem>
567 <listitem>
568 <para>
569 Begin spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl
570 </para>
571 </listitem>
572 </itemizedlist>
573</sect1>
574
575<sect1 id="version114"><title>
576Version 1.1.4
577</title>
578 <itemizedlist spacing="compact">
579 <listitem>
580 <para>
581 Completed rewrite of driver. Eliminated SoundBlaster compatibility
582 completely, and now uses the much-faster scatter-gather DMA engine.
583 </para>
584 </listitem>
585 </itemizedlist>
586</sect1>
587
588 </chapter>
589
590 <chapter id="intfunctions">
591 <title>Internal Functions</title>
592!Isound/oss/via82cxxx_audio.c
593 </chapter>
594
595</book>
596
597
diff --git a/Documentation/DocBook/videobook.tmpl b/Documentation/DocBook/videobook.tmpl
new file mode 100644
index 000000000000..3ec6c875588a
--- /dev/null
+++ b/Documentation/DocBook/videobook.tmpl
@@ -0,0 +1,1663 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="V4LGuide">
6 <bookinfo>
7 <title>Video4Linux Programming</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Alan</firstname>
12 <surname>Cox</surname>
13 <affiliation>
14 <address>
15 <email>alan@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2000</year>
23 <holder>Alan Cox</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53 </legalnotice>
54 </bookinfo>
55
56<toc></toc>
57
58 <chapter id="intro">
59 <title>Introduction</title>
60 <para>
61 Parts of this document first appeared in Linux Magazine under a
62 ninety day exclusivity.
63 </para>
64 <para>
65 Video4Linux is intended to provide a common programming interface
66 for the many TV and capture cards now on the market, as well as
67 parallel port and USB video cameras. Radio, teletext decoders and
68 vertical blanking data interfaces are also provided.
69 </para>
70 </chapter>
71 <chapter id="radio">
72 <title>Radio Devices</title>
73 <para>
74 There are a wide variety of radio interfaces available for PC's, and these
75 are generally very simple to program. The biggest problem with supporting
76 such devices is normally extracting documentation from the vendor.
77 </para>
78 <para>
79 The radio interface supports a simple set of control ioctls standardised
80 across all radio and tv interfaces. It does not support read or write, which
81 are used for video streams. The reason radio cards do not allow you to read
82 the audio stream into an application is that without exception they provide
83 a connection on to a soundcard. Soundcards can be used to read the radio
84 data just fine.
85 </para>
86 <sect1 id="registerradio">
87 <title>Registering Radio Devices</title>
88 <para>
89 The Video4linux core provides an interface for registering devices. The
90 first step in writing our radio card driver is to register it.
91 </para>
92 <programlisting>
93
94
95static struct video_device my_radio
96{
97 "My radio",
98 VID_TYPE_TUNER,
99 VID_HARDWARE_MYRADIO,
100 radio_open.
101 radio_close,
102 NULL, /* no read */
103 NULL, /* no write */
104 NULL, /* no poll */
105 radio_ioctl,
106 NULL, /* no special init function */
107 NULL /* no private data */
108};
109
110
111 </programlisting>
112 <para>
113 This declares our video4linux device driver interface. The VID_TYPE_ value
114 defines what kind of an interface we are, and defines basic capabilities.
115 </para>
116 <para>
117 The only defined value relevant for a radio card is VID_TYPE_TUNER which
118 indicates that the device can be tuned. Clearly our radio is going to have some
119 way to change channel so it is tuneable.
120 </para>
121 <para>
122 The VID_HARDWARE_ types are unique to each device. Numbers are assigned by
123 <email>alan@redhat.com</email> when device drivers are going to be released. Until then you
124 can pull a suitably large number out of your hat and use it. 10000 should be
125 safe for a very long time even allowing for the huge number of vendors
126 making new and different radio cards at the moment.
127 </para>
128 <para>
129 We declare an open and close routine, but we do not need read or write,
130 which are used to read and write video data to or from the card itself. As
131 we have no read or write there is no poll function.
132 </para>
133 <para>
134 The private initialise function is run when the device is registered. In
135 this driver we've already done all the work needed. The final pointer is a
136 private data pointer that can be used by the device driver to attach and
137 retrieve private data structures. We set this field "priv" to NULL for
138 the moment.
139 </para>
140 <para>
141 Having the structure defined is all very well but we now need to register it
142 with the kernel.
143 </para>
144 <programlisting>
145
146
147static int io = 0x320;
148
149int __init myradio_init(struct video_init *v)
150{
151 if(!request_region(io, MY_IO_SIZE, "myradio"))
152 {
153 printk(KERN_ERR
154 "myradio: port 0x%03X is in use.\n", io);
155 return -EBUSY;
156 }
157
158 if(video_device_register(&amp;my_radio, VFL_TYPE_RADIO)==-1) {
159 release_region(io, MY_IO_SIZE);
160 return -EINVAL;
161 }
162 return 0;
163}
164
165 </programlisting>
166 <para>
167 The first stage of the initialisation, as is normally the case, is to check
168 that the I/O space we are about to fiddle with doesn't belong to some other
169 driver. If it is we leave well alone. If the user gives the address of the
170 wrong device then we will spot this. These policies will generally avoid
171 crashing the machine.
172 </para>
173 <para>
174 Now we ask the Video4Linux layer to register the device for us. We hand it
175 our carefully designed video_device structure and also tell it which group
176 of devices we want it registered with. In this case VFL_TYPE_RADIO.
177 </para>
178 <para>
179 The types available are
180 </para>
181 <table frame="all"><title>Device Types</title>
182 <tgroup cols="3" align="left">
183 <tbody>
184 <row>
185 <entry>VFL_TYPE_RADIO</entry><entry>/dev/radio{n}</entry><entry>
186
187 Radio devices are assigned in this block. As with all of these
188 selections the actual number assignment is done by the video layer
189 accordijng to what is free.</entry>
190 </row><row>
191 <entry>VFL_TYPE_GRABBER</entry><entry>/dev/video{n}</entry><entry>
192 Video capture devices and also -- counter-intuitively for the name --
193 hardware video playback devices such as MPEG2 cards.</entry>
194 </row><row>
195 <entry>VFL_TYPE_VBI</entry><entry>/dev/vbi{n}</entry><entry>
196 The VBI devices capture the hidden lines on a television picture
197 that carry further information like closed caption data, teletext
198 (primarily in Europe) and now Intercast and the ATVEC internet
199 television encodings.</entry>
200 </row><row>
201 <entry>VFL_TYPE_VTX</entry><entry>/dev/vtx[n}</entry><entry>
202 VTX is 'Videotext' also known as 'Teletext'. This is a system for
203 sending numbered, 40x25, mostly textual page images over the hidden
204 lines. Unlike the /dev/vbi interfaces, this is for 'smart' decoder
205 chips. (The use of the word smart here has to be taken in context,
206 the smartest teletext chips are fairly dumb pieces of technology).
207 </entry>
208 </row>
209 </tbody>
210 </tgroup>
211 </table>
212 <para>
213 We are most definitely a radio.
214 </para>
215 <para>
216 Finally we allocate our I/O space so that nobody treads on us and return 0
217 to signify general happiness with the state of the universe.
218 </para>
219 </sect1>
220 <sect1 id="openradio">
221 <title>Opening And Closing The Radio</title>
222
223 <para>
224 The functions we declared in our video_device are mostly very simple.
225 Firstly we can drop in what is basically standard code for open and close.
226 </para>
227 <programlisting>
228
229
230static int users = 0;
231
232static int radio_open(stuct video_device *dev, int flags)
233{
234 if(users)
235 return -EBUSY;
236 users++;
237 return 0;
238}
239
240 </programlisting>
241 <para>
242 At open time we need to do nothing but check if someone else is also using
243 the radio card. If nobody is using it we make a note that we are using it,
244 then we ensure that nobody unloads our driver on us.
245 </para>
246 <programlisting>
247
248
249static int radio_close(struct video_device *dev)
250{
251 users--;
252}
253
254 </programlisting>
255 <para>
256 At close time we simply need to reduce the user count and allow the module
257 to become unloadable.
258 </para>
259 <para>
260 If you are sharp you will have noticed neither the open nor the close
261 routines attempt to reset or change the radio settings. This is intentional.
262 It allows an application to set up the radio and exit. It avoids a user
263 having to leave an application running all the time just to listen to the
264 radio.
265 </para>
266 </sect1>
267 <sect1 id="ioctlradio">
268 <title>The Ioctl Interface</title>
269 <para>
270 This leaves the ioctl routine, without which the driver will not be
271 terribly useful to anyone.
272 </para>
273 <programlisting>
274
275
276static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
277{
278 switch(cmd)
279 {
280 case VIDIOCGCAP:
281 {
282 struct video_capability v;
283 v.type = VID_TYPE_TUNER;
284 v.channels = 1;
285 v.audios = 1;
286 v.maxwidth = 0;
287 v.minwidth = 0;
288 v.maxheight = 0;
289 v.minheight = 0;
290 strcpy(v.name, "My Radio");
291 if(copy_to_user(arg, &amp;v, sizeof(v)))
292 return -EFAULT;
293 return 0;
294 }
295
296 </programlisting>
297 <para>
298 VIDIOCGCAP is the first ioctl all video4linux devices must support. It
299 allows the applications to find out what sort of a card they have found and
300 to figure out what they want to do about it. The fields in the structure are
301 </para>
302 <table frame="all"><title>struct video_capability fields</title>
303 <tgroup cols="2" align="left">
304 <tbody>
305 <row>
306 <entry>name</entry><entry>The device text name. This is intended for the user.</entry>
307 </row><row>
308 <entry>channels</entry><entry>The number of different channels you can tune on
309 this card. It could even by zero for a card that has
310 no tuning capability. For our simple FM radio it is 1.
311 An AM/FM radio would report 2.</entry>
312 </row><row>
313 <entry>audios</entry><entry>The number of audio inputs on this device. For our
314 radio there is only one audio input.</entry>
315 </row><row>
316 <entry>minwidth,minheight</entry><entry>The smallest size the card is capable of capturing
317 images in. We set these to zero. Radios do not
318 capture pictures</entry>
319 </row><row>
320 <entry>maxwidth,maxheight</entry><entry>The largest image size the card is capable of
321 capturing. For our radio we report 0.
322 </entry>
323 </row><row>
324 <entry>type</entry><entry>This reports the capabilities of the device, and
325 matches the field we filled in in the struct
326 video_device when registering.</entry>
327 </row>
328 </tbody>
329 </tgroup>
330 </table>
331 <para>
332 Having filled in the fields, we use copy_to_user to copy the structure into
333 the users buffer. If the copy fails we return an EFAULT to the application
334 so that it knows it tried to feed us garbage.
335 </para>
336 <para>
337 The next pair of ioctl operations select which tuner is to be used and let
338 the application find the tuner properties. We have only a single FM band
339 tuner in our example device.
340 </para>
341 <programlisting>
342
343
344 case VIDIOCGTUNER:
345 {
346 struct video_tuner v;
347 if(copy_from_user(&amp;v, arg, sizeof(v))!=0)
348 return -EFAULT;
349 if(v.tuner)
350 return -EINVAL;
351 v.rangelow=(87*16000);
352 v.rangehigh=(108*16000);
353 v.flags = VIDEO_TUNER_LOW;
354 v.mode = VIDEO_MODE_AUTO;
355 v.signal = 0xFFFF;
356 strcpy(v.name, "FM");
357 if(copy_to_user(&amp;v, arg, sizeof(v))!=0)
358 return -EFAULT;
359 return 0;
360 }
361
362 </programlisting>
363 <para>
364 The VIDIOCGTUNER ioctl allows applications to query a tuner. The application
365 sets the tuner field to the tuner number it wishes to query. The query does
366 not change the tuner that is being used, it merely enquires about the tuner
367 in question.
368 </para>
369 <para>
370 We have exactly one tuner so after copying the user buffer to our temporary
371 structure we complain if they asked for a tuner other than tuner 0.
372 </para>
373 <para>
374 The video_tuner structure has the following fields
375 </para>
376 <table frame="all"><title>struct video_tuner fields</title>
377 <tgroup cols="2" align="left">
378 <tbody>
379 <row>
380 <entry>int tuner</entry><entry>The number of the tuner in question</entry>
381 </row><row>
382 <entry>char name[32]</entry><entry>A text description of this tuner. "FM" will do fine.
383 This is intended for the application.</entry>
384 </row><row>
385 <entry>u32 flags</entry>
386 <entry>Tuner capability flags</entry>
387 </row>
388 <row>
389 <entry>u16 mode</entry><entry>The current reception mode</entry>
390
391 </row><row>
392 <entry>u16 signal</entry><entry>The signal strength scaled between 0 and 65535. If
393 a device cannot tell the signal strength it should
394 report 65535. Many simple cards contain only a
395 signal/no signal bit. Such cards will report either
396 0 or 65535.</entry>
397
398 </row><row>
399 <entry>u32 rangelow, rangehigh</entry><entry>
400 The range of frequencies supported by the radio
401 or TV. It is scaled according to the VIDEO_TUNER_LOW
402 flag.</entry>
403
404 </row>
405 </tbody>
406 </tgroup>
407 </table>
408
409 <table frame="all"><title>struct video_tuner flags</title>
410 <tgroup cols="2" align="left">
411 <tbody>
412 <row>
413 <entry>VIDEO_TUNER_PAL</entry><entry>A PAL TV tuner</entry>
414 </row><row>
415 <entry>VIDEO_TUNER_NTSC</entry><entry>An NTSC (US) TV tuner</entry>
416 </row><row>
417 <entry>VIDEO_TUNER_SECAM</entry><entry>A SECAM (French) TV tuner</entry>
418 </row><row>
419 <entry>VIDEO_TUNER_LOW</entry><entry>
420 The tuner frequency is scaled in 1/16th of a KHz
421 steps. If not it is in 1/16th of a MHz steps
422 </entry>
423 </row><row>
424 <entry>VIDEO_TUNER_NORM</entry><entry>The tuner can set its format</entry>
425 </row><row>
426 <entry>VIDEO_TUNER_STEREO_ON</entry><entry>The tuner is currently receiving a stereo signal</entry>
427 </row>
428 </tbody>
429 </tgroup>
430 </table>
431
432 <table frame="all"><title>struct video_tuner modes</title>
433 <tgroup cols="2" align="left">
434 <tbody>
435 <row>
436 <entry>VIDEO_MODE_PAL</entry><entry>PAL Format</entry>
437 </row><row>
438 <entry>VIDEO_MODE_NTSC</entry><entry>NTSC Format (USA)</entry>
439 </row><row>
440 <entry>VIDEO_MODE_SECAM</entry><entry>French Format</entry>
441 </row><row>
442 <entry>VIDEO_MODE_AUTO</entry><entry>A device that does not need to do
443 TV format switching</entry>
444 </row>
445 </tbody>
446 </tgroup>
447 </table>
448 <para>
449 The settings for the radio card are thus fairly simple. We report that we
450 are a tuner called "FM" for FM radio. In order to get the best tuning
451 resolution we report VIDEO_TUNER_LOW and select tuning to 1/16th of KHz. Its
452 unlikely our card can do that resolution but it is a fair bet the card can
453 do better than 1/16th of a MHz. VIDEO_TUNER_LOW is appropriate to almost all
454 radio usage.
455 </para>
456 <para>
457 We report that the tuner automatically handles deciding what format it is
458 receiving - true enough as it only handles FM radio. Our example card is
459 also incapable of detecting stereo or signal strengths so it reports a
460 strength of 0xFFFF (maximum) and no stereo detected.
461 </para>
462 <para>
463 To finish off we set the range that can be tuned to be 87-108Mhz, the normal
464 FM broadcast radio range. It is important to find out what the card is
465 actually capable of tuning. It is easy enough to simply use the FM broadcast
466 range. Unfortunately if you do this you will discover the FM broadcast
467 ranges in the USA, Europe and Japan are all subtly different and some users
468 cannot receive all the stations they wish.
469 </para>
470 <para>
471 The application also needs to be able to set the tuner it wishes to use. In
472 our case, with a single tuner this is rather simple to arrange.
473 </para>
474 <programlisting>
475
476 case VIDIOCSTUNER:
477 {
478 struct video_tuner v;
479 if(copy_from_user(&amp;v, arg, sizeof(v)))
480 return -EFAULT;
481 if(v.tuner != 0)
482 return -EINVAL;
483 return 0;
484 }
485
486 </programlisting>
487 <para>
488 We copy the user supplied structure into kernel memory so we can examine it.
489 If the user has selected a tuner other than zero we reject the request. If
490 they wanted tuner 0 then, surprisingly enough, that is the current tuner already.
491 </para>
492 <para>
493 The next two ioctls we need to provide are to get and set the frequency of
494 the radio. These both use an unsigned long argument which is the frequency.
495 The scale of the frequency depends on the VIDEO_TUNER_LOW flag as I
496 mentioned earlier on. Since we have VIDEO_TUNER_LOW set this will be in
497 1/16ths of a KHz.
498 </para>
499 <programlisting>
500
501static unsigned long current_freq;
502
503
504
505 case VIDIOCGFREQ:
506 if(copy_to_user(arg, &amp;current_freq,
507 sizeof(unsigned long))
508 return -EFAULT;
509 return 0;
510
511 </programlisting>
512 <para>
513 Querying the frequency in our case is relatively simple. Our radio card is
514 too dumb to let us query the signal strength so we remember our setting if
515 we know it. All we have to do is copy it to the user.
516 </para>
517 <programlisting>
518
519
520 case VIDIOCSFREQ:
521 {
522 u32 freq;
523 if(copy_from_user(arg, &amp;freq,
524 sizeof(unsigned long))!=0)
525 return -EFAULT;
526 if(hardware_set_freq(freq)&lt;0)
527 return -EINVAL;
528 current_freq = freq;
529 return 0;
530 }
531
532 </programlisting>
533 <para>
534 Setting the frequency is a little more complex. We begin by copying the
535 desired frequency into kernel space. Next we call a hardware specific routine
536 to set the radio up. This might be as simple as some scaling and a few
537 writes to an I/O port. For most radio cards it turns out a good deal more
538 complicated and may involve programming things like a phase locked loop on
539 the card. This is what documentation is for.
540 </para>
541 <para>
542 The final set of operations we need to provide for our radio are the
543 volume controls. Not all radio cards can even do volume control. After all
544 there is a perfectly good volume control on the sound card. We will assume
545 our radio card has a simple 4 step volume control.
546 </para>
547 <para>
548 There are two ioctls with audio we need to support
549 </para>
550 <programlisting>
551
552static int current_volume=0;
553
554 case VIDIOCGAUDIO:
555 {
556 struct video_audio v;
557 if(copy_from_user(&amp;v, arg, sizeof(v)))
558 return -EFAULT;
559 if(v.audio != 0)
560 return -EINVAL;
561 v.volume = 16384*current_volume;
562 v.step = 16384;
563 strcpy(v.name, "Radio");
564 v.mode = VIDEO_SOUND_MONO;
565 v.balance = 0;
566 v.base = 0;
567 v.treble = 0;
568
569 if(copy_to_user(arg. &amp;v, sizeof(v)))
570 return -EFAULT;
571 return 0;
572 }
573
574 </programlisting>
575 <para>
576 Much like the tuner we start by copying the user structure into kernel
577 space. Again we check if the user has asked for a valid audio input. We have
578 only input 0 and we punt if they ask for another input.
579 </para>
580 <para>
581 Then we fill in the video_audio structure. This has the following format
582 </para>
583 <table frame="all"><title>struct video_audio fields</title>
584 <tgroup cols="2" align="left">
585 <tbody>
586 <row>
587 <entry>audio</entry><entry>The input the user wishes to query</entry>
588 </row><row>
589 <entry>volume</entry><entry>The volume setting on a scale of 0-65535</entry>
590 </row><row>
591 <entry>base</entry><entry>The base level on a scale of 0-65535</entry>
592 </row><row>
593 <entry>treble</entry><entry>The treble level on a scale of 0-65535</entry>
594 </row><row>
595 <entry>flags</entry><entry>The features this audio device supports
596 </entry>
597 </row><row>
598 <entry>name</entry><entry>A text name to display to the user. We picked
599 "Radio" as it explains things quite nicely.</entry>
600 </row><row>
601 <entry>mode</entry><entry>The current reception mode for the audio
602
603 We report MONO because our card is too stupid to know if it is in
604 mono or stereo.
605 </entry>
606 </row><row>
607 <entry>balance</entry><entry>The stereo balance on a scale of 0-65535, 32768 is
608 middle.</entry>
609 </row><row>
610 <entry>step</entry><entry>The step by which the volume control jumps. This is
611 used to help make it easy for applications to set
612 slider behaviour.</entry>
613 </row>
614 </tbody>
615 </tgroup>
616 </table>
617
618 <table frame="all"><title>struct video_audio flags</title>
619 <tgroup cols="2" align="left">
620 <tbody>
621 <row>
622 <entry>VIDEO_AUDIO_MUTE</entry><entry>The audio is currently muted. We
623 could fake this in our driver but we
624 choose not to bother.</entry>
625 </row><row>
626 <entry>VIDEO_AUDIO_MUTABLE</entry><entry>The input has a mute option</entry>
627 </row><row>
628 <entry>VIDEO_AUDIO_TREBLE</entry><entry>The input has a treble control</entry>
629 </row><row>
630 <entry>VIDEO_AUDIO_BASS</entry><entry>The input has a base control</entry>
631 </row>
632 </tbody>
633 </tgroup>
634 </table>
635
636 <table frame="all"><title>struct video_audio modes</title>
637 <tgroup cols="2" align="left">
638 <tbody>
639 <row>
640 <entry>VIDEO_SOUND_MONO</entry><entry>Mono sound</entry>
641 </row><row>
642 <entry>VIDEO_SOUND_STEREO</entry><entry>Stereo sound</entry>
643 </row><row>
644 <entry>VIDEO_SOUND_LANG1</entry><entry>Alternative language 1 (TV specific)</entry>
645 </row><row>
646 <entry>VIDEO_SOUND_LANG2</entry><entry>Alternative language 2 (TV specific)</entry>
647 </row>
648 </tbody>
649 </tgroup>
650 </table>
651 <para>
652 Having filled in the structure we copy it back to user space.
653 </para>
654 <para>
655 The VIDIOCSAUDIO ioctl allows the user to set the audio parameters in the
656 video_audio structure. The driver does its best to honour the request.
657 </para>
658 <programlisting>
659
660 case VIDIOCSAUDIO:
661 {
662 struct video_audio v;
663 if(copy_from_user(&amp;v, arg, sizeof(v)))
664 return -EFAULT;
665 if(v.audio)
666 return -EINVAL;
667 current_volume = v/16384;
668 hardware_set_volume(current_volume);
669 return 0;
670 }
671
672 </programlisting>
673 <para>
674 In our case there is very little that the user can set. The volume is
675 basically the limit. Note that we could pretend to have a mute feature
676 by rewriting this to
677 </para>
678 <programlisting>
679
680 case VIDIOCSAUDIO:
681 {
682 struct video_audio v;
683 if(copy_from_user(&amp;v, arg, sizeof(v)))
684 return -EFAULT;
685 if(v.audio)
686 return -EINVAL;
687 current_volume = v/16384;
688 if(v.flags&amp;VIDEO_AUDIO_MUTE)
689 hardware_set_volume(0);
690 else
691 hardware_set_volume(current_volume);
692 current_muted = v.flags &amp;
693 VIDEO_AUDIO_MUTE;
694 return 0;
695 }
696
697 </programlisting>
698 <para>
699 This with the corresponding changes to the VIDIOCGAUDIO code to report the
700 state of the mute flag we save and to report the card has a mute function,
701 will allow applications to use a mute facility with this card. It is
702 questionable whether this is a good idea however. User applications can already
703 fake this themselves and kernel space is precious.
704 </para>
705 <para>
706 We now have a working radio ioctl handler. So we just wrap up the function
707 </para>
708 <programlisting>
709
710
711 }
712 return -ENOIOCTLCMD;
713}
714
715 </programlisting>
716 <para>
717 and pass the Video4Linux layer back an error so that it knows we did not
718 understand the request we got passed.
719 </para>
720 </sect1>
721 <sect1 id="modradio">
722 <title>Module Wrapper</title>
723 <para>
724 Finally we add in the usual module wrapping and the driver is done.
725 </para>
726 <programlisting>
727
728#ifndef MODULE
729
730static int io = 0x300;
731
732#else
733
734static int io = -1;
735
736#endif
737
738MODULE_AUTHOR("Alan Cox");
739MODULE_DESCRIPTION("A driver for an imaginary radio card.");
740module_param(io, int, 0444);
741MODULE_PARM_DESC(io, "I/O address of the card.");
742
743static int __init init(void)
744{
745 if(io==-1)
746 {
747 printk(KERN_ERR
748 "You must set an I/O address with io=0x???\n");
749 return -EINVAL;
750 }
751 return myradio_init(NULL);
752}
753
754static void __exit cleanup(void)
755{
756 video_unregister_device(&amp;my_radio);
757 release_region(io, MY_IO_SIZE);
758}
759
760module_init(init);
761module_exit(cleanup);
762
763 </programlisting>
764 <para>
765 In this example we set the IO base by default if the driver is compiled into
766 the kernel: you can still set it using "my_radio.irq" if this file is called <filename>my_radio.c</filename>. For the module we require the
767 user sets the parameter. We set io to a nonsense port (-1) so that we can
768 tell if the user supplied an io parameter or not.
769 </para>
770 <para>
771 We use MODULE_ defines to give an author for the card driver and a
772 description. We also use them to declare that io is an integer and it is the
773 address of the card, and can be read by anyone from sysfs.
774 </para>
775 <para>
776 The clean-up routine unregisters the video_device we registered, and frees
777 up the I/O space. Note that the unregister takes the actual video_device
778 structure as its argument. Unlike the file operations structure which can be
779 shared by all instances of a device a video_device structure as an actual
780 instance of the device. If you are registering multiple radio devices you
781 need to fill in one structure per device (most likely by setting up a
782 template and copying it to each of the actual device structures).
783 </para>
784 </sect1>
785 </chapter>
786 <chapter>
787 <title>Video Capture Devices</title>
788 <sect1 id="introvid">
789 <title>Video Capture Device Types</title>
790 <para>
791 The video capture devices share the same interfaces as radio devices. In
792 order to explain the video capture interface I will use the example of a
793 camera that has no tuners or audio input. This keeps the example relatively
794 clean. To get both combine the two driver examples.
795 </para>
796 <para>
797 Video capture devices divide into four categories. A little technology
798 backgrounder. Full motion video even at television resolution (which is
799 actually fairly low) is pretty resource-intensive. You are continually
800 passing megabytes of data every second from the capture card to the display.
801 several alternative approaches have emerged because copying this through the
802 processor and the user program is a particularly bad idea .
803 </para>
804 <para>
805 The first is to add the television image onto the video output directly.
806 This is also how some 3D cards work. These basic cards can generally drop the
807 video into any chosen rectangle of the display. Cards like this, which
808 include most mpeg1 cards that used the feature connector, aren't very
809 friendly in a windowing environment. They don't understand windows or
810 clipping. The video window is always on the top of the display.
811 </para>
812 <para>
813 Chroma keying is a technique used by cards to get around this. It is an old
814 television mixing trick where you mark all the areas you wish to replace
815 with a single clear colour that isn't used in the image - TV people use an
816 incredibly bright blue while computing people often use a particularly
817 virulent purple. Bright blue occurs on the desktop. Anyone with virulent
818 purple windows has another problem besides their TV overlay.
819 </para>
820 <para>
821 The third approach is to copy the data from the capture card to the video
822 card, but to do it directly across the PCI bus. This relieves the processor
823 from doing the work but does require some smartness on the part of the video
824 capture chip, as well as a suitable video card. Programming this kind of
825 card and more so debugging it can be extremely tricky. There are some quite
826 complicated interactions with the display and you may also have to cope with
827 various chipset bugs that show up when PCI cards start talking to each
828 other.
829 </para>
830 <para>
831 To keep our example fairly simple we will assume a card that supports
832 overlaying a flat rectangular image onto the frame buffer output, and which
833 can also capture stuff into processor memory.
834 </para>
835 </sect1>
836 <sect1 id="regvid">
837 <title>Registering Video Capture Devices</title>
838 <para>
839 This time we need to add more functions for our camera device.
840 </para>
841 <programlisting>
842static struct video_device my_camera
843{
844 "My Camera",
845 VID_TYPE_OVERLAY|VID_TYPE_SCALES|\
846 VID_TYPE_CAPTURE|VID_TYPE_CHROMAKEY,
847 VID_HARDWARE_MYCAMERA,
848 camera_open.
849 camera_close,
850 camera_read, /* no read */
851 NULL, /* no write */
852 camera_poll, /* no poll */
853 camera_ioctl,
854 NULL, /* no special init function */
855 NULL /* no private data */
856};
857 </programlisting>
858 <para>
859 We need a read() function which is used for capturing data from
860 the card, and we need a poll function so that a driver can wait for the next
861 frame to be captured.
862 </para>
863 <para>
864 We use the extra video capability flags that did not apply to the
865 radio interface. The video related flags are
866 </para>
867 <table frame="all"><title>Capture Capabilities</title>
868 <tgroup cols="2" align="left">
869 <tbody>
870 <row>
871<entry>VID_TYPE_CAPTURE</entry><entry>We support image capture</entry>
872</row><row>
873<entry>VID_TYPE_TELETEXT</entry><entry>A teletext capture device (vbi{n])</entry>
874</row><row>
875<entry>VID_TYPE_OVERLAY</entry><entry>The image can be directly overlaid onto the
876 frame buffer</entry>
877</row><row>
878<entry>VID_TYPE_CHROMAKEY</entry><entry>Chromakey can be used to select which parts
879 of the image to display</entry>
880</row><row>
881<entry>VID_TYPE_CLIPPING</entry><entry>It is possible to give the board a list of
882 rectangles to draw around. </entry>
883</row><row>
884<entry>VID_TYPE_FRAMERAM</entry><entry>The video capture goes into the video memory
885 and actually changes it. Applications need
886 to know this so they can clean up after the
887 card</entry>
888</row><row>
889<entry>VID_TYPE_SCALES</entry><entry>The image can be scaled to various sizes,
890 rather than being a single fixed size.</entry>
891</row><row>
892<entry>VID_TYPE_MONOCHROME</entry><entry>The capture will be monochrome. This isn't a
893 complete answer to the question since a mono
894 camera on a colour capture card will still
895 produce mono output.</entry>
896</row><row>
897<entry>VID_TYPE_SUBCAPTURE</entry><entry>The card allows only part of its field of
898 view to be captured. This enables
899 applications to avoid copying all of a large
900 image into memory when only some section is
901 relevant.</entry>
902 </row>
903 </tbody>
904 </tgroup>
905 </table>
906 <para>
907 We set VID_TYPE_CAPTURE so that we are seen as a capture card,
908 VID_TYPE_CHROMAKEY so the application knows it is time to draw in virulent
909 purple, and VID_TYPE_SCALES because we can be resized.
910 </para>
911 <para>
912 Our setup is fairly similar. This time we also want an interrupt line
913 for the 'frame captured' signal. Not all cards have this so some of them
914 cannot handle poll().
915 </para>
916 <programlisting>
917
918
919static int io = 0x320;
920static int irq = 11;
921
922int __init mycamera_init(struct video_init *v)
923{
924 if(!request_region(io, MY_IO_SIZE, "mycamera"))
925 {
926 printk(KERN_ERR
927 "mycamera: port 0x%03X is in use.\n", io);
928 return -EBUSY;
929 }
930
931 if(video_device_register(&amp;my_camera,
932 VFL_TYPE_GRABBER)==-1) {
933 release_region(io, MY_IO_SIZE);
934 return -EINVAL;
935 }
936 return 0;
937}
938
939 </programlisting>
940 <para>
941 This is little changed from the needs of the radio card. We specify
942 VFL_TYPE_GRABBER this time as we want to be allocated a /dev/video name.
943 </para>
944 </sect1>
945 <sect1 id="opvid">
946 <title>Opening And Closing The Capture Device</title>
947 <programlisting>
948
949
950static int users = 0;
951
952static int camera_open(stuct video_device *dev, int flags)
953{
954 if(users)
955 return -EBUSY;
956 if(request_irq(irq, camera_irq, 0, "camera", dev)&lt;0)
957 return -EBUSY;
958 users++;
959 return 0;
960}
961
962
963static int camera_close(struct video_device *dev)
964{
965 users--;
966 free_irq(irq, dev);
967}
968 </programlisting>
969 <para>
970 The open and close routines are also quite similar. The only real change is
971 that we now request an interrupt for the camera device interrupt line. If we
972 cannot get the interrupt we report EBUSY to the application and give up.
973 </para>
974 </sect1>
975 <sect1 id="irqvid">
976 <title>Interrupt Handling</title>
977 <para>
978 Our example handler is for an ISA bus device. If it was PCI you would be
979 able to share the interrupt and would have set SA_SHIRQ to indicate a
980 shared IRQ. We pass the device pointer as the interrupt routine argument. We
981 don't need to since we only support one card but doing this will make it
982 easier to upgrade the driver for multiple devices in the future.
983 </para>
984 <para>
985 Our interrupt routine needs to do little if we assume the card can simply
986 queue one frame to be read after it captures it.
987 </para>
988 <programlisting>
989
990
991static struct wait_queue *capture_wait;
992static int capture_ready = 0;
993
994static void camera_irq(int irq, void *dev_id,
995 struct pt_regs *regs)
996{
997 capture_ready=1;
998 wake_up_interruptible(&amp;capture_wait);
999}
1000 </programlisting>
1001 <para>
1002 The interrupt handler is nice and simple for this card as we are assuming
1003 the card is buffering the frame for us. This means we have little to do but
1004 wake up anybody interested. We also set a capture_ready flag, as we may
1005 capture a frame before an application needs it. In this case we need to know
1006 that a frame is ready. If we had to collect the frame on the interrupt life
1007 would be more complex.
1008 </para>
1009 <para>
1010 The two new routines we need to supply are camera_read which returns a
1011 frame, and camera_poll which waits for a frame to become ready.
1012 </para>
1013 <programlisting>
1014
1015
1016static int camera_poll(struct video_device *dev,
1017 struct file *file, struct poll_table *wait)
1018{
1019 poll_wait(file, &amp;capture_wait, wait);
1020 if(capture_read)
1021 return POLLIN|POLLRDNORM;
1022 return 0;
1023}
1024
1025 </programlisting>
1026 <para>
1027 Our wait queue for polling is the capture_wait queue. This will cause the
1028 task to be woken up by our camera_irq routine. We check capture_read to see
1029 if there is an image present and if so report that it is readable.
1030 </para>
1031 </sect1>
1032 <sect1 id="rdvid">
1033 <title>Reading The Video Image</title>
1034 <programlisting>
1035
1036
1037static long camera_read(struct video_device *dev, char *buf,
1038 unsigned long count)
1039{
1040 struct wait_queue wait = { current, NULL };
1041 u8 *ptr;
1042 int len;
1043 int i;
1044
1045 add_wait_queue(&amp;capture_wait, &amp;wait);
1046
1047 while(!capture_ready)
1048 {
1049 if(file->flags&amp;O_NDELAY)
1050 {
1051 remove_wait_queue(&amp;capture_wait, &amp;wait);
1052 current->state = TASK_RUNNING;
1053 return -EWOULDBLOCK;
1054 }
1055 if(signal_pending(current))
1056 {
1057 remove_wait_queue(&amp;capture_wait, &amp;wait);
1058 current->state = TASK_RUNNING;
1059 return -ERESTARTSYS;
1060 }
1061 schedule();
1062 current->state = TASK_INTERRUPTIBLE;
1063 }
1064 remove_wait_queue(&amp;capture_wait, &amp;wait);
1065 current->state = TASK_RUNNING;
1066
1067 </programlisting>
1068 <para>
1069 The first thing we have to do is to ensure that the application waits until
1070 the next frame is ready. The code here is almost identical to the mouse code
1071 we used earlier in this chapter. It is one of the common building blocks of
1072 Linux device driver code and probably one which you will find occurs in any
1073 drivers you write.
1074 </para>
1075 <para>
1076 We wait for a frame to be ready, or for a signal to interrupt our waiting. If a
1077 signal occurs we need to return from the system call so that the signal can
1078 be sent to the application itself. We also check to see if the user actually
1079 wanted to avoid waiting - ie if they are using non-blocking I/O and have other things
1080 to get on with.
1081 </para>
1082 <para>
1083 Next we copy the data from the card to the user application. This is rarely
1084 as easy as our example makes out. We will add capture_w, and capture_h here
1085 to hold the width and height of the captured image. We assume the card only
1086 supports 24bit RGB for now.
1087 </para>
1088 <programlisting>
1089
1090
1091
1092 capture_ready = 0;
1093
1094 ptr=(u8 *)buf;
1095 len = capture_w * 3 * capture_h; /* 24bit RGB */
1096
1097 if(len>count)
1098 len=count; /* Doesn't all fit */
1099
1100 for(i=0; i&lt;len; i++)
1101 {
1102 put_user(inb(io+IMAGE_DATA), ptr);
1103 ptr++;
1104 }
1105
1106 hardware_restart_capture();
1107
1108 return i;
1109}
1110
1111 </programlisting>
1112 <para>
1113 For a real hardware device you would try to avoid the loop with put_user().
1114 Each call to put_user() has a time overhead checking whether the accesses to user
1115 space are allowed. It would be better to read a line into a temporary buffer
1116 then copy this to user space in one go.
1117 </para>
1118 <para>
1119 Having captured the image and put it into user space we can kick the card to
1120 get the next frame acquired.
1121 </para>
1122 </sect1>
1123 <sect1 id="iocvid">
1124 <title>Video Ioctl Handling</title>
1125 <para>
1126 As with the radio driver the major control interface is via the ioctl()
1127 function. Video capture devices support the same tuner calls as a radio
1128 device and also support additional calls to control how the video functions
1129 are handled. In this simple example the card has no tuners to avoid making
1130 the code complex.
1131 </para>
1132 <programlisting>
1133
1134
1135
1136static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
1137{
1138 switch(cmd)
1139 {
1140 case VIDIOCGCAP:
1141 {
1142 struct video_capability v;
1143 v.type = VID_TYPE_CAPTURE|\
1144 VID_TYPE_CHROMAKEY|\
1145 VID_TYPE_SCALES|\
1146 VID_TYPE_OVERLAY;
1147 v.channels = 1;
1148 v.audios = 0;
1149 v.maxwidth = 640;
1150 v.minwidth = 16;
1151 v.maxheight = 480;
1152 v.minheight = 16;
1153 strcpy(v.name, "My Camera");
1154 if(copy_to_user(arg, &amp;v, sizeof(v)))
1155 return -EFAULT;
1156 return 0;
1157 }
1158
1159
1160 </programlisting>
1161 <para>
1162 The first ioctl we must support and which all video capture and radio
1163 devices are required to support is VIDIOCGCAP. This behaves exactly the same
1164 as with a radio device. This time, however, we report the extra capabilities
1165 we outlined earlier on when defining our video_dev structure.
1166 </para>
1167 <para>
1168 We now set the video flags saying that we support overlay, capture,
1169 scaling and chromakey. We also report size limits - our smallest image is
1170 16x16 pixels, our largest is 640x480.
1171 </para>
1172 <para>
1173 To keep things simple we report no audio and no tuning capabilities at all.
1174 </para>
1175 <programlisting>
1176
1177 case VIDIOCGCHAN:
1178 {
1179 struct video_channel v;
1180 if(copy_from_user(&amp;v, arg, sizeof(v)))
1181 return -EFAULT;
1182 if(v.channel != 0)
1183 return -EINVAL;
1184 v.flags = 0;
1185 v.tuners = 0;
1186 v.type = VIDEO_TYPE_CAMERA;
1187 v.norm = VIDEO_MODE_AUTO;
1188 strcpy(v.name, "Camera Input");break;
1189 if(copy_to_user(&amp;v, arg, sizeof(v)))
1190 return -EFAULT;
1191 return 0;
1192 }
1193
1194
1195 </programlisting>
1196 <para>
1197 This follows what is very much the standard way an ioctl handler looks
1198 in Linux. We copy the data into a kernel space variable and we check that the
1199 request is valid (in this case that the input is 0). Finally we copy the
1200 camera info back to the user.
1201 </para>
1202 <para>
1203 The VIDIOCGCHAN ioctl allows a user to ask about video channels (that is
1204 inputs to the video card). Our example card has a single camera input. The
1205 fields in the structure are
1206 </para>
1207 <table frame="all"><title>struct video_channel fields</title>
1208 <tgroup cols="2" align="left">
1209 <tbody>
1210 <row>
1211
1212 <entry>channel</entry><entry>The channel number we are selecting</entry>
1213 </row><row>
1214 <entry>name</entry><entry>The name for this channel. This is intended
1215 to describe the port to the user.
1216 Appropriate names are therefore things like
1217 "Camera" "SCART input"</entry>
1218 </row><row>
1219 <entry>flags</entry><entry>Channel properties</entry>
1220 </row><row>
1221 <entry>type</entry><entry>Input type</entry>
1222 </row><row>
1223 <entry>norm</entry><entry>The current television encoding being used
1224 if relevant for this channel.
1225 </entry>
1226 </row>
1227 </tbody>
1228 </tgroup>
1229 </table>
1230 <table frame="all"><title>struct video_channel flags</title>
1231 <tgroup cols="2" align="left">
1232 <tbody>
1233 <row>
1234 <entry>VIDEO_VC_TUNER</entry><entry>Channel has a tuner.</entry>
1235 </row><row>
1236 <entry>VIDEO_VC_AUDIO</entry><entry>Channel has audio.</entry>
1237 </row>
1238 </tbody>
1239 </tgroup>
1240 </table>
1241 <table frame="all"><title>struct video_channel types</title>
1242 <tgroup cols="2" align="left">
1243 <tbody>
1244 <row>
1245 <entry>VIDEO_TYPE_TV</entry><entry>Television input.</entry>
1246 </row><row>
1247 <entry>VIDEO_TYPE_CAMERA</entry><entry>Fixed camera input.</entry>
1248 </row><row>
1249 <entry>0</entry><entry>Type is unknown.</entry>
1250 </row>
1251 </tbody>
1252 </tgroup>
1253 </table>
1254 <table frame="all"><title>struct video_channel norms</title>
1255 <tgroup cols="2" align="left">
1256 <tbody>
1257 <row>
1258 <entry>VIDEO_MODE_PAL</entry><entry>PAL encoded Television</entry>
1259 </row><row>
1260 <entry>VIDEO_MODE_NTSC</entry><entry>NTSC (US) encoded Television</entry>
1261 </row><row>
1262 <entry>VIDEO_MODE_SECAM</entry><entry>SECAM (French) Television </entry>
1263 </row><row>
1264 <entry>VIDEO_MODE_AUTO</entry><entry>Automatic switching, or format does not
1265 matter</entry>
1266 </row>
1267 </tbody>
1268 </tgroup>
1269 </table>
1270 <para>
1271 The corresponding VIDIOCSCHAN ioctl allows a user to change channel and to
1272 request the norm is changed - for example to switch between a PAL or an NTSC
1273 format camera.
1274 </para>
1275 <programlisting>
1276
1277
1278 case VIDIOCSCHAN:
1279 {
1280 struct video_channel v;
1281 if(copy_from_user(&amp;v, arg, sizeof(v)))
1282 return -EFAULT;
1283 if(v.channel != 0)
1284 return -EINVAL;
1285 if(v.norm != VIDEO_MODE_AUTO)
1286 return -EINVAL;
1287 return 0;
1288 }
1289
1290
1291 </programlisting>
1292 <para>
1293 The implementation of this call in our driver is remarkably easy. Because we
1294 are assuming fixed format hardware we need only check that the user has not
1295 tried to change anything.
1296 </para>
1297 <para>
1298 The user also needs to be able to configure and adjust the picture they are
1299 seeing. This is much like adjusting a television set. A user application
1300 also needs to know the palette being used so that it knows how to display
1301 the image that has been captured. The VIDIOCGPICT and VIDIOCSPICT ioctl
1302 calls provide this information.
1303 </para>
1304 <programlisting>
1305
1306
1307 case VIDIOCGPICT
1308 {
1309 struct video_picture v;
1310 v.brightness = hardware_brightness();
1311 v.hue = hardware_hue();
1312 v.colour = hardware_saturation();
1313 v.contrast = hardware_brightness();
1314 /* Not settable */
1315 v.whiteness = 32768;
1316 v.depth = 24; /* 24bit */
1317 v.palette = VIDEO_PALETTE_RGB24;
1318 if(copy_to_user(&amp;v, arg,
1319 sizeof(v)))
1320 return -EFAULT;
1321 return 0;
1322 }
1323
1324
1325 </programlisting>
1326 <para>
1327 The brightness, hue, color, and contrast provide the picture controls that
1328 are akin to a conventional television. Whiteness provides additional
1329 control for greyscale images. All of these values are scaled between 0-65535
1330 and have 32768 as the mid point setting. The scaling means that applications
1331 do not have to worry about the capability range of the hardware but can let
1332 it make a best effort attempt.
1333 </para>
1334 <para>
1335 Our depth is 24, as this is in bits. We will be returning RGB24 format. This
1336 has one byte of red, then one of green, then one of blue. This then repeats
1337 for every other pixel in the image. The other common formats the interface
1338 defines are
1339 </para>
1340 <table frame="all"><title>Framebuffer Encodings</title>
1341 <tgroup cols="2" align="left">
1342 <tbody>
1343 <row>
1344 <entry>GREY</entry><entry>Linear greyscale. This is for simple cameras and the
1345 like</entry>
1346 </row><row>
1347 <entry>RGB565</entry><entry>The top 5 bits hold 32 red levels, the next six bits
1348 hold green and the low 5 bits hold blue. </entry>
1349 </row><row>
1350 <entry>RGB555</entry><entry>The top bit is clear. The red green and blue levels
1351 each occupy five bits.</entry>
1352 </row>
1353 </tbody>
1354 </tgroup>
1355 </table>
1356 <para>
1357 Additional modes are support for YUV capture formats. These are common for
1358 TV and video conferencing applications.
1359 </para>
1360 <para>
1361 The VIDIOCSPICT ioctl allows a user to set some of the picture parameters.
1362 Exactly which ones are supported depends heavily on the card itself. It is
1363 possible to support many modes and effects in software. In general doing
1364 this in the kernel is a bad idea. Video capture is a performance-sensitive
1365 application and the programs can often do better if they aren't being
1366 'helped' by an overkeen driver writer. Thus for our device we will report
1367 RGB24 only and refuse to allow a change.
1368 </para>
1369 <programlisting>
1370
1371
1372 case VIDIOCSPICT:
1373 {
1374 struct video_picture v;
1375 if(copy_from_user(&amp;v, arg, sizeof(v)))
1376 return -EFAULT;
1377 if(v.depth!=24 ||
1378 v.palette != VIDEO_PALETTE_RGB24)
1379 return -EINVAL;
1380 set_hardware_brightness(v.brightness);
1381 set_hardware_hue(v.hue);
1382 set_hardware_saturation(v.colour);
1383 set_hardware_brightness(v.contrast);
1384 return 0;
1385 }
1386
1387
1388 </programlisting>
1389 <para>
1390 We check the user has not tried to change the palette or the depth. We do
1391 not want to carry out some of the changes and then return an error. This may
1392 confuse the application which will be assuming no change occurred.
1393 </para>
1394 <para>
1395 In much the same way as you need to be able to set the picture controls to
1396 get the right capture images, many cards need to know what they are
1397 displaying onto when generating overlay output. In some cases getting this
1398 wrong even makes a nasty mess or may crash the computer. For that reason
1399 the VIDIOCSBUF ioctl used to set up the frame buffer information may well
1400 only be usable by root.
1401 </para>
1402 <para>
1403 We will assume our card is one of the old ISA devices with feature connector
1404 and only supports a couple of standard video modes. Very common for older
1405 cards although the PCI devices are way smarter than this.
1406 </para>
1407 <programlisting>
1408
1409
1410static struct video_buffer capture_fb;
1411
1412 case VIDIOCGFBUF:
1413 {
1414 if(copy_to_user(arg, &amp;capture_fb,
1415 sizeof(capture_fb)))
1416 return -EFAULT;
1417 return 0;
1418
1419 }
1420
1421
1422 </programlisting>
1423 <para>
1424 We keep the frame buffer information in the format the ioctl uses. This
1425 makes it nice and easy to work with in the ioctl calls.
1426 </para>
1427 <programlisting>
1428
1429 case VIDIOCSFBUF:
1430 {
1431 struct video_buffer v;
1432
1433 if(!capable(CAP_SYS_ADMIN))
1434 return -EPERM;
1435
1436 if(copy_from_user(&amp;v, arg, sizeof(v)))
1437 return -EFAULT;
1438 if(v.width!=320 &amp;&amp; v.width!=640)
1439 return -EINVAL;
1440 if(v.height!=200 &amp;&amp; v.height!=240
1441 &amp;&amp; v.height!=400
1442 &amp;&amp; v.height !=480)
1443 return -EINVAL;
1444 memcpy(&amp;capture_fb, &amp;v, sizeof(v));
1445 hardware_set_fb(&amp;v);
1446 return 0;
1447 }
1448
1449
1450
1451 </programlisting>
1452 <para>
1453 The capable() function checks a user has the required capability. The Linux
1454 operating system has a set of about 30 capabilities indicating privileged
1455 access to services. The default set up gives the superuser (uid 0) all of
1456 them and nobody else has any.
1457 </para>
1458 <para>
1459 We check that the user has the SYS_ADMIN capability, that is they are
1460 allowed to operate as the machine administrator. We don't want anyone but
1461 the administrator making a mess of the display.
1462 </para>
1463 <para>
1464 Next we check for standard PC video modes (320 or 640 wide with either
1465 EGA or VGA depths). If the mode is not a standard video mode we reject it as
1466 not supported by our card. If the mode is acceptable we save it so that
1467 VIDIOCFBUF will give the right answer next time it is called. The
1468 hardware_set_fb() function is some undescribed card specific function to
1469 program the card for the desired mode.
1470 </para>
1471 <para>
1472 Before the driver can display an overlay window it needs to know where the
1473 window should be placed, and also how large it should be. If the card
1474 supports clipping it needs to know which rectangles to omit from the
1475 display. The video_window structure is used to describe the way the image
1476 should be displayed.
1477 </para>
1478 <table frame="all"><title>struct video_window fields</title>
1479 <tgroup cols="2" align="left">
1480 <tbody>
1481 <row>
1482 <entry>width</entry><entry>The width in pixels of the desired image. The card
1483 may use a smaller size if this size is not available</entry>
1484 </row><row>
1485 <entry>height</entry><entry>The height of the image. The card may use a smaller
1486 size if this size is not available.</entry>
1487 </row><row>
1488 <entry>x</entry><entry> The X position of the top left of the window. This
1489 is in pixels relative to the left hand edge of the
1490 picture. Not all cards can display images aligned on
1491 any pixel boundary. If the position is unsuitable
1492 the card adjusts the image right and reduces the
1493 width.</entry>
1494 </row><row>
1495 <entry>y</entry><entry> The Y position of the top left of the window. This
1496 is counted in pixels relative to the top edge of the
1497 picture. As with the width if the card cannot
1498 display starting on this line it will adjust the
1499 values.</entry>
1500 </row><row>
1501 <entry>chromakey</entry><entry>The colour (expressed in RGB32 format) for the
1502 chromakey colour if chroma keying is being used. </entry>
1503 </row><row>
1504 <entry>clips</entry><entry>An array of rectangles that must not be drawn
1505 over.</entry>
1506 </row><row>
1507 <entry>clipcount</entry><entry>The number of clips in this array.</entry>
1508 </row>
1509 </tbody>
1510 </tgroup>
1511 </table>
1512 <para>
1513 Each clip is a struct video_clip which has the following fields
1514 </para>
1515 <table frame="all"><title>video_clip fields</title>
1516 <tgroup cols="2" align="left">
1517 <tbody>
1518 <row>
1519 <entry>x, y</entry><entry>Co-ordinates relative to the display</entry>
1520 </row><row>
1521 <entry>width, height</entry><entry>Width and height in pixels</entry>
1522 </row><row>
1523 <entry>next</entry><entry>A spare field for the application to use</entry>
1524 </row>
1525 </tbody>
1526 </tgroup>
1527 </table>
1528 <para>
1529 The driver is required to ensure it always draws in the area requested or a smaller area, and that it never draws in any of the areas that are clipped.
1530 This may well mean it has to leave alone. small areas the application wished to be
1531 drawn.
1532 </para>
1533 <para>
1534 Our example card uses chromakey so does not have to address most of the
1535 clipping. We will add a video_window structure to our global variables to
1536 remember our parameters, as we did with the frame buffer.
1537 </para>
1538 <programlisting>
1539
1540
1541 case VIDIOCGWIN:
1542 {
1543 if(copy_to_user(arg, &amp;capture_win,
1544 sizeof(capture_win)))
1545 return -EFAULT;
1546 return 0;
1547 }
1548
1549
1550 case VIDIOCSWIN:
1551 {
1552 struct video_window v;
1553 if(copy_from_user(&amp;v, arg, sizeof(v)))
1554 return -EFAULT;
1555 if(v.width &gt; 640 || v.height &gt; 480)
1556 return -EINVAL;
1557 if(v.width &lt; 16 || v.height &lt; 16)
1558 return -EINVAL;
1559 hardware_set_key(v.chromakey);
1560 hardware_set_window(v);
1561 memcpy(&amp;capture_win, &amp;v, sizeof(v));
1562 capture_w = v.width;
1563 capture_h = v.height;
1564 return 0;
1565 }
1566
1567
1568 </programlisting>
1569 <para>
1570 Because we are using Chromakey our setup is fairly simple. Mostly we have to
1571 check the values are sane and load them into the capture card.
1572 </para>
1573 <para>
1574 With all the setup done we can now turn on the actual capture/overlay. This
1575 is done with the VIDIOCCAPTURE ioctl. This takes a single integer argument
1576 where 0 is on and 1 is off.
1577 </para>
1578 <programlisting>
1579
1580
1581 case VIDIOCCAPTURE:
1582 {
1583 int v;
1584 if(get_user(v, (int *)arg))
1585 return -EFAULT;
1586 if(v==0)
1587 hardware_capture_off();
1588 else
1589 {
1590 if(capture_fb.width == 0
1591 || capture_w == 0)
1592 return -EINVAL;
1593 hardware_capture_on();
1594 }
1595 return 0;
1596 }
1597
1598
1599 </programlisting>
1600 <para>
1601 We grab the flag from user space and either enable or disable according to
1602 its value. There is one small corner case we have to consider here. Suppose
1603 that the capture was requested before the video window or the frame buffer
1604 had been set up. In those cases there will be unconfigured fields in our
1605 card data, as well as unconfigured hardware settings. We check for this case and
1606 return an error if the frame buffer or the capture window width is zero.
1607 </para>
1608 <programlisting>
1609
1610
1611 default:
1612 return -ENOIOCTLCMD;
1613 }
1614}
1615 </programlisting>
1616 <para>
1617
1618 We don't need to support any other ioctls, so if we get this far, it is time
1619 to tell the video layer that we don't now what the user is talking about.
1620 </para>
1621 </sect1>
1622 <sect1 id="endvid">
1623 <title>Other Functionality</title>
1624 <para>
1625 The Video4Linux layer supports additional features, including a high
1626 performance mmap() based capture mode and capturing part of the image.
1627 These features are out of the scope of the book. You should however have enough
1628 example code to implement most simple video4linux devices for radio and TV
1629 cards.
1630 </para>
1631 </sect1>
1632 </chapter>
1633 <chapter id="bugs">
1634 <title>Known Bugs And Assumptions</title>
1635 <para>
1636 <variablelist>
1637 <varlistentry><term>Multiple Opens</term>
1638 <listitem>
1639 <para>
1640 The driver assumes multiple opens should not be allowed. A driver
1641 can work around this but not cleanly.
1642 </para>
1643 </listitem></varlistentry>
1644
1645 <varlistentry><term>API Deficiencies</term>
1646 <listitem>
1647 <para>
1648 The existing API poorly reflects compression capable devices. There
1649 are plans afoot to merge V4L, V4L2 and some other ideas into a
1650 better interface.
1651 </para>
1652 </listitem></varlistentry>
1653 </variablelist>
1654
1655 </para>
1656 </chapter>
1657
1658 <chapter id="pubfunctions">
1659 <title>Public Functions Provided</title>
1660!Edrivers/media/video/videodev.c
1661 </chapter>
1662
1663</book>
diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl
new file mode 100644
index 000000000000..9eebcc304de4
--- /dev/null
+++ b/Documentation/DocBook/wanbook.tmpl
@@ -0,0 +1,99 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="WANGuide">
6 <bookinfo>
7 <title>Synchronous PPP and Cisco HDLC Programming Guide</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Alan</firstname>
12 <surname>Cox</surname>
13 <affiliation>
14 <address>
15 <email>alan@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2000</year>
23 <holder>Alan Cox</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53 </legalnotice>
54 </bookinfo>
55
56<toc></toc>
57
58 <chapter id="intro">
59 <title>Introduction</title>
60 <para>
61 The syncppp drivers in Linux provide a fairly complete
62 implementation of Cisco HDLC and a minimal implementation of
63 PPP. The longer term goal is to switch the PPP layer to the
64 generic PPP interface that is new in Linux 2.3.x. The API should
65 remain unchanged when this is done, but support will then be
66 available for IPX, compression and other PPP features
67 </para>
68 </chapter>
69 <chapter id="bugs">
70 <title>Known Bugs And Assumptions</title>
71 <para>
72 <variablelist>
73 <varlistentry><term>PPP is minimal</term>
74 <listitem>
75 <para>
76 The current PPP implementation is very basic, although sufficient
77 for most wan usages.
78 </para>
79 </listitem></varlistentry>
80
81 <varlistentry><term>Cisco HDLC Quirks</term>
82 <listitem>
83 <para>
84 Currently we do not end all packets with the correct Cisco multicast
85 or unicast flags. Nothing appears to mind too much but this should
86 be corrected.
87 </para>
88 </listitem></varlistentry>
89 </variablelist>
90
91 </para>
92 </chapter>
93
94 <chapter id="pubfunctions">
95 <title>Public Functions Provided</title>
96!Edrivers/net/wan/syncppp.c
97 </chapter>
98
99</book>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
new file mode 100644
index 000000000000..51f3bfb6fb6e
--- /dev/null
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -0,0 +1,419 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="USBDeviceDriver">
6 <bookinfo>
7 <title>Writing USB Device Drivers</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Greg</firstname>
12 <surname>Kroah-Hartman</surname>
13 <affiliation>
14 <address>
15 <email>greg@kroah.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2001-2002</year>
23 <holder>Greg Kroah-Hartman</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53
54 <para>
55 This documentation is based on an article published in
56 Linux Journal Magazine, October 2001, Issue 90.
57 </para>
58 </legalnotice>
59 </bookinfo>
60
61<toc></toc>
62
63 <chapter id="intro">
64 <title>Introduction</title>
65 <para>
66 The Linux USB subsystem has grown from supporting only two different
67 types of devices in the 2.2.7 kernel (mice and keyboards), to over 20
68 different types of devices in the 2.4 kernel. Linux currently supports
69 almost all USB class devices (standard types of devices like keyboards,
70 mice, modems, printers and speakers) and an ever-growing number of
71 vendor-specific devices (such as USB to serial converters, digital
72 cameras, Ethernet devices and MP3 players). For a full list of the
73 different USB devices currently supported, see Resources.
74 </para>
75 <para>
76 The remaining kinds of USB devices that do not have support on Linux are
77 almost all vendor-specific devices. Each vendor decides to implement a
78 custom protocol to talk to their device, so a custom driver usually needs
79 to be created. Some vendors are open with their USB protocols and help
80 with the creation of Linux drivers, while others do not publish them, and
81 developers are forced to reverse-engineer. See Resources for some links
82 to handy reverse-engineering tools.
83 </para>
84 <para>
85 Because each different protocol causes a new driver to be created, I have
86 written a generic USB driver skeleton, modeled after the pci-skeleton.c
87 file in the kernel source tree upon which many PCI network drivers have
88 been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c
89 in the kernel source tree. In this article I will walk through the basics
90 of the skeleton driver, explaining the different pieces and what needs to
91 be done to customize it to your specific device.
92 </para>
93 </chapter>
94
95 <chapter id="basics">
96 <title>Linux USB Basics</title>
97 <para>
98 If you are going to write a Linux USB driver, please become familiar with
99 the USB protocol specification. It can be found, along with many other
100 useful documents, at the USB home page (see Resources). An excellent
101 introduction to the Linux USB subsystem can be found at the USB Working
102 Devices List (see Resources). It explains how the Linux USB subsystem is
103 structured and introduces the reader to the concept of USB urbs, which
104 are essential to USB drivers.
105 </para>
106 <para>
107 The first thing a Linux USB driver needs to do is register itself with
108 the Linux USB subsystem, giving it some information about which devices
109 the driver supports and which functions to call when a device supported
110 by the driver is inserted or removed from the system. All of this
111 information is passed to the USB subsystem in the usb_driver structure.
112 The skeleton driver declares a usb_driver as:
113 </para>
114 <programlisting>
115static struct usb_driver skel_driver = {
116 .name = "skeleton",
117 .probe = skel_probe,
118 .disconnect = skel_disconnect,
119 .fops = &amp;skel_fops,
120 .minor = USB_SKEL_MINOR_BASE,
121 .id_table = skel_table,
122};
123 </programlisting>
124 <para>
125 The variable name is a string that describes the driver. It is used in
126 informational messages printed to the system log. The probe and
127 disconnect function pointers are called when a device that matches the
128 information provided in the id_table variable is either seen or removed.
129 </para>
130 <para>
131 The fops and minor variables are optional. Most USB drivers hook into
132 another kernel subsystem, such as the SCSI, network or TTY subsystem.
133 These types of drivers register themselves with the other kernel
134 subsystem, and any user-space interactions are provided through that
135 interface. But for drivers that do not have a matching kernel subsystem,
136 such as MP3 players or scanners, a method of interacting with user space
137 is needed. The USB subsystem provides a way to register a minor device
138 number and a set of file_operations function pointers that enable this
139 user-space interaction. The skeleton driver needs this kind of interface,
140 so it provides a minor starting number and a pointer to its
141 file_operations functions.
142 </para>
143 <para>
144 The USB driver is then registered with a call to usb_register, usually in
145 the driver's init function, as shown here:
146 </para>
147 <programlisting>
148static int __init usb_skel_init(void)
149{
150 int result;
151
152 /* register this driver with the USB subsystem */
153 result = usb_register(&amp;skel_driver);
154 if (result &lt; 0) {
155 err(&quot;usb_register failed for the &quot;__FILE__ &quot;driver.&quot;
156 &quot;Error number %d&quot;, result);
157 return -1;
158 }
159
160 return 0;
161}
162module_init(usb_skel_init);
163 </programlisting>
164 <para>
165 When the driver is unloaded from the system, it needs to unregister
166 itself with the USB subsystem. This is done with the usb_unregister
167 function:
168 </para>
169 <programlisting>
170static void __exit usb_skel_exit(void)
171{
172 /* deregister this driver with the USB subsystem */
173 usb_deregister(&amp;skel_driver);
174}
175module_exit(usb_skel_exit);
176 </programlisting>
177 <para>
178 To enable the linux-hotplug system to load the driver automatically when
179 the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The
180 following code tells the hotplug scripts that this module supports a
181 single device with a specific vendor and product ID:
182 </para>
183 <programlisting>
184/* table of devices that work with this driver */
185static struct usb_device_id skel_table [] = {
186 { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
187 { } /* Terminating entry */
188};
189MODULE_DEVICE_TABLE (usb, skel_table);
190 </programlisting>
191 <para>
192 There are other macros that can be used in describing a usb_device_id for
193 drivers that support a whole class of USB drivers. See usb.h for more
194 information on this.
195 </para>
196 </chapter>
197
198 <chapter id="device">
199 <title>Device operation</title>
200 <para>
201 When a device is plugged into the USB bus that matches the device ID
202 pattern that your driver registered with the USB core, the probe function
203 is called. The usb_device structure, interface number and the interface ID
204 are passed to the function:
205 </para>
206 <programlisting>
207static int skel_probe(struct usb_interface *interface,
208 const struct usb_device_id *id)
209 </programlisting>
210 <para>
211 The driver now needs to verify that this device is actually one that it
212 can accept. If so, it returns 0.
213 If not, or if any error occurs during initialization, an errorcode
214 (such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>)
215 is returned from the probe function.
216 </para>
217 <para>
218 In the skeleton driver, we determine what end points are marked as bulk-in
219 and bulk-out. We create buffers to hold the data that will be sent and
220 received from the device, and a USB urb to write data to the device is
221 initialized.
222 </para>
223 <para>
224 Conversely, when the device is removed from the USB bus, the disconnect
225 function is called with the device pointer. The driver needs to clean any
226 private data that has been allocated at this time and to shut down any
227 pending urbs that are in the USB system. The driver also unregisters
228 itself from the devfs subsystem with the call:
229 </para>
230 <programlisting>
231/* remove our devfs node */
232devfs_unregister(skel->devfs);
233 </programlisting>
234 <para>
235 Now that the device is plugged into the system and the driver is bound to
236 the device, any of the functions in the file_operations structure that
237 were passed to the USB subsystem will be called from a user program trying
238 to talk to the device. The first function called will be open, as the
239 program tries to open the device for I/O. We increment our private usage
240 count and save off a pointer to our internal structure in the file
241 structure. This is done so that future calls to file operations will
242 enable the driver to determine which device the user is addressing. All
243 of this is done with the following code:
244 </para>
245 <programlisting>
246/* increment our usage count for the module */
247++skel->open_count;
248
249/* save our object in the file's private structure */
250file->private_data = dev;
251 </programlisting>
252 <para>
253 After the open function is called, the read and write functions are called
254 to receive and send data to the device. In the skel_write function, we
255 receive a pointer to some data that the user wants to send to the device
256 and the size of the data. The function determines how much data it can
257 send to the device based on the size of the write urb it has created (this
258 size depends on the size of the bulk out end point that the device has).
259 Then it copies the data from user space to kernel space, points the urb to
260 the data and submits the urb to the USB subsystem. This can be shown in
261 he following code:
262 </para>
263 <programlisting>
264/* we can only write as much as 1 urb will hold */
265bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count;
266
267/* copy the data from user space into our urb */
268copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written);
269
270/* set up our urb */
271usb_fill_bulk_urb(skel->write_urb,
272 skel->dev,
273 usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr),
274 skel->write_urb->transfer_buffer,
275 bytes_written,
276 skel_write_bulk_callback,
277 skel);
278
279/* send the data out the bulk port */
280result = usb_submit_urb(skel->write_urb);
281if (result) {
282 err(&quot;Failed submitting write urb, error %d&quot;, result);
283}
284 </programlisting>
285 <para>
286 When the write urb is filled up with the proper information using the
287 usb_fill_bulk_urb function, we point the urb's completion callback to call our
288 own skel_write_bulk_callback function. This function is called when the
289 urb is finished by the USB subsystem. The callback function is called in
290 interrupt context, so caution must be taken not to do very much processing
291 at that time. Our implementation of skel_write_bulk_callback merely
292 reports if the urb was completed successfully or not and then returns.
293 </para>
294 <para>
295 The read function works a bit differently from the write function in that
296 we do not use an urb to transfer data from the device to the driver.
297 Instead we call the usb_bulk_msg function, which can be used to send or
298 receive data from a device without having to create urbs and handle
299 urb completion callback functions. We call the usb_bulk_msg function,
300 giving it a buffer into which to place any data received from the device
301 and a timeout value. If the timeout period expires without receiving any
302 data from the device, the function will fail and return an error message.
303 This can be shown with the following code:
304 </para>
305 <programlisting>
306/* do an immediate bulk read to get data from the device */
307retval = usb_bulk_msg (skel->dev,
308 usb_rcvbulkpipe (skel->dev,
309 skel->bulk_in_endpointAddr),
310 skel->bulk_in_buffer,
311 skel->bulk_in_size,
312 &amp;count, HZ*10);
313/* if the read was successful, copy the data to user space */
314if (!retval) {
315 if (copy_to_user (buffer, skel->bulk_in_buffer, count))
316 retval = -EFAULT;
317 else
318 retval = count;
319}
320 </programlisting>
321 <para>
322 The usb_bulk_msg function can be very useful for doing single reads or
323 writes to a device; however, if you need to read or write constantly to a
324 device, it is recommended to set up your own urbs and submit them to the
325 USB subsystem.
326 </para>
327 <para>
328 When the user program releases the file handle that it has been using to
329 talk to the device, the release function in the driver is called. In this
330 function we decrement our private usage count and wait for possible
331 pending writes:
332 </para>
333 <programlisting>
334/* decrement our usage count for the device */
335--skel->open_count;
336 </programlisting>
337 <para>
338 One of the more difficult problems that USB drivers must be able to handle
339 smoothly is the fact that the USB device may be removed from the system at
340 any point in time, even if a program is currently talking to it. It needs
341 to be able to shut down any current reads and writes and notify the
342 user-space programs that the device is no longer there. The following
343 code (function <function>skel_delete</function>)
344 is an example of how to do this: </para>
345 <programlisting>
346static inline void skel_delete (struct usb_skel *dev)
347{
348 if (dev->bulk_in_buffer != NULL)
349 kfree (dev->bulk_in_buffer);
350 if (dev->bulk_out_buffer != NULL)
351 usb_buffer_free (dev->udev, dev->bulk_out_size,
352 dev->bulk_out_buffer,
353 dev->write_urb->transfer_dma);
354 if (dev->write_urb != NULL)
355 usb_free_urb (dev->write_urb);
356 kfree (dev);
357}
358 </programlisting>
359 <para>
360 If a program currently has an open handle to the device, we reset the flag
361 <literal>device_present</literal>. For
362 every read, write, release and other functions that expect a device to be
363 present, the driver first checks this flag to see if the device is
364 still present. If not, it releases that the device has disappeared, and a
365 -ENODEV error is returned to the user-space program. When the release
366 function is eventually called, it determines if there is no device
367 and if not, it does the cleanup that the skel_disconnect
368 function normally does if there are no open files on the device (see
369 Listing 5).
370 </para>
371 </chapter>
372
373 <chapter id="iso">
374 <title>Isochronous Data</title>
375 <para>
376 This usb-skeleton driver does not have any examples of interrupt or
377 isochronous data being sent to or from the device. Interrupt data is sent
378 almost exactly as bulk data is, with a few minor exceptions. Isochronous
379 data works differently with continuous streams of data being sent to or
380 from the device. The audio and video camera drivers are very good examples
381 of drivers that handle isochronous data and will be useful if you also
382 need to do this.
383 </para>
384 </chapter>
385
386 <chapter id="Conclusion">
387 <title>Conclusion</title>
388 <para>
389 Writing Linux USB device drivers is not a difficult task as the
390 usb-skeleton driver shows. This driver, combined with the other current
391 USB drivers, should provide enough examples to help a beginning author
392 create a working driver in a minimal amount of time. The linux-usb-devel
393 mailing list archives also contain a lot of helpful information.
394 </para>
395 </chapter>
396
397 <chapter id="resources">
398 <title>Resources</title>
399 <para>
400 The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink>
401 </para>
402 <para>
403 Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink>
404 </para>
405 <para>
406 Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink>
407 </para>
408 <para>
409 linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink>
410 </para>
411 <para>
412 Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink>
413 </para>
414 <para>
415 USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink>
416 </para>
417 </chapter>
418
419</book>
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl
new file mode 100644
index 000000000000..a507876447aa
--- /dev/null
+++ b/Documentation/DocBook/z8530book.tmpl
@@ -0,0 +1,385 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="Z85230Guide">
6 <bookinfo>
7 <title>Z8530 Programming Guide</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Alan</firstname>
12 <surname>Cox</surname>
13 <affiliation>
14 <address>
15 <email>alan@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <copyright>
22 <year>2000</year>
23 <holder>Alan Cox</holder>
24 </copyright>
25
26 <legalnotice>
27 <para>
28 This documentation is free software; you can redistribute
29 it and/or modify it under the terms of the GNU General Public
30 License as published by the Free Software Foundation; either
31 version 2 of the License, or (at your option) any later
32 version.
33 </para>
34
35 <para>
36 This program is distributed in the hope that it will be
37 useful, but WITHOUT ANY WARRANTY; without even the implied
38 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
39 See the GNU General Public License for more details.
40 </para>
41
42 <para>
43 You should have received a copy of the GNU General Public
44 License along with this program; if not, write to the Free
45 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
46 MA 02111-1307 USA
47 </para>
48
49 <para>
50 For more details see the file COPYING in the source
51 distribution of Linux.
52 </para>
53 </legalnotice>
54 </bookinfo>
55
56<toc></toc>
57
58 <chapter id="intro">
59 <title>Introduction</title>
60 <para>
61 The Z85x30 family synchronous/asynchronous controller chips are
62 used on a large number of cheap network interface cards. The
63 kernel provides a core interface layer that is designed to make
64 it easy to provide WAN services using this chip.
65 </para>
66 <para>
67 The current driver only support synchronous operation. Merging the
68 asynchronous driver support into this code to allow any Z85x30
69 device to be used as both a tty interface and as a synchronous
70 controller is a project for Linux post the 2.4 release
71 </para>
72 <para>
73 The support code handles most common card configurations and
74 supports running both Cisco HDLC and Synchronous PPP. With extra
75 glue the frame relay and X.25 protocols can also be used with this
76 driver.
77 </para>
78 </chapter>
79
80 <chapter>
81 <title>Driver Modes</title>
82 <para>
83 The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
84 in three different modes. Each mode can be applied to an individual
85 channel on the chip (each chip has two channels).
86 </para>
87 <para>
88 The PIO synchronous mode supports the most common Z8530 wiring. Here
89 the chip is interface to the I/O and interrupt facilities of the
90 host machine but not to the DMA subsystem. When running PIO the
91 Z8530 has extremely tight timing requirements. Doing high speeds,
92 even with a Z85230 will be tricky. Typically you should expect to
93 achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
94 </para>
95 <para>
96 The DMA mode supports the chip when it is configured to use dual DMA
97 channels on an ISA bus. The better cards tend to support this mode
98 of operation for a single channel. With DMA running the Z85230 tops
99 out when it starts to hit ISA DMA constraints at about 512Kbits. It
100 is worth noting here that many PC machines hang or crash when the
101 chip is driven fast enough to hold the ISA bus solid.
102 </para>
103 <para>
104 Transmit DMA mode uses a single DMA channel. The DMA channel is used
105 for transmission as the transmit FIFO is smaller than the receive
106 FIFO. it gives better performance than pure PIO mode but is nowhere
107 near as ideal as pure DMA mode.
108 </para>
109 </chapter>
110
111 <chapter>
112 <title>Using the Z85230 driver</title>
113 <para>
114 The Z85230 driver provides the back end interface to your board. To
115 configure a Z8530 interface you need to detect the board and to
116 identify its ports and interrupt resources. It is also your problem
117 to verify the resources are available.
118 </para>
119 <para>
120 Having identified the chip you need to fill in a struct z8530_dev,
121 which describes each chip. This object must exist until you finally
122 shutdown the board. Firstly zero the active field. This ensures
123 nothing goes off without you intending it. The irq field should
124 be set to the interrupt number of the chip. (Each chip has a single
125 interrupt source rather than each channel). You are responsible
126 for allocating the interrupt line. The interrupt handler should be
127 set to <function>z8530_interrupt</function>. The device id should
128 be set to the z8530_dev structure pointer. Whether the interrupt can
129 be shared or not is board dependent, and up to you to initialise.
130 </para>
131 <para>
132 The structure holds two channel structures.
133 Initialise chanA.ctrlio and chanA.dataio with the address of the
134 control and data ports. You can or this with Z8530_PORT_SLEEP to
135 indicate your interface needs the 5uS delay for chip settling done
136 in software. The PORT_SLEEP option is architecture specific. Other
137 flags may become available on future platforms, eg for MMIO.
138 Initialise the chanA.irqs to &amp;z8530_nop to start the chip up
139 as disabled and discarding interrupt events. This ensures that
140 stray interrupts will be mopped up and not hang the bus. Set
141 chanA.dev to point to the device structure itself. The
142 private and name field you may use as you wish. The private field
143 is unused by the Z85230 layer. The name is used for error reporting
144 and it may thus make sense to make it match the network name.
145 </para>
146 <para>
147 Repeat the same operation with the B channel if your chip has
148 both channels wired to something useful. This isn't always the
149 case. If it is not wired then the I/O values do not matter, but
150 you must initialise chanB.dev.
151 </para>
152 <para>
153 If your board has DMA facilities then initialise the txdma and
154 rxdma fields for the relevant channels. You must also allocate the
155 ISA DMA channels and do any necessary board level initialisation
156 to configure them. The low level driver will do the Z8530 and
157 DMA controller programming but not board specific magic.
158 </para>
159 <para>
160 Having initialised the device you can then call
161 <function>z8530_init</function>. This will probe the chip and
162 reset it into a known state. An identification sequence is then
163 run to identify the chip type. If the checks fail to pass the
164 function returns a non zero error code. Typically this indicates
165 that the port given is not valid. After this call the
166 type field of the z8530_dev structure is initialised to either
167 Z8530, Z85C30 or Z85230 according to the chip found.
168 </para>
169 <para>
170 Once you have called z8530_init you can also make use of the utility
171 function <function>z8530_describe</function>. This provides a
172 consistent reporting format for the Z8530 devices, and allows all
173 the drivers to provide consistent reporting.
174 </para>
175 </chapter>
176
177 <chapter>
178 <title>Attaching Network Interfaces</title>
179 <para>
180 If you wish to use the network interface facilities of the driver,
181 then you need to attach a network device to each channel that is
182 present and in use. In addition to use the SyncPPP and Cisco HDLC
183 you need to follow some additional plumbing rules. They may seem
184 complex but a look at the example hostess_sv11 driver should
185 reassure you.
186 </para>
187 <para>
188 The network device used for each channel should be pointed to by
189 the netdevice field of each channel. The dev-&gt; priv field of the
190 network device points to your private data - you will need to be
191 able to find your ppp device from this. In addition to use the
192 sync ppp layer the private data must start with a void * pointer
193 to the syncppp structures.
194 </para>
195 <para>
196 The way most drivers approach this particular problem is to
197 create a structure holding the Z8530 device definition and
198 put that and the syncppp pointer into the private field of
199 the network device. The network device fields of the channels
200 then point back to the network devices. The ppp_device can also
201 be put in the private structure conveniently.
202 </para>
203 <para>
204 If you wish to use the synchronous ppp then you need to attach
205 the syncppp layer to the network device. You should do this before
206 you register the network device. The
207 <function>sppp_attach</function> requires that the first void *
208 pointer in your private data is pointing to an empty struct
209 ppp_device. The function fills in the initial data for the
210 ppp/hdlc layer.
211 </para>
212 <para>
213 Before you register your network device you will also need to
214 provide suitable handlers for most of the network device callbacks.
215 See the network device documentation for more details on this.
216 </para>
217 </chapter>
218
219 <chapter>
220 <title>Configuring And Activating The Port</title>
221 <para>
222 The Z85230 driver provides helper functions and tables to load the
223 port registers on the Z8530 chips. When programming the register
224 settings for a channel be aware that the documentation recommends
225 initialisation orders. Strange things happen when these are not
226 followed.
227 </para>
228 <para>
229 <function>z8530_channel_load</function> takes an array of
230 pairs of initialisation values in an array of u8 type. The first
231 value is the Z8530 register number. Add 16 to indicate the alternate
232 register bank on the later chips. The array is terminated by a 255.
233 </para>
234 <para>
235 The driver provides a pair of public tables. The
236 z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
237 also happens to cover most other end host configurations. The
238 z8530_hdlc_kilostream_85230 table is the same configuration using
239 the enhancements of the 85230 chip. The configuration loaded is
240 standard NRZ encoded synchronous data with HDLC bitstuffing. All
241 of the timing is taken from the other end of the link.
242 </para>
243 <para>
244 When writing your own tables be aware that the driver internally
245 tracks register values. It may need to reload values. You should
246 therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
247 configurations. Where the register settings depend on DMA selection
248 the driver will update the bits itself when you open or close.
249 Loading a new table with the interface open is not recommended.
250 </para>
251 <para>
252 There are three standard configurations supported by the core
253 code. In PIO mode the interface is programmed up to use
254 interrupt driven PIO. This places high demands on the host processor
255 to avoid latency. The driver is written to take account of latency
256 issues but it cannot avoid latencies caused by other drivers,
257 notably IDE in PIO mode. Because the drivers allocate buffers you
258 must also prevent MTU changes while the port is open.
259 </para>
260 <para>
261 Once the port is open it will call the rx_function of each channel
262 whenever a completed packet arrived. This is invoked from
263 interrupt context and passes you the channel and a network
264 buffer (struct sk_buff) holding the data. The data includes
265 the CRC bytes so most users will want to trim the last two
266 bytes before processing the data. This function is very timing
267 critical. When you wish to simply discard data the support
268 code provides the function <function>z8530_null_rx</function>
269 to discard the data.
270 </para>
271 <para>
272 To active PIO mode sending and receiving the <function>
273 z8530_sync_open</function> is called. This expects to be passed
274 the network device and the channel. Typically this is called from
275 your network device open callback. On a failure a non zero error
276 status is returned. The <function>z8530_sync_close</function>
277 function shuts down a PIO channel. This must be done before the
278 channel is opened again and before the driver shuts down
279 and unloads.
280 </para>
281 <para>
282 The ideal mode of operation is dual channel DMA mode. Here the
283 kernel driver will configure the board for DMA in both directions.
284 The driver also handles ISA DMA issues such as controller
285 programming and the memory range limit for you. This mode is
286 activated by calling the <function>z8530_sync_dma_open</function>
287 function. On failure a non zero error value is returned.
288 Once this mode is activated it can be shut down by calling the
289 <function>z8530_sync_dma_close</function>. You must call the close
290 function matching the open mode you used.
291 </para>
292 <para>
293 The final supported mode uses a single DMA channel to drive the
294 transmit side. As the Z85C30 has a larger FIFO on the receive
295 channel this tends to increase the maximum speed a little.
296 This is activated by calling the <function>z8530_sync_txdma_open
297 </function>. This returns a non zero error code on failure. The
298 <function>z8530_sync_txdma_close</function> function closes down
299 the Z8530 interface from this mode.
300 </para>
301 </chapter>
302
303 <chapter>
304 <title>Network Layer Functions</title>
305 <para>
306 The Z8530 layer provides functions to queue packets for
307 transmission. The driver internally buffers the frame currently
308 being transmitted and one further frame (in order to keep back
309 to back transmission running). Any further buffering is up to
310 the caller.
311 </para>
312 <para>
313 The function <function>z8530_queue_xmit</function> takes a network
314 buffer in sk_buff format and queues it for transmission. The
315 caller must provide the entire packet with the exception of the
316 bitstuffing and CRC. This is normally done by the caller via
317 the syncppp interface layer. It returns 0 if the buffer has been
318 queued and non zero values for queue full. If the function accepts
319 the buffer it becomes property of the Z8530 layer and the caller
320 should not free it.
321 </para>
322 <para>
323 The function <function>z8530_get_stats</function> returns a pointer
324 to an internally maintained per interface statistics block. This
325 provides most of the interface code needed to implement the network
326 layer get_stats callback.
327 </para>
328 </chapter>
329
330 <chapter>
331 <title>Porting The Z8530 Driver</title>
332 <para>
333 The Z8530 driver is written to be portable. In DMA mode it makes
334 assumptions about the use of ISA DMA. These are probably warranted
335 in most cases as the Z85230 in particular was designed to glue to PC
336 type machines. The PIO mode makes no real assumptions.
337 </para>
338 <para>
339 Should you need to retarget the Z8530 driver to another architecture
340 the only code that should need changing are the port I/O functions.
341 At the moment these assume PC I/O port accesses. This may not be
342 appropriate for all platforms. Replacing
343 <function>z8530_read_port</function> and <function>z8530_write_port
344 </function> is intended to be all that is required to port this
345 driver layer.
346 </para>
347 </chapter>
348
349 <chapter id="bugs">
350 <title>Known Bugs And Assumptions</title>
351 <para>
352 <variablelist>
353 <varlistentry><term>Interrupt Locking</term>
354 <listitem>
355 <para>
356 The locking in the driver is done via the global cli/sti lock. This
357 makes for relatively poor SMP performance. Switching this to use a
358 per device spin lock would probably materially improve performance.
359 </para>
360 </listitem></varlistentry>
361
362 <varlistentry><term>Occasional Failures</term>
363 <listitem>
364 <para>
365 We have reports of occasional failures when run for very long
366 periods of time and the driver starts to receive junk frames. At
367 the moment the cause of this is not clear.
368 </para>
369 </listitem></varlistentry>
370 </variablelist>
371
372 </para>
373 </chapter>
374
375 <chapter id="pubfunctions">
376 <title>Public Functions Provided</title>
377!Edrivers/net/wan/z85230.c
378 </chapter>
379
380 <chapter id="intfunctions">
381 <title>Internal Functions</title>
382!Idrivers/net/wan/z85230.c
383 </chapter>
384
385</book>