diff options
author | Mauro Carvalho Chehab <mchehab+samsung@kernel.org> | 2019-04-18 18:45:00 -0400 |
---|---|---|
committer | Mauro Carvalho Chehab <mchehab+samsung@kernel.org> | 2019-07-15 08:20:27 -0400 |
commit | 898bd37a92063e46bc8d7b870781cecd66234f92 (patch) | |
tree | 1eac9c597d45080cc2ff366f6e882a87fcea2d2b | |
parent | 53b9537509654a6267c3f56b4d2e7409b9089686 (diff) |
docs: block: convert to ReST
Rename the block documentation files to ReST, add an
index for them and adjust in order to produce a nice html
output via the Sphinx build system.
At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 8 | ||||
-rw-r--r-- | Documentation/block/bfq-iosched.rst (renamed from Documentation/block/bfq-iosched.txt) | 66 | ||||
-rw-r--r-- | Documentation/block/biodoc.rst (renamed from Documentation/block/biodoc.txt) | 330 | ||||
-rw-r--r-- | Documentation/block/biovecs.rst (renamed from Documentation/block/biovecs.txt) | 20 | ||||
-rw-r--r-- | Documentation/block/capability.rst | 18 | ||||
-rw-r--r-- | Documentation/block/capability.txt | 15 | ||||
-rw-r--r-- | Documentation/block/cmdline-partition.rst (renamed from Documentation/block/cmdline-partition.txt) | 13 | ||||
-rw-r--r-- | Documentation/block/data-integrity.rst (renamed from Documentation/block/data-integrity.txt) | 60 | ||||
-rw-r--r-- | Documentation/block/deadline-iosched.rst (renamed from Documentation/block/deadline-iosched.txt) | 21 | ||||
-rw-r--r-- | Documentation/block/index.rst | 25 | ||||
-rw-r--r-- | Documentation/block/ioprio.rst (renamed from Documentation/block/ioprio.txt) | 103 | ||||
-rw-r--r-- | Documentation/block/kyber-iosched.rst (renamed from Documentation/block/kyber-iosched.txt) | 3 | ||||
-rw-r--r-- | Documentation/block/null_blk.rst (renamed from Documentation/block/null_blk.txt) | 65 | ||||
-rw-r--r-- | Documentation/block/pr.rst (renamed from Documentation/block/pr.txt) | 18 | ||||
-rw-r--r-- | Documentation/block/queue-sysfs.rst (renamed from Documentation/block/queue-sysfs.txt) | 7 | ||||
-rw-r--r-- | Documentation/block/request.rst (renamed from Documentation/block/request.txt) | 47 | ||||
-rw-r--r-- | Documentation/block/stat.rst (renamed from Documentation/block/stat.txt) | 13 | ||||
-rw-r--r-- | Documentation/block/switching-sched.rst (renamed from Documentation/block/switching-sched.txt) | 28 | ||||
-rw-r--r-- | Documentation/block/writeback_cache_control.rst (renamed from Documentation/block/writeback_cache_control.txt) | 12 | ||||
-rw-r--r-- | Documentation/blockdev/zram.rst | 2 | ||||
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | block/Kconfig | 2 | ||||
-rw-r--r-- | block/Kconfig.iosched | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/blk-integrity.c | 2 | ||||
-rw-r--r-- | block/ioprio.c | 2 | ||||
-rw-r--r-- | block/mq-deadline.c | 2 | ||||
-rw-r--r-- | block/partitions/cmdline.c | 2 |
28 files changed, 545 insertions, 345 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 01123f1de354..e8e28cac32a3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -430,7 +430,7 @@ | |||
430 | 430 | ||
431 | blkdevparts= Manual partition parsing of block device(s) for | 431 | blkdevparts= Manual partition parsing of block device(s) for |
432 | embedded devices based on command line input. | 432 | embedded devices based on command line input. |
433 | See Documentation/block/cmdline-partition.txt | 433 | See Documentation/block/cmdline-partition.rst |
434 | 434 | ||
435 | boot_delay= Milliseconds to delay each printk during boot. | 435 | boot_delay= Milliseconds to delay each printk during boot. |
436 | Values larger than 10 seconds (10000) are changed to | 436 | Values larger than 10 seconds (10000) are changed to |
@@ -1199,9 +1199,9 @@ | |||
1199 | 1199 | ||
1200 | elevator= [IOSCHED] | 1200 | elevator= [IOSCHED] |
1201 | Format: { "mq-deadline" | "kyber" | "bfq" } | 1201 | Format: { "mq-deadline" | "kyber" | "bfq" } |
1202 | See Documentation/block/deadline-iosched.txt, | 1202 | See Documentation/block/deadline-iosched.rst, |
1203 | Documentation/block/kyber-iosched.txt and | 1203 | Documentation/block/kyber-iosched.rst and |
1204 | Documentation/block/bfq-iosched.txt for details. | 1204 | Documentation/block/bfq-iosched.rst for details. |
1205 | 1205 | ||
1206 | elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390] | 1206 | elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390] |
1207 | Specifies physical address of start of kernel core | 1207 | Specifies physical address of start of kernel core |
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.rst index bbd6eb5bbb07..2c13b2fc1888 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.rst | |||
@@ -1,9 +1,11 @@ | |||
1 | ========================== | ||
1 | BFQ (Budget Fair Queueing) | 2 | BFQ (Budget Fair Queueing) |
2 | ========================== | 3 | ========================== |
3 | 4 | ||
4 | BFQ is a proportional-share I/O scheduler, with some extra | 5 | BFQ is a proportional-share I/O scheduler, with some extra |
5 | low-latency capabilities. In addition to cgroups support (blkio or io | 6 | low-latency capabilities. In addition to cgroups support (blkio or io |
6 | controllers), BFQ's main features are: | 7 | controllers), BFQ's main features are: |
8 | |||
7 | - BFQ guarantees a high system and application responsiveness, and a | 9 | - BFQ guarantees a high system and application responsiveness, and a |
8 | low latency for time-sensitive applications, such as audio or video | 10 | low latency for time-sensitive applications, such as audio or video |
9 | players; | 11 | players; |
@@ -55,18 +57,18 @@ sustainable throughputs, on the same systems as above: | |||
55 | 57 | ||
56 | BFQ works for multi-queue devices too. | 58 | BFQ works for multi-queue devices too. |
57 | 59 | ||
58 | The table of contents follow. Impatients can just jump to Section 3. | 60 | .. The table of contents follow. Impatients can just jump to Section 3. |
59 | 61 | ||
60 | CONTENTS | 62 | .. CONTENTS |
61 | 63 | ||
62 | 1. When may BFQ be useful? | 64 | 1. When may BFQ be useful? |
63 | 1-1 Personal systems | 65 | 1-1 Personal systems |
64 | 1-2 Server systems | 66 | 1-2 Server systems |
65 | 2. How does BFQ work? | 67 | 2. How does BFQ work? |
66 | 3. What are BFQ's tunables and how to properly configure BFQ? | 68 | 3. What are BFQ's tunables and how to properly configure BFQ? |
67 | 4. BFQ group scheduling | 69 | 4. BFQ group scheduling |
68 | 4-1 Service guarantees provided | 70 | 4-1 Service guarantees provided |
69 | 4-2 Interface | 71 | 4-2 Interface |
70 | 72 | ||
71 | 1. When may BFQ be useful? | 73 | 1. When may BFQ be useful? |
72 | ========================== | 74 | ========================== |
@@ -77,17 +79,20 @@ BFQ provides the following benefits on personal and server systems. | |||
77 | -------------------- | 79 | -------------------- |
78 | 80 | ||
79 | Low latency for interactive applications | 81 | Low latency for interactive applications |
82 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
80 | 83 | ||
81 | Regardless of the actual background workload, BFQ guarantees that, for | 84 | Regardless of the actual background workload, BFQ guarantees that, for |
82 | interactive tasks, the storage device is virtually as responsive as if | 85 | interactive tasks, the storage device is virtually as responsive as if |
83 | it was idle. For example, even if one or more of the following | 86 | it was idle. For example, even if one or more of the following |
84 | background workloads are being executed: | 87 | background workloads are being executed: |
88 | |||
85 | - one or more large files are being read, written or copied, | 89 | - one or more large files are being read, written or copied, |
86 | - a tree of source files is being compiled, | 90 | - a tree of source files is being compiled, |
87 | - one or more virtual machines are performing I/O, | 91 | - one or more virtual machines are performing I/O, |
88 | - a software update is in progress, | 92 | - a software update is in progress, |
89 | - indexing daemons are scanning filesystems and updating their | 93 | - indexing daemons are scanning filesystems and updating their |
90 | databases, | 94 | databases, |
95 | |||
91 | starting an application or loading a file from within an application | 96 | starting an application or loading a file from within an application |
92 | takes about the same time as if the storage device was idle. As a | 97 | takes about the same time as if the storage device was idle. As a |
93 | comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, | 98 | comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, |
@@ -95,13 +100,14 @@ applications experience high latencies, or even become unresponsive | |||
95 | until the background workload terminates (also on SSDs). | 100 | until the background workload terminates (also on SSDs). |
96 | 101 | ||
97 | Low latency for soft real-time applications | 102 | Low latency for soft real-time applications |
98 | 103 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
99 | Also soft real-time applications, such as audio and video | 104 | Also soft real-time applications, such as audio and video |
100 | players/streamers, enjoy a low latency and a low drop rate, regardless | 105 | players/streamers, enjoy a low latency and a low drop rate, regardless |
101 | of the background I/O workload. As a consequence, these applications | 106 | of the background I/O workload. As a consequence, these applications |
102 | do not suffer from almost any glitch due to the background workload. | 107 | do not suffer from almost any glitch due to the background workload. |
103 | 108 | ||
104 | Higher speed for code-development tasks | 109 | Higher speed for code-development tasks |
110 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
105 | 111 | ||
106 | If some additional workload happens to be executed in parallel, then | 112 | If some additional workload happens to be executed in parallel, then |
107 | BFQ executes the I/O-related components of typical code-development | 113 | BFQ executes the I/O-related components of typical code-development |
@@ -109,6 +115,7 @@ tasks (compilation, checkout, merge, ...) much more quickly than CFQ, | |||
109 | NOOP or DEADLINE. | 115 | NOOP or DEADLINE. |
110 | 116 | ||
111 | High throughput | 117 | High throughput |
118 | ^^^^^^^^^^^^^^^ | ||
112 | 119 | ||
113 | On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and | 120 | On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and |
114 | up to 150% higher throughput than DEADLINE and NOOP, with all the | 121 | up to 150% higher throughput than DEADLINE and NOOP, with all the |
@@ -117,6 +124,7 @@ and with all the workloads on flash-based devices, BFQ achieves, | |||
117 | instead, about the same throughput as the other schedulers. | 124 | instead, about the same throughput as the other schedulers. |
118 | 125 | ||
119 | Strong fairness, bandwidth and delay guarantees | 126 | Strong fairness, bandwidth and delay guarantees |
127 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
120 | 128 | ||
121 | BFQ distributes the device throughput, and not just the device time, | 129 | BFQ distributes the device throughput, and not just the device time, |
122 | among I/O-bound applications in proportion their weights, with any | 130 | among I/O-bound applications in proportion their weights, with any |
@@ -133,15 +141,15 @@ Most benefits for server systems follow from the same service | |||
133 | properties as above. In particular, regardless of whether additional, | 141 | properties as above. In particular, regardless of whether additional, |
134 | possibly heavy workloads are being served, BFQ guarantees: | 142 | possibly heavy workloads are being served, BFQ guarantees: |
135 | 143 | ||
136 | . audio and video-streaming with zero or very low jitter and drop | 144 | * audio and video-streaming with zero or very low jitter and drop |
137 | rate; | 145 | rate; |
138 | 146 | ||
139 | . fast retrieval of WEB pages and embedded objects; | 147 | * fast retrieval of WEB pages and embedded objects; |
140 | 148 | ||
141 | . real-time recording of data in live-dumping applications (e.g., | 149 | * real-time recording of data in live-dumping applications (e.g., |
142 | packet logging); | 150 | packet logging); |
143 | 151 | ||
144 | . responsiveness in local and remote access to a server. | 152 | * responsiveness in local and remote access to a server. |
145 | 153 | ||
146 | 154 | ||
147 | 2. How does BFQ work? | 155 | 2. How does BFQ work? |
@@ -151,7 +159,7 @@ BFQ is a proportional-share I/O scheduler, whose general structure, | |||
151 | plus a lot of code, are borrowed from CFQ. | 159 | plus a lot of code, are borrowed from CFQ. |
152 | 160 | ||
153 | - Each process doing I/O on a device is associated with a weight and a | 161 | - Each process doing I/O on a device is associated with a weight and a |
154 | (bfq_)queue. | 162 | `(bfq_)queue`. |
155 | 163 | ||
156 | - BFQ grants exclusive access to the device, for a while, to one queue | 164 | - BFQ grants exclusive access to the device, for a while, to one queue |
157 | (process) at a time, and implements this service model by | 165 | (process) at a time, and implements this service model by |
@@ -540,11 +548,12 @@ created, and kept up-to-date by bfq, depends on whether | |||
540 | CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all | 548 | CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all |
541 | the stat files documented in | 549 | the stat files documented in |
542 | Documentation/cgroup-v1/blkio-controller.rst. If, instead, | 550 | Documentation/cgroup-v1/blkio-controller.rst. If, instead, |
543 | CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files | 551 | CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files:: |
544 | blkio.bfq.io_service_bytes | 552 | |
545 | blkio.bfq.io_service_bytes_recursive | 553 | blkio.bfq.io_service_bytes |
546 | blkio.bfq.io_serviced | 554 | blkio.bfq.io_service_bytes_recursive |
547 | blkio.bfq.io_serviced_recursive | 555 | blkio.bfq.io_serviced |
556 | blkio.bfq.io_serviced_recursive | ||
548 | 557 | ||
549 | The value of CONFIG_BFQ_CGROUP_DEBUG greatly influences the maximum | 558 | The value of CONFIG_BFQ_CGROUP_DEBUG greatly influences the maximum |
550 | throughput sustainable with bfq, because updating the blkio.bfq.* | 559 | throughput sustainable with bfq, because updating the blkio.bfq.* |
@@ -567,17 +576,22 @@ weight of the queues associated with interactive and soft real-time | |||
567 | applications. Unset this tunable if you need/want to control weights. | 576 | applications. Unset this tunable if you need/want to control weights. |
568 | 577 | ||
569 | 578 | ||
570 | [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O | 579 | [1] |
580 | P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O | ||
571 | Scheduler", Proceedings of the First Workshop on Mobile System | 581 | Scheduler", Proceedings of the First Workshop on Mobile System |
572 | Technologies (MST-2015), May 2015. | 582 | Technologies (MST-2015), May 2015. |
583 | |||
573 | http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf | 584 | http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf |
574 | 585 | ||
575 | [2] P. Valente and M. Andreolini, "Improving Application | 586 | [2] |
587 | P. Valente and M. Andreolini, "Improving Application | ||
576 | Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of | 588 | Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of |
577 | the 5th Annual International Systems and Storage Conference | 589 | the 5th Annual International Systems and Storage Conference |
578 | (SYSTOR '12), June 2012. | 590 | (SYSTOR '12), June 2012. |
591 | |||
579 | Slightly extended version: | 592 | Slightly extended version: |
580 | http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- | ||
581 | results.pdf | ||
582 | 593 | ||
583 | [3] https://github.com/Algodev-github/S | 594 | http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-results.pdf |
595 | |||
596 | [3] | ||
597 | https://github.com/Algodev-github/S | ||
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.rst index 31c177663ed5..d6e30b680405 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.rst | |||
@@ -1,15 +1,24 @@ | |||
1 | Notes on the Generic Block Layer Rewrite in Linux 2.5 | 1 | ===================================================== |
2 | ===================================================== | 2 | Notes on the Generic Block Layer Rewrite in Linux 2.5 |
3 | ===================================================== | ||
4 | |||
5 | .. note:: | ||
6 | |||
7 | It seems that there are lot of outdated stuff here. This seems | ||
8 | to be written somewhat as a task list. Yet, eventually, something | ||
9 | here might still be useful. | ||
3 | 10 | ||
4 | Notes Written on Jan 15, 2002: | 11 | Notes Written on Jan 15, 2002: |
5 | Jens Axboe <jens.axboe@oracle.com> | 12 | - Jens Axboe <jens.axboe@oracle.com> |
6 | Suparna Bhattacharya <suparna@in.ibm.com> | 13 | - Suparna Bhattacharya <suparna@in.ibm.com> |
7 | 14 | ||
8 | Last Updated May 2, 2002 | 15 | Last Updated May 2, 2002 |
16 | |||
9 | September 2003: Updated I/O Scheduler portions | 17 | September 2003: Updated I/O Scheduler portions |
10 | Nick Piggin <npiggin@kernel.dk> | 18 | - Nick Piggin <npiggin@kernel.dk> |
11 | 19 | ||
12 | Introduction: | 20 | Introduction |
21 | ============ | ||
13 | 22 | ||
14 | These are some notes describing some aspects of the 2.5 block layer in the | 23 | These are some notes describing some aspects of the 2.5 block layer in the |
15 | context of the bio rewrite. The idea is to bring out some of the key | 24 | context of the bio rewrite. The idea is to bring out some of the key |
@@ -17,11 +26,11 @@ changes and a glimpse of the rationale behind those changes. | |||
17 | 26 | ||
18 | Please mail corrections & suggestions to suparna@in.ibm.com. | 27 | Please mail corrections & suggestions to suparna@in.ibm.com. |
19 | 28 | ||
20 | Credits: | 29 | Credits |
21 | --------- | 30 | ======= |
22 | 31 | ||
23 | 2.5 bio rewrite: | 32 | 2.5 bio rewrite: |
24 | Jens Axboe <jens.axboe@oracle.com> | 33 | - Jens Axboe <jens.axboe@oracle.com> |
25 | 34 | ||
26 | Many aspects of the generic block layer redesign were driven by and evolved | 35 | Many aspects of the generic block layer redesign were driven by and evolved |
27 | over discussions, prior patches and the collective experience of several | 36 | over discussions, prior patches and the collective experience of several |
@@ -29,62 +38,63 @@ people. See sections 8 and 9 for a list of some related references. | |||
29 | 38 | ||
30 | The following people helped with review comments and inputs for this | 39 | The following people helped with review comments and inputs for this |
31 | document: | 40 | document: |
32 | Christoph Hellwig <hch@infradead.org> | 41 | |
33 | Arjan van de Ven <arjanv@redhat.com> | 42 | - Christoph Hellwig <hch@infradead.org> |
34 | Randy Dunlap <rdunlap@xenotime.net> | 43 | - Arjan van de Ven <arjanv@redhat.com> |
35 | Andre Hedrick <andre@linux-ide.org> | 44 | - Randy Dunlap <rdunlap@xenotime.net> |
45 | - Andre Hedrick <andre@linux-ide.org> | ||
36 | 46 | ||
37 | The following people helped with fixes/contributions to the bio patches | 47 | The following people helped with fixes/contributions to the bio patches |
38 | while it was still work-in-progress: | 48 | while it was still work-in-progress: |
39 | David S. Miller <davem@redhat.com> | ||
40 | 49 | ||
50 | - David S. Miller <davem@redhat.com> | ||
41 | 51 | ||
42 | Description of Contents: | ||
43 | ------------------------ | ||
44 | 52 | ||
45 | 1. Scope for tuning of logic to various needs | 53 | .. Description of Contents: |
46 | 1.1 Tuning based on device or low level driver capabilities | 54 | |
55 | 1. Scope for tuning of logic to various needs | ||
56 | 1.1 Tuning based on device or low level driver capabilities | ||
47 | - Per-queue parameters | 57 | - Per-queue parameters |
48 | - Highmem I/O support | 58 | - Highmem I/O support |
49 | - I/O scheduler modularization | 59 | - I/O scheduler modularization |
50 | 1.2 Tuning based on high level requirements/capabilities | 60 | 1.2 Tuning based on high level requirements/capabilities |
51 | 1.2.1 Request Priority/Latency | 61 | 1.2.1 Request Priority/Latency |
52 | 1.3 Direct access/bypass to lower layers for diagnostics and special | 62 | 1.3 Direct access/bypass to lower layers for diagnostics and special |
53 | device operations | 63 | device operations |
54 | 1.3.1 Pre-built commands | 64 | 1.3.1 Pre-built commands |
55 | 2. New flexible and generic but minimalist i/o structure or descriptor | 65 | 2. New flexible and generic but minimalist i/o structure or descriptor |
56 | (instead of using buffer heads at the i/o layer) | 66 | (instead of using buffer heads at the i/o layer) |
57 | 2.1 Requirements/Goals addressed | 67 | 2.1 Requirements/Goals addressed |
58 | 2.2 The bio struct in detail (multi-page io unit) | 68 | 2.2 The bio struct in detail (multi-page io unit) |
59 | 2.3 Changes in the request structure | 69 | 2.3 Changes in the request structure |
60 | 3. Using bios | 70 | 3. Using bios |
61 | 3.1 Setup/teardown (allocation, splitting) | 71 | 3.1 Setup/teardown (allocation, splitting) |
62 | 3.2 Generic bio helper routines | 72 | 3.2 Generic bio helper routines |
63 | 3.2.1 Traversing segments and completion units in a request | 73 | 3.2.1 Traversing segments and completion units in a request |
64 | 3.2.2 Setting up DMA scatterlists | 74 | 3.2.2 Setting up DMA scatterlists |
65 | 3.2.3 I/O completion | 75 | 3.2.3 I/O completion |
66 | 3.2.4 Implications for drivers that do not interpret bios (don't handle | 76 | 3.2.4 Implications for drivers that do not interpret bios (don't handle |
67 | multiple segments) | 77 | multiple segments) |
68 | 3.3 I/O submission | 78 | 3.3 I/O submission |
69 | 4. The I/O scheduler | 79 | 4. The I/O scheduler |
70 | 5. Scalability related changes | 80 | 5. Scalability related changes |
71 | 5.1 Granular locking: Removal of io_request_lock | 81 | 5.1 Granular locking: Removal of io_request_lock |
72 | 5.2 Prepare for transition to 64 bit sector_t | 82 | 5.2 Prepare for transition to 64 bit sector_t |
73 | 6. Other Changes/Implications | 83 | 6. Other Changes/Implications |
74 | 6.1 Partition re-mapping handled by the generic block layer | 84 | 6.1 Partition re-mapping handled by the generic block layer |
75 | 7. A few tips on migration of older drivers | 85 | 7. A few tips on migration of older drivers |
76 | 8. A list of prior/related/impacted patches/ideas | 86 | 8. A list of prior/related/impacted patches/ideas |
77 | 9. Other References/Discussion Threads | 87 | 9. Other References/Discussion Threads |
78 | 88 | ||
79 | --------------------------------------------------------------------------- | ||
80 | 89 | ||
81 | Bio Notes | 90 | Bio Notes |
82 | -------- | 91 | ========= |
83 | 92 | ||
84 | Let us discuss the changes in the context of how some overall goals for the | 93 | Let us discuss the changes in the context of how some overall goals for the |
85 | block layer are addressed. | 94 | block layer are addressed. |
86 | 95 | ||
87 | 1. Scope for tuning the generic logic to satisfy various requirements | 96 | 1. Scope for tuning the generic logic to satisfy various requirements |
97 | ===================================================================== | ||
88 | 98 | ||
89 | The block layer design supports adaptable abstractions to handle common | 99 | The block layer design supports adaptable abstractions to handle common |
90 | processing with the ability to tune the logic to an appropriate extent | 100 | processing with the ability to tune the logic to an appropriate extent |
@@ -97,6 +107,7 @@ and application/middleware software designed to take advantage of these | |||
97 | capabilities. | 107 | capabilities. |
98 | 108 | ||
99 | 1.1 Tuning based on low level device / driver capabilities | 109 | 1.1 Tuning based on low level device / driver capabilities |
110 | ---------------------------------------------------------- | ||
100 | 111 | ||
101 | Sophisticated devices with large built-in caches, intelligent i/o scheduling | 112 | Sophisticated devices with large built-in caches, intelligent i/o scheduling |
102 | optimizations, high memory DMA support, etc may find some of the | 113 | optimizations, high memory DMA support, etc may find some of the |
@@ -133,12 +144,12 @@ Some new queue property settings: | |||
133 | Sets two variables that limit the size of the request. | 144 | Sets two variables that limit the size of the request. |
134 | 145 | ||
135 | - The request queue's max_sectors, which is a soft size in | 146 | - The request queue's max_sectors, which is a soft size in |
136 | units of 512 byte sectors, and could be dynamically varied | 147 | units of 512 byte sectors, and could be dynamically varied |
137 | by the core kernel. | 148 | by the core kernel. |
138 | 149 | ||
139 | - The request queue's max_hw_sectors, which is a hard limit | 150 | - The request queue's max_hw_sectors, which is a hard limit |
140 | and reflects the maximum size request a driver can handle | 151 | and reflects the maximum size request a driver can handle |
141 | in units of 512 byte sectors. | 152 | in units of 512 byte sectors. |
142 | 153 | ||
143 | The default for both max_sectors and max_hw_sectors is | 154 | The default for both max_sectors and max_hw_sectors is |
144 | 255. The upper limit of max_sectors is 1024. | 155 | 255. The upper limit of max_sectors is 1024. |
@@ -234,6 +245,7 @@ I/O scheduler wrappers are to be used instead of accessing the queue directly. | |||
234 | See section 4. The I/O scheduler for details. | 245 | See section 4. The I/O scheduler for details. |
235 | 246 | ||
236 | 1.2 Tuning Based on High level code capabilities | 247 | 1.2 Tuning Based on High level code capabilities |
248 | ------------------------------------------------ | ||
237 | 249 | ||
238 | i. Application capabilities for raw i/o | 250 | i. Application capabilities for raw i/o |
239 | 251 | ||
@@ -258,9 +270,11 @@ would need an additional mechanism either via open flags or ioctls, or some | |||
258 | other upper level mechanism to communicate such settings to block. | 270 | other upper level mechanism to communicate such settings to block. |
259 | 271 | ||
260 | 1.2.1 Request Priority/Latency | 272 | 1.2.1 Request Priority/Latency |
273 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
261 | 274 | ||
262 | Todo/Under discussion: | 275 | Todo/Under discussion:: |
263 | Arjan's proposed request priority scheme allows higher levels some broad | 276 | |
277 | Arjan's proposed request priority scheme allows higher levels some broad | ||
264 | control (high/med/low) over the priority of an i/o request vs other pending | 278 | control (high/med/low) over the priority of an i/o request vs other pending |
265 | requests in the queue. For example it allows reads for bringing in an | 279 | requests in the queue. For example it allows reads for bringing in an |
266 | executable page on demand to be given a higher priority over pending write | 280 | executable page on demand to be given a higher priority over pending write |
@@ -272,7 +286,9 @@ Arjan's proposed request priority scheme allows higher levels some broad | |||
272 | 286 | ||
273 | 287 | ||
274 | 1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode) | 288 | 1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode) |
275 | (e.g Diagnostics, Systems Management) | 289 | ----------------------------------------------------------------------- |
290 | |||
291 | (e.g Diagnostics, Systems Management) | ||
276 | 292 | ||
277 | There are situations where high-level code needs to have direct access to | 293 | There are situations where high-level code needs to have direct access to |
278 | the low level device capabilities or requires the ability to issue commands | 294 | the low level device capabilities or requires the ability to issue commands |
@@ -308,28 +324,32 @@ involved. In the latter case, the driver would modify and manage the | |||
308 | request->buffer, request->sector and request->nr_sectors or | 324 | request->buffer, request->sector and request->nr_sectors or |
309 | request->current_nr_sectors fields itself rather than using the block layer | 325 | request->current_nr_sectors fields itself rather than using the block layer |
310 | end_request or end_that_request_first completion interfaces. | 326 | end_request or end_that_request_first completion interfaces. |
311 | (See 2.3 or Documentation/block/request.txt for a brief explanation of | 327 | (See 2.3 or Documentation/block/request.rst for a brief explanation of |
312 | the request structure fields) | 328 | the request structure fields) |
313 | 329 | ||
314 | [TBD: end_that_request_last should be usable even in this case; | 330 | :: |
315 | Perhaps an end_that_direct_request_first routine could be implemented to make | 331 | |
316 | handling direct requests easier for such drivers; Also for drivers that | 332 | [TBD: end_that_request_last should be usable even in this case; |
317 | expect bios, a helper function could be provided for setting up a bio | 333 | Perhaps an end_that_direct_request_first routine could be implemented to make |
318 | corresponding to a data buffer] | 334 | handling direct requests easier for such drivers; Also for drivers that |
319 | 335 | expect bios, a helper function could be provided for setting up a bio | |
320 | <JENS: I dont understand the above, why is end_that_request_first() not | 336 | corresponding to a data buffer] |
321 | usable? Or _last for that matter. I must be missing something> | 337 | |
322 | <SUP: What I meant here was that if the request doesn't have a bio, then | 338 | <JENS: I dont understand the above, why is end_that_request_first() not |
323 | end_that_request_first doesn't modify nr_sectors or current_nr_sectors, | 339 | usable? Or _last for that matter. I must be missing something> |
324 | and hence can't be used for advancing request state settings on the | 340 | |
325 | completion of partial transfers. The driver has to modify these fields | 341 | <SUP: What I meant here was that if the request doesn't have a bio, then |
326 | directly by hand. | 342 | end_that_request_first doesn't modify nr_sectors or current_nr_sectors, |
327 | This is because end_that_request_first only iterates over the bio list, | 343 | and hence can't be used for advancing request state settings on the |
328 | and always returns 0 if there are none associated with the request. | 344 | completion of partial transfers. The driver has to modify these fields |
329 | _last works OK in this case, and is not a problem, as I mentioned earlier | 345 | directly by hand. |
330 | > | 346 | This is because end_that_request_first only iterates over the bio list, |
347 | and always returns 0 if there are none associated with the request. | ||
348 | _last works OK in this case, and is not a problem, as I mentioned earlier | ||
349 | > | ||
331 | 350 | ||
332 | 1.3.1 Pre-built Commands | 351 | 1.3.1 Pre-built Commands |
352 | ^^^^^^^^^^^^^^^^^^^^^^^^ | ||
333 | 353 | ||
334 | A request can be created with a pre-built custom command to be sent directly | 354 | A request can be created with a pre-built custom command to be sent directly |
335 | to the device. The cmd block in the request structure has room for filling | 355 | to the device. The cmd block in the request structure has room for filling |
@@ -360,9 +380,11 @@ Aside: | |||
360 | the pre-builder hook can be invoked there. | 380 | the pre-builder hook can be invoked there. |
361 | 381 | ||
362 | 382 | ||
363 | 2. Flexible and generic but minimalist i/o structure/descriptor. | 383 | 2. Flexible and generic but minimalist i/o structure/descriptor |
384 | =============================================================== | ||
364 | 385 | ||
365 | 2.1 Reason for a new structure and requirements addressed | 386 | 2.1 Reason for a new structure and requirements addressed |
387 | --------------------------------------------------------- | ||
366 | 388 | ||
367 | Prior to 2.5, buffer heads were used as the unit of i/o at the generic block | 389 | Prior to 2.5, buffer heads were used as the unit of i/o at the generic block |
368 | layer, and the low level request structure was associated with a chain of | 390 | layer, and the low level request structure was associated with a chain of |
@@ -378,26 +400,26 @@ which were generated for each such chunk. | |||
378 | The following were some of the goals and expectations considered in the | 400 | The following were some of the goals and expectations considered in the |
379 | redesign of the block i/o data structure in 2.5. | 401 | redesign of the block i/o data structure in 2.5. |
380 | 402 | ||
381 | i. Should be appropriate as a descriptor for both raw and buffered i/o - | 403 | 1. Should be appropriate as a descriptor for both raw and buffered i/o - |
382 | avoid cache related fields which are irrelevant in the direct/page i/o path, | 404 | avoid cache related fields which are irrelevant in the direct/page i/o path, |
383 | or filesystem block size alignment restrictions which may not be relevant | 405 | or filesystem block size alignment restrictions which may not be relevant |
384 | for raw i/o. | 406 | for raw i/o. |
385 | ii. Ability to represent high-memory buffers (which do not have a virtual | 407 | 2. Ability to represent high-memory buffers (which do not have a virtual |
386 | address mapping in kernel address space). | 408 | address mapping in kernel address space). |
387 | iii.Ability to represent large i/os w/o unnecessarily breaking them up (i.e | 409 | 3. Ability to represent large i/os w/o unnecessarily breaking them up (i.e |
388 | greater than PAGE_SIZE chunks in one shot) | 410 | greater than PAGE_SIZE chunks in one shot) |
389 | iv. At the same time, ability to retain independent identity of i/os from | 411 | 4. At the same time, ability to retain independent identity of i/os from |
390 | different sources or i/o units requiring individual completion (e.g. for | 412 | different sources or i/o units requiring individual completion (e.g. for |
391 | latency reasons) | 413 | latency reasons) |
392 | v. Ability to represent an i/o involving multiple physical memory segments | 414 | 5. Ability to represent an i/o involving multiple physical memory segments |
393 | (including non-page aligned page fragments, as specified via readv/writev) | 415 | (including non-page aligned page fragments, as specified via readv/writev) |
394 | without unnecessarily breaking it up, if the underlying device is capable of | 416 | without unnecessarily breaking it up, if the underlying device is capable of |
395 | handling it. | 417 | handling it. |
396 | vi. Preferably should be based on a memory descriptor structure that can be | 418 | 6. Preferably should be based on a memory descriptor structure that can be |
397 | passed around different types of subsystems or layers, maybe even | 419 | passed around different types of subsystems or layers, maybe even |
398 | networking, without duplication or extra copies of data/descriptor fields | 420 | networking, without duplication or extra copies of data/descriptor fields |
399 | themselves in the process | 421 | themselves in the process |
400 | vii.Ability to handle the possibility of splits/merges as the structure passes | 422 | 7. Ability to handle the possibility of splits/merges as the structure passes |
401 | through layered drivers (lvm, md, evms), with minimal overhead. | 423 | through layered drivers (lvm, md, evms), with minimal overhead. |
402 | 424 | ||
403 | The solution was to define a new structure (bio) for the block layer, | 425 | The solution was to define a new structure (bio) for the block layer, |
@@ -408,6 +430,7 @@ bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are | |||
408 | mapped to bio structures. | 430 | mapped to bio structures. |
409 | 431 | ||
410 | 2.2 The bio struct | 432 | 2.2 The bio struct |
433 | ------------------ | ||
411 | 434 | ||
412 | The bio structure uses a vector representation pointing to an array of tuples | 435 | The bio structure uses a vector representation pointing to an array of tuples |
413 | of <page, offset, len> to describe the i/o buffer, and has various other | 436 | of <page, offset, len> to describe the i/o buffer, and has various other |
@@ -417,16 +440,18 @@ performing the i/o. | |||
417 | Notice that this representation means that a bio has no virtual address | 440 | Notice that this representation means that a bio has no virtual address |
418 | mapping at all (unlike buffer heads). | 441 | mapping at all (unlike buffer heads). |
419 | 442 | ||
420 | struct bio_vec { | 443 | :: |
444 | |||
445 | struct bio_vec { | ||
421 | struct page *bv_page; | 446 | struct page *bv_page; |
422 | unsigned short bv_len; | 447 | unsigned short bv_len; |
423 | unsigned short bv_offset; | 448 | unsigned short bv_offset; |
424 | }; | 449 | }; |
425 | 450 | ||
426 | /* | 451 | /* |
427 | * main unit of I/O for the block layer and lower layers (ie drivers) | 452 | * main unit of I/O for the block layer and lower layers (ie drivers) |
428 | */ | 453 | */ |
429 | struct bio { | 454 | struct bio { |
430 | struct bio *bi_next; /* request queue link */ | 455 | struct bio *bi_next; /* request queue link */ |
431 | struct block_device *bi_bdev; /* target device */ | 456 | struct block_device *bi_bdev; /* target device */ |
432 | unsigned long bi_flags; /* status, command, etc */ | 457 | unsigned long bi_flags; /* status, command, etc */ |
@@ -443,7 +468,7 @@ struct bio { | |||
443 | bio_end_io_t *bi_end_io; /* bi_end_io (bio) */ | 468 | bio_end_io_t *bi_end_io; /* bi_end_io (bio) */ |
444 | atomic_t bi_cnt; /* pin count: free when it hits zero */ | 469 | atomic_t bi_cnt; /* pin count: free when it hits zero */ |
445 | void *bi_private; | 470 | void *bi_private; |
446 | }; | 471 | }; |
447 | 472 | ||
448 | With this multipage bio design: | 473 | With this multipage bio design: |
449 | 474 | ||
@@ -453,7 +478,7 @@ With this multipage bio design: | |||
453 | - Splitting of an i/o request across multiple devices (as in the case of | 478 | - Splitting of an i/o request across multiple devices (as in the case of |
454 | lvm or raid) is achieved by cloning the bio (where the clone points to | 479 | lvm or raid) is achieved by cloning the bio (where the clone points to |
455 | the same bi_io_vec array, but with the index and size accordingly modified) | 480 | the same bi_io_vec array, but with the index and size accordingly modified) |
456 | - A linked list of bios is used as before for unrelated merges (*) - this | 481 | - A linked list of bios is used as before for unrelated merges [*]_ - this |
457 | avoids reallocs and makes independent completions easier to handle. | 482 | avoids reallocs and makes independent completions easier to handle. |
458 | - Code that traverses the req list can find all the segments of a bio | 483 | - Code that traverses the req list can find all the segments of a bio |
459 | by using rq_for_each_segment. This handles the fact that a request | 484 | by using rq_for_each_segment. This handles the fact that a request |
@@ -462,10 +487,12 @@ With this multipage bio design: | |||
462 | field to keep track of the next bio_vec entry to process. | 487 | field to keep track of the next bio_vec entry to process. |
463 | (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE) | 488 | (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE) |
464 | [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying | 489 | [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying |
465 | bi_offset an len fields] | 490 | bi_offset an len fields] |
466 | 491 | ||
467 | (*) unrelated merges -- a request ends up containing two or more bios that | 492 | .. [*] |
468 | didn't originate from the same place. | 493 | |
494 | unrelated merges -- a request ends up containing two or more bios that | ||
495 | didn't originate from the same place. | ||
469 | 496 | ||
470 | bi_end_io() i/o callback gets called on i/o completion of the entire bio. | 497 | bi_end_io() i/o callback gets called on i/o completion of the entire bio. |
471 | 498 | ||
@@ -483,10 +510,11 @@ which in turn means that only raw I/O uses it (direct i/o may not work | |||
483 | right now). The intent however is to enable clustering of pages etc to | 510 | right now). The intent however is to enable clustering of pages etc to |
484 | become possible. The pagebuf abstraction layer from SGI also uses multi-page | 511 | become possible. The pagebuf abstraction layer from SGI also uses multi-page |
485 | bios, but that is currently not included in the stock development kernels. | 512 | bios, but that is currently not included in the stock development kernels. |
486 | The same is true of Andrew Morton's work-in-progress multipage bio writeout | 513 | The same is true of Andrew Morton's work-in-progress multipage bio writeout |
487 | and readahead patches. | 514 | and readahead patches. |
488 | 515 | ||
489 | 2.3 Changes in the Request Structure | 516 | 2.3 Changes in the Request Structure |
517 | ------------------------------------ | ||
490 | 518 | ||
491 | The request structure is the structure that gets passed down to low level | 519 | The request structure is the structure that gets passed down to low level |
492 | drivers. The block layer make_request function builds up a request structure, | 520 | drivers. The block layer make_request function builds up a request structure, |
@@ -499,11 +527,11 @@ request structure. | |||
499 | Only some relevant fields (mainly those which changed or may be referred | 527 | Only some relevant fields (mainly those which changed or may be referred |
500 | to in some of the discussion here) are listed below, not necessarily in | 528 | to in some of the discussion here) are listed below, not necessarily in |
501 | the order in which they occur in the structure (see include/linux/blkdev.h) | 529 | the order in which they occur in the structure (see include/linux/blkdev.h) |
502 | Refer to Documentation/block/request.txt for details about all the request | 530 | Refer to Documentation/block/request.rst for details about all the request |
503 | structure fields and a quick reference about the layers which are | 531 | structure fields and a quick reference about the layers which are |
504 | supposed to use or modify those fields. | 532 | supposed to use or modify those fields:: |
505 | 533 | ||
506 | struct request { | 534 | struct request { |
507 | struct list_head queuelist; /* Not meant to be directly accessed by | 535 | struct list_head queuelist; /* Not meant to be directly accessed by |
508 | the driver. | 536 | the driver. |
509 | Used by q->elv_next_request_fn | 537 | Used by q->elv_next_request_fn |
@@ -548,11 +576,11 @@ struct request { | |||
548 | . | 576 | . |
549 | struct bio *bio, *biotail; /* bio list instead of bh */ | 577 | struct bio *bio, *biotail; /* bio list instead of bh */ |
550 | struct request_list *rl; | 578 | struct request_list *rl; |
551 | } | 579 | } |
552 | 580 | ||
553 | See the req_ops and req_flag_bits definitions for an explanation of the various | 581 | See the req_ops and req_flag_bits definitions for an explanation of the various |
554 | flags available. Some bits are used by the block layer or i/o scheduler. | 582 | flags available. Some bits are used by the block layer or i/o scheduler. |
555 | 583 | ||
556 | The behaviour of the various sector counts are almost the same as before, | 584 | The behaviour of the various sector counts are almost the same as before, |
557 | except that since we have multi-segment bios, current_nr_sectors refers | 585 | except that since we have multi-segment bios, current_nr_sectors refers |
558 | to the numbers of sectors in the current segment being processed which could | 586 | to the numbers of sectors in the current segment being processed which could |
@@ -578,8 +606,10 @@ a driver needs to be careful about interoperation with the block layer helper | |||
578 | functions which the driver uses. (Section 1.3) | 606 | functions which the driver uses. (Section 1.3) |
579 | 607 | ||
580 | 3. Using bios | 608 | 3. Using bios |
609 | ============= | ||
581 | 610 | ||
582 | 3.1 Setup/Teardown | 611 | 3.1 Setup/Teardown |
612 | ------------------ | ||
583 | 613 | ||
584 | There are routines for managing the allocation, and reference counting, and | 614 | There are routines for managing the allocation, and reference counting, and |
585 | freeing of bios (bio_alloc, bio_get, bio_put). | 615 | freeing of bios (bio_alloc, bio_get, bio_put). |
@@ -606,10 +636,13 @@ case of bio, these routines make use of the standard slab allocator. | |||
606 | The caller of bio_alloc is expected to taken certain steps to avoid | 636 | The caller of bio_alloc is expected to taken certain steps to avoid |
607 | deadlocks, e.g. avoid trying to allocate more memory from the pool while | 637 | deadlocks, e.g. avoid trying to allocate more memory from the pool while |
608 | already holding memory obtained from the pool. | 638 | already holding memory obtained from the pool. |
609 | [TBD: This is a potential issue, though a rare possibility | 639 | |
610 | in the bounce bio allocation that happens in the current code, since | 640 | :: |
611 | it ends up allocating a second bio from the same pool while | 641 | |
612 | holding the original bio ] | 642 | [TBD: This is a potential issue, though a rare possibility |
643 | in the bounce bio allocation that happens in the current code, since | ||
644 | it ends up allocating a second bio from the same pool while | ||
645 | holding the original bio ] | ||
613 | 646 | ||
614 | Memory allocated from the pool should be released back within a limited | 647 | Memory allocated from the pool should be released back within a limited |
615 | amount of time (in the case of bio, that would be after the i/o is completed). | 648 | amount of time (in the case of bio, that would be after the i/o is completed). |
@@ -635,14 +668,18 @@ same bio_vec_list). This would typically be used for splitting i/o requests | |||
635 | in lvm or md. | 668 | in lvm or md. |
636 | 669 | ||
637 | 3.2 Generic bio helper Routines | 670 | 3.2 Generic bio helper Routines |
671 | ------------------------------- | ||
638 | 672 | ||
639 | 3.2.1 Traversing segments and completion units in a request | 673 | 3.2.1 Traversing segments and completion units in a request |
674 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
640 | 675 | ||
641 | The macro rq_for_each_segment() should be used for traversing the bios | 676 | The macro rq_for_each_segment() should be used for traversing the bios |
642 | in the request list (drivers should avoid directly trying to do it | 677 | in the request list (drivers should avoid directly trying to do it |
643 | themselves). Using these helpers should also make it easier to cope | 678 | themselves). Using these helpers should also make it easier to cope |
644 | with block changes in the future. | 679 | with block changes in the future. |
645 | 680 | ||
681 | :: | ||
682 | |||
646 | struct req_iterator iter; | 683 | struct req_iterator iter; |
647 | rq_for_each_segment(bio_vec, rq, iter) | 684 | rq_for_each_segment(bio_vec, rq, iter) |
648 | /* bio_vec is now current segment */ | 685 | /* bio_vec is now current segment */ |
@@ -653,6 +690,7 @@ which don't make a distinction between segments and completion units would | |||
653 | need to be reorganized to support multi-segment bios. | 690 | need to be reorganized to support multi-segment bios. |
654 | 691 | ||
655 | 3.2.2 Setting up DMA scatterlists | 692 | 3.2.2 Setting up DMA scatterlists |
693 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
656 | 694 | ||
657 | The blk_rq_map_sg() helper routine would be used for setting up scatter | 695 | The blk_rq_map_sg() helper routine would be used for setting up scatter |
658 | gather lists from a request, so a driver need not do it on its own. | 696 | gather lists from a request, so a driver need not do it on its own. |
@@ -683,6 +721,7 @@ of physical data segments in a request (i.e. the largest sized scatter list | |||
683 | a driver could handle) | 721 | a driver could handle) |
684 | 722 | ||
685 | 3.2.3 I/O completion | 723 | 3.2.3 I/O completion |
724 | ^^^^^^^^^^^^^^^^^^^^ | ||
686 | 725 | ||
687 | The existing generic block layer helper routines end_request, | 726 | The existing generic block layer helper routines end_request, |
688 | end_that_request_first and end_that_request_last can be used for i/o | 727 | end_that_request_first and end_that_request_last can be used for i/o |
@@ -691,8 +730,10 @@ request can be kicked of) as before. With the introduction of multi-page | |||
691 | bio support, end_that_request_first requires an additional argument indicating | 730 | bio support, end_that_request_first requires an additional argument indicating |
692 | the number of sectors completed. | 731 | the number of sectors completed. |
693 | 732 | ||
694 | 3.2.4 Implications for drivers that do not interpret bios (don't handle | 733 | 3.2.4 Implications for drivers that do not interpret bios |
695 | multiple segments) | 734 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
735 | |||
736 | (don't handle multiple segments) | ||
696 | 737 | ||
697 | Drivers that do not interpret bios e.g those which do not handle multiple | 738 | Drivers that do not interpret bios e.g those which do not handle multiple |
698 | segments and do not support i/o into high memory addresses (require bounce | 739 | segments and do not support i/o into high memory addresses (require bounce |
@@ -707,15 +748,18 @@ be used if only if the request has come down from block/bio path, not for | |||
707 | direct access requests which only specify rq->buffer without a valid rq->bio) | 748 | direct access requests which only specify rq->buffer without a valid rq->bio) |
708 | 749 | ||
709 | 3.3 I/O Submission | 750 | 3.3 I/O Submission |
751 | ------------------ | ||
710 | 752 | ||
711 | The routine submit_bio() is used to submit a single io. Higher level i/o | 753 | The routine submit_bio() is used to submit a single io. Higher level i/o |
712 | routines make use of this: | 754 | routines make use of this: |
713 | 755 | ||
714 | (a) Buffered i/o: | 756 | (a) Buffered i/o: |
757 | |||
715 | The routine submit_bh() invokes submit_bio() on a bio corresponding to the | 758 | The routine submit_bh() invokes submit_bio() on a bio corresponding to the |
716 | bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before. | 759 | bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before. |
717 | 760 | ||
718 | (b) Kiobuf i/o (for raw/direct i/o): | 761 | (b) Kiobuf i/o (for raw/direct i/o): |
762 | |||
719 | The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and | 763 | The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and |
720 | maps the array to one or more multi-page bios, issuing submit_bio() to | 764 | maps the array to one or more multi-page bios, issuing submit_bio() to |
721 | perform the i/o on each of these. | 765 | perform the i/o on each of these. |
@@ -738,6 +782,7 @@ Todo/Observation: | |||
738 | 782 | ||
739 | 783 | ||
740 | (c) Page i/o: | 784 | (c) Page i/o: |
785 | |||
741 | Todo/Under discussion: | 786 | Todo/Under discussion: |
742 | 787 | ||
743 | Andrew Morton's multi-page bio patches attempt to issue multi-page | 788 | Andrew Morton's multi-page bio patches attempt to issue multi-page |
@@ -753,6 +798,7 @@ Todo/Under discussion: | |||
753 | abstraction, but intended to be as lightweight as possible). | 798 | abstraction, but intended to be as lightweight as possible). |
754 | 799 | ||
755 | (d) Direct access i/o: | 800 | (d) Direct access i/o: |
801 | |||
756 | Direct access requests that do not contain bios would be submitted differently | 802 | Direct access requests that do not contain bios would be submitted differently |
757 | as discussed earlier in section 1.3. | 803 | as discussed earlier in section 1.3. |
758 | 804 | ||
@@ -780,11 +826,13 @@ Aside: | |||
780 | 826 | ||
781 | 827 | ||
782 | 4. The I/O scheduler | 828 | 4. The I/O scheduler |
829 | ==================== | ||
830 | |||
783 | I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch | 831 | I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch |
784 | queue and specific I/O schedulers. Unless stated otherwise, elevator is used | 832 | queue and specific I/O schedulers. Unless stated otherwise, elevator is used |
785 | to refer to both parts and I/O scheduler to specific I/O schedulers. | 833 | to refer to both parts and I/O scheduler to specific I/O schedulers. |
786 | 834 | ||
787 | Block layer implements generic dispatch queue in block/*.c. | 835 | Block layer implements generic dispatch queue in `block/*.c`. |
788 | The generic dispatch queue is responsible for requeueing, handling non-fs | 836 | The generic dispatch queue is responsible for requeueing, handling non-fs |
789 | requests and all other subtleties. | 837 | requests and all other subtleties. |
790 | 838 | ||
@@ -802,8 +850,11 @@ doesn't implement a function, the switch does nothing or some minimal house | |||
802 | keeping work. | 850 | keeping work. |
803 | 851 | ||
804 | 4.1. I/O scheduler API | 852 | 4.1. I/O scheduler API |
853 | ---------------------- | ||
805 | 854 | ||
806 | The functions an elevator may implement are: (* are mandatory) | 855 | The functions an elevator may implement are: (* are mandatory) |
856 | |||
857 | =============================== ================================================ | ||
807 | elevator_merge_fn called to query requests for merge with a bio | 858 | elevator_merge_fn called to query requests for merge with a bio |
808 | 859 | ||
809 | elevator_merge_req_fn called when two requests get merged. the one | 860 | elevator_merge_req_fn called when two requests get merged. the one |
@@ -862,8 +913,11 @@ elevator_deactivate_req_fn Called when device driver decides to delay | |||
862 | elevator_init_fn* | 913 | elevator_init_fn* |
863 | elevator_exit_fn Allocate and free any elevator specific storage | 914 | elevator_exit_fn Allocate and free any elevator specific storage |
864 | for a queue. | 915 | for a queue. |
916 | =============================== ================================================ | ||
865 | 917 | ||
866 | 4.2 Request flows seen by I/O schedulers | 918 | 4.2 Request flows seen by I/O schedulers |
919 | ---------------------------------------- | ||
920 | |||
867 | All requests seen by I/O schedulers strictly follow one of the following three | 921 | All requests seen by I/O schedulers strictly follow one of the following three |
868 | flows. | 922 | flows. |
869 | 923 | ||
@@ -877,9 +931,12 @@ flows. | |||
877 | -> put_req_fn | 931 | -> put_req_fn |
878 | 932 | ||
879 | 4.3 I/O scheduler implementation | 933 | 4.3 I/O scheduler implementation |
934 | -------------------------------- | ||
935 | |||
880 | The generic i/o scheduler algorithm attempts to sort/merge/batch requests for | 936 | The generic i/o scheduler algorithm attempts to sort/merge/batch requests for |
881 | optimal disk scan and request servicing performance (based on generic | 937 | optimal disk scan and request servicing performance (based on generic |
882 | principles and device capabilities), optimized for: | 938 | principles and device capabilities), optimized for: |
939 | |||
883 | i. improved throughput | 940 | i. improved throughput |
884 | ii. improved latency | 941 | ii. improved latency |
885 | iii. better utilization of h/w & CPU time | 942 | iii. better utilization of h/w & CPU time |
@@ -933,15 +990,19 @@ Aside: | |||
933 | a big request from the broken up pieces coming by. | 990 | a big request from the broken up pieces coming by. |
934 | 991 | ||
935 | 4.4 I/O contexts | 992 | 4.4 I/O contexts |
993 | ---------------- | ||
994 | |||
936 | I/O contexts provide a dynamically allocated per process data area. They may | 995 | I/O contexts provide a dynamically allocated per process data area. They may |
937 | be used in I/O schedulers, and in the block layer (could be used for IO statis, | 996 | be used in I/O schedulers, and in the block layer (could be used for IO statis, |
938 | priorities for example). See *io_context in block/ll_rw_blk.c, and as-iosched.c | 997 | priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c |
939 | for an example of usage in an i/o scheduler. | 998 | for an example of usage in an i/o scheduler. |
940 | 999 | ||
941 | 1000 | ||
942 | 5. Scalability related changes | 1001 | 5. Scalability related changes |
1002 | ============================== | ||
943 | 1003 | ||
944 | 5.1 Granular Locking: io_request_lock replaced by a per-queue lock | 1004 | 5.1 Granular Locking: io_request_lock replaced by a per-queue lock |
1005 | ------------------------------------------------------------------ | ||
945 | 1006 | ||
946 | The global io_request_lock has been removed as of 2.5, to avoid | 1007 | The global io_request_lock has been removed as of 2.5, to avoid |
947 | the scalability bottleneck it was causing, and has been replaced by more | 1008 | the scalability bottleneck it was causing, and has been replaced by more |
@@ -956,20 +1017,23 @@ request_fn execution which it means that lots of older drivers | |||
956 | should still be SMP safe. Drivers are free to drop the queue | 1017 | should still be SMP safe. Drivers are free to drop the queue |
957 | lock themselves, if required. Drivers that explicitly used the | 1018 | lock themselves, if required. Drivers that explicitly used the |
958 | io_request_lock for serialization need to be modified accordingly. | 1019 | io_request_lock for serialization need to be modified accordingly. |
959 | Usually it's as easy as adding a global lock: | 1020 | Usually it's as easy as adding a global lock:: |
960 | 1021 | ||
961 | static DEFINE_SPINLOCK(my_driver_lock); | 1022 | static DEFINE_SPINLOCK(my_driver_lock); |
962 | 1023 | ||
963 | and passing the address to that lock to blk_init_queue(). | 1024 | and passing the address to that lock to blk_init_queue(). |
964 | 1025 | ||
965 | 5.2 64 bit sector numbers (sector_t prepares for 64 bit support) | 1026 | 5.2 64 bit sector numbers (sector_t prepares for 64 bit support) |
1027 | ---------------------------------------------------------------- | ||
966 | 1028 | ||
967 | The sector number used in the bio structure has been changed to sector_t, | 1029 | The sector number used in the bio structure has been changed to sector_t, |
968 | which could be defined as 64 bit in preparation for 64 bit sector support. | 1030 | which could be defined as 64 bit in preparation for 64 bit sector support. |
969 | 1031 | ||
970 | 6. Other Changes/Implications | 1032 | 6. Other Changes/Implications |
1033 | ============================= | ||
971 | 1034 | ||
972 | 6.1 Partition re-mapping handled by the generic block layer | 1035 | 6.1 Partition re-mapping handled by the generic block layer |
1036 | ----------------------------------------------------------- | ||
973 | 1037 | ||
974 | In 2.5 some of the gendisk/partition related code has been reorganized. | 1038 | In 2.5 some of the gendisk/partition related code has been reorganized. |
975 | Now the generic block layer performs partition-remapping early and thus | 1039 | Now the generic block layer performs partition-remapping early and thus |
@@ -984,6 +1048,7 @@ sent are offset from the beginning of the device. | |||
984 | 1048 | ||
985 | 1049 | ||
986 | 7. A Few Tips on Migration of older drivers | 1050 | 7. A Few Tips on Migration of older drivers |
1051 | =========================================== | ||
987 | 1052 | ||
988 | Old-style drivers that just use CURRENT and ignores clustered requests, | 1053 | Old-style drivers that just use CURRENT and ignores clustered requests, |
989 | may not need much change. The generic layer will automatically handle | 1054 | may not need much change. The generic layer will automatically handle |
@@ -1017,12 +1082,12 @@ blk_init_queue time. | |||
1017 | 1082 | ||
1018 | Drivers no longer have to map a {partition, sector offset} into the | 1083 | Drivers no longer have to map a {partition, sector offset} into the |
1019 | correct absolute location anymore, this is done by the block layer, so | 1084 | correct absolute location anymore, this is done by the block layer, so |
1020 | where a driver received a request ala this before: | 1085 | where a driver received a request ala this before:: |
1021 | 1086 | ||
1022 | rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */ | 1087 | rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */ |
1023 | rq->sector = 0; /* first sector on hda5 */ | 1088 | rq->sector = 0; /* first sector on hda5 */ |
1024 | 1089 | ||
1025 | it will now see | 1090 | it will now see:: |
1026 | 1091 | ||
1027 | rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */ | 1092 | rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */ |
1028 | rq->sector = 123128; /* offset from start of disk */ | 1093 | rq->sector = 123128; /* offset from start of disk */ |
@@ -1039,38 +1104,65 @@ a bio into the virtual address space. | |||
1039 | 1104 | ||
1040 | 1105 | ||
1041 | 8. Prior/Related/Impacted patches | 1106 | 8. Prior/Related/Impacted patches |
1107 | ================================= | ||
1042 | 1108 | ||
1043 | 8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp) | 1109 | 8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp) |
1110 | ----------------------------------------------------- | ||
1111 | |||
1044 | - orig kiobuf & raw i/o patches (now in 2.4 tree) | 1112 | - orig kiobuf & raw i/o patches (now in 2.4 tree) |
1045 | - direct kiobuf based i/o to devices (no intermediate bh's) | 1113 | - direct kiobuf based i/o to devices (no intermediate bh's) |
1046 | - page i/o using kiobuf | 1114 | - page i/o using kiobuf |
1047 | - kiobuf splitting for lvm (mkp) | 1115 | - kiobuf splitting for lvm (mkp) |
1048 | - elevator support for kiobuf request merging (axboe) | 1116 | - elevator support for kiobuf request merging (axboe) |
1117 | |||
1049 | 8.2. Zero-copy networking (Dave Miller) | 1118 | 8.2. Zero-copy networking (Dave Miller) |
1119 | --------------------------------------- | ||
1120 | |||
1050 | 8.3. SGI XFS - pagebuf patches - use of kiobufs | 1121 | 8.3. SGI XFS - pagebuf patches - use of kiobufs |
1122 | ----------------------------------------------- | ||
1051 | 8.4. Multi-page pioent patch for bio (Christoph Hellwig) | 1123 | 8.4. Multi-page pioent patch for bio (Christoph Hellwig) |
1124 | -------------------------------------------------------- | ||
1052 | 8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11 | 1125 | 8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11 |
1126 | -------------------------------------------------------------------- | ||
1053 | 8.6. Async i/o implementation patch (Ben LaHaise) | 1127 | 8.6. Async i/o implementation patch (Ben LaHaise) |
1128 | ------------------------------------------------- | ||
1054 | 8.7. EVMS layering design (IBM EVMS team) | 1129 | 8.7. EVMS layering design (IBM EVMS team) |
1055 | 8.8. Larger page cache size patch (Ben LaHaise) and | 1130 | ----------------------------------------- |
1056 | Large page size (Daniel Phillips) | 1131 | 8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips) |
1132 | ------------------------------------------------------------------------------------- | ||
1133 | |||
1057 | => larger contiguous physical memory buffers | 1134 | => larger contiguous physical memory buffers |
1135 | |||
1058 | 8.9. VM reservations patch (Ben LaHaise) | 1136 | 8.9. VM reservations patch (Ben LaHaise) |
1137 | ---------------------------------------- | ||
1059 | 8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?) | 1138 | 8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?) |
1139 | ---------------------------------------------------------- | ||
1060 | 8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+ | 1140 | 8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+ |
1061 | 8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, | 1141 | --------------------------------------------------------------------------- |
1062 | Badari) | 1142 | 8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari) |
1143 | ------------------------------------------------------------------------------- | ||
1063 | 8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven) | 1144 | 8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven) |
1145 | ------------------------------------------------------------------ | ||
1064 | 8.14 IDE Taskfile i/o patch (Andre Hedrick) | 1146 | 8.14 IDE Taskfile i/o patch (Andre Hedrick) |
1147 | -------------------------------------------- | ||
1065 | 8.15 Multi-page writeout and readahead patches (Andrew Morton) | 1148 | 8.15 Multi-page writeout and readahead patches (Andrew Morton) |
1149 | --------------------------------------------------------------- | ||
1066 | 8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy) | 1150 | 8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy) |
1151 | ----------------------------------------------------------------------- | ||
1067 | 1152 | ||
1068 | 9. Other References: | 1153 | 9. Other References |
1154 | =================== | ||
1069 | 1155 | ||
1070 | 9.1 The Splice I/O Model - Larry McVoy (and subsequent discussions on lkml, | 1156 | 9.1 The Splice I/O Model |
1071 | and Linus' comments - Jan 2001) | 1157 | ------------------------ |
1072 | 9.2 Discussions about kiobuf and bh design on lkml between sct, linus, alan | 1158 | |
1073 | et al - Feb-March 2001 (many of the initial thoughts that led to bio were | 1159 | Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001 |
1074 | brought up in this discussion thread) | 1160 | |
1075 | 9.3 Discussions on mempool on lkml - Dec 2001. | 1161 | 9.2 Discussions about kiobuf and bh design |
1162 | ------------------------------------------ | ||
1076 | 1163 | ||
1164 | On lkml between sct, linus, alan et al - Feb-March 2001 (many of the | ||
1165 | initial thoughts that led to bio were brought up in this discussion thread) | ||
1166 | |||
1167 | 9.3 Discussions on mempool on lkml - Dec 2001. | ||
1168 | ---------------------------------------------- | ||
diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.rst index ce6eccaf5df7..86fa66c87172 100644 --- a/Documentation/block/biovecs.txt +++ b/Documentation/block/biovecs.rst | |||
@@ -1,6 +1,6 @@ | |||
1 | 1 | ====================================== | |
2 | Immutable biovecs and biovec iterators: | 2 | Immutable biovecs and biovec iterators |
3 | ======================================= | 3 | ====================================== |
4 | 4 | ||
5 | Kent Overstreet <kmo@daterainc.com> | 5 | Kent Overstreet <kmo@daterainc.com> |
6 | 6 | ||
@@ -121,10 +121,12 @@ Other implications: | |||
121 | Usage of helpers: | 121 | Usage of helpers: |
122 | ================= | 122 | ================= |
123 | 123 | ||
124 | * The following helpers whose names have the suffix of "_all" can only be used | 124 | * The following helpers whose names have the suffix of `_all` can only be used |
125 | on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers | 125 | on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers |
126 | shouldn't use them because the bio may have been split before it reached the | 126 | shouldn't use them because the bio may have been split before it reached the |
127 | driver. | 127 | driver. |
128 | |||
129 | :: | ||
128 | 130 | ||
129 | bio_for_each_segment_all() | 131 | bio_for_each_segment_all() |
130 | bio_first_bvec_all() | 132 | bio_first_bvec_all() |
@@ -132,13 +134,13 @@ driver. | |||
132 | bio_last_bvec_all() | 134 | bio_last_bvec_all() |
133 | 135 | ||
134 | * The following helpers iterate over single-page segment. The passed 'struct | 136 | * The following helpers iterate over single-page segment. The passed 'struct |
135 | bio_vec' will contain a single-page IO vector during the iteration | 137 | bio_vec' will contain a single-page IO vector during the iteration:: |
136 | 138 | ||
137 | bio_for_each_segment() | 139 | bio_for_each_segment() |
138 | bio_for_each_segment_all() | 140 | bio_for_each_segment_all() |
139 | 141 | ||
140 | * The following helpers iterate over multi-page bvec. The passed 'struct | 142 | * The following helpers iterate over multi-page bvec. The passed 'struct |
141 | bio_vec' will contain a multi-page IO vector during the iteration | 143 | bio_vec' will contain a multi-page IO vector during the iteration:: |
142 | 144 | ||
143 | bio_for_each_bvec() | 145 | bio_for_each_bvec() |
144 | rq_for_each_bvec() | 146 | rq_for_each_bvec() |
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst new file mode 100644 index 000000000000..2cf258d64bbe --- /dev/null +++ b/Documentation/block/capability.rst | |||
@@ -0,0 +1,18 @@ | |||
1 | =============================== | ||
2 | Generic Block Device Capability | ||
3 | =============================== | ||
4 | |||
5 | This file documents the sysfs file block/<disk>/capability | ||
6 | |||
7 | capability is a hex word indicating which capabilities a specific disk | ||
8 | supports. For more information on bits not listed here, see | ||
9 | include/linux/genhd.h | ||
10 | |||
11 | GENHD_FL_MEDIA_CHANGE_NOTIFY | ||
12 | ---------------------------- | ||
13 | |||
14 | Value: 4 | ||
15 | |||
16 | When this bit is set, the disk supports Asynchronous Notification | ||
17 | of media change events. These events will be broadcast to user | ||
18 | space via kernel uevent. | ||
diff --git a/Documentation/block/capability.txt b/Documentation/block/capability.txt deleted file mode 100644 index 2f1729424ef4..000000000000 --- a/Documentation/block/capability.txt +++ /dev/null | |||
@@ -1,15 +0,0 @@ | |||
1 | Generic Block Device Capability | ||
2 | =============================================================================== | ||
3 | This file documents the sysfs file block/<disk>/capability | ||
4 | |||
5 | capability is a hex word indicating which capabilities a specific disk | ||
6 | supports. For more information on bits not listed here, see | ||
7 | include/linux/genhd.h | ||
8 | |||
9 | Capability Value | ||
10 | ------------------------------------------------------------------------------- | ||
11 | GENHD_FL_MEDIA_CHANGE_NOTIFY 4 | ||
12 | When this bit is set, the disk supports Asynchronous Notification | ||
13 | of media change events. These events will be broadcast to user | ||
14 | space via kernel uevent. | ||
15 | |||
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.rst index 760a3f7c3ed4..530bedff548a 100644 --- a/Documentation/block/cmdline-partition.txt +++ b/Documentation/block/cmdline-partition.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ============================================== | ||
1 | Embedded device command line partition parsing | 2 | Embedded device command line partition parsing |
2 | ===================================================================== | 3 | ============================================== |
3 | 4 | ||
4 | The "blkdevparts" command line option adds support for reading the | 5 | The "blkdevparts" command line option adds support for reading the |
5 | block device partition table from the kernel command line. | 6 | block device partition table from the kernel command line. |
@@ -22,12 +23,15 @@ blkdevparts=<blkdev-def>[;<blkdev-def>] | |||
22 | <size> | 23 | <size> |
23 | partition size, in bytes, such as: 512, 1m, 1G. | 24 | partition size, in bytes, such as: 512, 1m, 1G. |
24 | size may contain an optional suffix of (upper or lower case): | 25 | size may contain an optional suffix of (upper or lower case): |
26 | |||
25 | K, M, G, T, P, E. | 27 | K, M, G, T, P, E. |
28 | |||
26 | "-" is used to denote all remaining space. | 29 | "-" is used to denote all remaining space. |
27 | 30 | ||
28 | <offset> | 31 | <offset> |
29 | partition start address, in bytes. | 32 | partition start address, in bytes. |
30 | offset may contain an optional suffix of (upper or lower case): | 33 | offset may contain an optional suffix of (upper or lower case): |
34 | |||
31 | K, M, G, T, P, E. | 35 | K, M, G, T, P, E. |
32 | 36 | ||
33 | (part-name) | 37 | (part-name) |
@@ -36,11 +40,14 @@ blkdevparts=<blkdev-def>[;<blkdev-def>] | |||
36 | User space application can access partition by partition name. | 40 | User space application can access partition by partition name. |
37 | 41 | ||
38 | Example: | 42 | Example: |
43 | |||
39 | eMMC disk names are "mmcblk0" and "mmcblk0boot0". | 44 | eMMC disk names are "mmcblk0" and "mmcblk0boot0". |
40 | 45 | ||
41 | bootargs: | 46 | bootargs:: |
47 | |||
42 | 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)' | 48 | 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)' |
43 | 49 | ||
44 | dmesg: | 50 | dmesg:: |
51 | |||
45 | mmcblk0: p1(data0) p2(data1) p3() | 52 | mmcblk0: p1(data0) p2(data1) p3() |
46 | mmcblk0boot0: p1(boot) p2(kernel) | 53 | mmcblk0boot0: p1(boot) p2(kernel) |
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.rst index 934c44ea0c57..4f2452a95c43 100644 --- a/Documentation/block/data-integrity.txt +++ b/Documentation/block/data-integrity.rst | |||
@@ -1,5 +1,9 @@ | |||
1 | ---------------------------------------------------------------------- | 1 | ============== |
2 | 1. INTRODUCTION | 2 | Data Integrity |
3 | ============== | ||
4 | |||
5 | 1. Introduction | ||
6 | =============== | ||
3 | 7 | ||
4 | Modern filesystems feature checksumming of data and metadata to | 8 | Modern filesystems feature checksumming of data and metadata to |
5 | protect against data corruption. However, the detection of the | 9 | protect against data corruption. However, the detection of the |
@@ -28,8 +32,8 @@ integrity of the I/O and reject it if corruption is detected. This | |||
28 | allows not only corruption prevention but also isolation of the point | 32 | allows not only corruption prevention but also isolation of the point |
29 | of failure. | 33 | of failure. |
30 | 34 | ||
31 | ---------------------------------------------------------------------- | 35 | 2. The Data Integrity Extensions |
32 | 2. THE DATA INTEGRITY EXTENSIONS | 36 | ================================ |
33 | 37 | ||
34 | As written, the protocol extensions only protect the path between | 38 | As written, the protocol extensions only protect the path between |
35 | controller and storage device. However, many controllers actually | 39 | controller and storage device. However, many controllers actually |
@@ -75,8 +79,8 @@ Extensions. As these extensions are outside the scope of the protocol | |||
75 | bodies (T10, T13), Oracle and its partners are trying to standardize | 79 | bodies (T10, T13), Oracle and its partners are trying to standardize |
76 | them within the Storage Networking Industry Association. | 80 | them within the Storage Networking Industry Association. |
77 | 81 | ||
78 | ---------------------------------------------------------------------- | 82 | 3. Kernel Changes |
79 | 3. KERNEL CHANGES | 83 | ================= |
80 | 84 | ||
81 | The data integrity framework in Linux enables protection information | 85 | The data integrity framework in Linux enables protection information |
82 | to be pinned to I/Os and sent to/received from controllers that | 86 | to be pinned to I/Os and sent to/received from controllers that |
@@ -123,10 +127,11 @@ access to manipulate the tags from user space. A passthrough | |||
123 | interface for this is being worked on. | 127 | interface for this is being worked on. |
124 | 128 | ||
125 | 129 | ||
126 | ---------------------------------------------------------------------- | 130 | 4. Block Layer Implementation Details |
127 | 4. BLOCK LAYER IMPLEMENTATION DETAILS | 131 | ===================================== |
128 | 132 | ||
129 | 4.1 BIO | 133 | 4.1 Bio |
134 | ------- | ||
130 | 135 | ||
131 | The data integrity patches add a new field to struct bio when | 136 | The data integrity patches add a new field to struct bio when |
132 | CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a | 137 | CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a |
@@ -145,7 +150,8 @@ attached using bio_integrity_add_page(). | |||
145 | bio_free() will automatically free the bip. | 150 | bio_free() will automatically free the bip. |
146 | 151 | ||
147 | 152 | ||
148 | 4.2 BLOCK DEVICE | 153 | 4.2 Block Device |
154 | ---------------- | ||
149 | 155 | ||
150 | Because the format of the protection data is tied to the physical | 156 | Because the format of the protection data is tied to the physical |
151 | disk, each block device has been extended with a block integrity | 157 | disk, each block device has been extended with a block integrity |
@@ -163,10 +169,11 @@ and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6 | |||
163 | will require extra work due to the application tag. | 169 | will require extra work due to the application tag. |
164 | 170 | ||
165 | 171 | ||
166 | ---------------------------------------------------------------------- | 172 | 5.0 Block Layer Integrity API |
167 | 5.0 BLOCK LAYER INTEGRITY API | 173 | ============================= |
168 | 174 | ||
169 | 5.1 NORMAL FILESYSTEM | 175 | 5.1 Normal Filesystem |
176 | --------------------- | ||
170 | 177 | ||
171 | The normal filesystem is unaware that the underlying block device | 178 | The normal filesystem is unaware that the underlying block device |
172 | is capable of sending/receiving integrity metadata. The IMD will | 179 | is capable of sending/receiving integrity metadata. The IMD will |
@@ -174,25 +181,26 @@ will require extra work due to the application tag. | |||
174 | in case of a WRITE. A READ request will cause the I/O integrity | 181 | in case of a WRITE. A READ request will cause the I/O integrity |
175 | to be verified upon completion. | 182 | to be verified upon completion. |
176 | 183 | ||
177 | IMD generation and verification can be toggled using the | 184 | IMD generation and verification can be toggled using the:: |
178 | 185 | ||
179 | /sys/block/<bdev>/integrity/write_generate | 186 | /sys/block/<bdev>/integrity/write_generate |
180 | 187 | ||
181 | and | 188 | and:: |
182 | 189 | ||
183 | /sys/block/<bdev>/integrity/read_verify | 190 | /sys/block/<bdev>/integrity/read_verify |
184 | 191 | ||
185 | flags. | 192 | flags. |
186 | 193 | ||
187 | 194 | ||
188 | 5.2 INTEGRITY-AWARE FILESYSTEM | 195 | 5.2 Integrity-Aware Filesystem |
196 | ------------------------------ | ||
189 | 197 | ||
190 | A filesystem that is integrity-aware can prepare I/Os with IMD | 198 | A filesystem that is integrity-aware can prepare I/Os with IMD |
191 | attached. It can also use the application tag space if this is | 199 | attached. It can also use the application tag space if this is |
192 | supported by the block device. | 200 | supported by the block device. |
193 | 201 | ||
194 | 202 | ||
195 | bool bio_integrity_prep(bio); | 203 | `bool bio_integrity_prep(bio);` |
196 | 204 | ||
197 | To generate IMD for WRITE and to set up buffers for READ, the | 205 | To generate IMD for WRITE and to set up buffers for READ, the |
198 | filesystem must call bio_integrity_prep(bio). | 206 | filesystem must call bio_integrity_prep(bio). |
@@ -204,14 +212,15 @@ will require extra work due to the application tag. | |||
204 | Complete bio with error if prepare failed for some reson. | 212 | Complete bio with error if prepare failed for some reson. |
205 | 213 | ||
206 | 214 | ||
207 | 5.3 PASSING EXISTING INTEGRITY METADATA | 215 | 5.3 Passing Existing Integrity Metadata |
216 | --------------------------------------- | ||
208 | 217 | ||
209 | Filesystems that either generate their own integrity metadata or | 218 | Filesystems that either generate their own integrity metadata or |
210 | are capable of transferring IMD from user space can use the | 219 | are capable of transferring IMD from user space can use the |
211 | following calls: | 220 | following calls: |
212 | 221 | ||
213 | 222 | ||
214 | struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages); | 223 | `struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);` |
215 | 224 | ||
216 | Allocates the bio integrity payload and hangs it off of the bio. | 225 | Allocates the bio integrity payload and hangs it off of the bio. |
217 | nr_pages indicate how many pages of protection data need to be | 226 | nr_pages indicate how many pages of protection data need to be |
@@ -220,7 +229,7 @@ will require extra work due to the application tag. | |||
220 | The integrity payload will be freed at bio_free() time. | 229 | The integrity payload will be freed at bio_free() time. |
221 | 230 | ||
222 | 231 | ||
223 | int bio_integrity_add_page(bio, page, len, offset); | 232 | `int bio_integrity_add_page(bio, page, len, offset);` |
224 | 233 | ||
225 | Attaches a page containing integrity metadata to an existing | 234 | Attaches a page containing integrity metadata to an existing |
226 | bio. The bio must have an existing bip, | 235 | bio. The bio must have an existing bip, |
@@ -241,21 +250,21 @@ will require extra work due to the application tag. | |||
241 | integrity upon completion. | 250 | integrity upon completion. |
242 | 251 | ||
243 | 252 | ||
244 | 5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY | 253 | 5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata |
245 | METADATA | 254 | -------------------------------------------------------------------------- |
246 | 255 | ||
247 | To enable integrity exchange on a block device the gendisk must be | 256 | To enable integrity exchange on a block device the gendisk must be |
248 | registered as capable: | 257 | registered as capable: |
249 | 258 | ||
250 | int blk_integrity_register(gendisk, blk_integrity); | 259 | `int blk_integrity_register(gendisk, blk_integrity);` |
251 | 260 | ||
252 | The blk_integrity struct is a template and should contain the | 261 | The blk_integrity struct is a template and should contain the |
253 | following: | 262 | following:: |
254 | 263 | ||
255 | static struct blk_integrity my_profile = { | 264 | static struct blk_integrity my_profile = { |
256 | .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", | 265 | .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", |
257 | .generate_fn = my_generate_fn, | 266 | .generate_fn = my_generate_fn, |
258 | .verify_fn = my_verify_fn, | 267 | .verify_fn = my_verify_fn, |
259 | .tuple_size = sizeof(struct my_tuple_size), | 268 | .tuple_size = sizeof(struct my_tuple_size), |
260 | .tag_size = <tag bytes per hw sector>, | 269 | .tag_size = <tag bytes per hw sector>, |
261 | }; | 270 | }; |
@@ -278,4 +287,5 @@ will require extra work due to the application tag. | |||
278 | 0 depending on the value of the Control Mode Page ATO bit. | 287 | 0 depending on the value of the Control Mode Page ATO bit. |
279 | 288 | ||
280 | ---------------------------------------------------------------------- | 289 | ---------------------------------------------------------------------- |
290 | |||
281 | 2007-12-24 Martin K. Petersen <martin.petersen@oracle.com> | 291 | 2007-12-24 Martin K. Petersen <martin.petersen@oracle.com> |
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.rst index 2d82c80322cb..9f5c5a4c370e 100644 --- a/Documentation/block/deadline-iosched.txt +++ b/Documentation/block/deadline-iosched.rst | |||
@@ -1,3 +1,4 @@ | |||
1 | ============================== | ||
1 | Deadline IO scheduler tunables | 2 | Deadline IO scheduler tunables |
2 | ============================== | 3 | ============================== |
3 | 4 | ||
@@ -7,15 +8,13 @@ of interest to power users. | |||
7 | 8 | ||
8 | Selecting IO schedulers | 9 | Selecting IO schedulers |
9 | ----------------------- | 10 | ----------------------- |
10 | Refer to Documentation/block/switching-sched.txt for information on | 11 | Refer to Documentation/block/switching-sched.rst for information on |
11 | selecting an io scheduler on a per-device basis. | 12 | selecting an io scheduler on a per-device basis. |
12 | 13 | ||
13 | 14 | ------------------------------------------------------------------------------ | |
14 | ******************************************************************************** | ||
15 | |||
16 | 15 | ||
17 | read_expire (in ms) | 16 | read_expire (in ms) |
18 | ----------- | 17 | ----------------------- |
19 | 18 | ||
20 | The goal of the deadline io scheduler is to attempt to guarantee a start | 19 | The goal of the deadline io scheduler is to attempt to guarantee a start |
21 | service time for a request. As we focus mainly on read latencies, this is | 20 | service time for a request. As we focus mainly on read latencies, this is |
@@ -25,15 +24,15 @@ milliseconds. | |||
25 | 24 | ||
26 | 25 | ||
27 | write_expire (in ms) | 26 | write_expire (in ms) |
28 | ----------- | 27 | ----------------------- |
29 | 28 | ||
30 | Similar to read_expire mentioned above, but for writes. | 29 | Similar to read_expire mentioned above, but for writes. |
31 | 30 | ||
32 | 31 | ||
33 | fifo_batch (number of requests) | 32 | fifo_batch (number of requests) |
34 | ---------- | 33 | ------------------------------------ |
35 | 34 | ||
36 | Requests are grouped into ``batches'' of a particular data direction (read or | 35 | Requests are grouped into ``batches`` of a particular data direction (read or |
37 | write) which are serviced in increasing sector order. To limit extra seeking, | 36 | write) which are serviced in increasing sector order. To limit extra seeking, |
38 | deadline expiries are only checked between batches. fifo_batch controls the | 37 | deadline expiries are only checked between batches. fifo_batch controls the |
39 | maximum number of requests per batch. | 38 | maximum number of requests per batch. |
@@ -45,7 +44,7 @@ generally improves throughput, at the cost of latency variation. | |||
45 | 44 | ||
46 | 45 | ||
47 | writes_starved (number of dispatches) | 46 | writes_starved (number of dispatches) |
48 | -------------- | 47 | -------------------------------------- |
49 | 48 | ||
50 | When we have to move requests from the io scheduler queue to the block | 49 | When we have to move requests from the io scheduler queue to the block |
51 | device dispatch queue, we always give a preference to reads. However, we | 50 | device dispatch queue, we always give a preference to reads. However, we |
@@ -56,7 +55,7 @@ same criteria as reads. | |||
56 | 55 | ||
57 | 56 | ||
58 | front_merges (bool) | 57 | front_merges (bool) |
59 | ------------ | 58 | ---------------------- |
60 | 59 | ||
61 | Sometimes it happens that a request enters the io scheduler that is contiguous | 60 | Sometimes it happens that a request enters the io scheduler that is contiguous |
62 | with a request that is already on the queue. Either it fits in the back of that | 61 | with a request that is already on the queue. Either it fits in the back of that |
@@ -71,5 +70,3 @@ rbtree front sector lookup when the io scheduler merge function is called. | |||
71 | 70 | ||
72 | 71 | ||
73 | Nov 11 2002, Jens Axboe <jens.axboe@oracle.com> | 72 | Nov 11 2002, Jens Axboe <jens.axboe@oracle.com> |
74 | |||
75 | |||
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst new file mode 100644 index 000000000000..8cd226a0e86e --- /dev/null +++ b/Documentation/block/index.rst | |||
@@ -0,0 +1,25 @@ | |||
1 | :orphan: | ||
2 | |||
3 | ===== | ||
4 | Block | ||
5 | ===== | ||
6 | |||
7 | .. toctree:: | ||
8 | :maxdepth: 1 | ||
9 | |||
10 | bfq-iosched | ||
11 | biodoc | ||
12 | biovecs | ||
13 | capability | ||
14 | cmdline-partition | ||
15 | data-integrity | ||
16 | deadline-iosched | ||
17 | ioprio | ||
18 | kyber-iosched | ||
19 | null_blk | ||
20 | pr | ||
21 | queue-sysfs | ||
22 | request | ||
23 | stat | ||
24 | switching-sched | ||
25 | writeback_cache_control | ||
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.rst index 8ed8c59380b4..f72b0de65af7 100644 --- a/Documentation/block/ioprio.txt +++ b/Documentation/block/ioprio.rst | |||
@@ -1,3 +1,4 @@ | |||
1 | =================== | ||
1 | Block io priorities | 2 | Block io priorities |
2 | =================== | 3 | =================== |
3 | 4 | ||
@@ -40,81 +41,81 @@ class data, since it doesn't really apply here. | |||
40 | Tools | 41 | Tools |
41 | ----- | 42 | ----- |
42 | 43 | ||
43 | See below for a sample ionice tool. Usage: | 44 | See below for a sample ionice tool. Usage:: |
44 | 45 | ||
45 | # ionice -c<class> -n<level> -p<pid> | 46 | # ionice -c<class> -n<level> -p<pid> |
46 | 47 | ||
47 | If pid isn't given, the current process is assumed. IO priority settings | 48 | If pid isn't given, the current process is assumed. IO priority settings |
48 | are inherited on fork, so you can use ionice to start the process at a given | 49 | are inherited on fork, so you can use ionice to start the process at a given |
49 | level: | 50 | level:: |
50 | 51 | ||
51 | # ionice -c2 -n0 /bin/ls | 52 | # ionice -c2 -n0 /bin/ls |
52 | 53 | ||
53 | will run ls at the best-effort scheduling class at the highest priority. | 54 | will run ls at the best-effort scheduling class at the highest priority. |
54 | For a running process, you can give the pid instead: | 55 | For a running process, you can give the pid instead:: |
55 | 56 | ||
56 | # ionice -c1 -n2 -p100 | 57 | # ionice -c1 -n2 -p100 |
57 | 58 | ||
58 | will change pid 100 to run at the realtime scheduling class, at priority 2. | 59 | will change pid 100 to run at the realtime scheduling class, at priority 2. |
59 | 60 | ||
60 | ---> snip ionice.c tool <--- | 61 | ionice.c tool:: |
61 | 62 | ||
62 | #include <stdio.h> | 63 | #include <stdio.h> |
63 | #include <stdlib.h> | 64 | #include <stdlib.h> |
64 | #include <errno.h> | 65 | #include <errno.h> |
65 | #include <getopt.h> | 66 | #include <getopt.h> |
66 | #include <unistd.h> | 67 | #include <unistd.h> |
67 | #include <sys/ptrace.h> | 68 | #include <sys/ptrace.h> |
68 | #include <asm/unistd.h> | 69 | #include <asm/unistd.h> |
69 | 70 | ||
70 | extern int sys_ioprio_set(int, int, int); | 71 | extern int sys_ioprio_set(int, int, int); |
71 | extern int sys_ioprio_get(int, int); | 72 | extern int sys_ioprio_get(int, int); |
72 | 73 | ||
73 | #if defined(__i386__) | 74 | #if defined(__i386__) |
74 | #define __NR_ioprio_set 289 | 75 | #define __NR_ioprio_set 289 |
75 | #define __NR_ioprio_get 290 | 76 | #define __NR_ioprio_get 290 |
76 | #elif defined(__ppc__) | 77 | #elif defined(__ppc__) |
77 | #define __NR_ioprio_set 273 | 78 | #define __NR_ioprio_set 273 |
78 | #define __NR_ioprio_get 274 | 79 | #define __NR_ioprio_get 274 |
79 | #elif defined(__x86_64__) | 80 | #elif defined(__x86_64__) |
80 | #define __NR_ioprio_set 251 | 81 | #define __NR_ioprio_set 251 |
81 | #define __NR_ioprio_get 252 | 82 | #define __NR_ioprio_get 252 |
82 | #elif defined(__ia64__) | 83 | #elif defined(__ia64__) |
83 | #define __NR_ioprio_set 1274 | 84 | #define __NR_ioprio_set 1274 |
84 | #define __NR_ioprio_get 1275 | 85 | #define __NR_ioprio_get 1275 |
85 | #else | 86 | #else |
86 | #error "Unsupported arch" | 87 | #error "Unsupported arch" |
87 | #endif | 88 | #endif |
88 | 89 | ||
89 | static inline int ioprio_set(int which, int who, int ioprio) | 90 | static inline int ioprio_set(int which, int who, int ioprio) |
90 | { | 91 | { |
91 | return syscall(__NR_ioprio_set, which, who, ioprio); | 92 | return syscall(__NR_ioprio_set, which, who, ioprio); |
92 | } | 93 | } |
93 | 94 | ||
94 | static inline int ioprio_get(int which, int who) | 95 | static inline int ioprio_get(int which, int who) |
95 | { | 96 | { |
96 | return syscall(__NR_ioprio_get, which, who); | 97 | return syscall(__NR_ioprio_get, which, who); |
97 | } | 98 | } |
98 | 99 | ||
99 | enum { | 100 | enum { |
100 | IOPRIO_CLASS_NONE, | 101 | IOPRIO_CLASS_NONE, |
101 | IOPRIO_CLASS_RT, | 102 | IOPRIO_CLASS_RT, |
102 | IOPRIO_CLASS_BE, | 103 | IOPRIO_CLASS_BE, |
103 | IOPRIO_CLASS_IDLE, | 104 | IOPRIO_CLASS_IDLE, |
104 | }; | 105 | }; |
105 | 106 | ||
106 | enum { | 107 | enum { |
107 | IOPRIO_WHO_PROCESS = 1, | 108 | IOPRIO_WHO_PROCESS = 1, |
108 | IOPRIO_WHO_PGRP, | 109 | IOPRIO_WHO_PGRP, |
109 | IOPRIO_WHO_USER, | 110 | IOPRIO_WHO_USER, |
110 | }; | 111 | }; |
111 | 112 | ||
112 | #define IOPRIO_CLASS_SHIFT 13 | 113 | #define IOPRIO_CLASS_SHIFT 13 |
113 | 114 | ||
114 | const char *to_prio[] = { "none", "realtime", "best-effort", "idle", }; | 115 | const char *to_prio[] = { "none", "realtime", "best-effort", "idle", }; |
115 | 116 | ||
116 | int main(int argc, char *argv[]) | 117 | int main(int argc, char *argv[]) |
117 | { | 118 | { |
118 | int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE; | 119 | int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE; |
119 | int c, pid = 0; | 120 | int c, pid = 0; |
120 | 121 | ||
@@ -175,9 +176,7 @@ int main(int argc, char *argv[]) | |||
175 | } | 176 | } |
176 | 177 | ||
177 | return 0; | 178 | return 0; |
178 | } | 179 | } |
179 | |||
180 | ---> snip ionice.c tool <--- | ||
181 | 180 | ||
182 | 181 | ||
183 | March 11 2005, Jens Axboe <jens.axboe@oracle.com> | 182 | March 11 2005, Jens Axboe <jens.axboe@oracle.com> |
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.rst index e94feacd7edc..3e164dd0617c 100644 --- a/Documentation/block/kyber-iosched.txt +++ b/Documentation/block/kyber-iosched.rst | |||
@@ -1,5 +1,6 @@ | |||
1 | ============================ | ||
1 | Kyber I/O scheduler tunables | 2 | Kyber I/O scheduler tunables |
2 | =========================== | 3 | ============================ |
3 | 4 | ||
4 | The only two tunables for the Kyber scheduler are the target latencies for | 5 | The only two tunables for the Kyber scheduler are the target latencies for |
5 | reads and synchronous writes. Kyber will throttle requests in order to meet | 6 | reads and synchronous writes. Kyber will throttle requests in order to meet |
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.rst index 41f0a3d33bbd..31451d80783c 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.rst | |||
@@ -1,33 +1,43 @@ | |||
1 | ======================== | ||
1 | Null block device driver | 2 | Null block device driver |
2 | ================================================================================ | 3 | ======================== |
3 | 4 | ||
4 | I. Overview | 5 | 1. Overview |
6 | =========== | ||
5 | 7 | ||
6 | The null block device (/dev/nullb*) is used for benchmarking the various | 8 | The null block device (/dev/nullb*) is used for benchmarking the various |
7 | block-layer implementations. It emulates a block device of X gigabytes in size. | 9 | block-layer implementations. It emulates a block device of X gigabytes in size. |
8 | The following instances are possible: | 10 | The following instances are possible: |
9 | 11 | ||
10 | Single-queue block-layer | 12 | Single-queue block-layer |
13 | |||
11 | - Request-based. | 14 | - Request-based. |
12 | - Single submission queue per device. | 15 | - Single submission queue per device. |
13 | - Implements IO scheduling algorithms (CFQ, Deadline, noop). | 16 | - Implements IO scheduling algorithms (CFQ, Deadline, noop). |
17 | |||
14 | Multi-queue block-layer | 18 | Multi-queue block-layer |
19 | |||
15 | - Request-based. | 20 | - Request-based. |
16 | - Configurable submission queues per device. | 21 | - Configurable submission queues per device. |
22 | |||
17 | No block-layer (Known as bio-based) | 23 | No block-layer (Known as bio-based) |
24 | |||
18 | - Bio-based. IO requests are submitted directly to the device driver. | 25 | - Bio-based. IO requests are submitted directly to the device driver. |
19 | - Directly accepts bio data structure and returns them. | 26 | - Directly accepts bio data structure and returns them. |
20 | 27 | ||
21 | All of them have a completion queue for each core in the system. | 28 | All of them have a completion queue for each core in the system. |
22 | 29 | ||
23 | II. Module parameters applicable for all instances: | 30 | 2. Module parameters applicable for all instances |
31 | ================================================= | ||
24 | 32 | ||
25 | queue_mode=[0-2]: Default: 2-Multi-queue | 33 | queue_mode=[0-2]: Default: 2-Multi-queue |
26 | Selects which block-layer the module should instantiate with. | 34 | Selects which block-layer the module should instantiate with. |
27 | 35 | ||
28 | 0: Bio-based. | 36 | = ============ |
29 | 1: Single-queue. | 37 | 0 Bio-based |
30 | 2: Multi-queue. | 38 | 1 Single-queue |
39 | 2 Multi-queue | ||
40 | = ============ | ||
31 | 41 | ||
32 | home_node=[0--nr_nodes]: Default: NUMA_NO_NODE | 42 | home_node=[0--nr_nodes]: Default: NUMA_NO_NODE |
33 | Selects what CPU node the data structures are allocated from. | 43 | Selects what CPU node the data structures are allocated from. |
@@ -45,12 +55,14 @@ nr_devices=[Number of devices]: Default: 1 | |||
45 | irqmode=[0-2]: Default: 1-Soft-irq | 55 | irqmode=[0-2]: Default: 1-Soft-irq |
46 | The completion mode used for completing IOs to the block-layer. | 56 | The completion mode used for completing IOs to the block-layer. |
47 | 57 | ||
48 | 0: None. | 58 | = =========================================================================== |
49 | 1: Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead | 59 | 0 None. |
60 | 1 Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead | ||
50 | when IOs are issued from another CPU node than the home the device is | 61 | when IOs are issued from another CPU node than the home the device is |
51 | connected to. | 62 | connected to. |
52 | 2: Timer: Waits a specific period (completion_nsec) for each IO before | 63 | 2 Timer: Waits a specific period (completion_nsec) for each IO before |
53 | completion. | 64 | completion. |
65 | = =========================================================================== | ||
54 | 66 | ||
55 | completion_nsec=[ns]: Default: 10,000ns | 67 | completion_nsec=[ns]: Default: 10,000ns |
56 | Combined with irqmode=2 (timer). The time each completion event must wait. | 68 | Combined with irqmode=2 (timer). The time each completion event must wait. |
@@ -66,30 +78,45 @@ hw_queue_depth=[0..qdepth]: Default: 64 | |||
66 | III: Multi-queue specific parameters | 78 | III: Multi-queue specific parameters |
67 | 79 | ||
68 | use_per_node_hctx=[0/1]: Default: 0 | 80 | use_per_node_hctx=[0/1]: Default: 0 |
69 | 0: The number of submit queues are set to the value of the submit_queues | 81 | |
82 | = ===================================================================== | ||
83 | 0 The number of submit queues are set to the value of the submit_queues | ||
70 | parameter. | 84 | parameter. |
71 | 1: The multi-queue block layer is instantiated with a hardware dispatch | 85 | 1 The multi-queue block layer is instantiated with a hardware dispatch |
72 | queue for each CPU node in the system. | 86 | queue for each CPU node in the system. |
87 | = ===================================================================== | ||
73 | 88 | ||
74 | no_sched=[0/1]: Default: 0 | 89 | no_sched=[0/1]: Default: 0 |
75 | 0: nullb* use default blk-mq io scheduler. | 90 | |
76 | 1: nullb* doesn't use io scheduler. | 91 | = ====================================== |
92 | 0 nullb* use default blk-mq io scheduler | ||
93 | 1 nullb* doesn't use io scheduler | ||
94 | = ====================================== | ||
77 | 95 | ||
78 | blocking=[0/1]: Default: 0 | 96 | blocking=[0/1]: Default: 0 |
79 | 0: Register as a non-blocking blk-mq driver device. | 97 | |
80 | 1: Register as a blocking blk-mq driver device, null_blk will set | 98 | = =============================================================== |
99 | 0 Register as a non-blocking blk-mq driver device. | ||
100 | 1 Register as a blocking blk-mq driver device, null_blk will set | ||
81 | the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always | 101 | the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always |
82 | needs to block in its ->queue_rq() function. | 102 | needs to block in its ->queue_rq() function. |
103 | = =============================================================== | ||
83 | 104 | ||
84 | shared_tags=[0/1]: Default: 0 | 105 | shared_tags=[0/1]: Default: 0 |
85 | 0: Tag set is not shared. | 106 | |
86 | 1: Tag set shared between devices for blk-mq. Only makes sense with | 107 | = ================================================================ |
108 | 0 Tag set is not shared. | ||
109 | 1 Tag set shared between devices for blk-mq. Only makes sense with | ||
87 | nr_devices > 1, otherwise there's no tag set to share. | 110 | nr_devices > 1, otherwise there's no tag set to share. |
111 | = ================================================================ | ||
88 | 112 | ||
89 | zoned=[0/1]: Default: 0 | 113 | zoned=[0/1]: Default: 0 |
90 | 0: Block device is exposed as a random-access block device. | 114 | |
91 | 1: Block device is exposed as a host-managed zoned block device. Requires | 115 | = ====================================================================== |
116 | 0 Block device is exposed as a random-access block device. | ||
117 | 1 Block device is exposed as a host-managed zoned block device. Requires | ||
92 | CONFIG_BLK_DEV_ZONED. | 118 | CONFIG_BLK_DEV_ZONED. |
119 | = ====================================================================== | ||
93 | 120 | ||
94 | zone_size=[MB]: Default: 256 | 121 | zone_size=[MB]: Default: 256 |
95 | Per zone size when exposed as a zoned block device. Must be a power of two. | 122 | Per zone size when exposed as a zoned block device. Must be a power of two. |
diff --git a/Documentation/block/pr.txt b/Documentation/block/pr.rst index ac9b8e70e64b..30ea1c2e39eb 100644 --- a/Documentation/block/pr.txt +++ b/Documentation/block/pr.rst | |||
@@ -1,4 +1,4 @@ | |||
1 | 1 | =============================================== | |
2 | Block layer support for Persistent Reservations | 2 | Block layer support for Persistent Reservations |
3 | =============================================== | 3 | =============================================== |
4 | 4 | ||
@@ -23,22 +23,18 @@ The following types of reservations are supported: | |||
23 | -------------------------------------------------- | 23 | -------------------------------------------------- |
24 | 24 | ||
25 | - PR_WRITE_EXCLUSIVE | 25 | - PR_WRITE_EXCLUSIVE |
26 | |||
27 | Only the initiator that owns the reservation can write to the | 26 | Only the initiator that owns the reservation can write to the |
28 | device. Any initiator can read from the device. | 27 | device. Any initiator can read from the device. |
29 | 28 | ||
30 | - PR_EXCLUSIVE_ACCESS | 29 | - PR_EXCLUSIVE_ACCESS |
31 | |||
32 | Only the initiator that owns the reservation can access the | 30 | Only the initiator that owns the reservation can access the |
33 | device. | 31 | device. |
34 | 32 | ||
35 | - PR_WRITE_EXCLUSIVE_REG_ONLY | 33 | - PR_WRITE_EXCLUSIVE_REG_ONLY |
36 | |||
37 | Only initiators with a registered key can write to the device, | 34 | Only initiators with a registered key can write to the device, |
38 | Any initiator can read from the device. | 35 | Any initiator can read from the device. |
39 | 36 | ||
40 | - PR_EXCLUSIVE_ACCESS_REG_ONLY | 37 | - PR_EXCLUSIVE_ACCESS_REG_ONLY |
41 | |||
42 | Only initiators with a registered key can access the device. | 38 | Only initiators with a registered key can access the device. |
43 | 39 | ||
44 | - PR_WRITE_EXCLUSIVE_ALL_REGS | 40 | - PR_WRITE_EXCLUSIVE_ALL_REGS |
@@ -48,21 +44,21 @@ The following types of reservations are supported: | |||
48 | All initiators with a registered key are considered reservation | 44 | All initiators with a registered key are considered reservation |
49 | holders. | 45 | holders. |
50 | Please reference the SPC spec on the meaning of a reservation | 46 | Please reference the SPC spec on the meaning of a reservation |
51 | holder if you want to use this type. | 47 | holder if you want to use this type. |
52 | 48 | ||
53 | - PR_EXCLUSIVE_ACCESS_ALL_REGS | 49 | - PR_EXCLUSIVE_ACCESS_ALL_REGS |
54 | |||
55 | Only initiators with a registered key can access the device. | 50 | Only initiators with a registered key can access the device. |
56 | All initiators with a registered key are considered reservation | 51 | All initiators with a registered key are considered reservation |
57 | holders. | 52 | holders. |
58 | Please reference the SPC spec on the meaning of a reservation | 53 | Please reference the SPC spec on the meaning of a reservation |
59 | holder if you want to use this type. | 54 | holder if you want to use this type. |
60 | 55 | ||
61 | 56 | ||
62 | The following ioctl are supported: | 57 | The following ioctl are supported: |
63 | ---------------------------------- | 58 | ---------------------------------- |
64 | 59 | ||
65 | 1. IOC_PR_REGISTER | 60 | 1. IOC_PR_REGISTER |
61 | ^^^^^^^^^^^^^^^^^^ | ||
66 | 62 | ||
67 | This ioctl command registers a new reservation if the new_key argument | 63 | This ioctl command registers a new reservation if the new_key argument |
68 | is non-null. If no existing reservation exists old_key must be zero, | 64 | is non-null. If no existing reservation exists old_key must be zero, |
@@ -74,6 +70,7 @@ in old_key. | |||
74 | 70 | ||
75 | 71 | ||
76 | 2. IOC_PR_RESERVE | 72 | 2. IOC_PR_RESERVE |
73 | ^^^^^^^^^^^^^^^^^ | ||
77 | 74 | ||
78 | This ioctl command reserves the device and thus restricts access for other | 75 | This ioctl command reserves the device and thus restricts access for other |
79 | devices based on the type argument. The key argument must be the existing | 76 | devices based on the type argument. The key argument must be the existing |
@@ -82,12 +79,14 @@ IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands. | |||
82 | 79 | ||
83 | 80 | ||
84 | 3. IOC_PR_RELEASE | 81 | 3. IOC_PR_RELEASE |
82 | ^^^^^^^^^^^^^^^^^ | ||
85 | 83 | ||
86 | This ioctl command releases the reservation specified by key and flags | 84 | This ioctl command releases the reservation specified by key and flags |
87 | and thus removes any access restriction implied by it. | 85 | and thus removes any access restriction implied by it. |
88 | 86 | ||
89 | 87 | ||
90 | 4. IOC_PR_PREEMPT | 88 | 4. IOC_PR_PREEMPT |
89 | ^^^^^^^^^^^^^^^^^ | ||
91 | 90 | ||
92 | This ioctl command releases the existing reservation referred to by | 91 | This ioctl command releases the existing reservation referred to by |
93 | old_key and replaces it with a new reservation of type for the | 92 | old_key and replaces it with a new reservation of type for the |
@@ -95,11 +94,13 @@ reservation key new_key. | |||
95 | 94 | ||
96 | 95 | ||
97 | 5. IOC_PR_PREEMPT_ABORT | 96 | 5. IOC_PR_PREEMPT_ABORT |
97 | ^^^^^^^^^^^^^^^^^^^^^^^ | ||
98 | 98 | ||
99 | This ioctl command works like IOC_PR_PREEMPT except that it also aborts | 99 | This ioctl command works like IOC_PR_PREEMPT except that it also aborts |
100 | any outstanding command sent over a connection identified by old_key. | 100 | any outstanding command sent over a connection identified by old_key. |
101 | 101 | ||
102 | 6. IOC_PR_CLEAR | 102 | 6. IOC_PR_CLEAR |
103 | ^^^^^^^^^^^^^^^ | ||
103 | 104 | ||
104 | This ioctl command unregisters both key and any other reservation key | 105 | This ioctl command unregisters both key and any other reservation key |
105 | registered with the device and drops any existing reservation. | 106 | registered with the device and drops any existing reservation. |
@@ -111,7 +112,6 @@ Flags | |||
111 | All the ioctls have a flag field. Currently only one flag is supported: | 112 | All the ioctls have a flag field. Currently only one flag is supported: |
112 | 113 | ||
113 | - PR_FL_IGNORE_KEY | 114 | - PR_FL_IGNORE_KEY |
114 | |||
115 | Ignore the existing reservation key. This is commonly supported for | 115 | Ignore the existing reservation key. This is commonly supported for |
116 | IOC_PR_REGISTER, and some implementation may support the flag for | 116 | IOC_PR_REGISTER, and some implementation may support the flag for |
117 | IOC_PR_RESERVE. | 117 | IOC_PR_RESERVE. |
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.rst index b40b5b7cebd9..6a8513af9201 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.rst | |||
@@ -1,3 +1,4 @@ | |||
1 | ================= | ||
1 | Queue sysfs files | 2 | Queue sysfs files |
2 | ================= | 3 | ================= |
3 | 4 | ||
@@ -10,7 +11,7 @@ Files denoted with a RO postfix are readonly and the RW postfix means | |||
10 | read-write. | 11 | read-write. |
11 | 12 | ||
12 | add_random (RW) | 13 | add_random (RW) |
13 | ---------------- | 14 | --------------- |
14 | This file allows to turn off the disk entropy contribution. Default | 15 | This file allows to turn off the disk entropy contribution. Default |
15 | value of this file is '1'(on). | 16 | value of this file is '1'(on). |
16 | 17 | ||
@@ -30,13 +31,13 @@ used by CPU-addressable storage to bypass the pagecache. It shows '1' | |||
30 | if true, '0' if not. | 31 | if true, '0' if not. |
31 | 32 | ||
32 | discard_granularity (RO) | 33 | discard_granularity (RO) |
33 | ----------------------- | 34 | ------------------------ |
34 | This shows the size of internal allocation of the device in bytes, if | 35 | This shows the size of internal allocation of the device in bytes, if |
35 | reported by the device. A value of '0' means device does not support | 36 | reported by the device. A value of '0' means device does not support |
36 | the discard functionality. | 37 | the discard functionality. |
37 | 38 | ||
38 | discard_max_hw_bytes (RO) | 39 | discard_max_hw_bytes (RO) |
39 | ---------------------- | 40 | ------------------------- |
40 | Devices that support discard functionality may have internal limits on | 41 | Devices that support discard functionality may have internal limits on |
41 | the number of bytes that can be trimmed or unmapped in a single operation. | 42 | the number of bytes that can be trimmed or unmapped in a single operation. |
42 | The discard_max_bytes parameter is set by the device driver to the maximum | 43 | The discard_max_bytes parameter is set by the device driver to the maximum |
diff --git a/Documentation/block/request.txt b/Documentation/block/request.rst index 754e104ed369..747021e1ffdb 100644 --- a/Documentation/block/request.txt +++ b/Documentation/block/request.rst | |||
@@ -1,26 +1,37 @@ | |||
1 | 1 | ============================ | |
2 | struct request documentation | 2 | struct request documentation |
3 | ============================ | ||
3 | 4 | ||
4 | Jens Axboe <jens.axboe@oracle.com> 27/05/02 | 5 | Jens Axboe <jens.axboe@oracle.com> 27/05/02 |
5 | 6 | ||
6 | 1.0 | ||
7 | Index | ||
8 | 7 | ||
9 | 2.0 Struct request members classification | 8 | .. FIXME: |
9 | No idea about what does mean - seems just some noise, so comment it | ||
10 | |||
11 | 1.0 | ||
12 | Index | ||
13 | |||
14 | 2.0 Struct request members classification | ||
15 | |||
16 | 2.1 struct request members explanation | ||
10 | 17 | ||
11 | 2.1 struct request members explanation | 18 | 3.0 |
19 | |||
20 | |||
21 | 2.0 | ||
12 | 22 | ||
13 | 3.0 | ||
14 | 23 | ||
15 | 24 | ||
16 | 2.0 | ||
17 | Short explanation of request members | 25 | Short explanation of request members |
26 | ==================================== | ||
18 | 27 | ||
19 | Classification flags: | 28 | Classification flags: |
20 | 29 | ||
30 | = ==================== | ||
21 | D driver member | 31 | D driver member |
22 | B block layer member | 32 | B block layer member |
23 | I I/O scheduler member | 33 | I I/O scheduler member |
34 | = ==================== | ||
24 | 35 | ||
25 | Unless an entry contains a D classification, a device driver must not access | 36 | Unless an entry contains a D classification, a device driver must not access |
26 | this member. Some members may contain D classifications, but should only be | 37 | this member. Some members may contain D classifications, but should only be |
@@ -28,14 +39,13 @@ access through certain macros or functions (eg ->flags). | |||
28 | 39 | ||
29 | <linux/blkdev.h> | 40 | <linux/blkdev.h> |
30 | 41 | ||
31 | 2.1 | 42 | =============================== ======= ======================================= |
32 | Member Flag Comment | 43 | Member Flag Comment |
33 | ------ ---- ------- | 44 | =============================== ======= ======================================= |
34 | |||
35 | struct list_head queuelist BI Organization on various internal | 45 | struct list_head queuelist BI Organization on various internal |
36 | queues | 46 | queues |
37 | 47 | ||
38 | void *elevator_private I I/O scheduler private data | 48 | ``void *elevator_private`` I I/O scheduler private data |
39 | 49 | ||
40 | unsigned char cmd[16] D Driver can use this for setting up | 50 | unsigned char cmd[16] D Driver can use this for setting up |
41 | a cdb before execution, see | 51 | a cdb before execution, see |
@@ -71,18 +81,19 @@ unsigned int hard_cur_sectors B Used to keep current_nr_sectors sane | |||
71 | 81 | ||
72 | int tag DB TCQ tag, if assigned | 82 | int tag DB TCQ tag, if assigned |
73 | 83 | ||
74 | void *special D Free to be used by driver | 84 | ``void *special`` D Free to be used by driver |
75 | 85 | ||
76 | char *buffer D Map of first segment, also see | 86 | ``char *buffer`` D Map of first segment, also see |
77 | section on bouncing SECTION | 87 | section on bouncing SECTION |
78 | 88 | ||
79 | struct completion *waiting D Can be used by driver to get signalled | 89 | ``struct completion *waiting`` D Can be used by driver to get signalled |
80 | on request completion | 90 | on request completion |
81 | 91 | ||
82 | struct bio *bio DBI First bio in request | 92 | ``struct bio *bio`` DBI First bio in request |
83 | 93 | ||
84 | struct bio *biotail DBI Last bio in request | 94 | ``struct bio *biotail`` DBI Last bio in request |
85 | 95 | ||
86 | struct request_queue *q DB Request queue this request belongs to | 96 | ``struct request_queue *q`` DB Request queue this request belongs to |
87 | 97 | ||
88 | struct request_list *rl B Request list this request came from | 98 | ``struct request_list *rl`` B Request list this request came from |
99 | =============================== ======= ======================================= | ||
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.rst index 0aace9cc536c..9c07bc22b0bc 100644 --- a/Documentation/block/stat.txt +++ b/Documentation/block/stat.rst | |||
@@ -1,3 +1,4 @@ | |||
1 | =============================================== | ||
1 | Block layer statistics in /sys/block/<dev>/stat | 2 | Block layer statistics in /sys/block/<dev>/stat |
2 | =============================================== | 3 | =============================================== |
3 | 4 | ||
@@ -6,9 +7,12 @@ This file documents the contents of the /sys/block/<dev>/stat file. | |||
6 | The stat file provides several statistics about the state of block | 7 | The stat file provides several statistics about the state of block |
7 | device <dev>. | 8 | device <dev>. |
8 | 9 | ||
9 | Q. Why are there multiple statistics in a single file? Doesn't sysfs | 10 | Q. |
11 | Why are there multiple statistics in a single file? Doesn't sysfs | ||
10 | normally contain a single value per file? | 12 | normally contain a single value per file? |
11 | A. By having a single file, the kernel can guarantee that the statistics | 13 | |
14 | A. | ||
15 | By having a single file, the kernel can guarantee that the statistics | ||
12 | represent a consistent snapshot of the state of the device. If the | 16 | represent a consistent snapshot of the state of the device. If the |
13 | statistics were exported as multiple files containing one statistic | 17 | statistics were exported as multiple files containing one statistic |
14 | each, it would be impossible to guarantee that a set of readings | 18 | each, it would be impossible to guarantee that a set of readings |
@@ -18,8 +22,10 @@ The stat file consists of a single line of text containing 11 decimal | |||
18 | values separated by whitespace. The fields are summarized in the | 22 | values separated by whitespace. The fields are summarized in the |
19 | following table, and described in more detail below. | 23 | following table, and described in more detail below. |
20 | 24 | ||
25 | |||
26 | =============== ============= ================================================= | ||
21 | Name units description | 27 | Name units description |
22 | ---- ----- ----------- | 28 | =============== ============= ================================================= |
23 | read I/Os requests number of read I/Os processed | 29 | read I/Os requests number of read I/Os processed |
24 | read merges requests number of read I/Os merged with in-queue I/O | 30 | read merges requests number of read I/Os merged with in-queue I/O |
25 | read sectors sectors number of sectors read | 31 | read sectors sectors number of sectors read |
@@ -35,6 +41,7 @@ discard I/Os requests number of discard I/Os processed | |||
35 | discard merges requests number of discard I/Os merged with in-queue I/O | 41 | discard merges requests number of discard I/Os merged with in-queue I/O |
36 | discard sectors sectors number of sectors discarded | 42 | discard sectors sectors number of sectors discarded |
37 | discard ticks milliseconds total wait time for discard requests | 43 | discard ticks milliseconds total wait time for discard requests |
44 | =============== ============= ================================================= | ||
38 | 45 | ||
39 | read I/Os, write I/Os, discard I/0s | 46 | read I/Os, write I/Os, discard I/0s |
40 | =================================== | 47 | =================================== |
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.rst index 7977f6fb8b20..42042417380e 100644 --- a/Documentation/block/switching-sched.txt +++ b/Documentation/block/switching-sched.rst | |||
@@ -1,35 +1,39 @@ | |||
1 | =================== | ||
2 | Switching Scheduler | ||
3 | =================== | ||
4 | |||
1 | To choose IO schedulers at boot time, use the argument 'elevator=deadline'. | 5 | To choose IO schedulers at boot time, use the argument 'elevator=deadline'. |
2 | 'noop' and 'cfq' (the default) are also available. IO schedulers are assigned | 6 | 'noop' and 'cfq' (the default) are also available. IO schedulers are assigned |
3 | globally at boot time only presently. | 7 | globally at boot time only presently. |
4 | 8 | ||
5 | Each io queue has a set of io scheduler tunables associated with it. These | 9 | Each io queue has a set of io scheduler tunables associated with it. These |
6 | tunables control how the io scheduler works. You can find these entries | 10 | tunables control how the io scheduler works. You can find these entries |
7 | in: | 11 | in:: |
8 | 12 | ||
9 | /sys/block/<device>/queue/iosched | 13 | /sys/block/<device>/queue/iosched |
10 | 14 | ||
11 | assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted, | 15 | assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted, |
12 | you can do so by typing: | 16 | you can do so by typing:: |
13 | 17 | ||
14 | # mount none /sys -t sysfs | 18 | # mount none /sys -t sysfs |
15 | 19 | ||
16 | It is possible to change the IO scheduler for a given block device on | 20 | It is possible to change the IO scheduler for a given block device on |
17 | the fly to select one of mq-deadline, none, bfq, or kyber schedulers - | 21 | the fly to select one of mq-deadline, none, bfq, or kyber schedulers - |
18 | which can improve that device's throughput. | 22 | which can improve that device's throughput. |
19 | 23 | ||
20 | To set a specific scheduler, simply do this: | 24 | To set a specific scheduler, simply do this:: |
21 | 25 | ||
22 | echo SCHEDNAME > /sys/block/DEV/queue/scheduler | 26 | echo SCHEDNAME > /sys/block/DEV/queue/scheduler |
23 | 27 | ||
24 | where SCHEDNAME is the name of a defined IO scheduler, and DEV is the | 28 | where SCHEDNAME is the name of a defined IO scheduler, and DEV is the |
25 | device name (hda, hdb, sga, or whatever you happen to have). | 29 | device name (hda, hdb, sga, or whatever you happen to have). |
26 | 30 | ||
27 | The list of defined schedulers can be found by simply doing | 31 | The list of defined schedulers can be found by simply doing |
28 | a "cat /sys/block/DEV/queue/scheduler" - the list of valid names | 32 | a "cat /sys/block/DEV/queue/scheduler" - the list of valid names |
29 | will be displayed, with the currently selected scheduler in brackets: | 33 | will be displayed, with the currently selected scheduler in brackets:: |
30 | 34 | ||
31 | # cat /sys/block/sda/queue/scheduler | 35 | # cat /sys/block/sda/queue/scheduler |
32 | [mq-deadline] kyber bfq none | 36 | [mq-deadline] kyber bfq none |
33 | # echo none >/sys/block/sda/queue/scheduler | 37 | # echo none >/sys/block/sda/queue/scheduler |
34 | # cat /sys/block/sda/queue/scheduler | 38 | # cat /sys/block/sda/queue/scheduler |
35 | [none] mq-deadline kyber bfq | 39 | [none] mq-deadline kyber bfq |
diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.rst index 8a6bdada5f6b..2c752c57c14c 100644 --- a/Documentation/block/writeback_cache_control.txt +++ b/Documentation/block/writeback_cache_control.rst | |||
@@ -1,6 +1,6 @@ | |||
1 | 1 | ========================================== | |
2 | Explicit volatile write back cache control | 2 | Explicit volatile write back cache control |
3 | ===================================== | 3 | ========================================== |
4 | 4 | ||
5 | Introduction | 5 | Introduction |
6 | ------------ | 6 | ------------ |
@@ -31,7 +31,7 @@ the blkdev_issue_flush() helper for a pure cache flush. | |||
31 | 31 | ||
32 | 32 | ||
33 | Forced Unit Access | 33 | Forced Unit Access |
34 | ----------------- | 34 | ------------------ |
35 | 35 | ||
36 | The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the | 36 | The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the |
37 | filesystem and will make sure that I/O completion for this request is only | 37 | filesystem and will make sure that I/O completion for this request is only |
@@ -62,14 +62,14 @@ flags themselves without any help from the block layer. | |||
62 | 62 | ||
63 | 63 | ||
64 | Implementation details for request_fn based block drivers | 64 | Implementation details for request_fn based block drivers |
65 | -------------------------------------------------------------- | 65 | --------------------------------------------------------- |
66 | 66 | ||
67 | For devices that do not support volatile write caches there is no driver | 67 | For devices that do not support volatile write caches there is no driver |
68 | support required, the block layer completes empty REQ_PREFLUSH requests before | 68 | support required, the block layer completes empty REQ_PREFLUSH requests before |
69 | entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from | 69 | entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from |
70 | requests that have a payload. For devices with volatile write caches the | 70 | requests that have a payload. For devices with volatile write caches the |
71 | driver needs to tell the block layer that it supports flushing caches by | 71 | driver needs to tell the block layer that it supports flushing caches by |
72 | doing: | 72 | doing:: |
73 | 73 | ||
74 | blk_queue_write_cache(sdkp->disk->queue, true, false); | 74 | blk_queue_write_cache(sdkp->disk->queue, true, false); |
75 | 75 | ||
@@ -77,7 +77,7 @@ and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that | |||
77 | REQ_PREFLUSH requests with a payload are automatically turned into a sequence | 77 | REQ_PREFLUSH requests with a payload are automatically turned into a sequence |
78 | of an empty REQ_OP_FLUSH request followed by the actual write by the block | 78 | of an empty REQ_OP_FLUSH request followed by the actual write by the block |
79 | layer. For devices that also support the FUA bit the block layer needs | 79 | layer. For devices that also support the FUA bit the block layer needs |
80 | to be told to pass through the REQ_FUA bit using: | 80 | to be told to pass through the REQ_FUA bit using:: |
81 | 81 | ||
82 | blk_queue_write_cache(sdkp->disk->queue, true, true); | 82 | blk_queue_write_cache(sdkp->disk->queue, true, true); |
83 | 83 | ||
diff --git a/Documentation/blockdev/zram.rst b/Documentation/blockdev/zram.rst index 2111231c9c0f..6eccf13219ff 100644 --- a/Documentation/blockdev/zram.rst +++ b/Documentation/blockdev/zram.rst | |||
@@ -215,7 +215,7 @@ User space is advised to use the following files to read the device statistics. | |||
215 | 215 | ||
216 | File /sys/block/zram<id>/stat | 216 | File /sys/block/zram<id>/stat |
217 | 217 | ||
218 | Represents block layer statistics. Read Documentation/block/stat.txt for | 218 | Represents block layer statistics. Read Documentation/block/stat.rst for |
219 | details. | 219 | details. |
220 | 220 | ||
221 | File /sys/block/zram<id>/io_stat | 221 | File /sys/block/zram<id>/io_stat |
diff --git a/MAINTAINERS b/MAINTAINERS index 93e5ac1de255..4b9fd11466a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2968,7 +2968,7 @@ M: Jens Axboe <axboe@kernel.dk> | |||
2968 | L: linux-block@vger.kernel.org | 2968 | L: linux-block@vger.kernel.org |
2969 | S: Maintained | 2969 | S: Maintained |
2970 | F: block/bfq-* | 2970 | F: block/bfq-* |
2971 | F: Documentation/block/bfq-iosched.txt | 2971 | F: Documentation/block/bfq-iosched.rst |
2972 | 2972 | ||
2973 | BFS FILE SYSTEM | 2973 | BFS FILE SYSTEM |
2974 | M: "Tigran A. Aivazian" <aivazian.tigran@gmail.com> | 2974 | M: "Tigran A. Aivazian" <aivazian.tigran@gmail.com> |
diff --git a/block/Kconfig b/block/Kconfig index 56cb1695cd87..b16b3e075d31 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -110,7 +110,7 @@ config BLK_CMDLINE_PARSER | |||
110 | which don't otherwise have any standardized method for listing the | 110 | which don't otherwise have any standardized method for listing the |
111 | partitions on a block device. | 111 | partitions on a block device. |
112 | 112 | ||
113 | See Documentation/block/cmdline-partition.txt for more information. | 113 | See Documentation/block/cmdline-partition.rst for more information. |
114 | 114 | ||
115 | config BLK_WBT | 115 | config BLK_WBT |
116 | bool "Enable support for block device writeback throttling" | 116 | bool "Enable support for block device writeback throttling" |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 7a6b2f29a582..b89310a022ad 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -26,7 +26,7 @@ config IOSCHED_BFQ | |||
26 | regardless of the device parameters and with any workload. It | 26 | regardless of the device parameters and with any workload. It |
27 | also guarantees a low latency to interactive and soft | 27 | also guarantees a low latency to interactive and soft |
28 | real-time applications. Details in | 28 | real-time applications. Details in |
29 | Documentation/block/bfq-iosched.txt | 29 | Documentation/block/bfq-iosched.rst |
30 | 30 | ||
31 | config BFQ_GROUP_IOSCHED | 31 | config BFQ_GROUP_IOSCHED |
32 | bool "BFQ hierarchical scheduling support" | 32 | bool "BFQ hierarchical scheduling support" |
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c9d2598500..72860325245a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -17,7 +17,7 @@ | |||
17 | * low-latency capabilities. BFQ also supports full hierarchical | 17 | * low-latency capabilities. BFQ also supports full hierarchical |
18 | * scheduling through cgroups. Next paragraphs provide an introduction | 18 | * scheduling through cgroups. Next paragraphs provide an introduction |
19 | * on BFQ inner workings. Details on BFQ benefits, usage and | 19 | * on BFQ inner workings. Details on BFQ benefits, usage and |
20 | * limitations can be found in Documentation/block/bfq-iosched.txt. | 20 | * limitations can be found in Documentation/block/bfq-iosched.rst. |
21 | * | 21 | * |
22 | * BFQ is a proportional-share storage-I/O scheduling algorithm based | 22 | * BFQ is a proportional-share storage-I/O scheduling algorithm based |
23 | * on the slice-by-slice service scheme of CFQ. But BFQ assigns | 23 | * on the slice-by-slice service scheme of CFQ. But BFQ assigns |
diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 825c9c070458..ca39b4624cf8 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c | |||
@@ -383,7 +383,7 @@ static const struct blk_integrity_profile nop_profile = { | |||
383 | * send/receive integrity metadata it must use this function to register | 383 | * send/receive integrity metadata it must use this function to register |
384 | * the capability with the block layer. The template is a blk_integrity | 384 | * the capability with the block layer. The template is a blk_integrity |
385 | * struct with values appropriate for the underlying hardware. See | 385 | * struct with values appropriate for the underlying hardware. See |
386 | * Documentation/block/data-integrity.txt. | 386 | * Documentation/block/data-integrity.rst. |
387 | */ | 387 | */ |
388 | void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) | 388 | void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) |
389 | { | 389 | { |
diff --git a/block/ioprio.c b/block/ioprio.c index 2e0559f157c8..77bcab11dce5 100644 --- a/block/ioprio.c +++ b/block/ioprio.c | |||
@@ -17,7 +17,7 @@ | |||
17 | * | 17 | * |
18 | * ioprio_set(PRIO_PROCESS, pid, prio); | 18 | * ioprio_set(PRIO_PROCESS, pid, prio); |
19 | * | 19 | * |
20 | * See also Documentation/block/ioprio.txt | 20 | * See also Documentation/block/ioprio.rst |
21 | * | 21 | * |
22 | */ | 22 | */ |
23 | #include <linux/gfp.h> | 23 | #include <linux/gfp.h> |
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b8a682b5a1bb..2a2a2e82832e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include "blk-mq-sched.h" | 25 | #include "blk-mq-sched.h" |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * See Documentation/block/deadline-iosched.txt | 28 | * See Documentation/block/deadline-iosched.rst |
29 | */ | 29 | */ |
30 | static const int read_expire = HZ / 2; /* max time before a read is submitted. */ | 30 | static const int read_expire = HZ / 2; /* max time before a read is submitted. */ |
31 | static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ | 31 | static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ |
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 60fb3df9897c..f1edd5452249 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * | 11 | * |
12 | * The format for the command line is just like mtdparts. | 12 | * The format for the command line is just like mtdparts. |
13 | * | 13 | * |
14 | * For further information, see "Documentation/block/cmdline-partition.txt" | 14 | * For further information, see "Documentation/block/cmdline-partition.rst" |
15 | * | 15 | * |
16 | */ | 16 | */ |
17 | 17 | ||