diff options
172 files changed, 6029 insertions, 2659 deletions
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index f91a973a37fe..abac31d216de 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats | |||
| @@ -5,6 +5,7 @@ Description: | |||
| 5 | The /proc/diskstats file displays the I/O statistics | 5 | The /proc/diskstats file displays the I/O statistics |
| 6 | of block devices. Each line contains the following 14 | 6 | of block devices. Each line contains the following 14 |
| 7 | fields: | 7 | fields: |
| 8 | |||
| 8 | 1 - major number | 9 | 1 - major number |
| 9 | 2 - minor mumber | 10 | 2 - minor mumber |
| 10 | 3 - device name | 11 | 3 - device name |
| @@ -19,4 +20,13 @@ Description: | |||
| 19 | 12 - I/Os currently in progress | 20 | 12 - I/Os currently in progress |
| 20 | 13 - time spent doing I/Os (ms) | 21 | 13 - time spent doing I/Os (ms) |
| 21 | 14 - weighted time spent doing I/Os (ms) | 22 | 14 - weighted time spent doing I/Os (ms) |
| 23 | |||
| 24 | Kernel 4.18+ appends four more fields for discard | ||
| 25 | tracking putting the total at 18: | ||
| 26 | |||
| 27 | 15 - discards completed successfully | ||
| 28 | 16 - discards merged | ||
| 29 | 17 - sectors discarded | ||
| 30 | 18 - time spent discarding | ||
| 31 | |||
| 22 | For more details refer to Documentation/iostats.txt | 32 | For more details refer to Documentation/iostats.txt |
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8a2c52d5c53b..1746131bc9cb 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst | |||
| @@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/. | |||
| 51 | 5-3. IO | 51 | 5-3. IO |
| 52 | 5-3-1. IO Interface Files | 52 | 5-3-1. IO Interface Files |
| 53 | 5-3-2. Writeback | 53 | 5-3-2. Writeback |
| 54 | 5-3-3. IO Latency | ||
| 55 | 5-3-3-1. How IO Latency Throttling Works | ||
| 56 | 5-3-3-2. IO Latency Interface Files | ||
| 54 | 5-4. PID | 57 | 5-4. PID |
| 55 | 5-4-1. PID Interface Files | 58 | 5-4-1. PID Interface Files |
| 56 | 5-5. Device | 59 | 5-5. Device |
| @@ -1314,17 +1317,19 @@ IO Interface Files | |||
| 1314 | Lines are keyed by $MAJ:$MIN device numbers and not ordered. | 1317 | Lines are keyed by $MAJ:$MIN device numbers and not ordered. |
| 1315 | The following nested keys are defined. | 1318 | The following nested keys are defined. |
| 1316 | 1319 | ||
| 1317 | ====== =================== | 1320 | ====== ===================== |
| 1318 | rbytes Bytes read | 1321 | rbytes Bytes read |
| 1319 | wbytes Bytes written | 1322 | wbytes Bytes written |
| 1320 | rios Number of read IOs | 1323 | rios Number of read IOs |
| 1321 | wios Number of write IOs | 1324 | wios Number of write IOs |
| 1322 | ====== =================== | 1325 | dbytes Bytes discarded |
| 1326 | dios Number of discard IOs | ||
| 1327 | ====== ===================== | ||
| 1323 | 1328 | ||
| 1324 | An example read output follows: | 1329 | An example read output follows: |
| 1325 | 1330 | ||
| 1326 | 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 | 1331 | 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0 |
| 1327 | 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 | 1332 | 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021 |
| 1328 | 1333 | ||
| 1329 | io.weight | 1334 | io.weight |
| 1330 | A read-write flat-keyed file which exists on non-root cgroups. | 1335 | A read-write flat-keyed file which exists on non-root cgroups. |
| @@ -1446,6 +1451,85 @@ writeback as follows. | |||
| 1446 | vm.dirty[_background]_ratio. | 1451 | vm.dirty[_background]_ratio. |
| 1447 | 1452 | ||
| 1448 | 1453 | ||
| 1454 | IO Latency | ||
| 1455 | ~~~~~~~~~~ | ||
| 1456 | |||
| 1457 | This is a cgroup v2 controller for IO workload protection. You provide a group | ||
| 1458 | with a latency target, and if the average latency exceeds that target the | ||
| 1459 | controller will throttle any peers that have a lower latency target than the | ||
| 1460 | protected workload. | ||
| 1461 | |||
| 1462 | The limits are only applied at the peer level in the hierarchy. This means that | ||
| 1463 | in the diagram below, only groups A, B, and C will influence each other, and | ||
| 1464 | groups D and F will influence each other. Group G will influence nobody. | ||
| 1465 | |||
| 1466 | [root] | ||
| 1467 | / | \ | ||
| 1468 | A B C | ||
| 1469 | / \ | | ||
| 1470 | D F G | ||
| 1471 | |||
| 1472 | |||
| 1473 | So the ideal way to configure this is to set io.latency in groups A, B, and C. | ||
| 1474 | Generally you do not want to set a value lower than the latency your device | ||
| 1475 | supports. Experiment to find the value that works best for your workload. | ||
| 1476 | Start at higher than the expected latency for your device and watch the | ||
| 1477 | avg_lat value in io.stat for your workload group to get an idea of the | ||
| 1478 | latency you see during normal operation. Use the avg_lat value as a basis for | ||
| 1479 | your real setting, setting at 10-15% higher than the value in io.stat. | ||
| 1480 | |||
| 1481 | How IO Latency Throttling Works | ||
| 1482 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
| 1483 | |||
| 1484 | io.latency is work conserving; so as long as everybody is meeting their latency | ||
| 1485 | target the controller doesn't do anything. Once a group starts missing its | ||
| 1486 | target it begins throttling any peer group that has a higher target than itself. | ||
| 1487 | This throttling takes 2 forms: | ||
| 1488 | |||
| 1489 | - Queue depth throttling. This is the number of outstanding IO's a group is | ||
| 1490 | allowed to have. We will clamp down relatively quickly, starting at no limit | ||
| 1491 | and going all the way down to 1 IO at a time. | ||
| 1492 | |||
| 1493 | - Artificial delay induction. There are certain types of IO that cannot be | ||
| 1494 | throttled without possibly adversely affecting higher priority groups. This | ||
| 1495 | includes swapping and metadata IO. These types of IO are allowed to occur | ||
| 1496 | normally, however they are "charged" to the originating group. If the | ||
| 1497 | originating group is being throttled you will see the use_delay and delay | ||
| 1498 | fields in io.stat increase. The delay value is how many microseconds that are | ||
| 1499 | being added to any process that runs in this group. Because this number can | ||
| 1500 | grow quite large if there is a lot of swapping or metadata IO occurring we | ||
| 1501 | limit the individual delay events to 1 second at a time. | ||
| 1502 | |||
| 1503 | Once the victimized group starts meeting its latency target again it will start | ||
| 1504 | unthrottling any peer groups that were throttled previously. If the victimized | ||
| 1505 | group simply stops doing IO the global counter will unthrottle appropriately. | ||
| 1506 | |||
| 1507 | IO Latency Interface Files | ||
| 1508 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
| 1509 | |||
| 1510 | io.latency | ||
| 1511 | This takes a similar format as the other controllers. | ||
| 1512 | |||
| 1513 | "MAJOR:MINOR target=<target time in microseconds" | ||
| 1514 | |||
| 1515 | io.stat | ||
| 1516 | If the controller is enabled you will see extra stats in io.stat in | ||
| 1517 | addition to the normal ones. | ||
| 1518 | |||
| 1519 | depth | ||
| 1520 | This is the current queue depth for the group. | ||
| 1521 | |||
| 1522 | avg_lat | ||
| 1523 | This is an exponential moving average with a decay rate of 1/exp | ||
| 1524 | bound by the sampling interval. The decay rate interval can be | ||
| 1525 | calculated by multiplying the win value in io.stat by the | ||
| 1526 | corresponding number of samples based on the win value. | ||
| 1527 | |||
| 1528 | win | ||
| 1529 | The sampling window size in milliseconds. This is the minimum | ||
| 1530 | duration of time between evaluation events. Windows only elapse | ||
| 1531 | with IO activity. Idle periods extend the most recent window. | ||
| 1532 | |||
| 1449 | PID | 1533 | PID |
| 1450 | --- | 1534 | --- |
| 1451 | 1535 | ||
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 07f147381f32..ea2dafe49ae8 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt | |||
| @@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0 | |||
| 85 | 0: Tag set is not shared. | 85 | 0: Tag set is not shared. |
| 86 | 1: Tag set shared between devices for blk-mq. Only makes sense with | 86 | 1: Tag set shared between devices for blk-mq. Only makes sense with |
| 87 | nr_devices > 1, otherwise there's no tag set to share. | 87 | nr_devices > 1, otherwise there's no tag set to share. |
| 88 | |||
| 89 | zoned=[0/1]: Default: 0 | ||
| 90 | 0: Block device is exposed as a random-access block device. | ||
| 91 | 1: Block device is exposed as a host-managed zoned block device. | ||
| 92 | |||
| 93 | zone_size=[MB]: Default: 256 | ||
| 94 | Per zone size when exposed as a zoned block device. Must be a power of two. | ||
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt index 0dbc946de2ea..0aace9cc536c 100644 --- a/Documentation/block/stat.txt +++ b/Documentation/block/stat.txt | |||
| @@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests | |||
| 31 | in_flight requests number of I/Os currently in flight | 31 | in_flight requests number of I/Os currently in flight |
| 32 | io_ticks milliseconds total time this block device has been active | 32 | io_ticks milliseconds total time this block device has been active |
| 33 | time_in_queue milliseconds total wait time for all requests | 33 | time_in_queue milliseconds total wait time for all requests |
| 34 | discard I/Os requests number of discard I/Os processed | ||
| 35 | discard merges requests number of discard I/Os merged with in-queue I/O | ||
| 36 | discard sectors sectors number of sectors discarded | ||
| 37 | discard ticks milliseconds total wait time for discard requests | ||
| 34 | 38 | ||
| 35 | read I/Os, write I/Os | 39 | read I/Os, write I/Os, discard I/0s |
| 36 | ===================== | 40 | =================================== |
| 37 | 41 | ||
| 38 | These values increment when an I/O request completes. | 42 | These values increment when an I/O request completes. |
| 39 | 43 | ||
| 40 | read merges, write merges | 44 | read merges, write merges, discard merges |
| 41 | ========================= | 45 | ========================================= |
| 42 | 46 | ||
| 43 | These values increment when an I/O request is merged with an | 47 | These values increment when an I/O request is merged with an |
| 44 | already-queued I/O request. | 48 | already-queued I/O request. |
| 45 | 49 | ||
| 46 | read sectors, write sectors | 50 | read sectors, write sectors, discard_sectors |
| 47 | =========================== | 51 | ============================================ |
| 48 | 52 | ||
| 49 | These values count the number of sectors read from or written to this | 53 | These values count the number of sectors read from, written to, or |
| 50 | block device. The "sectors" in question are the standard UNIX 512-byte | 54 | discarded from this block device. The "sectors" in question are the |
| 51 | sectors, not any device- or filesystem-specific block size. The | 55 | standard UNIX 512-byte sectors, not any device- or filesystem-specific |
| 52 | counters are incremented when the I/O completes. | 56 | block size. The counters are incremented when the I/O completes. |
| 53 | 57 | ||
| 54 | read ticks, write ticks | 58 | read ticks, write ticks, discard ticks |
| 55 | ======================= | 59 | ====================================== |
| 56 | 60 | ||
| 57 | These values count the number of milliseconds that I/O requests have | 61 | These values count the number of milliseconds that I/O requests have |
| 58 | waited on this block device. If there are multiple I/O requests waiting, | 62 | waited on this block device. If there are multiple I/O requests waiting, |
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt index 04d394a2e06c..49df45f90e8a 100644 --- a/Documentation/iostats.txt +++ b/Documentation/iostats.txt | |||
| @@ -31,6 +31,9 @@ Here are examples of these different formats:: | |||
| 31 | 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 | 31 | 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 |
| 32 | 3 1 hda1 35486 38030 38030 38030 | 32 | 3 1 hda1 35486 38030 38030 38030 |
| 33 | 33 | ||
| 34 | 4.18+ diskstats: | ||
| 35 | 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 | ||
| 36 | |||
| 34 | On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have | 37 | On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have |
| 35 | a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. | 38 | a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. |
| 36 | 39 | ||
| @@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os | |||
| 101 | last update of this field. This can provide an easy measure of both | 104 | last update of this field. This can provide an easy measure of both |
| 102 | I/O completion time and the backlog that may be accumulating. | 105 | I/O completion time and the backlog that may be accumulating. |
| 103 | 106 | ||
| 107 | Field 12 -- # of discards completed | ||
| 108 | This is the total number of discards completed successfully. | ||
| 109 | |||
| 110 | Field 13 -- # of discards merged | ||
| 111 | See the description of field 2 | ||
| 112 | |||
| 113 | Field 14 -- # of sectors discarded | ||
| 114 | This is the total number of sectors discarded successfully. | ||
| 115 | |||
| 116 | Field 15 -- # of milliseconds spent discarding | ||
| 117 | This is the total number of milliseconds spent by all discards (as | ||
| 118 | measured from __make_request() to end_that_request_last()). | ||
| 104 | 119 | ||
| 105 | To avoid introducing performance bottlenecks, no locks are held while | 120 | To avoid introducing performance bottlenecks, no locks are held while |
| 106 | modifying these counters. This implies that minor inaccuracies may be | 121 | modifying these counters. This implies that minor inaccuracies may be |
diff --git a/block/Kconfig b/block/Kconfig index eb50fd4977c2..1f2469a0123c 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
| @@ -149,6 +149,18 @@ config BLK_WBT | |||
| 149 | dynamically on an algorithm loosely based on CoDel, factoring in | 149 | dynamically on an algorithm loosely based on CoDel, factoring in |
| 150 | the realtime performance of the disk. | 150 | the realtime performance of the disk. |
| 151 | 151 | ||
| 152 | config BLK_CGROUP_IOLATENCY | ||
| 153 | bool "Enable support for latency based cgroup IO protection" | ||
| 154 | depends on BLK_CGROUP=y | ||
| 155 | default n | ||
| 156 | ---help--- | ||
| 157 | Enabling this option enables the .latency interface for IO throttling. | ||
| 158 | The IO controller will attempt to maintain average IO latencies below | ||
| 159 | the configured latency target, throttling anybody with a higher latency | ||
| 160 | target than the victimized group. | ||
| 161 | |||
| 162 | Note, this is an experimental interface and could be changed someday. | ||
| 163 | |||
| 152 | config BLK_WBT_SQ | 164 | config BLK_WBT_SQ |
| 153 | bool "Single queue writeback throttling" | 165 | bool "Single queue writeback throttling" |
| 154 | default n | 166 | default n |
| @@ -177,6 +189,10 @@ config BLK_DEBUG_FS | |||
| 177 | Unless you are building a kernel for a tiny system, you should | 189 | Unless you are building a kernel for a tiny system, you should |
| 178 | say Y here. | 190 | say Y here. |
| 179 | 191 | ||
| 192 | config BLK_DEBUG_FS_ZONED | ||
| 193 | bool | ||
| 194 | default BLK_DEBUG_FS && BLK_DEV_ZONED | ||
| 195 | |||
| 180 | config BLK_SED_OPAL | 196 | config BLK_SED_OPAL |
| 181 | bool "Logic for interfacing with Opal enabled SEDs" | 197 | bool "Logic for interfacing with Opal enabled SEDs" |
| 182 | ---help--- | 198 | ---help--- |
diff --git a/block/Makefile b/block/Makefile index 6a56303b9925..572b33f32c07 100644 --- a/block/Makefile +++ b/block/Makefile | |||
| @@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | |||
| 9 | blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ | 9 | blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ |
| 10 | blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ | 10 | blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ |
| 11 | genhd.o partition-generic.o ioprio.o \ | 11 | genhd.o partition-generic.o ioprio.o \ |
| 12 | badblocks.o partitions/ | 12 | badblocks.o partitions/ blk-rq-qos.o |
| 13 | 13 | ||
| 14 | obj-$(CONFIG_BOUNCE) += bounce.o | 14 | obj-$(CONFIG_BOUNCE) += bounce.o |
| 15 | obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o | 15 | obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o |
| @@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | |||
| 17 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | 17 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o |
| 18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 18 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
| 19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o | 19 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
| 20 | obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o | ||
| 20 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o | 21 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
| 21 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | 22 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
| 22 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | 23 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
| @@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o | |||
| 34 | obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o | 35 | obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o |
| 35 | obj-$(CONFIG_BLK_WBT) += blk-wbt.o | 36 | obj-$(CONFIG_BLK_WBT) += blk-wbt.o |
| 36 | obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o | 37 | obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o |
| 38 | obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o | ||
| 37 | obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o | 39 | obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o |
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 495b9ddb3355..41d9036b1822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
| @@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) | |||
| 634 | * The following function returns true if every queue must receive the | 634 | * The following function returns true if every queue must receive the |
| 635 | * same share of the throughput (this condition is used when deciding | 635 | * same share of the throughput (this condition is used when deciding |
| 636 | * whether idling may be disabled, see the comments in the function | 636 | * whether idling may be disabled, see the comments in the function |
| 637 | * bfq_bfqq_may_idle()). | 637 | * bfq_better_to_idle()). |
| 638 | * | 638 | * |
| 639 | * Such a scenario occurs when: | 639 | * Such a scenario occurs when: |
| 640 | * 1) all active queues have the same weight, | 640 | * 1) all active queues have the same weight, |
| @@ -742,8 +742,9 @@ inc_counter: | |||
| 742 | * See the comments to the function bfq_weights_tree_add() for considerations | 742 | * See the comments to the function bfq_weights_tree_add() for considerations |
| 743 | * about overhead. | 743 | * about overhead. |
| 744 | */ | 744 | */ |
| 745 | void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, | 745 | void __bfq_weights_tree_remove(struct bfq_data *bfqd, |
| 746 | struct rb_root *root) | 746 | struct bfq_entity *entity, |
| 747 | struct rb_root *root) | ||
| 747 | { | 748 | { |
| 748 | if (!entity->weight_counter) | 749 | if (!entity->weight_counter) |
| 749 | return; | 750 | return; |
| @@ -760,6 +761,43 @@ reset_entity_pointer: | |||
| 760 | } | 761 | } |
| 761 | 762 | ||
| 762 | /* | 763 | /* |
| 764 | * Invoke __bfq_weights_tree_remove on bfqq and all its inactive | ||
| 765 | * parent entities. | ||
| 766 | */ | ||
| 767 | void bfq_weights_tree_remove(struct bfq_data *bfqd, | ||
| 768 | struct bfq_queue *bfqq) | ||
| 769 | { | ||
| 770 | struct bfq_entity *entity = bfqq->entity.parent; | ||
| 771 | |||
| 772 | __bfq_weights_tree_remove(bfqd, &bfqq->entity, | ||
| 773 | &bfqd->queue_weights_tree); | ||
| 774 | |||
| 775 | for_each_entity(entity) { | ||
| 776 | struct bfq_sched_data *sd = entity->my_sched_data; | ||
| 777 | |||
| 778 | if (sd->next_in_service || sd->in_service_entity) { | ||
| 779 | /* | ||
| 780 | * entity is still active, because either | ||
| 781 | * next_in_service or in_service_entity is not | ||
| 782 | * NULL (see the comments on the definition of | ||
| 783 | * next_in_service for details on why | ||
| 784 | * in_service_entity must be checked too). | ||
| 785 | * | ||
| 786 | * As a consequence, the weight of entity is | ||
| 787 | * not to be removed. In addition, if entity | ||
| 788 | * is active, then its parent entities are | ||
| 789 | * active as well, and thus their weights are | ||
| 790 | * not to be removed either. In the end, this | ||
| 791 | * loop must stop here. | ||
| 792 | */ | ||
| 793 | break; | ||
| 794 | } | ||
| 795 | __bfq_weights_tree_remove(bfqd, entity, | ||
| 796 | &bfqd->group_weights_tree); | ||
| 797 | } | ||
| 798 | } | ||
| 799 | |||
| 800 | /* | ||
| 763 | * Return expired entry, or NULL to just start from scratch in rbtree. | 801 | * Return expired entry, or NULL to just start from scratch in rbtree. |
| 764 | */ | 802 | */ |
| 765 | static struct request *bfq_check_fifo(struct bfq_queue *bfqq, | 803 | static struct request *bfq_check_fifo(struct bfq_queue *bfqq, |
| @@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, | |||
| 1344 | * remain unchanged after such an expiration, and the | 1382 | * remain unchanged after such an expiration, and the |
| 1345 | * following statement therefore assigns to | 1383 | * following statement therefore assigns to |
| 1346 | * entity->budget the remaining budget on such an | 1384 | * entity->budget the remaining budget on such an |
| 1347 | * expiration. For clarity, entity->service is not | 1385 | * expiration. |
| 1348 | * updated on expiration in any case, and, in normal | ||
| 1349 | * operation, is reset only when bfqq is selected for | ||
| 1350 | * service (see bfq_get_next_queue). | ||
| 1351 | */ | 1386 | */ |
| 1352 | entity->budget = min_t(unsigned long, | 1387 | entity->budget = min_t(unsigned long, |
| 1353 | bfq_bfqq_budget_left(bfqq), | 1388 | bfq_bfqq_budget_left(bfqq), |
| 1354 | bfqq->max_budget); | 1389 | bfqq->max_budget); |
| 1355 | 1390 | ||
| 1391 | /* | ||
| 1392 | * At this point, we have used entity->service to get | ||
| 1393 | * the budget left (needed for updating | ||
| 1394 | * entity->budget). Thus we finally can, and have to, | ||
| 1395 | * reset entity->service. The latter must be reset | ||
| 1396 | * because bfqq would otherwise be charged again for | ||
| 1397 | * the service it has received during its previous | ||
| 1398 | * service slot(s). | ||
| 1399 | */ | ||
| 1400 | entity->service = 0; | ||
| 1401 | |||
| 1356 | return true; | 1402 | return true; |
| 1357 | } | 1403 | } |
| 1358 | 1404 | ||
| 1405 | /* | ||
| 1406 | * We can finally complete expiration, by setting service to 0. | ||
| 1407 | */ | ||
| 1408 | entity->service = 0; | ||
| 1359 | entity->budget = max_t(unsigned long, bfqq->max_budget, | 1409 | entity->budget = max_t(unsigned long, bfqq->max_budget, |
| 1360 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); | 1410 | bfq_serv_to_charge(bfqq->next_rq, bfqq)); |
| 1361 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); | 1411 | bfq_clear_bfqq_non_blocking_wait_rq(bfqq); |
| @@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, | |||
| 3233 | ref = bfqq->ref; | 3283 | ref = bfqq->ref; |
| 3234 | __bfq_bfqq_expire(bfqd, bfqq); | 3284 | __bfq_bfqq_expire(bfqd, bfqq); |
| 3235 | 3285 | ||
| 3286 | if (ref == 1) /* bfqq is gone, no more actions on it */ | ||
| 3287 | return; | ||
| 3288 | |||
| 3236 | /* mark bfqq as waiting a request only if a bic still points to it */ | 3289 | /* mark bfqq as waiting a request only if a bic still points to it */ |
| 3237 | if (ref > 1 && !bfq_bfqq_busy(bfqq) && | 3290 | if (!bfq_bfqq_busy(bfqq) && |
| 3238 | reason != BFQQE_BUDGET_TIMEOUT && | 3291 | reason != BFQQE_BUDGET_TIMEOUT && |
| 3239 | reason != BFQQE_BUDGET_EXHAUSTED) | 3292 | reason != BFQQE_BUDGET_EXHAUSTED) { |
| 3240 | bfq_mark_bfqq_non_blocking_wait_rq(bfqq); | 3293 | bfq_mark_bfqq_non_blocking_wait_rq(bfqq); |
| 3294 | /* | ||
| 3295 | * Not setting service to 0, because, if the next rq | ||
| 3296 | * arrives in time, the queue will go on receiving | ||
| 3297 | * service with this same budget (as if it never expired) | ||
| 3298 | */ | ||
| 3299 | } else | ||
| 3300 | entity->service = 0; | ||
| 3241 | } | 3301 | } |
| 3242 | 3302 | ||
| 3243 | /* | 3303 | /* |
| @@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) | |||
| 3295 | * issues taken into account are not trivial. We discuss these issues | 3355 | * issues taken into account are not trivial. We discuss these issues |
| 3296 | * individually while introducing the variables. | 3356 | * individually while introducing the variables. |
| 3297 | */ | 3357 | */ |
| 3298 | static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) | 3358 | static bool bfq_better_to_idle(struct bfq_queue *bfqq) |
| 3299 | { | 3359 | { |
| 3300 | struct bfq_data *bfqd = bfqq->bfqd; | 3360 | struct bfq_data *bfqd = bfqq->bfqd; |
| 3301 | bool rot_without_queueing = | 3361 | bool rot_without_queueing = |
| @@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) | |||
| 3528 | } | 3588 | } |
| 3529 | 3589 | ||
| 3530 | /* | 3590 | /* |
| 3531 | * If the in-service queue is empty but the function bfq_bfqq_may_idle | 3591 | * If the in-service queue is empty but the function bfq_better_to_idle |
| 3532 | * returns true, then: | 3592 | * returns true, then: |
| 3533 | * 1) the queue must remain in service and cannot be expired, and | 3593 | * 1) the queue must remain in service and cannot be expired, and |
| 3534 | * 2) the device must be idled to wait for the possible arrival of a new | 3594 | * 2) the device must be idled to wait for the possible arrival of a new |
| 3535 | * request for the queue. | 3595 | * request for the queue. |
| 3536 | * See the comments on the function bfq_bfqq_may_idle for the reasons | 3596 | * See the comments on the function bfq_better_to_idle for the reasons |
| 3537 | * why performing device idling is the best choice to boost the throughput | 3597 | * why performing device idling is the best choice to boost the throughput |
| 3538 | * and preserve service guarantees when bfq_bfqq_may_idle itself | 3598 | * and preserve service guarantees when bfq_better_to_idle itself |
| 3539 | * returns true. | 3599 | * returns true. |
| 3540 | */ | 3600 | */ |
| 3541 | static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) | 3601 | static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
| 3542 | { | 3602 | { |
| 3543 | return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); | 3603 | return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); |
| 3544 | } | 3604 | } |
| 3545 | 3605 | ||
| 3546 | /* | 3606 | /* |
| @@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) | |||
| 3559 | 3619 | ||
| 3560 | bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); | 3620 | bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
| 3561 | 3621 | ||
| 3622 | /* | ||
| 3623 | * Do not expire bfqq for budget timeout if bfqq may be about | ||
| 3624 | * to enjoy device idling. The reason why, in this case, we | ||
| 3625 | * prevent bfqq from expiring is the same as in the comments | ||
| 3626 | * on the case where bfq_bfqq_must_idle() returns true, in | ||
| 3627 | * bfq_completed_request(). | ||
| 3628 | */ | ||
| 3562 | if (bfq_may_expire_for_budg_timeout(bfqq) && | 3629 | if (bfq_may_expire_for_budg_timeout(bfqq) && |
| 3563 | !bfq_bfqq_wait_request(bfqq) && | ||
| 3564 | !bfq_bfqq_must_idle(bfqq)) | 3630 | !bfq_bfqq_must_idle(bfqq)) |
| 3565 | goto expire; | 3631 | goto expire; |
| 3566 | 3632 | ||
| @@ -3620,7 +3686,7 @@ check_queue: | |||
| 3620 | * may idle after their completion, then keep it anyway. | 3686 | * may idle after their completion, then keep it anyway. |
| 3621 | */ | 3687 | */ |
| 3622 | if (bfq_bfqq_wait_request(bfqq) || | 3688 | if (bfq_bfqq_wait_request(bfqq) || |
| 3623 | (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { | 3689 | (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { |
| 3624 | bfqq = NULL; | 3690 | bfqq = NULL; |
| 3625 | goto keep_queue; | 3691 | goto keep_queue; |
| 3626 | } | 3692 | } |
| @@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
| 4582 | */ | 4648 | */ |
| 4583 | bfqq->budget_timeout = jiffies; | 4649 | bfqq->budget_timeout = jiffies; |
| 4584 | 4650 | ||
| 4585 | bfq_weights_tree_remove(bfqd, &bfqq->entity, | 4651 | bfq_weights_tree_remove(bfqd, bfqq); |
| 4586 | &bfqd->queue_weights_tree); | ||
| 4587 | } | 4652 | } |
| 4588 | 4653 | ||
| 4589 | now_ns = ktime_get_ns(); | 4654 | now_ns = ktime_get_ns(); |
| @@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) | |||
| 4637 | * or if we want to idle in case it has no pending requests. | 4702 | * or if we want to idle in case it has no pending requests. |
| 4638 | */ | 4703 | */ |
| 4639 | if (bfqd->in_service_queue == bfqq) { | 4704 | if (bfqd->in_service_queue == bfqq) { |
| 4640 | if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { | 4705 | if (bfq_bfqq_must_idle(bfqq)) { |
| 4641 | bfq_arm_slice_timer(bfqd); | 4706 | if (bfqq->dispatched == 0) |
| 4707 | bfq_arm_slice_timer(bfqd); | ||
| 4708 | /* | ||
| 4709 | * If we get here, we do not expire bfqq, even | ||
| 4710 | * if bfqq was in budget timeout or had no | ||
| 4711 | * more requests (as controlled in the next | ||
| 4712 | * conditional instructions). The reason for | ||
| 4713 | * not expiring bfqq is as follows. | ||
| 4714 | * | ||
| 4715 | * Here bfqq->dispatched > 0 holds, but | ||
| 4716 | * bfq_bfqq_must_idle() returned true. This | ||
| 4717 | * implies that, even if no request arrives | ||
| 4718 | * for bfqq before bfqq->dispatched reaches 0, | ||
| 4719 | * bfqq will, however, not be expired on the | ||
| 4720 | * completion event that causes bfqq->dispatch | ||
| 4721 | * to reach zero. In contrast, on this event, | ||
| 4722 | * bfqq will start enjoying device idling | ||
| 4723 | * (I/O-dispatch plugging). | ||
| 4724 | * | ||
| 4725 | * But, if we expired bfqq here, bfqq would | ||
| 4726 | * not have the chance to enjoy device idling | ||
| 4727 | * when bfqq->dispatched finally reaches | ||
| 4728 | * zero. This would expose bfqq to violation | ||
| 4729 | * of its reserved service guarantees. | ||
| 4730 | */ | ||
| 4642 | return; | 4731 | return; |
| 4643 | } else if (bfq_may_expire_for_budg_timeout(bfqq)) | 4732 | } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
| 4644 | bfq_bfqq_expire(bfqd, bfqq, false, | 4733 | bfq_bfqq_expire(bfqd, bfqq, false, |
| 4645 | BFQQE_BUDGET_TIMEOUT); | 4734 | BFQQE_BUDGET_TIMEOUT); |
| 4646 | else if (RB_EMPTY_ROOT(&bfqq->sort_list) && | 4735 | else if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
| 4647 | (bfqq->dispatched == 0 || | 4736 | (bfqq->dispatched == 0 || |
| 4648 | !bfq_bfqq_may_idle(bfqq))) | 4737 | !bfq_better_to_idle(bfqq))) |
| 4649 | bfq_bfqq_expire(bfqd, bfqq, false, | 4738 | bfq_bfqq_expire(bfqd, bfqq, false, |
| 4650 | BFQQE_NO_MORE_REQUESTS); | 4739 | BFQQE_NO_MORE_REQUESTS); |
| 4651 | } | 4740 | } |
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0f712e03b035..a8a2e5aca4d4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h | |||
| @@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); | |||
| 827 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); | 827 | void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
| 828 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, | 828 | void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, |
| 829 | struct rb_root *root); | 829 | struct rb_root *root); |
| 830 | void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, | 830 | void __bfq_weights_tree_remove(struct bfq_data *bfqd, |
| 831 | struct rb_root *root); | 831 | struct bfq_entity *entity, |
| 832 | struct rb_root *root); | ||
| 833 | void bfq_weights_tree_remove(struct bfq_data *bfqd, | ||
| 834 | struct bfq_queue *bfqq); | ||
| 832 | void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, | 835 | void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
| 833 | bool compensate, enum bfqq_expiration reason); | 836 | bool compensate, enum bfqq_expiration reason); |
| 834 | void bfq_put_queue(struct bfq_queue *bfqq); | 837 | void bfq_put_queue(struct bfq_queue *bfqq); |
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4498c43245e2..dbc07b456059 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c | |||
| @@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, | |||
| 499 | if (bfqq) | 499 | if (bfqq) |
| 500 | list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); | 500 | list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); |
| 501 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 501 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
| 502 | else /* bfq_group */ | ||
| 503 | bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); | ||
| 504 | |||
| 505 | if (bfqg != bfqd->root_group) | 502 | if (bfqg != bfqd->root_group) |
| 506 | bfqg->active_entities++; | 503 | bfqg->active_entities++; |
| 507 | #endif | 504 | #endif |
| @@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, | |||
| 601 | if (bfqq) | 598 | if (bfqq) |
| 602 | list_del(&bfqq->bfqq_list); | 599 | list_del(&bfqq->bfqq_list); |
| 603 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 600 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
| 604 | else /* bfq_group */ | ||
| 605 | bfq_weights_tree_remove(bfqd, entity, | ||
| 606 | &bfqd->group_weights_tree); | ||
| 607 | |||
| 608 | if (bfqg != bfqd->root_group) | 601 | if (bfqg != bfqd->root_group) |
| 609 | bfqg->active_entities--; | 602 | bfqg->active_entities--; |
| 610 | #endif | 603 | #endif |
| @@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, | |||
| 799 | if (prev_weight != new_weight) { | 792 | if (prev_weight != new_weight) { |
| 800 | root = bfqq ? &bfqd->queue_weights_tree : | 793 | root = bfqq ? &bfqd->queue_weights_tree : |
| 801 | &bfqd->group_weights_tree; | 794 | &bfqd->group_weights_tree; |
| 802 | bfq_weights_tree_remove(bfqd, entity, root); | 795 | __bfq_weights_tree_remove(bfqd, entity, root); |
| 803 | } | 796 | } |
| 804 | entity->weight = new_weight; | 797 | entity->weight = new_weight; |
| 805 | /* | 798 | /* |
| @@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, | |||
| 971 | * one of its children receives a new request. | 964 | * one of its children receives a new request. |
| 972 | * | 965 | * |
| 973 | * Basically, this function updates the timestamps of entity and | 966 | * Basically, this function updates the timestamps of entity and |
| 974 | * inserts entity into its active tree, ater possibly extracting it | 967 | * inserts entity into its active tree, after possibly extracting it |
| 975 | * from its idle tree. | 968 | * from its idle tree. |
| 976 | */ | 969 | */ |
| 977 | static void __bfq_activate_entity(struct bfq_entity *entity, | 970 | static void __bfq_activate_entity(struct bfq_entity *entity, |
| @@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity, | |||
| 1015 | entity->on_st = true; | 1008 | entity->on_st = true; |
| 1016 | } | 1009 | } |
| 1017 | 1010 | ||
| 1011 | #ifdef BFQ_GROUP_IOSCHED_ENABLED | ||
| 1012 | if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ | ||
| 1013 | struct bfq_group *bfqg = | ||
| 1014 | container_of(entity, struct bfq_group, entity); | ||
| 1015 | |||
| 1016 | bfq_weights_tree_add(bfqg->bfqd, entity, | ||
| 1017 | &bfqd->group_weights_tree); | ||
| 1018 | } | ||
| 1019 | #endif | ||
| 1020 | |||
| 1018 | bfq_update_fin_time_enqueue(entity, st, backshifted); | 1021 | bfq_update_fin_time_enqueue(entity, st, backshifted); |
| 1019 | } | 1022 | } |
| 1020 | 1023 | ||
| @@ -1542,12 +1545,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) | |||
| 1542 | sd->in_service_entity = entity; | 1545 | sd->in_service_entity = entity; |
| 1543 | 1546 | ||
| 1544 | /* | 1547 | /* |
| 1545 | * Reset the accumulator of the amount of service that | ||
| 1546 | * the entity is about to receive. | ||
| 1547 | */ | ||
| 1548 | entity->service = 0; | ||
| 1549 | |||
| 1550 | /* | ||
| 1551 | * If entity is no longer a candidate for next | 1548 | * If entity is no longer a candidate for next |
| 1552 | * service, then it must be extracted from its active | 1549 | * service, then it must be extracted from its active |
| 1553 | * tree, so as to make sure that it won't be | 1550 | * tree, so as to make sure that it won't be |
| @@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
| 1664 | bfqd->busy_queues--; | 1661 | bfqd->busy_queues--; |
| 1665 | 1662 | ||
| 1666 | if (!bfqq->dispatched) | 1663 | if (!bfqq->dispatched) |
| 1667 | bfq_weights_tree_remove(bfqd, &bfqq->entity, | 1664 | bfq_weights_tree_remove(bfqd, bfqq); |
| 1668 | &bfqd->queue_weights_tree); | ||
| 1669 | 1665 | ||
| 1670 | if (bfqq->wr_coeff > 1) | 1666 | if (bfqq->wr_coeff > 1) |
| 1671 | bfqd->wr_busy_queues--; | 1667 | bfqd->wr_busy_queues--; |
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index add7c7c85335..67b5fb861a51 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c | |||
| @@ -160,28 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, | |||
| 160 | EXPORT_SYMBOL(bio_integrity_add_page); | 160 | EXPORT_SYMBOL(bio_integrity_add_page); |
| 161 | 161 | ||
| 162 | /** | 162 | /** |
| 163 | * bio_integrity_intervals - Return number of integrity intervals for a bio | ||
| 164 | * @bi: blk_integrity profile for device | ||
| 165 | * @sectors: Size of the bio in 512-byte sectors | ||
| 166 | * | ||
| 167 | * Description: The block layer calculates everything in 512 byte | ||
| 168 | * sectors but integrity metadata is done in terms of the data integrity | ||
| 169 | * interval size of the storage device. Convert the block layer sectors | ||
| 170 | * to the appropriate number of integrity intervals. | ||
| 171 | */ | ||
| 172 | static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, | ||
| 173 | unsigned int sectors) | ||
| 174 | { | ||
| 175 | return sectors >> (bi->interval_exp - 9); | ||
| 176 | } | ||
| 177 | |||
| 178 | static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, | ||
| 179 | unsigned int sectors) | ||
| 180 | { | ||
| 181 | return bio_integrity_intervals(bi, sectors) * bi->tuple_size; | ||
| 182 | } | ||
| 183 | |||
| 184 | /** | ||
| 185 | * bio_integrity_process - Process integrity metadata for a bio | 163 | * bio_integrity_process - Process integrity metadata for a bio |
| 186 | * @bio: bio to generate/verify integrity metadata for | 164 | * @bio: bio to generate/verify integrity metadata for |
| 187 | * @proc_iter: iterator to process | 165 | * @proc_iter: iterator to process |
diff --git a/block/bio.c b/block/bio.c index 047c5dca6d90..b12966e415d3 100644 --- a/block/bio.c +++ b/block/bio.c | |||
| @@ -28,9 +28,11 @@ | |||
| 28 | #include <linux/mempool.h> | 28 | #include <linux/mempool.h> |
| 29 | #include <linux/workqueue.h> | 29 | #include <linux/workqueue.h> |
| 30 | #include <linux/cgroup.h> | 30 | #include <linux/cgroup.h> |
| 31 | #include <linux/blk-cgroup.h> | ||
| 31 | 32 | ||
| 32 | #include <trace/events/block.h> | 33 | #include <trace/events/block.h> |
| 33 | #include "blk.h" | 34 | #include "blk.h" |
| 35 | #include "blk-rq-qos.h" | ||
| 34 | 36 | ||
| 35 | /* | 37 | /* |
| 36 | * Test patch to inline a certain number of bi_io_vec's inside the bio | 38 | * Test patch to inline a certain number of bi_io_vec's inside the bio |
| @@ -156,7 +158,7 @@ out: | |||
| 156 | 158 | ||
| 157 | unsigned int bvec_nr_vecs(unsigned short idx) | 159 | unsigned int bvec_nr_vecs(unsigned short idx) |
| 158 | { | 160 | { |
| 159 | return bvec_slabs[idx].nr_vecs; | 161 | return bvec_slabs[--idx].nr_vecs; |
| 160 | } | 162 | } |
| 161 | 163 | ||
| 162 | void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) | 164 | void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) |
| @@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) | |||
| 645 | EXPORT_SYMBOL(bio_clone_fast); | 647 | EXPORT_SYMBOL(bio_clone_fast); |
| 646 | 648 | ||
| 647 | /** | 649 | /** |
| 648 | * bio_clone_bioset - clone a bio | ||
| 649 | * @bio_src: bio to clone | ||
| 650 | * @gfp_mask: allocation priority | ||
| 651 | * @bs: bio_set to allocate from | ||
| 652 | * | ||
| 653 | * Clone bio. Caller will own the returned bio, but not the actual data it | ||
| 654 | * points to. Reference count of returned bio will be one. | ||
| 655 | */ | ||
| 656 | struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, | ||
| 657 | struct bio_set *bs) | ||
| 658 | { | ||
| 659 | struct bvec_iter iter; | ||
| 660 | struct bio_vec bv; | ||
| 661 | struct bio *bio; | ||
| 662 | |||
| 663 | /* | ||
| 664 | * Pre immutable biovecs, __bio_clone() used to just do a memcpy from | ||
| 665 | * bio_src->bi_io_vec to bio->bi_io_vec. | ||
| 666 | * | ||
| 667 | * We can't do that anymore, because: | ||
| 668 | * | ||
| 669 | * - The point of cloning the biovec is to produce a bio with a biovec | ||
| 670 | * the caller can modify: bi_idx and bi_bvec_done should be 0. | ||
| 671 | * | ||
| 672 | * - The original bio could've had more than BIO_MAX_PAGES biovecs; if | ||
| 673 | * we tried to clone the whole thing bio_alloc_bioset() would fail. | ||
| 674 | * But the clone should succeed as long as the number of biovecs we | ||
| 675 | * actually need to allocate is fewer than BIO_MAX_PAGES. | ||
| 676 | * | ||
| 677 | * - Lastly, bi_vcnt should not be looked at or relied upon by code | ||
| 678 | * that does not own the bio - reason being drivers don't use it for | ||
| 679 | * iterating over the biovec anymore, so expecting it to be kept up | ||
| 680 | * to date (i.e. for clones that share the parent biovec) is just | ||
| 681 | * asking for trouble and would force extra work on | ||
| 682 | * __bio_clone_fast() anyways. | ||
| 683 | */ | ||
| 684 | |||
| 685 | bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); | ||
| 686 | if (!bio) | ||
| 687 | return NULL; | ||
| 688 | bio->bi_disk = bio_src->bi_disk; | ||
| 689 | bio->bi_opf = bio_src->bi_opf; | ||
| 690 | bio->bi_write_hint = bio_src->bi_write_hint; | ||
| 691 | bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; | ||
| 692 | bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; | ||
| 693 | |||
| 694 | switch (bio_op(bio)) { | ||
| 695 | case REQ_OP_DISCARD: | ||
| 696 | case REQ_OP_SECURE_ERASE: | ||
| 697 | case REQ_OP_WRITE_ZEROES: | ||
| 698 | break; | ||
| 699 | case REQ_OP_WRITE_SAME: | ||
| 700 | bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; | ||
| 701 | break; | ||
| 702 | default: | ||
| 703 | bio_for_each_segment(bv, bio_src, iter) | ||
| 704 | bio->bi_io_vec[bio->bi_vcnt++] = bv; | ||
| 705 | break; | ||
| 706 | } | ||
| 707 | |||
| 708 | if (bio_integrity(bio_src)) { | ||
| 709 | int ret; | ||
| 710 | |||
| 711 | ret = bio_integrity_clone(bio, bio_src, gfp_mask); | ||
| 712 | if (ret < 0) { | ||
| 713 | bio_put(bio); | ||
| 714 | return NULL; | ||
| 715 | } | ||
| 716 | } | ||
| 717 | |||
| 718 | bio_clone_blkcg_association(bio, bio_src); | ||
| 719 | |||
| 720 | return bio; | ||
| 721 | } | ||
| 722 | EXPORT_SYMBOL(bio_clone_bioset); | ||
| 723 | |||
| 724 | /** | ||
| 725 | * bio_add_pc_page - attempt to add page to bio | 650 | * bio_add_pc_page - attempt to add page to bio |
| 726 | * @q: the target queue | 651 | * @q: the target queue |
| 727 | * @bio: destination bio | 652 | * @bio: destination bio |
| @@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio) | |||
| 1661 | int i; | 1586 | int i; |
| 1662 | 1587 | ||
| 1663 | bio_for_each_segment_all(bvec, bio, i) { | 1588 | bio_for_each_segment_all(bvec, bio, i) { |
| 1664 | struct page *page = bvec->bv_page; | 1589 | if (!PageCompound(bvec->bv_page)) |
| 1665 | 1590 | set_page_dirty_lock(bvec->bv_page); | |
| 1666 | if (page && !PageCompound(page)) | ||
| 1667 | set_page_dirty_lock(page); | ||
| 1668 | } | 1591 | } |
| 1669 | } | 1592 | } |
| 1670 | EXPORT_SYMBOL_GPL(bio_set_pages_dirty); | 1593 | EXPORT_SYMBOL_GPL(bio_set_pages_dirty); |
| @@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio) | |||
| 1674 | struct bio_vec *bvec; | 1597 | struct bio_vec *bvec; |
| 1675 | int i; | 1598 | int i; |
| 1676 | 1599 | ||
| 1677 | bio_for_each_segment_all(bvec, bio, i) { | 1600 | bio_for_each_segment_all(bvec, bio, i) |
| 1678 | struct page *page = bvec->bv_page; | 1601 | put_page(bvec->bv_page); |
| 1679 | |||
| 1680 | if (page) | ||
| 1681 | put_page(page); | ||
| 1682 | } | ||
| 1683 | } | 1602 | } |
| 1684 | 1603 | ||
| 1685 | /* | 1604 | /* |
| 1686 | * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. | 1605 | * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. |
| 1687 | * If they are, then fine. If, however, some pages are clean then they must | 1606 | * If they are, then fine. If, however, some pages are clean then they must |
| 1688 | * have been written out during the direct-IO read. So we take another ref on | 1607 | * have been written out during the direct-IO read. So we take another ref on |
| 1689 | * the BIO and the offending pages and re-dirty the pages in process context. | 1608 | * the BIO and re-dirty the pages in process context. |
| 1690 | * | 1609 | * |
| 1691 | * It is expected that bio_check_pages_dirty() will wholly own the BIO from | 1610 | * It is expected that bio_check_pages_dirty() will wholly own the BIO from |
| 1692 | * here on. It will run one put_page() against each page and will run one | 1611 | * here on. It will run one put_page() against each page and will run one |
| @@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list; | |||
| 1704 | */ | 1623 | */ |
| 1705 | static void bio_dirty_fn(struct work_struct *work) | 1624 | static void bio_dirty_fn(struct work_struct *work) |
| 1706 | { | 1625 | { |
| 1707 | unsigned long flags; | 1626 | struct bio *bio, *next; |
| 1708 | struct bio *bio; | ||
| 1709 | 1627 | ||
| 1710 | spin_lock_irqsave(&bio_dirty_lock, flags); | 1628 | spin_lock_irq(&bio_dirty_lock); |
| 1711 | bio = bio_dirty_list; | 1629 | next = bio_dirty_list; |
| 1712 | bio_dirty_list = NULL; | 1630 | bio_dirty_list = NULL; |
| 1713 | spin_unlock_irqrestore(&bio_dirty_lock, flags); | 1631 | spin_unlock_irq(&bio_dirty_lock); |
| 1714 | 1632 | ||
| 1715 | while (bio) { | 1633 | while ((bio = next) != NULL) { |
| 1716 | struct bio *next = bio->bi_private; | 1634 | next = bio->bi_private; |
| 1717 | 1635 | ||
| 1718 | bio_set_pages_dirty(bio); | 1636 | bio_set_pages_dirty(bio); |
| 1719 | bio_release_pages(bio); | 1637 | bio_release_pages(bio); |
| 1720 | bio_put(bio); | 1638 | bio_put(bio); |
| 1721 | bio = next; | ||
| 1722 | } | 1639 | } |
| 1723 | } | 1640 | } |
| 1724 | 1641 | ||
| 1725 | void bio_check_pages_dirty(struct bio *bio) | 1642 | void bio_check_pages_dirty(struct bio *bio) |
| 1726 | { | 1643 | { |
| 1727 | struct bio_vec *bvec; | 1644 | struct bio_vec *bvec; |
| 1728 | int nr_clean_pages = 0; | 1645 | unsigned long flags; |
| 1729 | int i; | 1646 | int i; |
| 1730 | 1647 | ||
| 1731 | bio_for_each_segment_all(bvec, bio, i) { | 1648 | bio_for_each_segment_all(bvec, bio, i) { |
| 1732 | struct page *page = bvec->bv_page; | 1649 | if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) |
| 1733 | 1650 | goto defer; | |
| 1734 | if (PageDirty(page) || PageCompound(page)) { | ||
| 1735 | put_page(page); | ||
| 1736 | bvec->bv_page = NULL; | ||
| 1737 | } else { | ||
| 1738 | nr_clean_pages++; | ||
| 1739 | } | ||
| 1740 | } | 1651 | } |
| 1741 | 1652 | ||
| 1742 | if (nr_clean_pages) { | 1653 | bio_release_pages(bio); |
| 1743 | unsigned long flags; | 1654 | bio_put(bio); |
| 1744 | 1655 | return; | |
| 1745 | spin_lock_irqsave(&bio_dirty_lock, flags); | 1656 | defer: |
| 1746 | bio->bi_private = bio_dirty_list; | 1657 | spin_lock_irqsave(&bio_dirty_lock, flags); |
| 1747 | bio_dirty_list = bio; | 1658 | bio->bi_private = bio_dirty_list; |
| 1748 | spin_unlock_irqrestore(&bio_dirty_lock, flags); | 1659 | bio_dirty_list = bio; |
| 1749 | schedule_work(&bio_dirty_work); | 1660 | spin_unlock_irqrestore(&bio_dirty_lock, flags); |
| 1750 | } else { | 1661 | schedule_work(&bio_dirty_work); |
| 1751 | bio_put(bio); | ||
| 1752 | } | ||
| 1753 | } | 1662 | } |
| 1754 | EXPORT_SYMBOL_GPL(bio_check_pages_dirty); | 1663 | EXPORT_SYMBOL_GPL(bio_check_pages_dirty); |
| 1755 | 1664 | ||
| 1756 | void generic_start_io_acct(struct request_queue *q, int rw, | 1665 | void generic_start_io_acct(struct request_queue *q, int op, |
| 1757 | unsigned long sectors, struct hd_struct *part) | 1666 | unsigned long sectors, struct hd_struct *part) |
| 1758 | { | 1667 | { |
| 1668 | const int sgrp = op_stat_group(op); | ||
| 1759 | int cpu = part_stat_lock(); | 1669 | int cpu = part_stat_lock(); |
| 1760 | 1670 | ||
| 1761 | part_round_stats(q, cpu, part); | 1671 | part_round_stats(q, cpu, part); |
| 1762 | part_stat_inc(cpu, part, ios[rw]); | 1672 | part_stat_inc(cpu, part, ios[sgrp]); |
| 1763 | part_stat_add(cpu, part, sectors[rw], sectors); | 1673 | part_stat_add(cpu, part, sectors[sgrp], sectors); |
| 1764 | part_inc_in_flight(q, part, rw); | 1674 | part_inc_in_flight(q, part, op_is_write(op)); |
| 1765 | 1675 | ||
| 1766 | part_stat_unlock(); | 1676 | part_stat_unlock(); |
| 1767 | } | 1677 | } |
| 1768 | EXPORT_SYMBOL(generic_start_io_acct); | 1678 | EXPORT_SYMBOL(generic_start_io_acct); |
| 1769 | 1679 | ||
| 1770 | void generic_end_io_acct(struct request_queue *q, int rw, | 1680 | void generic_end_io_acct(struct request_queue *q, int req_op, |
| 1771 | struct hd_struct *part, unsigned long start_time) | 1681 | struct hd_struct *part, unsigned long start_time) |
| 1772 | { | 1682 | { |
| 1773 | unsigned long duration = jiffies - start_time; | 1683 | unsigned long duration = jiffies - start_time; |
| 1684 | const int sgrp = op_stat_group(req_op); | ||
| 1774 | int cpu = part_stat_lock(); | 1685 | int cpu = part_stat_lock(); |
| 1775 | 1686 | ||
| 1776 | part_stat_add(cpu, part, ticks[rw], duration); | 1687 | part_stat_add(cpu, part, ticks[sgrp], duration); |
| 1777 | part_round_stats(q, cpu, part); | 1688 | part_round_stats(q, cpu, part); |
| 1778 | part_dec_in_flight(q, part, rw); | 1689 | part_dec_in_flight(q, part, op_is_write(req_op)); |
| 1779 | 1690 | ||
| 1780 | part_stat_unlock(); | 1691 | part_stat_unlock(); |
| 1781 | } | 1692 | } |
| @@ -1834,6 +1745,9 @@ again: | |||
| 1834 | if (!bio_integrity_endio(bio)) | 1745 | if (!bio_integrity_endio(bio)) |
| 1835 | return; | 1746 | return; |
| 1836 | 1747 | ||
| 1748 | if (bio->bi_disk) | ||
| 1749 | rq_qos_done_bio(bio->bi_disk->queue, bio); | ||
| 1750 | |||
| 1837 | /* | 1751 | /* |
| 1838 | * Need to have a real endio function for chained bios, otherwise | 1752 | * Need to have a real endio function for chained bios, otherwise |
| 1839 | * various corner cases will break (like stacking block devices that | 1753 | * various corner cases will break (like stacking block devices that |
| @@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src); | |||
| 2042 | 1956 | ||
| 2043 | #ifdef CONFIG_BLK_CGROUP | 1957 | #ifdef CONFIG_BLK_CGROUP |
| 2044 | 1958 | ||
| 1959 | #ifdef CONFIG_MEMCG | ||
| 1960 | /** | ||
| 1961 | * bio_associate_blkcg_from_page - associate a bio with the page's blkcg | ||
| 1962 | * @bio: target bio | ||
| 1963 | * @page: the page to lookup the blkcg from | ||
| 1964 | * | ||
| 1965 | * Associate @bio with the blkcg from @page's owning memcg. This works like | ||
| 1966 | * every other associate function wrt references. | ||
| 1967 | */ | ||
| 1968 | int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) | ||
| 1969 | { | ||
| 1970 | struct cgroup_subsys_state *blkcg_css; | ||
| 1971 | |||
| 1972 | if (unlikely(bio->bi_css)) | ||
| 1973 | return -EBUSY; | ||
| 1974 | if (!page->mem_cgroup) | ||
| 1975 | return 0; | ||
| 1976 | blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, | ||
| 1977 | &io_cgrp_subsys); | ||
| 1978 | bio->bi_css = blkcg_css; | ||
| 1979 | return 0; | ||
| 1980 | } | ||
| 1981 | #endif /* CONFIG_MEMCG */ | ||
| 1982 | |||
| 2045 | /** | 1983 | /** |
| 2046 | * bio_associate_blkcg - associate a bio with the specified blkcg | 1984 | * bio_associate_blkcg - associate a bio with the specified blkcg |
| 2047 | * @bio: target bio | 1985 | * @bio: target bio |
| @@ -2065,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) | |||
| 2065 | EXPORT_SYMBOL_GPL(bio_associate_blkcg); | 2003 | EXPORT_SYMBOL_GPL(bio_associate_blkcg); |
| 2066 | 2004 | ||
| 2067 | /** | 2005 | /** |
| 2006 | * bio_associate_blkg - associate a bio with the specified blkg | ||
| 2007 | * @bio: target bio | ||
| 2008 | * @blkg: the blkg to associate | ||
| 2009 | * | ||
| 2010 | * Associate @bio with the blkg specified by @blkg. This is the queue specific | ||
| 2011 | * blkcg information associated with the @bio, a reference will be taken on the | ||
| 2012 | * @blkg and will be freed when the bio is freed. | ||
| 2013 | */ | ||
| 2014 | int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) | ||
| 2015 | { | ||
| 2016 | if (unlikely(bio->bi_blkg)) | ||
| 2017 | return -EBUSY; | ||
| 2018 | blkg_get(blkg); | ||
| 2019 | bio->bi_blkg = blkg; | ||
| 2020 | return 0; | ||
| 2021 | } | ||
| 2022 | |||
| 2023 | /** | ||
| 2068 | * bio_disassociate_task - undo bio_associate_current() | 2024 | * bio_disassociate_task - undo bio_associate_current() |
| 2069 | * @bio: target bio | 2025 | * @bio: target bio |
| 2070 | */ | 2026 | */ |
| @@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio) | |||
| 2078 | css_put(bio->bi_css); | 2034 | css_put(bio->bi_css); |
| 2079 | bio->bi_css = NULL; | 2035 | bio->bi_css = NULL; |
| 2080 | } | 2036 | } |
| 2037 | if (bio->bi_blkg) { | ||
| 2038 | blkg_put(bio->bi_blkg); | ||
| 2039 | bio->bi_blkg = NULL; | ||
| 2040 | } | ||
| 2081 | } | 2041 | } |
| 2082 | 2042 | ||
| 2083 | /** | 2043 | /** |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index eb85cb87c40f..694595b29b8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/atomic.h> | 27 | #include <linux/atomic.h> |
| 28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
| 29 | #include <linux/blk-cgroup.h> | 29 | #include <linux/blk-cgroup.h> |
| 30 | #include <linux/tracehook.h> | ||
| 30 | #include "blk.h" | 31 | #include "blk.h" |
| 31 | 32 | ||
| 32 | #define MAX_KEY_LEN 100 | 33 | #define MAX_KEY_LEN 100 |
| @@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; | |||
| 50 | 51 | ||
| 51 | static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ | 52 | static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ |
| 52 | 53 | ||
| 54 | static bool blkcg_debug_stats = false; | ||
| 55 | |||
| 53 | static bool blkcg_policy_enabled(struct request_queue *q, | 56 | static bool blkcg_policy_enabled(struct request_queue *q, |
| 54 | const struct blkcg_policy *pol) | 57 | const struct blkcg_policy *pol) |
| 55 | { | 58 | { |
| @@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
| 564 | [BLKG_RWSTAT_WRITE] = "Write", | 567 | [BLKG_RWSTAT_WRITE] = "Write", |
| 565 | [BLKG_RWSTAT_SYNC] = "Sync", | 568 | [BLKG_RWSTAT_SYNC] = "Sync", |
| 566 | [BLKG_RWSTAT_ASYNC] = "Async", | 569 | [BLKG_RWSTAT_ASYNC] = "Async", |
| 570 | [BLKG_RWSTAT_DISCARD] = "Discard", | ||
| 567 | }; | 571 | }; |
| 568 | const char *dname = blkg_dev_name(pd->blkg); | 572 | const char *dname = blkg_dev_name(pd->blkg); |
| 569 | u64 v; | 573 | u64 v; |
| @@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
| 577 | (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); | 581 | (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); |
| 578 | 582 | ||
| 579 | v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + | 583 | v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + |
| 580 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); | 584 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + |
| 585 | atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); | ||
| 581 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | 586 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); |
| 582 | return v; | 587 | return v; |
| 583 | } | 588 | } |
| @@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) | |||
| 954 | 959 | ||
| 955 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { | 960 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { |
| 956 | const char *dname; | 961 | const char *dname; |
| 962 | char *buf; | ||
| 957 | struct blkg_rwstat rwstat; | 963 | struct blkg_rwstat rwstat; |
| 958 | u64 rbytes, wbytes, rios, wios; | 964 | u64 rbytes, wbytes, rios, wios, dbytes, dios; |
| 965 | size_t size = seq_get_buf(sf, &buf), off = 0; | ||
| 966 | int i; | ||
| 967 | bool has_stats = false; | ||
| 959 | 968 | ||
| 960 | dname = blkg_dev_name(blkg); | 969 | dname = blkg_dev_name(blkg); |
| 961 | if (!dname) | 970 | if (!dname) |
| 962 | continue; | 971 | continue; |
| 963 | 972 | ||
| 973 | /* | ||
| 974 | * Hooray string manipulation, count is the size written NOT | ||
| 975 | * INCLUDING THE \0, so size is now count+1 less than what we | ||
| 976 | * had before, but we want to start writing the next bit from | ||
| 977 | * the \0 so we only add count to buf. | ||
| 978 | */ | ||
| 979 | off += scnprintf(buf+off, size-off, "%s ", dname); | ||
| 980 | |||
| 964 | spin_lock_irq(blkg->q->queue_lock); | 981 | spin_lock_irq(blkg->q->queue_lock); |
| 965 | 982 | ||
| 966 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | 983 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, |
| 967 | offsetof(struct blkcg_gq, stat_bytes)); | 984 | offsetof(struct blkcg_gq, stat_bytes)); |
| 968 | rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | 985 | rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); |
| 969 | wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | 986 | wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); |
| 987 | dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); | ||
| 970 | 988 | ||
| 971 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, | 989 | rwstat = blkg_rwstat_recursive_sum(blkg, NULL, |
| 972 | offsetof(struct blkcg_gq, stat_ios)); | 990 | offsetof(struct blkcg_gq, stat_ios)); |
| 973 | rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); | 991 | rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); |
| 974 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); | 992 | wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); |
| 993 | dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); | ||
| 975 | 994 | ||
| 976 | spin_unlock_irq(blkg->q->queue_lock); | 995 | spin_unlock_irq(blkg->q->queue_lock); |
| 977 | 996 | ||
| 978 | if (rbytes || wbytes || rios || wios) | 997 | if (rbytes || wbytes || rios || wios) { |
| 979 | seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", | 998 | has_stats = true; |
| 980 | dname, rbytes, wbytes, rios, wios); | 999 | off += scnprintf(buf+off, size-off, |
| 1000 | "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", | ||
| 1001 | rbytes, wbytes, rios, wios, | ||
| 1002 | dbytes, dios); | ||
| 1003 | } | ||
| 1004 | |||
| 1005 | if (!blkcg_debug_stats) | ||
| 1006 | goto next; | ||
| 1007 | |||
| 1008 | if (atomic_read(&blkg->use_delay)) { | ||
| 1009 | has_stats = true; | ||
| 1010 | off += scnprintf(buf+off, size-off, | ||
| 1011 | " use_delay=%d delay_nsec=%llu", | ||
| 1012 | atomic_read(&blkg->use_delay), | ||
| 1013 | (unsigned long long)atomic64_read(&blkg->delay_nsec)); | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
| 1017 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
| 1018 | size_t written; | ||
| 1019 | |||
| 1020 | if (!blkg->pd[i] || !pol->pd_stat_fn) | ||
| 1021 | continue; | ||
| 1022 | |||
| 1023 | written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); | ||
| 1024 | if (written) | ||
| 1025 | has_stats = true; | ||
| 1026 | off += written; | ||
| 1027 | } | ||
| 1028 | next: | ||
| 1029 | if (has_stats) { | ||
| 1030 | off += scnprintf(buf+off, size-off, "\n"); | ||
| 1031 | seq_commit(sf, off); | ||
| 1032 | } | ||
| 981 | } | 1033 | } |
| 982 | 1034 | ||
| 983 | rcu_read_unlock(); | 1035 | rcu_read_unlock(); |
| @@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q) | |||
| 1191 | if (preloaded) | 1243 | if (preloaded) |
| 1192 | radix_tree_preload_end(); | 1244 | radix_tree_preload_end(); |
| 1193 | 1245 | ||
| 1246 | ret = blk_iolatency_init(q); | ||
| 1247 | if (ret) { | ||
| 1248 | spin_lock_irq(q->queue_lock); | ||
| 1249 | blkg_destroy_all(q); | ||
| 1250 | spin_unlock_irq(q->queue_lock); | ||
| 1251 | return ret; | ||
| 1252 | } | ||
| 1253 | |||
| 1194 | ret = blk_throtl_init(q); | 1254 | ret = blk_throtl_init(q); |
| 1195 | if (ret) { | 1255 | if (ret) { |
| 1196 | spin_lock_irq(q->queue_lock); | 1256 | spin_lock_irq(q->queue_lock); |
| @@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) | |||
| 1288 | mutex_unlock(&blkcg_pol_mutex); | 1348 | mutex_unlock(&blkcg_pol_mutex); |
| 1289 | } | 1349 | } |
| 1290 | 1350 | ||
| 1351 | static void blkcg_exit(struct task_struct *tsk) | ||
| 1352 | { | ||
| 1353 | if (tsk->throttle_queue) | ||
| 1354 | blk_put_queue(tsk->throttle_queue); | ||
| 1355 | tsk->throttle_queue = NULL; | ||
| 1356 | } | ||
| 1357 | |||
| 1291 | struct cgroup_subsys io_cgrp_subsys = { | 1358 | struct cgroup_subsys io_cgrp_subsys = { |
| 1292 | .css_alloc = blkcg_css_alloc, | 1359 | .css_alloc = blkcg_css_alloc, |
| 1293 | .css_offline = blkcg_css_offline, | 1360 | .css_offline = blkcg_css_offline, |
| @@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = { | |||
| 1297 | .dfl_cftypes = blkcg_files, | 1364 | .dfl_cftypes = blkcg_files, |
| 1298 | .legacy_cftypes = blkcg_legacy_files, | 1365 | .legacy_cftypes = blkcg_legacy_files, |
| 1299 | .legacy_name = "blkio", | 1366 | .legacy_name = "blkio", |
| 1367 | .exit = blkcg_exit, | ||
| 1300 | #ifdef CONFIG_MEMCG | 1368 | #ifdef CONFIG_MEMCG |
| 1301 | /* | 1369 | /* |
| 1302 | * This ensures that, if available, memcg is automatically enabled | 1370 | * This ensures that, if available, memcg is automatically enabled |
| @@ -1547,3 +1615,209 @@ out_unlock: | |||
| 1547 | mutex_unlock(&blkcg_pol_register_mutex); | 1615 | mutex_unlock(&blkcg_pol_register_mutex); |
| 1548 | } | 1616 | } |
| 1549 | EXPORT_SYMBOL_GPL(blkcg_policy_unregister); | 1617 | EXPORT_SYMBOL_GPL(blkcg_policy_unregister); |
| 1618 | |||
| 1619 | /* | ||
| 1620 | * Scale the accumulated delay based on how long it has been since we updated | ||
| 1621 | * the delay. We only call this when we are adding delay, in case it's been a | ||
| 1622 | * while since we added delay, and when we are checking to see if we need to | ||
| 1623 | * delay a task, to account for any delays that may have occurred. | ||
| 1624 | */ | ||
| 1625 | static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) | ||
| 1626 | { | ||
| 1627 | u64 old = atomic64_read(&blkg->delay_start); | ||
| 1628 | |||
| 1629 | /* | ||
| 1630 | * We only want to scale down every second. The idea here is that we | ||
| 1631 | * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain | ||
| 1632 | * time window. We only want to throttle tasks for recent delay that | ||
| 1633 | * has occurred, in 1 second time windows since that's the maximum | ||
| 1634 | * things can be throttled. We save the current delay window in | ||
| 1635 | * blkg->last_delay so we know what amount is still left to be charged | ||
| 1636 | * to the blkg from this point onward. blkg->last_use keeps track of | ||
| 1637 | * the use_delay counter. The idea is if we're unthrottling the blkg we | ||
| 1638 | * are ok with whatever is happening now, and we can take away more of | ||
| 1639 | * the accumulated delay as we've already throttled enough that | ||
| 1640 | * everybody is happy with their IO latencies. | ||
| 1641 | */ | ||
| 1642 | if (time_before64(old + NSEC_PER_SEC, now) && | ||
| 1643 | atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { | ||
| 1644 | u64 cur = atomic64_read(&blkg->delay_nsec); | ||
| 1645 | u64 sub = min_t(u64, blkg->last_delay, now - old); | ||
| 1646 | int cur_use = atomic_read(&blkg->use_delay); | ||
| 1647 | |||
| 1648 | /* | ||
| 1649 | * We've been unthrottled, subtract a larger chunk of our | ||
| 1650 | * accumulated delay. | ||
| 1651 | */ | ||
| 1652 | if (cur_use < blkg->last_use) | ||
| 1653 | sub = max_t(u64, sub, blkg->last_delay >> 1); | ||
| 1654 | |||
| 1655 | /* | ||
| 1656 | * This shouldn't happen, but handle it anyway. Our delay_nsec | ||
| 1657 | * should only ever be growing except here where we subtract out | ||
| 1658 | * min(last_delay, 1 second), but lord knows bugs happen and I'd | ||
| 1659 | * rather not end up with negative numbers. | ||
| 1660 | */ | ||
| 1661 | if (unlikely(cur < sub)) { | ||
| 1662 | atomic64_set(&blkg->delay_nsec, 0); | ||
| 1663 | blkg->last_delay = 0; | ||
| 1664 | } else { | ||
| 1665 | atomic64_sub(sub, &blkg->delay_nsec); | ||
| 1666 | blkg->last_delay = cur - sub; | ||
| 1667 | } | ||
| 1668 | blkg->last_use = cur_use; | ||
| 1669 | } | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | /* | ||
| 1673 | * This is called when we want to actually walk up the hierarchy and check to | ||
| 1674 | * see if we need to throttle, and then actually throttle if there is some | ||
| 1675 | * accumulated delay. This should only be called upon return to user space so | ||
| 1676 | * we're not holding some lock that would induce a priority inversion. | ||
| 1677 | */ | ||
| 1678 | static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) | ||
| 1679 | { | ||
| 1680 | u64 now = ktime_to_ns(ktime_get()); | ||
| 1681 | u64 exp; | ||
| 1682 | u64 delay_nsec = 0; | ||
| 1683 | int tok; | ||
| 1684 | |||
| 1685 | while (blkg->parent) { | ||
| 1686 | if (atomic_read(&blkg->use_delay)) { | ||
| 1687 | blkcg_scale_delay(blkg, now); | ||
| 1688 | delay_nsec = max_t(u64, delay_nsec, | ||
| 1689 | atomic64_read(&blkg->delay_nsec)); | ||
| 1690 | } | ||
| 1691 | blkg = blkg->parent; | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | if (!delay_nsec) | ||
| 1695 | return; | ||
| 1696 | |||
| 1697 | /* | ||
| 1698 | * Let's not sleep for all eternity if we've amassed a huge delay. | ||
| 1699 | * Swapping or metadata IO can accumulate 10's of seconds worth of | ||
| 1700 | * delay, and we want userspace to be able to do _something_ so cap the | ||
| 1701 | * delays at 1 second. If there's 10's of seconds worth of delay then | ||
| 1702 | * the tasks will be delayed for 1 second for every syscall. | ||
| 1703 | */ | ||
| 1704 | delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); | ||
| 1705 | |||
| 1706 | /* | ||
| 1707 | * TODO: the use_memdelay flag is going to be for the upcoming psi stuff | ||
| 1708 | * that hasn't landed upstream yet. Once that stuff is in place we need | ||
| 1709 | * to do a psi_memstall_enter/leave if memdelay is set. | ||
| 1710 | */ | ||
| 1711 | |||
| 1712 | exp = ktime_add_ns(now, delay_nsec); | ||
| 1713 | tok = io_schedule_prepare(); | ||
| 1714 | do { | ||
| 1715 | __set_current_state(TASK_KILLABLE); | ||
| 1716 | if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) | ||
| 1717 | break; | ||
| 1718 | } while (!fatal_signal_pending(current)); | ||
| 1719 | io_schedule_finish(tok); | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | /** | ||
| 1723 | * blkcg_maybe_throttle_current - throttle the current task if it has been marked | ||
| 1724 | * | ||
| 1725 | * This is only called if we've been marked with set_notify_resume(). Obviously | ||
| 1726 | * we can be set_notify_resume() for reasons other than blkcg throttling, so we | ||
| 1727 | * check to see if current->throttle_queue is set and if not this doesn't do | ||
| 1728 | * anything. This should only ever be called by the resume code, it's not meant | ||
| 1729 | * to be called by people willy-nilly as it will actually do the work to | ||
| 1730 | * throttle the task if it is setup for throttling. | ||
| 1731 | */ | ||
| 1732 | void blkcg_maybe_throttle_current(void) | ||
| 1733 | { | ||
| 1734 | struct request_queue *q = current->throttle_queue; | ||
| 1735 | struct cgroup_subsys_state *css; | ||
| 1736 | struct blkcg *blkcg; | ||
| 1737 | struct blkcg_gq *blkg; | ||
| 1738 | bool use_memdelay = current->use_memdelay; | ||
| 1739 | |||
| 1740 | if (!q) | ||
| 1741 | return; | ||
| 1742 | |||
| 1743 | current->throttle_queue = NULL; | ||
| 1744 | current->use_memdelay = false; | ||
| 1745 | |||
| 1746 | rcu_read_lock(); | ||
| 1747 | css = kthread_blkcg(); | ||
| 1748 | if (css) | ||
| 1749 | blkcg = css_to_blkcg(css); | ||
| 1750 | else | ||
| 1751 | blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); | ||
| 1752 | |||
| 1753 | if (!blkcg) | ||
| 1754 | goto out; | ||
| 1755 | blkg = blkg_lookup(blkcg, q); | ||
| 1756 | if (!blkg) | ||
| 1757 | goto out; | ||
| 1758 | blkg = blkg_try_get(blkg); | ||
| 1759 | if (!blkg) | ||
| 1760 | goto out; | ||
| 1761 | rcu_read_unlock(); | ||
| 1762 | |||
| 1763 | blkcg_maybe_throttle_blkg(blkg, use_memdelay); | ||
| 1764 | blkg_put(blkg); | ||
| 1765 | blk_put_queue(q); | ||
| 1766 | return; | ||
| 1767 | out: | ||
| 1768 | rcu_read_unlock(); | ||
| 1769 | blk_put_queue(q); | ||
| 1770 | } | ||
| 1771 | EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); | ||
| 1772 | |||
| 1773 | /** | ||
| 1774 | * blkcg_schedule_throttle - this task needs to check for throttling | ||
| 1775 | * @q - the request queue IO was submitted on | ||
| 1776 | * @use_memdelay - do we charge this to memory delay for PSI | ||
| 1777 | * | ||
| 1778 | * This is called by the IO controller when we know there's delay accumulated | ||
| 1779 | * for the blkg for this task. We do not pass the blkg because there are places | ||
| 1780 | * we call this that may not have that information, the swapping code for | ||
| 1781 | * instance will only have a request_queue at that point. This set's the | ||
| 1782 | * notify_resume for the task to check and see if it requires throttling before | ||
| 1783 | * returning to user space. | ||
| 1784 | * | ||
| 1785 | * We will only schedule once per syscall. You can call this over and over | ||
| 1786 | * again and it will only do the check once upon return to user space, and only | ||
| 1787 | * throttle once. If the task needs to be throttled again it'll need to be | ||
| 1788 | * re-set at the next time we see the task. | ||
| 1789 | */ | ||
| 1790 | void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) | ||
| 1791 | { | ||
| 1792 | if (unlikely(current->flags & PF_KTHREAD)) | ||
| 1793 | return; | ||
| 1794 | |||
| 1795 | if (!blk_get_queue(q)) | ||
| 1796 | return; | ||
| 1797 | |||
| 1798 | if (current->throttle_queue) | ||
| 1799 | blk_put_queue(current->throttle_queue); | ||
| 1800 | current->throttle_queue = q; | ||
| 1801 | if (use_memdelay) | ||
| 1802 | current->use_memdelay = use_memdelay; | ||
| 1803 | set_notify_resume(current); | ||
| 1804 | } | ||
| 1805 | EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); | ||
| 1806 | |||
| 1807 | /** | ||
| 1808 | * blkcg_add_delay - add delay to this blkg | ||
| 1809 | * @now - the current time in nanoseconds | ||
| 1810 | * @delta - how many nanoseconds of delay to add | ||
| 1811 | * | ||
| 1812 | * Charge @delta to the blkg's current delay accumulation. This is used to | ||
| 1813 | * throttle tasks if an IO controller thinks we need more throttling. | ||
| 1814 | */ | ||
| 1815 | void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) | ||
| 1816 | { | ||
| 1817 | blkcg_scale_delay(blkg, now); | ||
| 1818 | atomic64_add(delta, &blkg->delay_nsec); | ||
| 1819 | } | ||
| 1820 | EXPORT_SYMBOL_GPL(blkcg_add_delay); | ||
| 1821 | |||
| 1822 | module_param(blkcg_debug_stats, bool, 0644); | ||
| 1823 | MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); | ||
diff --git a/block/blk-core.c b/block/blk-core.c index ee33590f54eb..12550340418d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
| @@ -42,7 +42,7 @@ | |||
| 42 | #include "blk.h" | 42 | #include "blk.h" |
| 43 | #include "blk-mq.h" | 43 | #include "blk-mq.h" |
| 44 | #include "blk-mq-sched.h" | 44 | #include "blk-mq-sched.h" |
| 45 | #include "blk-wbt.h" | 45 | #include "blk-rq-qos.h" |
| 46 | 46 | ||
| 47 | #ifdef CONFIG_DEBUG_FS | 47 | #ifdef CONFIG_DEBUG_FS |
| 48 | struct dentry *blk_debugfs_root; | 48 | struct dentry *blk_debugfs_root; |
| @@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q) | |||
| 715 | } | 715 | } |
| 716 | EXPORT_SYMBOL_GPL(blk_set_queue_dying); | 716 | EXPORT_SYMBOL_GPL(blk_set_queue_dying); |
| 717 | 717 | ||
| 718 | /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ | ||
| 719 | void blk_exit_queue(struct request_queue *q) | ||
| 720 | { | ||
| 721 | /* | ||
| 722 | * Since the I/O scheduler exit code may access cgroup information, | ||
| 723 | * perform I/O scheduler exit before disassociating from the block | ||
| 724 | * cgroup controller. | ||
| 725 | */ | ||
| 726 | if (q->elevator) { | ||
| 727 | ioc_clear_queue(q); | ||
| 728 | elevator_exit(q, q->elevator); | ||
| 729 | q->elevator = NULL; | ||
| 730 | } | ||
| 731 | |||
| 732 | /* | ||
| 733 | * Remove all references to @q from the block cgroup controller before | ||
| 734 | * restoring @q->queue_lock to avoid that restoring this pointer causes | ||
| 735 | * e.g. blkcg_print_blkgs() to crash. | ||
| 736 | */ | ||
| 737 | blkcg_exit_queue(q); | ||
| 738 | |||
| 739 | /* | ||
| 740 | * Since the cgroup code may dereference the @q->backing_dev_info | ||
| 741 | * pointer, only decrease its reference count after having removed the | ||
| 742 | * association with the block cgroup controller. | ||
| 743 | */ | ||
| 744 | bdi_put(q->backing_dev_info); | ||
| 745 | } | ||
| 746 | |||
| 718 | /** | 747 | /** |
| 719 | * blk_cleanup_queue - shutdown a request queue | 748 | * blk_cleanup_queue - shutdown a request queue |
| 720 | * @q: request queue to shutdown | 749 | * @q: request queue to shutdown |
| @@ -762,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q) | |||
| 762 | * make sure all in-progress dispatch are completed because | 791 | * make sure all in-progress dispatch are completed because |
| 763 | * blk_freeze_queue() can only complete all requests, and | 792 | * blk_freeze_queue() can only complete all requests, and |
| 764 | * dispatch may still be in-progress since we dispatch requests | 793 | * dispatch may still be in-progress since we dispatch requests |
| 765 | * from more than one contexts | 794 | * from more than one contexts. |
| 795 | * | ||
| 796 | * No need to quiesce queue if it isn't initialized yet since | ||
| 797 | * blk_freeze_queue() should be enough for cases of passthrough | ||
| 798 | * request. | ||
| 766 | */ | 799 | */ |
| 767 | if (q->mq_ops) | 800 | if (q->mq_ops && blk_queue_init_done(q)) |
| 768 | blk_mq_quiesce_queue(q); | 801 | blk_mq_quiesce_queue(q); |
| 769 | 802 | ||
| 770 | /* for synchronous bio-based driver finish in-flight integrity i/o */ | 803 | /* for synchronous bio-based driver finish in-flight integrity i/o */ |
| @@ -780,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q) | |||
| 780 | */ | 813 | */ |
| 781 | WARN_ON_ONCE(q->kobj.state_in_sysfs); | 814 | WARN_ON_ONCE(q->kobj.state_in_sysfs); |
| 782 | 815 | ||
| 783 | /* | 816 | blk_exit_queue(q); |
| 784 | * Since the I/O scheduler exit code may access cgroup information, | ||
| 785 | * perform I/O scheduler exit before disassociating from the block | ||
| 786 | * cgroup controller. | ||
| 787 | */ | ||
| 788 | if (q->elevator) { | ||
| 789 | ioc_clear_queue(q); | ||
| 790 | elevator_exit(q, q->elevator); | ||
| 791 | q->elevator = NULL; | ||
| 792 | } | ||
| 793 | |||
| 794 | /* | ||
| 795 | * Remove all references to @q from the block cgroup controller before | ||
| 796 | * restoring @q->queue_lock to avoid that restoring this pointer causes | ||
| 797 | * e.g. blkcg_print_blkgs() to crash. | ||
| 798 | */ | ||
| 799 | blkcg_exit_queue(q); | ||
| 800 | |||
| 801 | /* | ||
| 802 | * Since the cgroup code may dereference the @q->backing_dev_info | ||
| 803 | * pointer, only decrease its reference count after having removed the | ||
| 804 | * association with the block cgroup controller. | ||
| 805 | */ | ||
| 806 | bdi_put(q->backing_dev_info); | ||
| 807 | 817 | ||
| 808 | if (q->mq_ops) | 818 | if (q->mq_ops) |
| 809 | blk_mq_free_queue(q); | 819 | blk_mq_free_queue(q); |
| @@ -1180,6 +1190,7 @@ out_exit_flush_rq: | |||
| 1180 | q->exit_rq_fn(q, q->fq->flush_rq); | 1190 | q->exit_rq_fn(q, q->fq->flush_rq); |
| 1181 | out_free_flush_queue: | 1191 | out_free_flush_queue: |
| 1182 | blk_free_flush_queue(q->fq); | 1192 | blk_free_flush_queue(q->fq); |
| 1193 | q->fq = NULL; | ||
| 1183 | return -ENOMEM; | 1194 | return -ENOMEM; |
| 1184 | } | 1195 | } |
| 1185 | EXPORT_SYMBOL(blk_init_allocated_queue); | 1196 | EXPORT_SYMBOL(blk_init_allocated_queue); |
| @@ -1641,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) | |||
| 1641 | blk_delete_timer(rq); | 1652 | blk_delete_timer(rq); |
| 1642 | blk_clear_rq_complete(rq); | 1653 | blk_clear_rq_complete(rq); |
| 1643 | trace_block_rq_requeue(q, rq); | 1654 | trace_block_rq_requeue(q, rq); |
| 1644 | wbt_requeue(q->rq_wb, rq); | 1655 | rq_qos_requeue(q, rq); |
| 1645 | 1656 | ||
| 1646 | if (rq->rq_flags & RQF_QUEUED) | 1657 | if (rq->rq_flags & RQF_QUEUED) |
| 1647 | blk_queue_end_tag(q, rq); | 1658 | blk_queue_end_tag(q, rq); |
| @@ -1748,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) | |||
| 1748 | /* this is a bio leak */ | 1759 | /* this is a bio leak */ |
| 1749 | WARN_ON(req->bio != NULL); | 1760 | WARN_ON(req->bio != NULL); |
| 1750 | 1761 | ||
| 1751 | wbt_done(q->rq_wb, req); | 1762 | rq_qos_done(q, req); |
| 1752 | 1763 | ||
| 1753 | /* | 1764 | /* |
| 1754 | * Request may not have originated from ll_rw_blk. if not, | 1765 | * Request may not have originated from ll_rw_blk. if not, |
| @@ -1982,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) | |||
| 1982 | int where = ELEVATOR_INSERT_SORT; | 1993 | int where = ELEVATOR_INSERT_SORT; |
| 1983 | struct request *req, *free; | 1994 | struct request *req, *free; |
| 1984 | unsigned int request_count = 0; | 1995 | unsigned int request_count = 0; |
| 1985 | unsigned int wb_acct; | ||
| 1986 | 1996 | ||
| 1987 | /* | 1997 | /* |
| 1988 | * low level driver can indicate that it wants pages above a | 1998 | * low level driver can indicate that it wants pages above a |
| @@ -2040,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) | |||
| 2040 | } | 2050 | } |
| 2041 | 2051 | ||
| 2042 | get_rq: | 2052 | get_rq: |
| 2043 | wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); | 2053 | rq_qos_throttle(q, bio, q->queue_lock); |
| 2044 | 2054 | ||
| 2045 | /* | 2055 | /* |
| 2046 | * Grab a free request. This is might sleep but can not fail. | 2056 | * Grab a free request. This is might sleep but can not fail. |
| @@ -2050,7 +2060,7 @@ get_rq: | |||
| 2050 | req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); | 2060 | req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); |
| 2051 | if (IS_ERR(req)) { | 2061 | if (IS_ERR(req)) { |
| 2052 | blk_queue_exit(q); | 2062 | blk_queue_exit(q); |
| 2053 | __wbt_done(q->rq_wb, wb_acct); | 2063 | rq_qos_cleanup(q, bio); |
| 2054 | if (PTR_ERR(req) == -ENOMEM) | 2064 | if (PTR_ERR(req) == -ENOMEM) |
| 2055 | bio->bi_status = BLK_STS_RESOURCE; | 2065 | bio->bi_status = BLK_STS_RESOURCE; |
| 2056 | else | 2066 | else |
| @@ -2059,7 +2069,7 @@ get_rq: | |||
| 2059 | goto out_unlock; | 2069 | goto out_unlock; |
| 2060 | } | 2070 | } |
| 2061 | 2071 | ||
| 2062 | wbt_track(req, wb_acct); | 2072 | rq_qos_track(q, req, bio); |
| 2063 | 2073 | ||
| 2064 | /* | 2074 | /* |
| 2065 | * After dropping the lock and possibly sleeping here, our request | 2075 | * After dropping the lock and possibly sleeping here, our request |
| @@ -2700,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); | |||
| 2700 | void blk_account_io_completion(struct request *req, unsigned int bytes) | 2710 | void blk_account_io_completion(struct request *req, unsigned int bytes) |
| 2701 | { | 2711 | { |
| 2702 | if (blk_do_io_stat(req)) { | 2712 | if (blk_do_io_stat(req)) { |
| 2703 | const int rw = rq_data_dir(req); | 2713 | const int sgrp = op_stat_group(req_op(req)); |
| 2704 | struct hd_struct *part; | 2714 | struct hd_struct *part; |
| 2705 | int cpu; | 2715 | int cpu; |
| 2706 | 2716 | ||
| 2707 | cpu = part_stat_lock(); | 2717 | cpu = part_stat_lock(); |
| 2708 | part = req->part; | 2718 | part = req->part; |
| 2709 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); | 2719 | part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); |
| 2710 | part_stat_unlock(); | 2720 | part_stat_unlock(); |
| 2711 | } | 2721 | } |
| 2712 | } | 2722 | } |
| @@ -2720,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now) | |||
| 2720 | */ | 2730 | */ |
| 2721 | if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { | 2731 | if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { |
| 2722 | unsigned long duration; | 2732 | unsigned long duration; |
| 2723 | const int rw = rq_data_dir(req); | 2733 | const int sgrp = op_stat_group(req_op(req)); |
| 2724 | struct hd_struct *part; | 2734 | struct hd_struct *part; |
| 2725 | int cpu; | 2735 | int cpu; |
| 2726 | 2736 | ||
| @@ -2728,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now) | |||
| 2728 | cpu = part_stat_lock(); | 2738 | cpu = part_stat_lock(); |
| 2729 | part = req->part; | 2739 | part = req->part; |
| 2730 | 2740 | ||
| 2731 | part_stat_inc(cpu, part, ios[rw]); | 2741 | part_stat_inc(cpu, part, ios[sgrp]); |
| 2732 | part_stat_add(cpu, part, ticks[rw], duration); | 2742 | part_stat_add(cpu, part, ticks[sgrp], duration); |
| 2733 | part_round_stats(req->q, cpu, part); | 2743 | part_round_stats(req->q, cpu, part); |
| 2734 | part_dec_in_flight(req->q, part, rw); | 2744 | part_dec_in_flight(req->q, part, rq_data_dir(req)); |
| 2735 | 2745 | ||
| 2736 | hd_struct_put(part); | 2746 | hd_struct_put(part); |
| 2737 | part_stat_unlock(); | 2747 | part_stat_unlock(); |
| @@ -2751,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq) | |||
| 2751 | return rq->rq_flags & RQF_PM; | 2761 | return rq->rq_flags & RQF_PM; |
| 2752 | case RPM_SUSPENDED: | 2762 | case RPM_SUSPENDED: |
| 2753 | return false; | 2763 | return false; |
| 2764 | default: | ||
| 2765 | return true; | ||
| 2754 | } | 2766 | } |
| 2755 | |||
| 2756 | return true; | ||
| 2757 | } | 2767 | } |
| 2758 | #else | 2768 | #else |
| 2759 | static bool blk_pm_allow_request(struct request *rq) | 2769 | static bool blk_pm_allow_request(struct request *rq) |
| @@ -2980,7 +2990,7 @@ void blk_start_request(struct request *req) | |||
| 2980 | req->throtl_size = blk_rq_sectors(req); | 2990 | req->throtl_size = blk_rq_sectors(req); |
| 2981 | #endif | 2991 | #endif |
| 2982 | req->rq_flags |= RQF_STATS; | 2992 | req->rq_flags |= RQF_STATS; |
| 2983 | wbt_issue(req->q->rq_wb, req); | 2993 | rq_qos_issue(req->q, req); |
| 2984 | } | 2994 | } |
| 2985 | 2995 | ||
| 2986 | BUG_ON(blk_rq_is_complete(req)); | 2996 | BUG_ON(blk_rq_is_complete(req)); |
| @@ -3053,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); | |||
| 3053 | * Passing the result of blk_rq_bytes() as @nr_bytes guarantees | 3063 | * Passing the result of blk_rq_bytes() as @nr_bytes guarantees |
| 3054 | * %false return from this function. | 3064 | * %false return from this function. |
| 3055 | * | 3065 | * |
| 3066 | * Note: | ||
| 3067 | * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both | ||
| 3068 | * blk_rq_bytes() and in blk_update_request(). | ||
| 3069 | * | ||
| 3056 | * Return: | 3070 | * Return: |
| 3057 | * %false - this request doesn't have any more data | 3071 | * %false - this request doesn't have any more data |
| 3058 | * %true - this request has more data | 3072 | * %true - this request has more data |
| @@ -3200,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error) | |||
| 3200 | blk_account_io_done(req, now); | 3214 | blk_account_io_done(req, now); |
| 3201 | 3215 | ||
| 3202 | if (req->end_io) { | 3216 | if (req->end_io) { |
| 3203 | wbt_done(req->q->rq_wb, req); | 3217 | rq_qos_done(q, req); |
| 3204 | req->end_io(req, error); | 3218 | req->end_io(req, error); |
| 3205 | } else { | 3219 | } else { |
| 3206 | if (blk_bidi_rq(req)) | 3220 | if (blk_bidi_rq(req)) |
| @@ -3763,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug); | |||
| 3763 | */ | 3777 | */ |
| 3764 | void blk_pm_runtime_init(struct request_queue *q, struct device *dev) | 3778 | void blk_pm_runtime_init(struct request_queue *q, struct device *dev) |
| 3765 | { | 3779 | { |
| 3766 | /* not support for RQF_PM and ->rpm_status in blk-mq yet */ | 3780 | /* Don't enable runtime PM for blk-mq until it is ready */ |
| 3767 | if (q->mq_ops) | 3781 | if (q->mq_ops) { |
| 3782 | pm_runtime_disable(dev); | ||
| 3768 | return; | 3783 | return; |
| 3784 | } | ||
| 3769 | 3785 | ||
| 3770 | q->dev = dev; | 3786 | q->dev = dev; |
| 3771 | q->rpm_status = RPM_ACTIVE; | 3787 | q->rpm_status = RPM_ACTIVE; |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f23311e4b201..01580f88fcb3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
| @@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) | |||
| 278 | atomic_set(&ioc->nr_tasks, 1); | 278 | atomic_set(&ioc->nr_tasks, 1); |
| 279 | atomic_set(&ioc->active_ref, 1); | 279 | atomic_set(&ioc->active_ref, 1); |
| 280 | spin_lock_init(&ioc->lock); | 280 | spin_lock_init(&ioc->lock); |
| 281 | INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); | 281 | INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); |
| 282 | INIT_HLIST_HEAD(&ioc->icq_list); | 282 | INIT_HLIST_HEAD(&ioc->icq_list); |
| 283 | INIT_WORK(&ioc->release_work, ioc_release_fn); | 283 | INIT_WORK(&ioc->release_work, ioc_release_fn); |
| 284 | 284 | ||
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c new file mode 100644 index 000000000000..19923f8a029d --- /dev/null +++ b/block/blk-iolatency.c | |||
| @@ -0,0 +1,955 @@ | |||
| 1 | /* | ||
| 2 | * Block rq-qos base io controller | ||
| 3 | * | ||
| 4 | * This works similar to wbt with a few exceptions | ||
| 5 | * | ||
| 6 | * - It's bio based, so the latency covers the whole block layer in addition to | ||
| 7 | * the actual io. | ||
| 8 | * - We will throttle all IO that comes in here if we need to. | ||
| 9 | * - We use the mean latency over the 100ms window. This is because writes can | ||
| 10 | * be particularly fast, which could give us a false sense of the impact of | ||
| 11 | * other workloads on our protected workload. | ||
| 12 | * - By default there's no throttling, we set the queue_depth to UINT_MAX so | ||
| 13 | * that we can have as many outstanding bio's as we're allowed to. Only at | ||
| 14 | * throttle time do we pay attention to the actual queue depth. | ||
| 15 | * | ||
| 16 | * The hierarchy works like the cpu controller does, we track the latency at | ||
| 17 | * every configured node, and each configured node has it's own independent | ||
| 18 | * queue depth. This means that we only care about our latency targets at the | ||
| 19 | * peer level. Some group at the bottom of the hierarchy isn't going to affect | ||
| 20 | * a group at the end of some other path if we're only configred at leaf level. | ||
| 21 | * | ||
| 22 | * Consider the following | ||
| 23 | * | ||
| 24 | * root blkg | ||
| 25 | * / \ | ||
| 26 | * fast (target=5ms) slow (target=10ms) | ||
| 27 | * / \ / \ | ||
| 28 | * a b normal(15ms) unloved | ||
| 29 | * | ||
| 30 | * "a" and "b" have no target, but their combined io under "fast" cannot exceed | ||
| 31 | * an average latency of 5ms. If it does then we will throttle the "slow" | ||
| 32 | * group. In the case of "normal", if it exceeds its 15ms target, we will | ||
| 33 | * throttle "unloved", but nobody else. | ||
| 34 | * | ||
| 35 | * In this example "fast", "slow", and "normal" will be the only groups actually | ||
| 36 | * accounting their io latencies. We have to walk up the heirarchy to the root | ||
| 37 | * on every submit and complete so we can do the appropriate stat recording and | ||
| 38 | * adjust the queue depth of ourselves if needed. | ||
| 39 | * | ||
| 40 | * There are 2 ways we throttle IO. | ||
| 41 | * | ||
| 42 | * 1) Queue depth throttling. As we throttle down we will adjust the maximum | ||
| 43 | * number of IO's we're allowed to have in flight. This starts at (u64)-1 down | ||
| 44 | * to 1. If the group is only ever submitting IO for itself then this is the | ||
| 45 | * only way we throttle. | ||
| 46 | * | ||
| 47 | * 2) Induced delay throttling. This is for the case that a group is generating | ||
| 48 | * IO that has to be issued by the root cg to avoid priority inversion. So think | ||
| 49 | * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot | ||
| 50 | * of work done for us on behalf of the root cg and are being asked to scale | ||
| 51 | * down more then we induce a latency at userspace return. We accumulate the | ||
| 52 | * total amount of time we need to be punished by doing | ||
| 53 | * | ||
| 54 | * total_time += min_lat_nsec - actual_io_completion | ||
| 55 | * | ||
| 56 | * and then at throttle time will do | ||
| 57 | * | ||
| 58 | * throttle_time = min(total_time, NSEC_PER_SEC) | ||
| 59 | * | ||
| 60 | * This induced delay will throttle back the activity that is generating the | ||
| 61 | * root cg issued io's, wethere that's some metadata intensive operation or the | ||
| 62 | * group is using so much memory that it is pushing us into swap. | ||
| 63 | * | ||
| 64 | * Copyright (C) 2018 Josef Bacik | ||
| 65 | */ | ||
| 66 | #include <linux/kernel.h> | ||
| 67 | #include <linux/blk_types.h> | ||
| 68 | #include <linux/backing-dev.h> | ||
| 69 | #include <linux/module.h> | ||
| 70 | #include <linux/timer.h> | ||
| 71 | #include <linux/memcontrol.h> | ||
| 72 | #include <linux/sched/loadavg.h> | ||
| 73 | #include <linux/sched/signal.h> | ||
| 74 | #include <trace/events/block.h> | ||
| 75 | #include "blk-rq-qos.h" | ||
| 76 | #include "blk-stat.h" | ||
| 77 | |||
| 78 | #define DEFAULT_SCALE_COOKIE 1000000U | ||
| 79 | |||
| 80 | static struct blkcg_policy blkcg_policy_iolatency; | ||
| 81 | struct iolatency_grp; | ||
| 82 | |||
| 83 | struct blk_iolatency { | ||
| 84 | struct rq_qos rqos; | ||
| 85 | struct timer_list timer; | ||
| 86 | atomic_t enabled; | ||
| 87 | }; | ||
| 88 | |||
| 89 | static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) | ||
| 90 | { | ||
| 91 | return container_of(rqos, struct blk_iolatency, rqos); | ||
| 92 | } | ||
| 93 | |||
| 94 | static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) | ||
| 95 | { | ||
| 96 | return atomic_read(&blkiolat->enabled) > 0; | ||
| 97 | } | ||
| 98 | |||
| 99 | struct child_latency_info { | ||
| 100 | spinlock_t lock; | ||
| 101 | |||
| 102 | /* Last time we adjusted the scale of everybody. */ | ||
| 103 | u64 last_scale_event; | ||
| 104 | |||
| 105 | /* The latency that we missed. */ | ||
| 106 | u64 scale_lat; | ||
| 107 | |||
| 108 | /* Total io's from all of our children for the last summation. */ | ||
| 109 | u64 nr_samples; | ||
| 110 | |||
| 111 | /* The guy who actually changed the latency numbers. */ | ||
| 112 | struct iolatency_grp *scale_grp; | ||
| 113 | |||
| 114 | /* Cookie to tell if we need to scale up or down. */ | ||
| 115 | atomic_t scale_cookie; | ||
| 116 | }; | ||
| 117 | |||
| 118 | struct iolatency_grp { | ||
| 119 | struct blkg_policy_data pd; | ||
| 120 | struct blk_rq_stat __percpu *stats; | ||
| 121 | struct blk_iolatency *blkiolat; | ||
| 122 | struct rq_depth rq_depth; | ||
| 123 | struct rq_wait rq_wait; | ||
| 124 | atomic64_t window_start; | ||
| 125 | atomic_t scale_cookie; | ||
| 126 | u64 min_lat_nsec; | ||
| 127 | u64 cur_win_nsec; | ||
| 128 | |||
| 129 | /* total running average of our io latency. */ | ||
| 130 | u64 lat_avg; | ||
| 131 | |||
| 132 | /* Our current number of IO's for the last summation. */ | ||
| 133 | u64 nr_samples; | ||
| 134 | |||
| 135 | struct child_latency_info child_lat; | ||
| 136 | }; | ||
| 137 | |||
| 138 | #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) | ||
| 139 | #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC | ||
| 140 | /* | ||
| 141 | * These are the constants used to fake the fixed-point moving average | ||
| 142 | * calculation just like load average. The call to CALC_LOAD folds | ||
| 143 | * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling | ||
| 144 | * window size is bucketed to try to approximately calculate average | ||
| 145 | * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows | ||
| 146 | * elapse immediately. Note, windows only elapse with IO activity. Idle | ||
| 147 | * periods extend the most recent window. | ||
| 148 | */ | ||
| 149 | #define BLKIOLATENCY_NR_EXP_FACTORS 5 | ||
| 150 | #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ | ||
| 151 | (BLKIOLATENCY_NR_EXP_FACTORS - 1)) | ||
| 152 | static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { | ||
| 153 | 2045, // exp(1/600) - 600 samples | ||
| 154 | 2039, // exp(1/240) - 240 samples | ||
| 155 | 2031, // exp(1/120) - 120 samples | ||
| 156 | 2023, // exp(1/80) - 80 samples | ||
| 157 | 2014, // exp(1/60) - 60 samples | ||
| 158 | }; | ||
| 159 | |||
| 160 | static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) | ||
| 161 | { | ||
| 162 | return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; | ||
| 163 | } | ||
| 164 | |||
| 165 | static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) | ||
| 166 | { | ||
| 167 | return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); | ||
| 168 | } | ||
| 169 | |||
| 170 | static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) | ||
| 171 | { | ||
| 172 | return pd_to_blkg(&iolat->pd); | ||
| 173 | } | ||
| 174 | |||
| 175 | static inline bool iolatency_may_queue(struct iolatency_grp *iolat, | ||
| 176 | wait_queue_entry_t *wait, | ||
| 177 | bool first_block) | ||
| 178 | { | ||
| 179 | struct rq_wait *rqw = &iolat->rq_wait; | ||
| 180 | |||
| 181 | if (first_block && waitqueue_active(&rqw->wait) && | ||
| 182 | rqw->wait.head.next != &wait->entry) | ||
| 183 | return false; | ||
| 184 | return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); | ||
| 185 | } | ||
| 186 | |||
| 187 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, | ||
| 188 | struct iolatency_grp *iolat, | ||
| 189 | spinlock_t *lock, bool issue_as_root, | ||
| 190 | bool use_memdelay) | ||
| 191 | __releases(lock) | ||
| 192 | __acquires(lock) | ||
| 193 | { | ||
| 194 | struct rq_wait *rqw = &iolat->rq_wait; | ||
| 195 | unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); | ||
| 196 | DEFINE_WAIT(wait); | ||
| 197 | bool first_block = true; | ||
| 198 | |||
| 199 | if (use_delay) | ||
| 200 | blkcg_schedule_throttle(rqos->q, use_memdelay); | ||
| 201 | |||
| 202 | /* | ||
| 203 | * To avoid priority inversions we want to just take a slot if we are | ||
| 204 | * issuing as root. If we're being killed off there's no point in | ||
| 205 | * delaying things, we may have been killed by OOM so throttling may | ||
| 206 | * make recovery take even longer, so just let the IO's through so the | ||
| 207 | * task can go away. | ||
| 208 | */ | ||
| 209 | if (issue_as_root || fatal_signal_pending(current)) { | ||
| 210 | atomic_inc(&rqw->inflight); | ||
| 211 | return; | ||
| 212 | } | ||
| 213 | |||
| 214 | if (iolatency_may_queue(iolat, &wait, first_block)) | ||
| 215 | return; | ||
| 216 | |||
| 217 | do { | ||
| 218 | prepare_to_wait_exclusive(&rqw->wait, &wait, | ||
| 219 | TASK_UNINTERRUPTIBLE); | ||
| 220 | |||
| 221 | if (iolatency_may_queue(iolat, &wait, first_block)) | ||
| 222 | break; | ||
| 223 | first_block = false; | ||
| 224 | |||
| 225 | if (lock) { | ||
| 226 | spin_unlock_irq(lock); | ||
| 227 | io_schedule(); | ||
| 228 | spin_lock_irq(lock); | ||
| 229 | } else { | ||
| 230 | io_schedule(); | ||
| 231 | } | ||
| 232 | } while (1); | ||
| 233 | |||
| 234 | finish_wait(&rqw->wait, &wait); | ||
| 235 | } | ||
| 236 | |||
| 237 | #define SCALE_DOWN_FACTOR 2 | ||
| 238 | #define SCALE_UP_FACTOR 4 | ||
| 239 | |||
| 240 | static inline unsigned long scale_amount(unsigned long qd, bool up) | ||
| 241 | { | ||
| 242 | return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); | ||
| 243 | } | ||
| 244 | |||
| 245 | /* | ||
| 246 | * We scale the qd down faster than we scale up, so we need to use this helper | ||
| 247 | * to adjust the scale_cookie accordingly so we don't prematurely get | ||
| 248 | * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. | ||
| 249 | * | ||
| 250 | * Each group has their own local copy of the last scale cookie they saw, so if | ||
| 251 | * the global scale cookie goes up or down they know which way they need to go | ||
| 252 | * based on their last knowledge of it. | ||
| 253 | */ | ||
| 254 | static void scale_cookie_change(struct blk_iolatency *blkiolat, | ||
| 255 | struct child_latency_info *lat_info, | ||
| 256 | bool up) | ||
| 257 | { | ||
| 258 | unsigned long qd = blk_queue_depth(blkiolat->rqos.q); | ||
| 259 | unsigned long scale = scale_amount(qd, up); | ||
| 260 | unsigned long old = atomic_read(&lat_info->scale_cookie); | ||
| 261 | unsigned long max_scale = qd << 1; | ||
| 262 | unsigned long diff = 0; | ||
| 263 | |||
| 264 | if (old < DEFAULT_SCALE_COOKIE) | ||
| 265 | diff = DEFAULT_SCALE_COOKIE - old; | ||
| 266 | |||
| 267 | if (up) { | ||
| 268 | if (scale + old > DEFAULT_SCALE_COOKIE) | ||
| 269 | atomic_set(&lat_info->scale_cookie, | ||
| 270 | DEFAULT_SCALE_COOKIE); | ||
| 271 | else if (diff > qd) | ||
| 272 | atomic_inc(&lat_info->scale_cookie); | ||
| 273 | else | ||
| 274 | atomic_add(scale, &lat_info->scale_cookie); | ||
| 275 | } else { | ||
| 276 | /* | ||
| 277 | * We don't want to dig a hole so deep that it takes us hours to | ||
| 278 | * dig out of it. Just enough that we don't throttle/unthrottle | ||
| 279 | * with jagged workloads but can still unthrottle once pressure | ||
| 280 | * has sufficiently dissipated. | ||
| 281 | */ | ||
| 282 | if (diff > qd) { | ||
| 283 | if (diff < max_scale) | ||
| 284 | atomic_dec(&lat_info->scale_cookie); | ||
| 285 | } else { | ||
| 286 | atomic_sub(scale, &lat_info->scale_cookie); | ||
| 287 | } | ||
| 288 | } | ||
| 289 | } | ||
| 290 | |||
| 291 | /* | ||
| 292 | * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the | ||
| 293 | * queue depth at a time so we don't get wild swings and hopefully dial in to | ||
| 294 | * fairer distribution of the overall queue depth. | ||
| 295 | */ | ||
| 296 | static void scale_change(struct iolatency_grp *iolat, bool up) | ||
| 297 | { | ||
| 298 | unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); | ||
| 299 | unsigned long scale = scale_amount(qd, up); | ||
| 300 | unsigned long old = iolat->rq_depth.max_depth; | ||
| 301 | bool changed = false; | ||
| 302 | |||
| 303 | if (old > qd) | ||
| 304 | old = qd; | ||
| 305 | |||
| 306 | if (up) { | ||
| 307 | if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) | ||
| 308 | return; | ||
| 309 | |||
| 310 | if (old < qd) { | ||
| 311 | changed = true; | ||
| 312 | old += scale; | ||
| 313 | old = min(old, qd); | ||
| 314 | iolat->rq_depth.max_depth = old; | ||
| 315 | wake_up_all(&iolat->rq_wait.wait); | ||
| 316 | } | ||
| 317 | } else if (old > 1) { | ||
| 318 | old >>= 1; | ||
| 319 | changed = true; | ||
| 320 | iolat->rq_depth.max_depth = max(old, 1UL); | ||
| 321 | } | ||
| 322 | } | ||
| 323 | |||
| 324 | /* Check our parent and see if the scale cookie has changed. */ | ||
| 325 | static void check_scale_change(struct iolatency_grp *iolat) | ||
| 326 | { | ||
| 327 | struct iolatency_grp *parent; | ||
| 328 | struct child_latency_info *lat_info; | ||
| 329 | unsigned int cur_cookie; | ||
| 330 | unsigned int our_cookie = atomic_read(&iolat->scale_cookie); | ||
| 331 | u64 scale_lat; | ||
| 332 | unsigned int old; | ||
| 333 | int direction = 0; | ||
| 334 | |||
| 335 | if (lat_to_blkg(iolat)->parent == NULL) | ||
| 336 | return; | ||
| 337 | |||
| 338 | parent = blkg_to_lat(lat_to_blkg(iolat)->parent); | ||
| 339 | if (!parent) | ||
| 340 | return; | ||
| 341 | |||
| 342 | lat_info = &parent->child_lat; | ||
| 343 | cur_cookie = atomic_read(&lat_info->scale_cookie); | ||
| 344 | scale_lat = READ_ONCE(lat_info->scale_lat); | ||
| 345 | |||
| 346 | if (cur_cookie < our_cookie) | ||
| 347 | direction = -1; | ||
| 348 | else if (cur_cookie > our_cookie) | ||
| 349 | direction = 1; | ||
| 350 | else | ||
| 351 | return; | ||
| 352 | |||
| 353 | old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); | ||
| 354 | |||
| 355 | /* Somebody beat us to the punch, just bail. */ | ||
| 356 | if (old != our_cookie) | ||
| 357 | return; | ||
| 358 | |||
| 359 | if (direction < 0 && iolat->min_lat_nsec) { | ||
| 360 | u64 samples_thresh; | ||
| 361 | |||
| 362 | if (!scale_lat || iolat->min_lat_nsec <= scale_lat) | ||
| 363 | return; | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Sometimes high priority groups are their own worst enemy, so | ||
| 367 | * instead of taking it out on some poor other group that did 5% | ||
| 368 | * or less of the IO's for the last summation just skip this | ||
| 369 | * scale down event. | ||
| 370 | */ | ||
| 371 | samples_thresh = lat_info->nr_samples * 5; | ||
| 372 | samples_thresh = div64_u64(samples_thresh, 100); | ||
| 373 | if (iolat->nr_samples <= samples_thresh) | ||
| 374 | return; | ||
| 375 | } | ||
| 376 | |||
| 377 | /* We're as low as we can go. */ | ||
| 378 | if (iolat->rq_depth.max_depth == 1 && direction < 0) { | ||
| 379 | blkcg_use_delay(lat_to_blkg(iolat)); | ||
| 380 | return; | ||
| 381 | } | ||
| 382 | |||
| 383 | /* We're back to the default cookie, unthrottle all the things. */ | ||
| 384 | if (cur_cookie == DEFAULT_SCALE_COOKIE) { | ||
| 385 | blkcg_clear_delay(lat_to_blkg(iolat)); | ||
| 386 | iolat->rq_depth.max_depth = UINT_MAX; | ||
| 387 | wake_up_all(&iolat->rq_wait.wait); | ||
| 388 | return; | ||
| 389 | } | ||
| 390 | |||
| 391 | scale_change(iolat, direction > 0); | ||
| 392 | } | ||
| 393 | |||
| 394 | static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, | ||
| 395 | spinlock_t *lock) | ||
| 396 | { | ||
| 397 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); | ||
| 398 | struct blkcg *blkcg; | ||
| 399 | struct blkcg_gq *blkg; | ||
| 400 | struct request_queue *q = rqos->q; | ||
| 401 | bool issue_as_root = bio_issue_as_root_blkg(bio); | ||
| 402 | |||
| 403 | if (!blk_iolatency_enabled(blkiolat)) | ||
| 404 | return; | ||
| 405 | |||
| 406 | rcu_read_lock(); | ||
| 407 | blkcg = bio_blkcg(bio); | ||
| 408 | bio_associate_blkcg(bio, &blkcg->css); | ||
| 409 | blkg = blkg_lookup(blkcg, q); | ||
| 410 | if (unlikely(!blkg)) { | ||
| 411 | if (!lock) | ||
| 412 | spin_lock_irq(q->queue_lock); | ||
| 413 | blkg = blkg_lookup_create(blkcg, q); | ||
| 414 | if (IS_ERR(blkg)) | ||
| 415 | blkg = NULL; | ||
| 416 | if (!lock) | ||
| 417 | spin_unlock_irq(q->queue_lock); | ||
| 418 | } | ||
| 419 | if (!blkg) | ||
| 420 | goto out; | ||
| 421 | |||
| 422 | bio_issue_init(&bio->bi_issue, bio_sectors(bio)); | ||
| 423 | bio_associate_blkg(bio, blkg); | ||
| 424 | out: | ||
| 425 | rcu_read_unlock(); | ||
| 426 | while (blkg && blkg->parent) { | ||
| 427 | struct iolatency_grp *iolat = blkg_to_lat(blkg); | ||
| 428 | if (!iolat) { | ||
| 429 | blkg = blkg->parent; | ||
| 430 | continue; | ||
| 431 | } | ||
| 432 | |||
| 433 | check_scale_change(iolat); | ||
| 434 | __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, | ||
| 435 | (bio->bi_opf & REQ_SWAP) == REQ_SWAP); | ||
| 436 | blkg = blkg->parent; | ||
| 437 | } | ||
| 438 | if (!timer_pending(&blkiolat->timer)) | ||
| 439 | mod_timer(&blkiolat->timer, jiffies + HZ); | ||
| 440 | } | ||
| 441 | |||
| 442 | static void iolatency_record_time(struct iolatency_grp *iolat, | ||
| 443 | struct bio_issue *issue, u64 now, | ||
| 444 | bool issue_as_root) | ||
| 445 | { | ||
| 446 | struct blk_rq_stat *rq_stat; | ||
| 447 | u64 start = bio_issue_time(issue); | ||
| 448 | u64 req_time; | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Have to do this so we are truncated to the correct time that our | ||
| 452 | * issue is truncated to. | ||
| 453 | */ | ||
| 454 | now = __bio_issue_time(now); | ||
| 455 | |||
| 456 | if (now <= start) | ||
| 457 | return; | ||
| 458 | |||
| 459 | req_time = now - start; | ||
| 460 | |||
| 461 | /* | ||
| 462 | * We don't want to count issue_as_root bio's in the cgroups latency | ||
| 463 | * statistics as it could skew the numbers downwards. | ||
| 464 | */ | ||
| 465 | if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { | ||
| 466 | u64 sub = iolat->min_lat_nsec; | ||
| 467 | if (req_time < sub) | ||
| 468 | blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); | ||
| 469 | return; | ||
| 470 | } | ||
| 471 | |||
| 472 | rq_stat = get_cpu_ptr(iolat->stats); | ||
| 473 | blk_rq_stat_add(rq_stat, req_time); | ||
| 474 | put_cpu_ptr(rq_stat); | ||
| 475 | } | ||
| 476 | |||
| 477 | #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) | ||
| 478 | #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 | ||
| 479 | |||
| 480 | static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) | ||
| 481 | { | ||
| 482 | struct blkcg_gq *blkg = lat_to_blkg(iolat); | ||
| 483 | struct iolatency_grp *parent; | ||
| 484 | struct child_latency_info *lat_info; | ||
| 485 | struct blk_rq_stat stat; | ||
| 486 | unsigned long flags; | ||
| 487 | int cpu, exp_idx; | ||
| 488 | |||
| 489 | blk_rq_stat_init(&stat); | ||
| 490 | preempt_disable(); | ||
| 491 | for_each_online_cpu(cpu) { | ||
| 492 | struct blk_rq_stat *s; | ||
| 493 | s = per_cpu_ptr(iolat->stats, cpu); | ||
| 494 | blk_rq_stat_sum(&stat, s); | ||
| 495 | blk_rq_stat_init(s); | ||
| 496 | } | ||
| 497 | preempt_enable(); | ||
| 498 | |||
| 499 | parent = blkg_to_lat(blkg->parent); | ||
| 500 | if (!parent) | ||
| 501 | return; | ||
| 502 | |||
| 503 | lat_info = &parent->child_lat; | ||
| 504 | |||
| 505 | /* | ||
| 506 | * CALC_LOAD takes in a number stored in fixed point representation. | ||
| 507 | * Because we are using this for IO time in ns, the values stored | ||
| 508 | * are significantly larger than the FIXED_1 denominator (2048). | ||
| 509 | * Therefore, rounding errors in the calculation are negligible and | ||
| 510 | * can be ignored. | ||
| 511 | */ | ||
| 512 | exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, | ||
| 513 | div64_u64(iolat->cur_win_nsec, | ||
| 514 | BLKIOLATENCY_EXP_BUCKET_SIZE)); | ||
| 515 | CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); | ||
| 516 | |||
| 517 | /* Everything is ok and we don't need to adjust the scale. */ | ||
| 518 | if (stat.mean <= iolat->min_lat_nsec && | ||
| 519 | atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) | ||
| 520 | return; | ||
| 521 | |||
| 522 | /* Somebody beat us to the punch, just bail. */ | ||
| 523 | spin_lock_irqsave(&lat_info->lock, flags); | ||
| 524 | lat_info->nr_samples -= iolat->nr_samples; | ||
| 525 | lat_info->nr_samples += stat.nr_samples; | ||
| 526 | iolat->nr_samples = stat.nr_samples; | ||
| 527 | |||
| 528 | if ((lat_info->last_scale_event >= now || | ||
| 529 | now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && | ||
| 530 | lat_info->scale_lat <= iolat->min_lat_nsec) | ||
| 531 | goto out; | ||
| 532 | |||
| 533 | if (stat.mean <= iolat->min_lat_nsec && | ||
| 534 | stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { | ||
| 535 | if (lat_info->scale_grp == iolat) { | ||
| 536 | lat_info->last_scale_event = now; | ||
| 537 | scale_cookie_change(iolat->blkiolat, lat_info, true); | ||
| 538 | } | ||
| 539 | } else if (stat.mean > iolat->min_lat_nsec) { | ||
| 540 | lat_info->last_scale_event = now; | ||
| 541 | if (!lat_info->scale_grp || | ||
| 542 | lat_info->scale_lat > iolat->min_lat_nsec) { | ||
| 543 | WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); | ||
| 544 | lat_info->scale_grp = iolat; | ||
| 545 | } | ||
| 546 | scale_cookie_change(iolat->blkiolat, lat_info, false); | ||
| 547 | } | ||
| 548 | out: | ||
| 549 | spin_unlock_irqrestore(&lat_info->lock, flags); | ||
| 550 | } | ||
| 551 | |||
| 552 | static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) | ||
| 553 | { | ||
| 554 | struct blkcg_gq *blkg; | ||
| 555 | struct rq_wait *rqw; | ||
| 556 | struct iolatency_grp *iolat; | ||
| 557 | u64 window_start; | ||
| 558 | u64 now = ktime_to_ns(ktime_get()); | ||
| 559 | bool issue_as_root = bio_issue_as_root_blkg(bio); | ||
| 560 | bool enabled = false; | ||
| 561 | |||
| 562 | blkg = bio->bi_blkg; | ||
| 563 | if (!blkg) | ||
| 564 | return; | ||
| 565 | |||
| 566 | iolat = blkg_to_lat(bio->bi_blkg); | ||
| 567 | if (!iolat) | ||
| 568 | return; | ||
| 569 | |||
| 570 | enabled = blk_iolatency_enabled(iolat->blkiolat); | ||
| 571 | while (blkg && blkg->parent) { | ||
| 572 | iolat = blkg_to_lat(blkg); | ||
| 573 | if (!iolat) { | ||
| 574 | blkg = blkg->parent; | ||
| 575 | continue; | ||
| 576 | } | ||
| 577 | rqw = &iolat->rq_wait; | ||
| 578 | |||
| 579 | atomic_dec(&rqw->inflight); | ||
| 580 | if (!enabled || iolat->min_lat_nsec == 0) | ||
| 581 | goto next; | ||
| 582 | iolatency_record_time(iolat, &bio->bi_issue, now, | ||
| 583 | issue_as_root); | ||
| 584 | window_start = atomic64_read(&iolat->window_start); | ||
| 585 | if (now > window_start && | ||
| 586 | (now - window_start) >= iolat->cur_win_nsec) { | ||
| 587 | if (atomic64_cmpxchg(&iolat->window_start, | ||
| 588 | window_start, now) == window_start) | ||
| 589 | iolatency_check_latencies(iolat, now); | ||
| 590 | } | ||
| 591 | next: | ||
| 592 | wake_up(&rqw->wait); | ||
| 593 | blkg = blkg->parent; | ||
| 594 | } | ||
| 595 | } | ||
| 596 | |||
| 597 | static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) | ||
| 598 | { | ||
| 599 | struct blkcg_gq *blkg; | ||
| 600 | |||
| 601 | blkg = bio->bi_blkg; | ||
| 602 | while (blkg && blkg->parent) { | ||
| 603 | struct rq_wait *rqw; | ||
| 604 | struct iolatency_grp *iolat; | ||
| 605 | |||
| 606 | iolat = blkg_to_lat(blkg); | ||
| 607 | if (!iolat) | ||
| 608 | goto next; | ||
| 609 | |||
| 610 | rqw = &iolat->rq_wait; | ||
| 611 | atomic_dec(&rqw->inflight); | ||
| 612 | wake_up(&rqw->wait); | ||
| 613 | next: | ||
| 614 | blkg = blkg->parent; | ||
| 615 | } | ||
| 616 | } | ||
| 617 | |||
| 618 | static void blkcg_iolatency_exit(struct rq_qos *rqos) | ||
| 619 | { | ||
| 620 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); | ||
| 621 | |||
| 622 | del_timer_sync(&blkiolat->timer); | ||
| 623 | blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); | ||
| 624 | kfree(blkiolat); | ||
| 625 | } | ||
| 626 | |||
| 627 | static struct rq_qos_ops blkcg_iolatency_ops = { | ||
| 628 | .throttle = blkcg_iolatency_throttle, | ||
| 629 | .cleanup = blkcg_iolatency_cleanup, | ||
| 630 | .done_bio = blkcg_iolatency_done_bio, | ||
| 631 | .exit = blkcg_iolatency_exit, | ||
| 632 | }; | ||
| 633 | |||
| 634 | static void blkiolatency_timer_fn(struct timer_list *t) | ||
| 635 | { | ||
| 636 | struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); | ||
| 637 | struct blkcg_gq *blkg; | ||
| 638 | struct cgroup_subsys_state *pos_css; | ||
| 639 | u64 now = ktime_to_ns(ktime_get()); | ||
| 640 | |||
| 641 | rcu_read_lock(); | ||
| 642 | blkg_for_each_descendant_pre(blkg, pos_css, | ||
| 643 | blkiolat->rqos.q->root_blkg) { | ||
| 644 | struct iolatency_grp *iolat; | ||
| 645 | struct child_latency_info *lat_info; | ||
| 646 | unsigned long flags; | ||
| 647 | u64 cookie; | ||
| 648 | |||
| 649 | /* | ||
| 650 | * We could be exiting, don't access the pd unless we have a | ||
| 651 | * ref on the blkg. | ||
| 652 | */ | ||
| 653 | if (!blkg_try_get(blkg)) | ||
| 654 | continue; | ||
| 655 | |||
| 656 | iolat = blkg_to_lat(blkg); | ||
| 657 | if (!iolat) | ||
| 658 | goto next; | ||
| 659 | |||
| 660 | lat_info = &iolat->child_lat; | ||
| 661 | cookie = atomic_read(&lat_info->scale_cookie); | ||
| 662 | |||
| 663 | if (cookie >= DEFAULT_SCALE_COOKIE) | ||
| 664 | goto next; | ||
| 665 | |||
| 666 | spin_lock_irqsave(&lat_info->lock, flags); | ||
| 667 | if (lat_info->last_scale_event >= now) | ||
| 668 | goto next_lock; | ||
| 669 | |||
| 670 | /* | ||
| 671 | * We scaled down but don't have a scale_grp, scale up and carry | ||
| 672 | * on. | ||
| 673 | */ | ||
| 674 | if (lat_info->scale_grp == NULL) { | ||
| 675 | scale_cookie_change(iolat->blkiolat, lat_info, true); | ||
| 676 | goto next_lock; | ||
| 677 | } | ||
| 678 | |||
| 679 | /* | ||
| 680 | * It's been 5 seconds since our last scale event, clear the | ||
| 681 | * scale grp in case the group that needed the scale down isn't | ||
| 682 | * doing any IO currently. | ||
| 683 | */ | ||
| 684 | if (now - lat_info->last_scale_event >= | ||
| 685 | ((u64)NSEC_PER_SEC * 5)) | ||
| 686 | lat_info->scale_grp = NULL; | ||
| 687 | next_lock: | ||
| 688 | spin_unlock_irqrestore(&lat_info->lock, flags); | ||
| 689 | next: | ||
| 690 | blkg_put(blkg); | ||
| 691 | } | ||
| 692 | rcu_read_unlock(); | ||
| 693 | } | ||
| 694 | |||
| 695 | int blk_iolatency_init(struct request_queue *q) | ||
| 696 | { | ||
| 697 | struct blk_iolatency *blkiolat; | ||
| 698 | struct rq_qos *rqos; | ||
| 699 | int ret; | ||
| 700 | |||
| 701 | blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); | ||
| 702 | if (!blkiolat) | ||
| 703 | return -ENOMEM; | ||
| 704 | |||
| 705 | rqos = &blkiolat->rqos; | ||
| 706 | rqos->id = RQ_QOS_CGROUP; | ||
| 707 | rqos->ops = &blkcg_iolatency_ops; | ||
| 708 | rqos->q = q; | ||
| 709 | |||
| 710 | rq_qos_add(q, rqos); | ||
| 711 | |||
| 712 | ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); | ||
| 713 | if (ret) { | ||
| 714 | rq_qos_del(q, rqos); | ||
| 715 | kfree(blkiolat); | ||
| 716 | return ret; | ||
| 717 | } | ||
| 718 | |||
| 719 | timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); | ||
| 720 | |||
| 721 | return 0; | ||
| 722 | } | ||
| 723 | |||
| 724 | static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) | ||
| 725 | { | ||
| 726 | struct iolatency_grp *iolat = blkg_to_lat(blkg); | ||
| 727 | struct blk_iolatency *blkiolat = iolat->blkiolat; | ||
| 728 | u64 oldval = iolat->min_lat_nsec; | ||
| 729 | |||
| 730 | iolat->min_lat_nsec = val; | ||
| 731 | iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); | ||
| 732 | iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, | ||
| 733 | BLKIOLATENCY_MAX_WIN_SIZE); | ||
| 734 | |||
| 735 | if (!oldval && val) | ||
| 736 | atomic_inc(&blkiolat->enabled); | ||
| 737 | if (oldval && !val) | ||
| 738 | atomic_dec(&blkiolat->enabled); | ||
| 739 | } | ||
| 740 | |||
| 741 | static void iolatency_clear_scaling(struct blkcg_gq *blkg) | ||
| 742 | { | ||
| 743 | if (blkg->parent) { | ||
| 744 | struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); | ||
| 745 | struct child_latency_info *lat_info; | ||
| 746 | if (!iolat) | ||
| 747 | return; | ||
| 748 | |||
| 749 | lat_info = &iolat->child_lat; | ||
| 750 | spin_lock(&lat_info->lock); | ||
| 751 | atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); | ||
| 752 | lat_info->last_scale_event = 0; | ||
| 753 | lat_info->scale_grp = NULL; | ||
| 754 | lat_info->scale_lat = 0; | ||
| 755 | spin_unlock(&lat_info->lock); | ||
| 756 | } | ||
| 757 | } | ||
| 758 | |||
| 759 | static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, | ||
| 760 | size_t nbytes, loff_t off) | ||
| 761 | { | ||
| 762 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
| 763 | struct blkcg_gq *blkg; | ||
| 764 | struct blk_iolatency *blkiolat; | ||
| 765 | struct blkg_conf_ctx ctx; | ||
| 766 | struct iolatency_grp *iolat; | ||
| 767 | char *p, *tok; | ||
| 768 | u64 lat_val = 0; | ||
| 769 | u64 oldval; | ||
| 770 | int ret; | ||
| 771 | |||
| 772 | ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); | ||
| 773 | if (ret) | ||
| 774 | return ret; | ||
| 775 | |||
| 776 | iolat = blkg_to_lat(ctx.blkg); | ||
| 777 | blkiolat = iolat->blkiolat; | ||
| 778 | p = ctx.body; | ||
| 779 | |||
| 780 | ret = -EINVAL; | ||
| 781 | while ((tok = strsep(&p, " "))) { | ||
| 782 | char key[16]; | ||
| 783 | char val[21]; /* 18446744073709551616 */ | ||
| 784 | |||
| 785 | if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) | ||
| 786 | goto out; | ||
| 787 | |||
| 788 | if (!strcmp(key, "target")) { | ||
| 789 | u64 v; | ||
| 790 | |||
| 791 | if (!strcmp(val, "max")) | ||
| 792 | lat_val = 0; | ||
| 793 | else if (sscanf(val, "%llu", &v) == 1) | ||
| 794 | lat_val = v * NSEC_PER_USEC; | ||
| 795 | else | ||
| 796 | goto out; | ||
| 797 | } else { | ||
| 798 | goto out; | ||
| 799 | } | ||
| 800 | } | ||
| 801 | |||
| 802 | /* Walk up the tree to see if our new val is lower than it should be. */ | ||
| 803 | blkg = ctx.blkg; | ||
| 804 | oldval = iolat->min_lat_nsec; | ||
| 805 | |||
| 806 | iolatency_set_min_lat_nsec(blkg, lat_val); | ||
| 807 | if (oldval != iolat->min_lat_nsec) { | ||
| 808 | iolatency_clear_scaling(blkg); | ||
| 809 | } | ||
| 810 | |||
| 811 | ret = 0; | ||
| 812 | out: | ||
| 813 | blkg_conf_finish(&ctx); | ||
| 814 | return ret ?: nbytes; | ||
| 815 | } | ||
| 816 | |||
| 817 | static u64 iolatency_prfill_limit(struct seq_file *sf, | ||
| 818 | struct blkg_policy_data *pd, int off) | ||
| 819 | { | ||
| 820 | struct iolatency_grp *iolat = pd_to_lat(pd); | ||
| 821 | const char *dname = blkg_dev_name(pd->blkg); | ||
| 822 | |||
| 823 | if (!dname || !iolat->min_lat_nsec) | ||
| 824 | return 0; | ||
| 825 | seq_printf(sf, "%s target=%llu\n", | ||
| 826 | dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); | ||
| 827 | return 0; | ||
| 828 | } | ||
| 829 | |||
| 830 | static int iolatency_print_limit(struct seq_file *sf, void *v) | ||
| 831 | { | ||
| 832 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), | ||
| 833 | iolatency_prfill_limit, | ||
| 834 | &blkcg_policy_iolatency, seq_cft(sf)->private, false); | ||
| 835 | return 0; | ||
| 836 | } | ||
| 837 | |||
| 838 | static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, | ||
| 839 | size_t size) | ||
| 840 | { | ||
| 841 | struct iolatency_grp *iolat = pd_to_lat(pd); | ||
| 842 | unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); | ||
| 843 | unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); | ||
| 844 | |||
| 845 | if (iolat->rq_depth.max_depth == UINT_MAX) | ||
| 846 | return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", | ||
| 847 | avg_lat, cur_win); | ||
| 848 | |||
| 849 | return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", | ||
| 850 | iolat->rq_depth.max_depth, avg_lat, cur_win); | ||
| 851 | } | ||
| 852 | |||
| 853 | |||
| 854 | static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) | ||
| 855 | { | ||
| 856 | struct iolatency_grp *iolat; | ||
| 857 | |||
| 858 | iolat = kzalloc_node(sizeof(*iolat), gfp, node); | ||
| 859 | if (!iolat) | ||
| 860 | return NULL; | ||
| 861 | iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), | ||
| 862 | __alignof__(struct blk_rq_stat), gfp); | ||
| 863 | if (!iolat->stats) { | ||
| 864 | kfree(iolat); | ||
| 865 | return NULL; | ||
| 866 | } | ||
| 867 | return &iolat->pd; | ||
| 868 | } | ||
| 869 | |||
| 870 | static void iolatency_pd_init(struct blkg_policy_data *pd) | ||
| 871 | { | ||
| 872 | struct iolatency_grp *iolat = pd_to_lat(pd); | ||
| 873 | struct blkcg_gq *blkg = lat_to_blkg(iolat); | ||
| 874 | struct rq_qos *rqos = blkcg_rq_qos(blkg->q); | ||
| 875 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); | ||
| 876 | u64 now = ktime_to_ns(ktime_get()); | ||
| 877 | int cpu; | ||
| 878 | |||
| 879 | for_each_possible_cpu(cpu) { | ||
| 880 | struct blk_rq_stat *stat; | ||
| 881 | stat = per_cpu_ptr(iolat->stats, cpu); | ||
| 882 | blk_rq_stat_init(stat); | ||
| 883 | } | ||
| 884 | |||
| 885 | rq_wait_init(&iolat->rq_wait); | ||
| 886 | spin_lock_init(&iolat->child_lat.lock); | ||
| 887 | iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); | ||
| 888 | iolat->rq_depth.max_depth = UINT_MAX; | ||
| 889 | iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; | ||
| 890 | iolat->blkiolat = blkiolat; | ||
| 891 | iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; | ||
| 892 | atomic64_set(&iolat->window_start, now); | ||
| 893 | |||
| 894 | /* | ||
| 895 | * We init things in list order, so the pd for the parent may not be | ||
| 896 | * init'ed yet for whatever reason. | ||
| 897 | */ | ||
| 898 | if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { | ||
| 899 | struct iolatency_grp *parent = blkg_to_lat(blkg->parent); | ||
| 900 | atomic_set(&iolat->scale_cookie, | ||
| 901 | atomic_read(&parent->child_lat.scale_cookie)); | ||
| 902 | } else { | ||
| 903 | atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); | ||
| 904 | } | ||
| 905 | |||
| 906 | atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); | ||
| 907 | } | ||
| 908 | |||
| 909 | static void iolatency_pd_offline(struct blkg_policy_data *pd) | ||
| 910 | { | ||
| 911 | struct iolatency_grp *iolat = pd_to_lat(pd); | ||
| 912 | struct blkcg_gq *blkg = lat_to_blkg(iolat); | ||
| 913 | |||
| 914 | iolatency_set_min_lat_nsec(blkg, 0); | ||
| 915 | iolatency_clear_scaling(blkg); | ||
| 916 | } | ||
| 917 | |||
| 918 | static void iolatency_pd_free(struct blkg_policy_data *pd) | ||
| 919 | { | ||
| 920 | struct iolatency_grp *iolat = pd_to_lat(pd); | ||
| 921 | free_percpu(iolat->stats); | ||
| 922 | kfree(iolat); | ||
| 923 | } | ||
| 924 | |||
| 925 | static struct cftype iolatency_files[] = { | ||
| 926 | { | ||
| 927 | .name = "latency", | ||
| 928 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 929 | .seq_show = iolatency_print_limit, | ||
| 930 | .write = iolatency_set_limit, | ||
| 931 | }, | ||
| 932 | {} | ||
| 933 | }; | ||
| 934 | |||
| 935 | static struct blkcg_policy blkcg_policy_iolatency = { | ||
| 936 | .dfl_cftypes = iolatency_files, | ||
| 937 | .pd_alloc_fn = iolatency_pd_alloc, | ||
| 938 | .pd_init_fn = iolatency_pd_init, | ||
| 939 | .pd_offline_fn = iolatency_pd_offline, | ||
| 940 | .pd_free_fn = iolatency_pd_free, | ||
| 941 | .pd_stat_fn = iolatency_pd_stat, | ||
| 942 | }; | ||
| 943 | |||
| 944 | static int __init iolatency_init(void) | ||
| 945 | { | ||
| 946 | return blkcg_policy_register(&blkcg_policy_iolatency); | ||
| 947 | } | ||
| 948 | |||
| 949 | static void __exit iolatency_exit(void) | ||
| 950 | { | ||
| 951 | return blkcg_policy_unregister(&blkcg_policy_iolatency); | ||
| 952 | } | ||
| 953 | |||
| 954 | module_init(iolatency_init); | ||
| 955 | module_exit(iolatency_exit); | ||
diff --git a/block/blk-lib.c b/block/blk-lib.c index 8faa70f26fcd..d1b9dd03da25 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
| @@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
| 68 | */ | 68 | */ |
| 69 | req_sects = min_t(sector_t, nr_sects, | 69 | req_sects = min_t(sector_t, nr_sects, |
| 70 | q->limits.max_discard_sectors); | 70 | q->limits.max_discard_sectors); |
| 71 | if (!req_sects) | ||
| 72 | goto fail; | ||
| 71 | if (req_sects > UINT_MAX >> 9) | 73 | if (req_sects > UINT_MAX >> 9) |
| 72 | req_sects = UINT_MAX >> 9; | 74 | req_sects = UINT_MAX >> 9; |
| 73 | 75 | ||
| @@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
| 105 | 107 | ||
| 106 | *biop = bio; | 108 | *biop = bio; |
| 107 | return 0; | 109 | return 0; |
| 110 | |||
| 111 | fail: | ||
| 112 | if (bio) { | ||
| 113 | submit_bio_wait(bio); | ||
| 114 | bio_put(bio); | ||
| 115 | } | ||
| 116 | *biop = NULL; | ||
| 117 | return -EOPNOTSUPP; | ||
| 108 | } | 118 | } |
| 109 | EXPORT_SYMBOL(__blkdev_issue_discard); | 119 | EXPORT_SYMBOL(__blkdev_issue_discard); |
| 110 | 120 | ||
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c new file mode 100644 index 000000000000..fb2c82c351e4 --- /dev/null +++ b/block/blk-mq-debugfs-zoned.c | |||
| @@ -0,0 +1,24 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Copyright (C) 2017 Western Digital Corporation or its affiliates. | ||
| 4 | * | ||
| 5 | * This file is released under the GPL. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/blkdev.h> | ||
| 9 | #include "blk-mq-debugfs.h" | ||
| 10 | |||
| 11 | int queue_zone_wlock_show(void *data, struct seq_file *m) | ||
| 12 | { | ||
| 13 | struct request_queue *q = data; | ||
| 14 | unsigned int i; | ||
| 15 | |||
| 16 | if (!q->seq_zones_wlock) | ||
| 17 | return 0; | ||
| 18 | |||
| 19 | for (i = 0; i < q->nr_zones; i++) | ||
| 20 | if (test_bit(i, q->seq_zones_wlock)) | ||
| 21 | seq_printf(m, "%u\n", i); | ||
| 22 | |||
| 23 | return 0; | ||
| 24 | } | ||
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c4532e92938..cb1e6cf7ac48 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c | |||
| @@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, | |||
| 206 | return count; | 206 | return count; |
| 207 | } | 207 | } |
| 208 | 208 | ||
| 209 | static int queue_zone_wlock_show(void *data, struct seq_file *m) | ||
| 210 | { | ||
| 211 | struct request_queue *q = data; | ||
| 212 | unsigned int i; | ||
| 213 | |||
| 214 | if (!q->seq_zones_wlock) | ||
| 215 | return 0; | ||
| 216 | |||
| 217 | for (i = 0; i < blk_queue_nr_zones(q); i++) | ||
| 218 | if (test_bit(i, q->seq_zones_wlock)) | ||
| 219 | seq_printf(m, "%u\n", i); | ||
| 220 | |||
| 221 | return 0; | ||
| 222 | } | ||
| 223 | |||
| 224 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { | 209 | static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { |
| 225 | { "poll_stat", 0400, queue_poll_stat_show }, | 210 | { "poll_stat", 0400, queue_poll_stat_show }, |
| 226 | { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, | 211 | { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, |
| @@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m) | |||
| 637 | return 0; | 622 | return 0; |
| 638 | } | 623 | } |
| 639 | 624 | ||
| 625 | static int hctx_dispatch_busy_show(void *data, struct seq_file *m) | ||
| 626 | { | ||
| 627 | struct blk_mq_hw_ctx *hctx = data; | ||
| 628 | |||
| 629 | seq_printf(m, "%u\n", hctx->dispatch_busy); | ||
| 630 | return 0; | ||
| 631 | } | ||
| 632 | |||
| 640 | static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) | 633 | static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) |
| 641 | __acquires(&ctx->lock) | 634 | __acquires(&ctx->lock) |
| 642 | { | 635 | { |
| @@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { | |||
| 798 | {"queued", 0600, hctx_queued_show, hctx_queued_write}, | 791 | {"queued", 0600, hctx_queued_show, hctx_queued_write}, |
| 799 | {"run", 0600, hctx_run_show, hctx_run_write}, | 792 | {"run", 0600, hctx_run_show, hctx_run_write}, |
| 800 | {"active", 0400, hctx_active_show}, | 793 | {"active", 0400, hctx_active_show}, |
| 794 | {"dispatch_busy", 0400, hctx_dispatch_busy_show}, | ||
| 801 | {}, | 795 | {}, |
| 802 | }; | 796 | }; |
| 803 | 797 | ||
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index b9d366e57097..a9160be12be0 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h | |||
| @@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc | |||
| 80 | } | 80 | } |
| 81 | #endif | 81 | #endif |
| 82 | 82 | ||
| 83 | #ifdef CONFIG_BLK_DEBUG_FS_ZONED | ||
| 84 | int queue_zone_wlock_show(void *data, struct seq_file *m); | ||
| 85 | #else | ||
| 86 | static inline int queue_zone_wlock_show(void *data, struct seq_file *m) | ||
| 87 | { | ||
| 88 | return 0; | ||
| 89 | } | ||
| 90 | #endif | ||
| 91 | |||
| 83 | #endif | 92 | #endif |
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index e233996bb76f..db644ec624f5 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c | |||
| @@ -17,6 +17,8 @@ | |||
| 17 | #include <linux/pci.h> | 17 | #include <linux/pci.h> |
| 18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 19 | 19 | ||
| 20 | #include "blk-mq.h" | ||
| 21 | |||
| 20 | /** | 22 | /** |
| 21 | * blk_mq_pci_map_queues - provide a default queue mapping for PCI device | 23 | * blk_mq_pci_map_queues - provide a default queue mapping for PCI device |
| 22 | * @set: tagset to provide the mapping for | 24 | * @set: tagset to provide the mapping for |
| @@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, | |||
| 48 | 50 | ||
| 49 | fallback: | 51 | fallback: |
| 50 | WARN_ON_ONCE(set->nr_hw_queues > 1); | 52 | WARN_ON_ONCE(set->nr_hw_queues > 1); |
| 51 | for_each_possible_cpu(cpu) | 53 | blk_mq_clear_mq_map(set); |
| 52 | set->mq_map[cpu] = 0; | ||
| 53 | return 0; | 54 | return 0; |
| 54 | } | 55 | } |
| 55 | EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); | 56 | EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); |
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 56c493c6cd90..cf9c66c6d35a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c | |||
| @@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) | |||
| 59 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | 59 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) |
| 60 | return; | 60 | return; |
| 61 | 61 | ||
| 62 | if (hctx->flags & BLK_MQ_F_TAG_SHARED) { | 62 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
| 63 | struct request_queue *q = hctx->queue; | ||
| 64 | |||
| 65 | if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | ||
| 66 | atomic_inc(&q->shared_hctx_restart); | ||
| 67 | } else | ||
| 68 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | ||
| 69 | } | 63 | } |
| 70 | 64 | ||
| 71 | static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) | 65 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) |
| 72 | { | 66 | { |
| 73 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | 67 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) |
| 74 | return false; | 68 | return; |
| 75 | 69 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | |
| 76 | if (hctx->flags & BLK_MQ_F_TAG_SHARED) { | ||
| 77 | struct request_queue *q = hctx->queue; | ||
| 78 | |||
| 79 | if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | ||
| 80 | atomic_dec(&q->shared_hctx_restart); | ||
| 81 | } else | ||
| 82 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | ||
| 83 | 70 | ||
| 84 | return blk_mq_run_hw_queue(hctx, true); | 71 | blk_mq_run_hw_queue(hctx, true); |
| 85 | } | 72 | } |
| 86 | 73 | ||
| 87 | /* | 74 | /* |
| @@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
| 219 | } | 206 | } |
| 220 | } else if (has_sched_dispatch) { | 207 | } else if (has_sched_dispatch) { |
| 221 | blk_mq_do_dispatch_sched(hctx); | 208 | blk_mq_do_dispatch_sched(hctx); |
| 222 | } else if (q->mq_ops->get_budget) { | 209 | } else if (hctx->dispatch_busy) { |
| 223 | /* | 210 | /* dequeue request one by one from sw queue if queue is busy */ |
| 224 | * If we need to get budget before queuing request, we | ||
| 225 | * dequeue request one by one from sw queue for avoiding | ||
| 226 | * to mess up I/O merge when dispatch runs out of resource. | ||
| 227 | * | ||
| 228 | * TODO: get more budgets, and dequeue more requests in | ||
| 229 | * one time. | ||
| 230 | */ | ||
| 231 | blk_mq_do_dispatch_ctx(hctx); | 211 | blk_mq_do_dispatch_ctx(hctx); |
| 232 | } else { | 212 | } else { |
| 233 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | 213 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
| @@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | |||
| 339 | return e->type->ops.mq.bio_merge(hctx, bio); | 319 | return e->type->ops.mq.bio_merge(hctx, bio); |
| 340 | } | 320 | } |
| 341 | 321 | ||
| 342 | if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { | 322 | if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
| 323 | !list_empty_careful(&ctx->rq_list)) { | ||
| 343 | /* default per sw-queue merge */ | 324 | /* default per sw-queue merge */ |
| 344 | spin_lock(&ctx->lock); | 325 | spin_lock(&ctx->lock); |
| 345 | ret = blk_mq_attempt_merge(q, ctx, bio); | 326 | ret = blk_mq_attempt_merge(q, ctx, bio); |
| @@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, | |||
| 380 | return false; | 361 | return false; |
| 381 | } | 362 | } |
| 382 | 363 | ||
| 383 | /** | ||
| 384 | * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list | ||
| 385 | * @pos: loop cursor. | ||
| 386 | * @skip: the list element that will not be examined. Iteration starts at | ||
| 387 | * @skip->next. | ||
| 388 | * @head: head of the list to examine. This list must have at least one | ||
| 389 | * element, namely @skip. | ||
| 390 | * @member: name of the list_head structure within typeof(*pos). | ||
| 391 | */ | ||
| 392 | #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ | ||
| 393 | for ((pos) = (skip); \ | ||
| 394 | (pos = (pos)->member.next != (head) ? list_entry_rcu( \ | ||
| 395 | (pos)->member.next, typeof(*pos), member) : \ | ||
| 396 | list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ | ||
| 397 | (pos) != (skip); ) | ||
| 398 | |||
| 399 | /* | ||
| 400 | * Called after a driver tag has been freed to check whether a hctx needs to | ||
| 401 | * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware | ||
| 402 | * queues in a round-robin fashion if the tag set of @hctx is shared with other | ||
| 403 | * hardware queues. | ||
| 404 | */ | ||
| 405 | void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) | ||
| 406 | { | ||
| 407 | struct blk_mq_tags *const tags = hctx->tags; | ||
| 408 | struct blk_mq_tag_set *const set = hctx->queue->tag_set; | ||
| 409 | struct request_queue *const queue = hctx->queue, *q; | ||
| 410 | struct blk_mq_hw_ctx *hctx2; | ||
| 411 | unsigned int i, j; | ||
| 412 | |||
| 413 | if (set->flags & BLK_MQ_F_TAG_SHARED) { | ||
| 414 | /* | ||
| 415 | * If this is 0, then we know that no hardware queues | ||
| 416 | * have RESTART marked. We're done. | ||
| 417 | */ | ||
| 418 | if (!atomic_read(&queue->shared_hctx_restart)) | ||
| 419 | return; | ||
| 420 | |||
| 421 | rcu_read_lock(); | ||
| 422 | list_for_each_entry_rcu_rr(q, queue, &set->tag_list, | ||
| 423 | tag_set_list) { | ||
| 424 | queue_for_each_hw_ctx(q, hctx2, i) | ||
| 425 | if (hctx2->tags == tags && | ||
| 426 | blk_mq_sched_restart_hctx(hctx2)) | ||
| 427 | goto done; | ||
| 428 | } | ||
| 429 | j = hctx->queue_num + 1; | ||
| 430 | for (i = 0; i < queue->nr_hw_queues; i++, j++) { | ||
| 431 | if (j == queue->nr_hw_queues) | ||
| 432 | j = 0; | ||
| 433 | hctx2 = queue->queue_hw_ctx[j]; | ||
| 434 | if (hctx2->tags == tags && | ||
| 435 | blk_mq_sched_restart_hctx(hctx2)) | ||
| 436 | break; | ||
| 437 | } | ||
| 438 | done: | ||
| 439 | rcu_read_unlock(); | ||
| 440 | } else { | ||
| 441 | blk_mq_sched_restart_hctx(hctx); | ||
| 442 | } | ||
| 443 | } | ||
| 444 | |||
| 445 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | 364 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, |
| 446 | bool run_queue, bool async) | 365 | bool run_queue, bool async) |
| 447 | { | 366 | { |
| @@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q, | |||
| 486 | 405 | ||
| 487 | if (e && e->type->ops.mq.insert_requests) | 406 | if (e && e->type->ops.mq.insert_requests) |
| 488 | e->type->ops.mq.insert_requests(hctx, list, false); | 407 | e->type->ops.mq.insert_requests(hctx, list, false); |
| 489 | else | 408 | else { |
| 409 | /* | ||
| 410 | * try to issue requests directly if the hw queue isn't | ||
| 411 | * busy in case of 'none' scheduler, and this way may save | ||
| 412 | * us one extra enqueue & dequeue to sw queue. | ||
| 413 | */ | ||
| 414 | if (!hctx->dispatch_busy && !e && !run_queue_async) { | ||
| 415 | blk_mq_try_issue_list_directly(hctx, list); | ||
| 416 | if (list_empty(list)) | ||
| 417 | return; | ||
| 418 | } | ||
| 490 | blk_mq_insert_requests(hctx, ctx, list); | 419 | blk_mq_insert_requests(hctx, ctx, list); |
| 420 | } | ||
| 491 | 421 | ||
| 492 | blk_mq_run_hw_queue(hctx, run_queue_async); | 422 | blk_mq_run_hw_queue(hctx, run_queue_async); |
| 493 | } | 423 | } |
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 3de0836163c2..816923bf874d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c | |||
| @@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) | |||
| 23 | 23 | ||
| 24 | /* | 24 | /* |
| 25 | * If a previously inactive queue goes active, bump the active user count. | 25 | * If a previously inactive queue goes active, bump the active user count. |
| 26 | * We need to do this before try to allocate driver tag, then even if fail | ||
| 27 | * to get tag when first time, the other shared-tag users could reserve | ||
| 28 | * budget for it. | ||
| 26 | */ | 29 | */ |
| 27 | bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) | 30 | bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) |
| 28 | { | 31 | { |
| @@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, | |||
| 399 | if (tdepth <= tags->nr_reserved_tags) | 402 | if (tdepth <= tags->nr_reserved_tags) |
| 400 | return -EINVAL; | 403 | return -EINVAL; |
| 401 | 404 | ||
| 402 | tdepth -= tags->nr_reserved_tags; | ||
| 403 | |||
| 404 | /* | 405 | /* |
| 405 | * If we are allowed to grow beyond the original size, allocate | 406 | * If we are allowed to grow beyond the original size, allocate |
| 406 | * a new set of tags before freeing the old one. | 407 | * a new set of tags before freeing the old one. |
| @@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, | |||
| 420 | if (tdepth > 16 * BLKDEV_MAX_RQ) | 421 | if (tdepth > 16 * BLKDEV_MAX_RQ) |
| 421 | return -EINVAL; | 422 | return -EINVAL; |
| 422 | 423 | ||
| 423 | new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); | 424 | new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, |
| 425 | tags->nr_reserved_tags); | ||
| 424 | if (!new) | 426 | if (!new) |
| 425 | return -ENOMEM; | 427 | return -ENOMEM; |
| 426 | ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); | 428 | ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); |
| @@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, | |||
| 437 | * Don't need (or can't) update reserved tags here, they | 439 | * Don't need (or can't) update reserved tags here, they |
| 438 | * remain static and should never need resizing. | 440 | * remain static and should never need resizing. |
| 439 | */ | 441 | */ |
| 440 | sbitmap_queue_resize(&tags->bitmap_tags, tdepth); | 442 | sbitmap_queue_resize(&tags->bitmap_tags, |
| 443 | tdepth - tags->nr_reserved_tags); | ||
| 441 | } | 444 | } |
| 442 | 445 | ||
| 443 | return 0; | 446 | return 0; |
diff --git a/block/blk-mq.c b/block/blk-mq.c index 654b0dc7e001..72a0033ccee9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
| @@ -34,8 +34,8 @@ | |||
| 34 | #include "blk-mq-debugfs.h" | 34 | #include "blk-mq-debugfs.h" |
| 35 | #include "blk-mq-tag.h" | 35 | #include "blk-mq-tag.h" |
| 36 | #include "blk-stat.h" | 36 | #include "blk-stat.h" |
| 37 | #include "blk-wbt.h" | ||
| 38 | #include "blk-mq-sched.h" | 37 | #include "blk-mq-sched.h" |
| 38 | #include "blk-rq-qos.h" | ||
| 39 | 39 | ||
| 40 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); | 40 | static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); |
| 41 | static void blk_mq_poll_stats_start(struct request_queue *q); | 41 | static void blk_mq_poll_stats_start(struct request_queue *q); |
| @@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |||
| 285 | rq->tag = -1; | 285 | rq->tag = -1; |
| 286 | rq->internal_tag = tag; | 286 | rq->internal_tag = tag; |
| 287 | } else { | 287 | } else { |
| 288 | if (blk_mq_tag_busy(data->hctx)) { | 288 | if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { |
| 289 | rq_flags = RQF_MQ_INFLIGHT; | 289 | rq_flags = RQF_MQ_INFLIGHT; |
| 290 | atomic_inc(&data->hctx->nr_active); | 290 | atomic_inc(&data->hctx->nr_active); |
| 291 | } | 291 | } |
| @@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, | |||
| 367 | if (!op_is_flush(op) && e->type->ops.mq.limit_depth && | 367 | if (!op_is_flush(op) && e->type->ops.mq.limit_depth && |
| 368 | !(data->flags & BLK_MQ_REQ_RESERVED)) | 368 | !(data->flags & BLK_MQ_REQ_RESERVED)) |
| 369 | e->type->ops.mq.limit_depth(op, data); | 369 | e->type->ops.mq.limit_depth(op, data); |
| 370 | } else { | ||
| 371 | blk_mq_tag_busy(data->hctx); | ||
| 370 | } | 372 | } |
| 371 | 373 | ||
| 372 | tag = blk_mq_get_tag(data); | 374 | tag = blk_mq_get_tag(data); |
| @@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq) | |||
| 504 | if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) | 506 | if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) |
| 505 | laptop_io_completion(q->backing_dev_info); | 507 | laptop_io_completion(q->backing_dev_info); |
| 506 | 508 | ||
| 507 | wbt_done(q->rq_wb, rq); | 509 | rq_qos_done(q, rq); |
| 508 | 510 | ||
| 509 | if (blk_rq_rl(rq)) | 511 | if (blk_rq_rl(rq)) |
| 510 | blk_put_rl(blk_rq_rl(rq)); | 512 | blk_put_rl(blk_rq_rl(rq)); |
| @@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) | |||
| 527 | blk_account_io_done(rq, now); | 529 | blk_account_io_done(rq, now); |
| 528 | 530 | ||
| 529 | if (rq->end_io) { | 531 | if (rq->end_io) { |
| 530 | wbt_done(rq->q->rq_wb, rq); | 532 | rq_qos_done(rq->q, rq); |
| 531 | rq->end_io(rq, error); | 533 | rq->end_io(rq, error); |
| 532 | } else { | 534 | } else { |
| 533 | if (unlikely(blk_bidi_rq(rq))) | 535 | if (unlikely(blk_bidi_rq(rq))) |
| @@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq) | |||
| 639 | rq->throtl_size = blk_rq_sectors(rq); | 641 | rq->throtl_size = blk_rq_sectors(rq); |
| 640 | #endif | 642 | #endif |
| 641 | rq->rq_flags |= RQF_STATS; | 643 | rq->rq_flags |= RQF_STATS; |
| 642 | wbt_issue(q->rq_wb, rq); | 644 | rq_qos_issue(q, rq); |
| 643 | } | 645 | } |
| 644 | 646 | ||
| 645 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); | 647 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); |
| @@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq) | |||
| 665 | blk_mq_put_driver_tag(rq); | 667 | blk_mq_put_driver_tag(rq); |
| 666 | 668 | ||
| 667 | trace_block_rq_requeue(q, rq); | 669 | trace_block_rq_requeue(q, rq); |
| 668 | wbt_requeue(q->rq_wb, rq); | 670 | rq_qos_requeue(q, rq); |
| 669 | 671 | ||
| 670 | if (blk_mq_request_started(rq)) { | 672 | if (blk_mq_request_started(rq)) { |
| 671 | WRITE_ONCE(rq->state, MQ_RQ_IDLE); | 673 | WRITE_ONCE(rq->state, MQ_RQ_IDLE); |
| @@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued) | |||
| 962 | return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); | 964 | return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); |
| 963 | } | 965 | } |
| 964 | 966 | ||
| 965 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | 967 | bool blk_mq_get_driver_tag(struct request *rq) |
| 966 | bool wait) | ||
| 967 | { | 968 | { |
| 968 | struct blk_mq_alloc_data data = { | 969 | struct blk_mq_alloc_data data = { |
| 969 | .q = rq->q, | 970 | .q = rq->q, |
| 970 | .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), | 971 | .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), |
| 971 | .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, | 972 | .flags = BLK_MQ_REQ_NOWAIT, |
| 972 | }; | 973 | }; |
| 973 | 974 | bool shared; | |
| 974 | might_sleep_if(wait); | ||
| 975 | 975 | ||
| 976 | if (rq->tag != -1) | 976 | if (rq->tag != -1) |
| 977 | goto done; | 977 | goto done; |
| @@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | |||
| 979 | if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) | 979 | if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) |
| 980 | data.flags |= BLK_MQ_REQ_RESERVED; | 980 | data.flags |= BLK_MQ_REQ_RESERVED; |
| 981 | 981 | ||
| 982 | shared = blk_mq_tag_busy(data.hctx); | ||
| 982 | rq->tag = blk_mq_get_tag(&data); | 983 | rq->tag = blk_mq_get_tag(&data); |
| 983 | if (rq->tag >= 0) { | 984 | if (rq->tag >= 0) { |
| 984 | if (blk_mq_tag_busy(data.hctx)) { | 985 | if (shared) { |
| 985 | rq->rq_flags |= RQF_MQ_INFLIGHT; | 986 | rq->rq_flags |= RQF_MQ_INFLIGHT; |
| 986 | atomic_inc(&data.hctx->nr_active); | 987 | atomic_inc(&data.hctx->nr_active); |
| 987 | } | 988 | } |
| @@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | |||
| 989 | } | 990 | } |
| 990 | 991 | ||
| 991 | done: | 992 | done: |
| 992 | if (hctx) | ||
| 993 | *hctx = data.hctx; | ||
| 994 | return rq->tag != -1; | 993 | return rq->tag != -1; |
| 995 | } | 994 | } |
| 996 | 995 | ||
| @@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, | |||
| 1001 | 1000 | ||
| 1002 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); | 1001 | hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); |
| 1003 | 1002 | ||
| 1003 | spin_lock(&hctx->dispatch_wait_lock); | ||
| 1004 | list_del_init(&wait->entry); | 1004 | list_del_init(&wait->entry); |
| 1005 | spin_unlock(&hctx->dispatch_wait_lock); | ||
| 1006 | |||
| 1005 | blk_mq_run_hw_queue(hctx, true); | 1007 | blk_mq_run_hw_queue(hctx, true); |
| 1006 | return 1; | 1008 | return 1; |
| 1007 | } | 1009 | } |
| @@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, | |||
| 1012 | * restart. For both cases, take care to check the condition again after | 1014 | * restart. For both cases, take care to check the condition again after |
| 1013 | * marking us as waiting. | 1015 | * marking us as waiting. |
| 1014 | */ | 1016 | */ |
| 1015 | static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, | 1017 | static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, |
| 1016 | struct request *rq) | 1018 | struct request *rq) |
| 1017 | { | 1019 | { |
| 1018 | struct blk_mq_hw_ctx *this_hctx = *hctx; | 1020 | struct wait_queue_head *wq; |
| 1019 | struct sbq_wait_state *ws; | ||
| 1020 | wait_queue_entry_t *wait; | 1021 | wait_queue_entry_t *wait; |
| 1021 | bool ret; | 1022 | bool ret; |
| 1022 | 1023 | ||
| 1023 | if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { | 1024 | if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { |
| 1024 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) | 1025 | if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) |
| 1025 | set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); | 1026 | set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); |
| 1026 | 1027 | ||
| 1027 | /* | 1028 | /* |
| 1028 | * It's possible that a tag was freed in the window between the | 1029 | * It's possible that a tag was freed in the window between the |
| @@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, | |||
| 1032 | * Don't clear RESTART here, someone else could have set it. | 1033 | * Don't clear RESTART here, someone else could have set it. |
| 1033 | * At most this will cost an extra queue run. | 1034 | * At most this will cost an extra queue run. |
| 1034 | */ | 1035 | */ |
| 1035 | return blk_mq_get_driver_tag(rq, hctx, false); | 1036 | return blk_mq_get_driver_tag(rq); |
| 1036 | } | 1037 | } |
| 1037 | 1038 | ||
| 1038 | wait = &this_hctx->dispatch_wait; | 1039 | wait = &hctx->dispatch_wait; |
| 1039 | if (!list_empty_careful(&wait->entry)) | 1040 | if (!list_empty_careful(&wait->entry)) |
| 1040 | return false; | 1041 | return false; |
| 1041 | 1042 | ||
| 1042 | spin_lock(&this_hctx->lock); | 1043 | wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; |
| 1044 | |||
| 1045 | spin_lock_irq(&wq->lock); | ||
| 1046 | spin_lock(&hctx->dispatch_wait_lock); | ||
| 1043 | if (!list_empty(&wait->entry)) { | 1047 | if (!list_empty(&wait->entry)) { |
| 1044 | spin_unlock(&this_hctx->lock); | 1048 | spin_unlock(&hctx->dispatch_wait_lock); |
| 1049 | spin_unlock_irq(&wq->lock); | ||
| 1045 | return false; | 1050 | return false; |
| 1046 | } | 1051 | } |
| 1047 | 1052 | ||
| 1048 | ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); | 1053 | wait->flags &= ~WQ_FLAG_EXCLUSIVE; |
| 1049 | add_wait_queue(&ws->wait, wait); | 1054 | __add_wait_queue(wq, wait); |
| 1050 | 1055 | ||
| 1051 | /* | 1056 | /* |
| 1052 | * It's possible that a tag was freed in the window between the | 1057 | * It's possible that a tag was freed in the window between the |
| 1053 | * allocation failure and adding the hardware queue to the wait | 1058 | * allocation failure and adding the hardware queue to the wait |
| 1054 | * queue. | 1059 | * queue. |
| 1055 | */ | 1060 | */ |
| 1056 | ret = blk_mq_get_driver_tag(rq, hctx, false); | 1061 | ret = blk_mq_get_driver_tag(rq); |
| 1057 | if (!ret) { | 1062 | if (!ret) { |
| 1058 | spin_unlock(&this_hctx->lock); | 1063 | spin_unlock(&hctx->dispatch_wait_lock); |
| 1064 | spin_unlock_irq(&wq->lock); | ||
| 1059 | return false; | 1065 | return false; |
| 1060 | } | 1066 | } |
| 1061 | 1067 | ||
| @@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, | |||
| 1063 | * We got a tag, remove ourselves from the wait queue to ensure | 1069 | * We got a tag, remove ourselves from the wait queue to ensure |
| 1064 | * someone else gets the wakeup. | 1070 | * someone else gets the wakeup. |
| 1065 | */ | 1071 | */ |
| 1066 | spin_lock_irq(&ws->wait.lock); | ||
| 1067 | list_del_init(&wait->entry); | 1072 | list_del_init(&wait->entry); |
| 1068 | spin_unlock_irq(&ws->wait.lock); | 1073 | spin_unlock(&hctx->dispatch_wait_lock); |
| 1069 | spin_unlock(&this_hctx->lock); | 1074 | spin_unlock_irq(&wq->lock); |
| 1070 | 1075 | ||
| 1071 | return true; | 1076 | return true; |
| 1072 | } | 1077 | } |
| 1073 | 1078 | ||
| 1079 | #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 | ||
| 1080 | #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 | ||
| 1081 | /* | ||
| 1082 | * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): | ||
| 1083 | * - EWMA is one simple way to compute running average value | ||
| 1084 | * - weight(7/8 and 1/8) is applied so that it can decrease exponentially | ||
| 1085 | * - take 4 as factor for avoiding to get too small(0) result, and this | ||
| 1086 | * factor doesn't matter because EWMA decreases exponentially | ||
| 1087 | */ | ||
| 1088 | static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) | ||
| 1089 | { | ||
| 1090 | unsigned int ewma; | ||
| 1091 | |||
| 1092 | if (hctx->queue->elevator) | ||
| 1093 | return; | ||
| 1094 | |||
| 1095 | ewma = hctx->dispatch_busy; | ||
| 1096 | |||
| 1097 | if (!ewma && !busy) | ||
| 1098 | return; | ||
| 1099 | |||
| 1100 | ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; | ||
| 1101 | if (busy) | ||
| 1102 | ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; | ||
| 1103 | ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; | ||
| 1104 | |||
| 1105 | hctx->dispatch_busy = ewma; | ||
| 1106 | } | ||
| 1107 | |||
| 1074 | #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ | 1108 | #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ |
| 1075 | 1109 | ||
| 1076 | /* | 1110 | /* |
| @@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
| 1103 | if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) | 1137 | if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) |
| 1104 | break; | 1138 | break; |
| 1105 | 1139 | ||
| 1106 | if (!blk_mq_get_driver_tag(rq, NULL, false)) { | 1140 | if (!blk_mq_get_driver_tag(rq)) { |
| 1107 | /* | 1141 | /* |
| 1108 | * The initial allocation attempt failed, so we need to | 1142 | * The initial allocation attempt failed, so we need to |
| 1109 | * rerun the hardware queue when a tag is freed. The | 1143 | * rerun the hardware queue when a tag is freed. The |
| @@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
| 1111 | * before we add this entry back on the dispatch list, | 1145 | * before we add this entry back on the dispatch list, |
| 1112 | * we'll re-run it below. | 1146 | * we'll re-run it below. |
| 1113 | */ | 1147 | */ |
| 1114 | if (!blk_mq_mark_tag_wait(&hctx, rq)) { | 1148 | if (!blk_mq_mark_tag_wait(hctx, rq)) { |
| 1115 | blk_mq_put_dispatch_budget(hctx); | 1149 | blk_mq_put_dispatch_budget(hctx); |
| 1116 | /* | 1150 | /* |
| 1117 | * For non-shared tags, the RESTART check | 1151 | * For non-shared tags, the RESTART check |
| @@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
| 1135 | bd.last = true; | 1169 | bd.last = true; |
| 1136 | else { | 1170 | else { |
| 1137 | nxt = list_first_entry(list, struct request, queuelist); | 1171 | nxt = list_first_entry(list, struct request, queuelist); |
| 1138 | bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); | 1172 | bd.last = !blk_mq_get_driver_tag(nxt); |
| 1139 | } | 1173 | } |
| 1140 | 1174 | ||
| 1141 | ret = q->mq_ops->queue_rq(hctx, &bd); | 1175 | ret = q->mq_ops->queue_rq(hctx, &bd); |
| @@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, | |||
| 1207 | else if (needs_restart && (ret == BLK_STS_RESOURCE)) | 1241 | else if (needs_restart && (ret == BLK_STS_RESOURCE)) |
| 1208 | blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); | 1242 | blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); |
| 1209 | 1243 | ||
| 1244 | blk_mq_update_dispatch_busy(hctx, true); | ||
| 1210 | return false; | 1245 | return false; |
| 1211 | } | 1246 | } else |
| 1247 | blk_mq_update_dispatch_busy(hctx, false); | ||
| 1212 | 1248 | ||
| 1213 | /* | 1249 | /* |
| 1214 | * If the host/device is unable to accept more work, inform the | 1250 | * If the host/device is unable to accept more work, inform the |
| @@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | |||
| 1542 | struct list_head *list) | 1578 | struct list_head *list) |
| 1543 | 1579 | ||
| 1544 | { | 1580 | { |
| 1581 | struct request *rq; | ||
| 1582 | |||
| 1545 | /* | 1583 | /* |
| 1546 | * preemption doesn't flush plug list, so it's possible ctx->cpu is | 1584 | * preemption doesn't flush plug list, so it's possible ctx->cpu is |
| 1547 | * offline now | 1585 | * offline now |
| 1548 | */ | 1586 | */ |
| 1549 | spin_lock(&ctx->lock); | 1587 | list_for_each_entry(rq, list, queuelist) { |
| 1550 | while (!list_empty(list)) { | ||
| 1551 | struct request *rq; | ||
| 1552 | |||
| 1553 | rq = list_first_entry(list, struct request, queuelist); | ||
| 1554 | BUG_ON(rq->mq_ctx != ctx); | 1588 | BUG_ON(rq->mq_ctx != ctx); |
| 1555 | list_del_init(&rq->queuelist); | 1589 | trace_block_rq_insert(hctx->queue, rq); |
| 1556 | __blk_mq_insert_req_list(hctx, rq, false); | ||
| 1557 | } | 1590 | } |
| 1591 | |||
| 1592 | spin_lock(&ctx->lock); | ||
| 1593 | list_splice_tail_init(list, &ctx->rq_list); | ||
| 1558 | blk_mq_hctx_mark_pending(hctx, ctx); | 1594 | blk_mq_hctx_mark_pending(hctx, ctx); |
| 1559 | spin_unlock(&ctx->lock); | 1595 | spin_unlock(&ctx->lock); |
| 1560 | } | 1596 | } |
| @@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
| 1657 | ret = q->mq_ops->queue_rq(hctx, &bd); | 1693 | ret = q->mq_ops->queue_rq(hctx, &bd); |
| 1658 | switch (ret) { | 1694 | switch (ret) { |
| 1659 | case BLK_STS_OK: | 1695 | case BLK_STS_OK: |
| 1696 | blk_mq_update_dispatch_busy(hctx, false); | ||
| 1660 | *cookie = new_cookie; | 1697 | *cookie = new_cookie; |
| 1661 | break; | 1698 | break; |
| 1662 | case BLK_STS_RESOURCE: | 1699 | case BLK_STS_RESOURCE: |
| 1663 | case BLK_STS_DEV_RESOURCE: | 1700 | case BLK_STS_DEV_RESOURCE: |
| 1701 | blk_mq_update_dispatch_busy(hctx, true); | ||
| 1664 | __blk_mq_requeue_request(rq); | 1702 | __blk_mq_requeue_request(rq); |
| 1665 | break; | 1703 | break; |
| 1666 | default: | 1704 | default: |
| 1705 | blk_mq_update_dispatch_busy(hctx, false); | ||
| 1667 | *cookie = BLK_QC_T_NONE; | 1706 | *cookie = BLK_QC_T_NONE; |
| 1668 | break; | 1707 | break; |
| 1669 | } | 1708 | } |
| @@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, | |||
| 1698 | if (!blk_mq_get_dispatch_budget(hctx)) | 1737 | if (!blk_mq_get_dispatch_budget(hctx)) |
| 1699 | goto insert; | 1738 | goto insert; |
| 1700 | 1739 | ||
| 1701 | if (!blk_mq_get_driver_tag(rq, NULL, false)) { | 1740 | if (!blk_mq_get_driver_tag(rq)) { |
| 1702 | blk_mq_put_dispatch_budget(hctx); | 1741 | blk_mq_put_dispatch_budget(hctx); |
| 1703 | goto insert; | 1742 | goto insert; |
| 1704 | } | 1743 | } |
| @@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) | |||
| 1746 | return ret; | 1785 | return ret; |
| 1747 | } | 1786 | } |
| 1748 | 1787 | ||
| 1788 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, | ||
| 1789 | struct list_head *list) | ||
| 1790 | { | ||
| 1791 | while (!list_empty(list)) { | ||
| 1792 | blk_status_t ret; | ||
| 1793 | struct request *rq = list_first_entry(list, struct request, | ||
| 1794 | queuelist); | ||
| 1795 | |||
| 1796 | list_del_init(&rq->queuelist); | ||
| 1797 | ret = blk_mq_request_issue_directly(rq); | ||
| 1798 | if (ret != BLK_STS_OK) { | ||
| 1799 | if (ret == BLK_STS_RESOURCE || | ||
| 1800 | ret == BLK_STS_DEV_RESOURCE) { | ||
| 1801 | list_add(&rq->queuelist, list); | ||
| 1802 | break; | ||
| 1803 | } | ||
| 1804 | blk_mq_end_request(rq, ret); | ||
| 1805 | } | ||
| 1806 | } | ||
| 1807 | } | ||
| 1808 | |||
| 1749 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | 1809 | static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) |
| 1750 | { | 1810 | { |
| 1751 | const int is_sync = op_is_sync(bio->bi_opf); | 1811 | const int is_sync = op_is_sync(bio->bi_opf); |
| @@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
| 1756 | struct blk_plug *plug; | 1816 | struct blk_plug *plug; |
| 1757 | struct request *same_queue_rq = NULL; | 1817 | struct request *same_queue_rq = NULL; |
| 1758 | blk_qc_t cookie; | 1818 | blk_qc_t cookie; |
| 1759 | unsigned int wb_acct; | ||
| 1760 | 1819 | ||
| 1761 | blk_queue_bounce(q, &bio); | 1820 | blk_queue_bounce(q, &bio); |
| 1762 | 1821 | ||
| @@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
| 1772 | if (blk_mq_sched_bio_merge(q, bio)) | 1831 | if (blk_mq_sched_bio_merge(q, bio)) |
| 1773 | return BLK_QC_T_NONE; | 1832 | return BLK_QC_T_NONE; |
| 1774 | 1833 | ||
| 1775 | wb_acct = wbt_wait(q->rq_wb, bio, NULL); | 1834 | rq_qos_throttle(q, bio, NULL); |
| 1776 | 1835 | ||
| 1777 | trace_block_getrq(q, bio, bio->bi_opf); | 1836 | trace_block_getrq(q, bio, bio->bi_opf); |
| 1778 | 1837 | ||
| 1779 | rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); | 1838 | rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); |
| 1780 | if (unlikely(!rq)) { | 1839 | if (unlikely(!rq)) { |
| 1781 | __wbt_done(q->rq_wb, wb_acct); | 1840 | rq_qos_cleanup(q, bio); |
| 1782 | if (bio->bi_opf & REQ_NOWAIT) | 1841 | if (bio->bi_opf & REQ_NOWAIT) |
| 1783 | bio_wouldblock_error(bio); | 1842 | bio_wouldblock_error(bio); |
| 1784 | return BLK_QC_T_NONE; | 1843 | return BLK_QC_T_NONE; |
| 1785 | } | 1844 | } |
| 1786 | 1845 | ||
| 1787 | wbt_track(rq, wb_acct); | 1846 | rq_qos_track(q, rq, bio); |
| 1788 | 1847 | ||
| 1789 | cookie = request_to_qc_t(data.hctx, rq); | 1848 | cookie = request_to_qc_t(data.hctx, rq); |
| 1790 | 1849 | ||
| @@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) | |||
| 1847 | blk_mq_try_issue_directly(data.hctx, same_queue_rq, | 1906 | blk_mq_try_issue_directly(data.hctx, same_queue_rq, |
| 1848 | &cookie); | 1907 | &cookie); |
| 1849 | } | 1908 | } |
| 1850 | } else if (q->nr_hw_queues > 1 && is_sync) { | 1909 | } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && |
| 1910 | !data.hctx->dispatch_busy)) { | ||
| 1851 | blk_mq_put_ctx(data.ctx); | 1911 | blk_mq_put_ctx(data.ctx); |
| 1852 | blk_mq_bio_to_request(rq, bio); | 1912 | blk_mq_bio_to_request(rq, bio); |
| 1853 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); | 1913 | blk_mq_try_issue_directly(data.hctx, rq, &cookie); |
| @@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
| 2146 | 2206 | ||
| 2147 | hctx->nr_ctx = 0; | 2207 | hctx->nr_ctx = 0; |
| 2148 | 2208 | ||
| 2209 | spin_lock_init(&hctx->dispatch_wait_lock); | ||
| 2149 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); | 2210 | init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); |
| 2150 | INIT_LIST_HEAD(&hctx->dispatch_wait.entry); | 2211 | INIT_LIST_HEAD(&hctx->dispatch_wait.entry); |
| 2151 | 2212 | ||
| @@ -2331,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) | |||
| 2331 | int i; | 2392 | int i; |
| 2332 | 2393 | ||
| 2333 | queue_for_each_hw_ctx(q, hctx, i) { | 2394 | queue_for_each_hw_ctx(q, hctx, i) { |
| 2334 | if (shared) { | 2395 | if (shared) |
| 2335 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | ||
| 2336 | atomic_inc(&q->shared_hctx_restart); | ||
| 2337 | hctx->flags |= BLK_MQ_F_TAG_SHARED; | 2396 | hctx->flags |= BLK_MQ_F_TAG_SHARED; |
| 2338 | } else { | 2397 | else |
| 2339 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) | ||
| 2340 | atomic_dec(&q->shared_hctx_restart); | ||
| 2341 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; | 2398 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; |
| 2342 | } | ||
| 2343 | } | 2399 | } |
| 2344 | } | 2400 | } |
| 2345 | 2401 | ||
| @@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) | |||
| 2370 | blk_mq_update_tag_set_depth(set, false); | 2426 | blk_mq_update_tag_set_depth(set, false); |
| 2371 | } | 2427 | } |
| 2372 | mutex_unlock(&set->tag_list_lock); | 2428 | mutex_unlock(&set->tag_list_lock); |
| 2373 | synchronize_rcu(); | ||
| 2374 | INIT_LIST_HEAD(&q->tag_set_list); | 2429 | INIT_LIST_HEAD(&q->tag_set_list); |
| 2375 | } | 2430 | } |
| 2376 | 2431 | ||
| @@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | |||
| 2685 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | 2740 | static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) |
| 2686 | { | 2741 | { |
| 2687 | if (set->ops->map_queues) { | 2742 | if (set->ops->map_queues) { |
| 2688 | int cpu; | ||
| 2689 | /* | 2743 | /* |
| 2690 | * transport .map_queues is usually done in the following | 2744 | * transport .map_queues is usually done in the following |
| 2691 | * way: | 2745 | * way: |
| @@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | |||
| 2700 | * killing stale mapping since one CPU may not be mapped | 2754 | * killing stale mapping since one CPU may not be mapped |
| 2701 | * to any hw queue. | 2755 | * to any hw queue. |
| 2702 | */ | 2756 | */ |
| 2703 | for_each_possible_cpu(cpu) | 2757 | blk_mq_clear_mq_map(set); |
| 2704 | set->mq_map[cpu] = 0; | ||
| 2705 | 2758 | ||
| 2706 | return set->ops->map_queues(set); | 2759 | return set->ops->map_queues(set); |
| 2707 | } else | 2760 | } else |
| @@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) | |||
| 2711 | /* | 2764 | /* |
| 2712 | * Alloc a tag set to be associated with one or more request queues. | 2765 | * Alloc a tag set to be associated with one or more request queues. |
| 2713 | * May fail with EINVAL for various error conditions. May adjust the | 2766 | * May fail with EINVAL for various error conditions. May adjust the |
| 2714 | * requested depth down, if if it too large. In that case, the set | 2767 | * requested depth down, if it's too large. In that case, the set |
| 2715 | * value will be stored in set->queue_depth. | 2768 | * value will be stored in set->queue_depth. |
| 2716 | */ | 2769 | */ |
| 2717 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | 2770 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) |
diff --git a/block/blk-mq.h b/block/blk-mq.h index 89231e439b2f..9497b47e2526 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h | |||
| @@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); | |||
| 36 | void blk_mq_wake_waiters(struct request_queue *q); | 36 | void blk_mq_wake_waiters(struct request_queue *q); |
| 37 | bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); | 37 | bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); |
| 38 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); | 38 | void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); |
| 39 | bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, | 39 | bool blk_mq_get_driver_tag(struct request *rq); |
| 40 | bool wait); | ||
| 41 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, | 40 | struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, |
| 42 | struct blk_mq_ctx *start); | 41 | struct blk_mq_ctx *start); |
| 43 | 42 | ||
| @@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, | |||
| 65 | 64 | ||
| 66 | /* Used by blk_insert_cloned_request() to issue request directly */ | 65 | /* Used by blk_insert_cloned_request() to issue request directly */ |
| 67 | blk_status_t blk_mq_request_issue_directly(struct request *rq); | 66 | blk_status_t blk_mq_request_issue_directly(struct request *rq); |
| 67 | void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, | ||
| 68 | struct list_head *list); | ||
| 68 | 69 | ||
| 69 | /* | 70 | /* |
| 70 | * CPU -> queue mappings | 71 | * CPU -> queue mappings |
| @@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq) | |||
| 203 | __blk_mq_put_driver_tag(hctx, rq); | 204 | __blk_mq_put_driver_tag(hctx, rq); |
| 204 | } | 205 | } |
| 205 | 206 | ||
| 207 | static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) | ||
| 208 | { | ||
| 209 | int cpu; | ||
| 210 | |||
| 211 | for_each_possible_cpu(cpu) | ||
| 212 | set->mq_map[cpu] = 0; | ||
| 213 | } | ||
| 214 | |||
| 206 | #endif | 215 | #endif |
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c new file mode 100644 index 000000000000..0005dfd568dd --- /dev/null +++ b/block/blk-rq-qos.c | |||
| @@ -0,0 +1,194 @@ | |||
| 1 | #include "blk-rq-qos.h" | ||
| 2 | |||
| 3 | /* | ||
| 4 | * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, | ||
| 5 | * false if 'v' + 1 would be bigger than 'below'. | ||
| 6 | */ | ||
| 7 | static bool atomic_inc_below(atomic_t *v, unsigned int below) | ||
| 8 | { | ||
| 9 | unsigned int cur = atomic_read(v); | ||
| 10 | |||
| 11 | for (;;) { | ||
| 12 | unsigned int old; | ||
| 13 | |||
| 14 | if (cur >= below) | ||
| 15 | return false; | ||
| 16 | old = atomic_cmpxchg(v, cur, cur + 1); | ||
| 17 | if (old == cur) | ||
| 18 | break; | ||
| 19 | cur = old; | ||
| 20 | } | ||
| 21 | |||
| 22 | return true; | ||
| 23 | } | ||
| 24 | |||
| 25 | bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) | ||
| 26 | { | ||
| 27 | return atomic_inc_below(&rq_wait->inflight, limit); | ||
| 28 | } | ||
| 29 | |||
| 30 | void rq_qos_cleanup(struct request_queue *q, struct bio *bio) | ||
| 31 | { | ||
| 32 | struct rq_qos *rqos; | ||
| 33 | |||
| 34 | for (rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 35 | if (rqos->ops->cleanup) | ||
| 36 | rqos->ops->cleanup(rqos, bio); | ||
| 37 | } | ||
| 38 | } | ||
| 39 | |||
| 40 | void rq_qos_done(struct request_queue *q, struct request *rq) | ||
| 41 | { | ||
| 42 | struct rq_qos *rqos; | ||
| 43 | |||
| 44 | for (rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 45 | if (rqos->ops->done) | ||
| 46 | rqos->ops->done(rqos, rq); | ||
| 47 | } | ||
| 48 | } | ||
| 49 | |||
| 50 | void rq_qos_issue(struct request_queue *q, struct request *rq) | ||
| 51 | { | ||
| 52 | struct rq_qos *rqos; | ||
| 53 | |||
| 54 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 55 | if (rqos->ops->issue) | ||
| 56 | rqos->ops->issue(rqos, rq); | ||
| 57 | } | ||
| 58 | } | ||
| 59 | |||
| 60 | void rq_qos_requeue(struct request_queue *q, struct request *rq) | ||
| 61 | { | ||
| 62 | struct rq_qos *rqos; | ||
| 63 | |||
| 64 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 65 | if (rqos->ops->requeue) | ||
| 66 | rqos->ops->requeue(rqos, rq); | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | void rq_qos_throttle(struct request_queue *q, struct bio *bio, | ||
| 71 | spinlock_t *lock) | ||
| 72 | { | ||
| 73 | struct rq_qos *rqos; | ||
| 74 | |||
| 75 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 76 | if (rqos->ops->throttle) | ||
| 77 | rqos->ops->throttle(rqos, bio, lock); | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) | ||
| 82 | { | ||
| 83 | struct rq_qos *rqos; | ||
| 84 | |||
| 85 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 86 | if (rqos->ops->track) | ||
| 87 | rqos->ops->track(rqos, rq, bio); | ||
| 88 | } | ||
| 89 | } | ||
| 90 | |||
| 91 | void rq_qos_done_bio(struct request_queue *q, struct bio *bio) | ||
| 92 | { | ||
| 93 | struct rq_qos *rqos; | ||
| 94 | |||
| 95 | for(rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 96 | if (rqos->ops->done_bio) | ||
| 97 | rqos->ops->done_bio(rqos, bio); | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | /* | ||
| 102 | * Return true, if we can't increase the depth further by scaling | ||
| 103 | */ | ||
| 104 | bool rq_depth_calc_max_depth(struct rq_depth *rqd) | ||
| 105 | { | ||
| 106 | unsigned int depth; | ||
| 107 | bool ret = false; | ||
| 108 | |||
| 109 | /* | ||
| 110 | * For QD=1 devices, this is a special case. It's important for those | ||
| 111 | * to have one request ready when one completes, so force a depth of | ||
| 112 | * 2 for those devices. On the backend, it'll be a depth of 1 anyway, | ||
| 113 | * since the device can't have more than that in flight. If we're | ||
| 114 | * scaling down, then keep a setting of 1/1/1. | ||
| 115 | */ | ||
| 116 | if (rqd->queue_depth == 1) { | ||
| 117 | if (rqd->scale_step > 0) | ||
| 118 | rqd->max_depth = 1; | ||
| 119 | else { | ||
| 120 | rqd->max_depth = 2; | ||
| 121 | ret = true; | ||
| 122 | } | ||
| 123 | } else { | ||
| 124 | /* | ||
| 125 | * scale_step == 0 is our default state. If we have suffered | ||
| 126 | * latency spikes, step will be > 0, and we shrink the | ||
| 127 | * allowed write depths. If step is < 0, we're only doing | ||
| 128 | * writes, and we allow a temporarily higher depth to | ||
| 129 | * increase performance. | ||
| 130 | */ | ||
| 131 | depth = min_t(unsigned int, rqd->default_depth, | ||
| 132 | rqd->queue_depth); | ||
| 133 | if (rqd->scale_step > 0) | ||
| 134 | depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); | ||
| 135 | else if (rqd->scale_step < 0) { | ||
| 136 | unsigned int maxd = 3 * rqd->queue_depth / 4; | ||
| 137 | |||
| 138 | depth = 1 + ((depth - 1) << -rqd->scale_step); | ||
| 139 | if (depth > maxd) { | ||
| 140 | depth = maxd; | ||
| 141 | ret = true; | ||
| 142 | } | ||
| 143 | } | ||
| 144 | |||
| 145 | rqd->max_depth = depth; | ||
| 146 | } | ||
| 147 | |||
| 148 | return ret; | ||
| 149 | } | ||
| 150 | |||
| 151 | void rq_depth_scale_up(struct rq_depth *rqd) | ||
| 152 | { | ||
| 153 | /* | ||
| 154 | * Hit max in previous round, stop here | ||
| 155 | */ | ||
| 156 | if (rqd->scaled_max) | ||
| 157 | return; | ||
| 158 | |||
| 159 | rqd->scale_step--; | ||
| 160 | |||
| 161 | rqd->scaled_max = rq_depth_calc_max_depth(rqd); | ||
| 162 | } | ||
| 163 | |||
| 164 | /* | ||
| 165 | * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we | ||
| 166 | * had a latency violation. | ||
| 167 | */ | ||
| 168 | void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) | ||
| 169 | { | ||
| 170 | /* | ||
| 171 | * Stop scaling down when we've hit the limit. This also prevents | ||
| 172 | * ->scale_step from going to crazy values, if the device can't | ||
| 173 | * keep up. | ||
| 174 | */ | ||
| 175 | if (rqd->max_depth == 1) | ||
| 176 | return; | ||
| 177 | |||
| 178 | if (rqd->scale_step < 0 && hard_throttle) | ||
| 179 | rqd->scale_step = 0; | ||
| 180 | else | ||
| 181 | rqd->scale_step++; | ||
| 182 | |||
| 183 | rqd->scaled_max = false; | ||
| 184 | rq_depth_calc_max_depth(rqd); | ||
| 185 | } | ||
| 186 | |||
| 187 | void rq_qos_exit(struct request_queue *q) | ||
| 188 | { | ||
| 189 | while (q->rq_qos) { | ||
| 190 | struct rq_qos *rqos = q->rq_qos; | ||
| 191 | q->rq_qos = rqos->next; | ||
| 192 | rqos->ops->exit(rqos); | ||
| 193 | } | ||
| 194 | } | ||
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h new file mode 100644 index 000000000000..32b02efbfa66 --- /dev/null +++ b/block/blk-rq-qos.h | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | #ifndef RQ_QOS_H | ||
| 2 | #define RQ_QOS_H | ||
| 3 | |||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/blkdev.h> | ||
| 6 | #include <linux/blk_types.h> | ||
| 7 | #include <linux/atomic.h> | ||
| 8 | #include <linux/wait.h> | ||
| 9 | |||
| 10 | enum rq_qos_id { | ||
| 11 | RQ_QOS_WBT, | ||
| 12 | RQ_QOS_CGROUP, | ||
| 13 | }; | ||
| 14 | |||
| 15 | struct rq_wait { | ||
| 16 | wait_queue_head_t wait; | ||
| 17 | atomic_t inflight; | ||
| 18 | }; | ||
| 19 | |||
| 20 | struct rq_qos { | ||
| 21 | struct rq_qos_ops *ops; | ||
| 22 | struct request_queue *q; | ||
| 23 | enum rq_qos_id id; | ||
| 24 | struct rq_qos *next; | ||
| 25 | }; | ||
| 26 | |||
| 27 | struct rq_qos_ops { | ||
| 28 | void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); | ||
| 29 | void (*track)(struct rq_qos *, struct request *, struct bio *); | ||
| 30 | void (*issue)(struct rq_qos *, struct request *); | ||
| 31 | void (*requeue)(struct rq_qos *, struct request *); | ||
| 32 | void (*done)(struct rq_qos *, struct request *); | ||
| 33 | void (*done_bio)(struct rq_qos *, struct bio *); | ||
| 34 | void (*cleanup)(struct rq_qos *, struct bio *); | ||
| 35 | void (*exit)(struct rq_qos *); | ||
| 36 | }; | ||
| 37 | |||
| 38 | struct rq_depth { | ||
| 39 | unsigned int max_depth; | ||
| 40 | |||
| 41 | int scale_step; | ||
| 42 | bool scaled_max; | ||
| 43 | |||
| 44 | unsigned int queue_depth; | ||
| 45 | unsigned int default_depth; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static inline struct rq_qos *rq_qos_id(struct request_queue *q, | ||
| 49 | enum rq_qos_id id) | ||
| 50 | { | ||
| 51 | struct rq_qos *rqos; | ||
| 52 | for (rqos = q->rq_qos; rqos; rqos = rqos->next) { | ||
| 53 | if (rqos->id == id) | ||
| 54 | break; | ||
| 55 | } | ||
| 56 | return rqos; | ||
| 57 | } | ||
| 58 | |||
| 59 | static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) | ||
| 60 | { | ||
| 61 | return rq_qos_id(q, RQ_QOS_WBT); | ||
| 62 | } | ||
| 63 | |||
| 64 | static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) | ||
| 65 | { | ||
| 66 | return rq_qos_id(q, RQ_QOS_CGROUP); | ||
| 67 | } | ||
| 68 | |||
| 69 | static inline void rq_wait_init(struct rq_wait *rq_wait) | ||
| 70 | { | ||
| 71 | atomic_set(&rq_wait->inflight, 0); | ||
| 72 | init_waitqueue_head(&rq_wait->wait); | ||
| 73 | } | ||
| 74 | |||
| 75 | static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) | ||
| 76 | { | ||
| 77 | rqos->next = q->rq_qos; | ||
| 78 | q->rq_qos = rqos; | ||
| 79 | } | ||
| 80 | |||
| 81 | static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) | ||
| 82 | { | ||
| 83 | struct rq_qos *cur, *prev = NULL; | ||
| 84 | for (cur = q->rq_qos; cur; cur = cur->next) { | ||
| 85 | if (cur == rqos) { | ||
| 86 | if (prev) | ||
| 87 | prev->next = rqos->next; | ||
| 88 | else | ||
| 89 | q->rq_qos = cur; | ||
| 90 | break; | ||
| 91 | } | ||
| 92 | prev = cur; | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 96 | bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); | ||
| 97 | void rq_depth_scale_up(struct rq_depth *rqd); | ||
| 98 | void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); | ||
| 99 | bool rq_depth_calc_max_depth(struct rq_depth *rqd); | ||
| 100 | |||
| 101 | void rq_qos_cleanup(struct request_queue *, struct bio *); | ||
| 102 | void rq_qos_done(struct request_queue *, struct request *); | ||
| 103 | void rq_qos_issue(struct request_queue *, struct request *); | ||
| 104 | void rq_qos_requeue(struct request_queue *, struct request *); | ||
| 105 | void rq_qos_done_bio(struct request_queue *q, struct bio *bio); | ||
| 106 | void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); | ||
| 107 | void rq_qos_track(struct request_queue *q, struct request *, struct bio *); | ||
| 108 | void rq_qos_exit(struct request_queue *); | ||
| 109 | #endif | ||
diff --git a/block/blk-settings.c b/block/blk-settings.c index d1de71124656..ffd459969689 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
| @@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) | |||
| 128 | 128 | ||
| 129 | /* Inherit limits from component devices */ | 129 | /* Inherit limits from component devices */ |
| 130 | lim->max_segments = USHRT_MAX; | 130 | lim->max_segments = USHRT_MAX; |
| 131 | lim->max_discard_segments = 1; | 131 | lim->max_discard_segments = USHRT_MAX; |
| 132 | lim->max_hw_sectors = UINT_MAX; | 132 | lim->max_hw_sectors = UINT_MAX; |
| 133 | lim->max_segment_size = UINT_MAX; | 133 | lim->max_segment_size = UINT_MAX; |
| 134 | lim->max_sectors = UINT_MAX; | 134 | lim->max_sectors = UINT_MAX; |
| @@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); | |||
| 875 | void blk_set_queue_depth(struct request_queue *q, unsigned int depth) | 875 | void blk_set_queue_depth(struct request_queue *q, unsigned int depth) |
| 876 | { | 876 | { |
| 877 | q->queue_depth = depth; | 877 | q->queue_depth = depth; |
| 878 | wbt_set_queue_depth(q->rq_wb, depth); | 878 | wbt_set_queue_depth(q, depth); |
| 879 | } | 879 | } |
| 880 | EXPORT_SYMBOL(blk_set_queue_depth); | 880 | EXPORT_SYMBOL(blk_set_queue_depth); |
| 881 | 881 | ||
| @@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) | |||
| 900 | queue_flag_clear(QUEUE_FLAG_FUA, q); | 900 | queue_flag_clear(QUEUE_FLAG_FUA, q); |
| 901 | spin_unlock_irq(q->queue_lock); | 901 | spin_unlock_irq(q->queue_lock); |
| 902 | 902 | ||
| 903 | wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); | 903 | wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); |
| 904 | } | 904 | } |
| 905 | EXPORT_SYMBOL_GPL(blk_queue_write_cache); | 905 | EXPORT_SYMBOL_GPL(blk_queue_write_cache); |
| 906 | 906 | ||
diff --git a/block/blk-stat.c b/block/blk-stat.c index 175c143ac5b9..7587b1c3caaf 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c | |||
| @@ -17,7 +17,7 @@ struct blk_queue_stats { | |||
| 17 | bool enable_accounting; | 17 | bool enable_accounting; |
| 18 | }; | 18 | }; |
| 19 | 19 | ||
| 20 | static void blk_stat_init(struct blk_rq_stat *stat) | 20 | void blk_rq_stat_init(struct blk_rq_stat *stat) |
| 21 | { | 21 | { |
| 22 | stat->min = -1ULL; | 22 | stat->min = -1ULL; |
| 23 | stat->max = stat->nr_samples = stat->mean = 0; | 23 | stat->max = stat->nr_samples = stat->mean = 0; |
| @@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat) | |||
| 25 | } | 25 | } |
| 26 | 26 | ||
| 27 | /* src is a per-cpu stat, mean isn't initialized */ | 27 | /* src is a per-cpu stat, mean isn't initialized */ |
| 28 | static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) | 28 | void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) |
| 29 | { | 29 | { |
| 30 | if (!src->nr_samples) | 30 | if (!src->nr_samples) |
| 31 | return; | 31 | return; |
| @@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) | |||
| 39 | dst->nr_samples += src->nr_samples; | 39 | dst->nr_samples += src->nr_samples; |
| 40 | } | 40 | } |
| 41 | 41 | ||
| 42 | static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) | 42 | void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) |
| 43 | { | 43 | { |
| 44 | stat->min = min(stat->min, value); | 44 | stat->min = min(stat->min, value); |
| 45 | stat->max = max(stat->max, value); | 45 | stat->max = max(stat->max, value); |
| @@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now) | |||
| 69 | continue; | 69 | continue; |
| 70 | 70 | ||
| 71 | stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; | 71 | stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; |
| 72 | __blk_stat_add(stat, value); | 72 | blk_rq_stat_add(stat, value); |
| 73 | put_cpu_ptr(cb->cpu_stat); | 73 | put_cpu_ptr(cb->cpu_stat); |
| 74 | } | 74 | } |
| 75 | rcu_read_unlock(); | 75 | rcu_read_unlock(); |
| @@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t) | |||
| 82 | int cpu; | 82 | int cpu; |
| 83 | 83 | ||
| 84 | for (bucket = 0; bucket < cb->buckets; bucket++) | 84 | for (bucket = 0; bucket < cb->buckets; bucket++) |
| 85 | blk_stat_init(&cb->stat[bucket]); | 85 | blk_rq_stat_init(&cb->stat[bucket]); |
| 86 | 86 | ||
| 87 | for_each_online_cpu(cpu) { | 87 | for_each_online_cpu(cpu) { |
| 88 | struct blk_rq_stat *cpu_stat; | 88 | struct blk_rq_stat *cpu_stat; |
| 89 | 89 | ||
| 90 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); | 90 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); |
| 91 | for (bucket = 0; bucket < cb->buckets; bucket++) { | 91 | for (bucket = 0; bucket < cb->buckets; bucket++) { |
| 92 | blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); | 92 | blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); |
| 93 | blk_stat_init(&cpu_stat[bucket]); | 93 | blk_rq_stat_init(&cpu_stat[bucket]); |
| 94 | } | 94 | } |
| 95 | } | 95 | } |
| 96 | 96 | ||
| @@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q, | |||
| 143 | 143 | ||
| 144 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); | 144 | cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); |
| 145 | for (bucket = 0; bucket < cb->buckets; bucket++) | 145 | for (bucket = 0; bucket < cb->buckets; bucket++) |
| 146 | blk_stat_init(&cpu_stat[bucket]); | 146 | blk_rq_stat_init(&cpu_stat[bucket]); |
| 147 | } | 147 | } |
| 148 | 148 | ||
| 149 | spin_lock(&q->stats->lock); | 149 | spin_lock(&q->stats->lock); |
diff --git a/block/blk-stat.h b/block/blk-stat.h index 78399cdde9c9..f4a1568e81a4 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h | |||
| @@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, | |||
| 159 | mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); | 159 | mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); |
| 160 | } | 160 | } |
| 161 | 161 | ||
| 162 | void blk_rq_stat_add(struct blk_rq_stat *, u64); | ||
| 163 | void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); | ||
| 164 | void blk_rq_stat_init(struct blk_rq_stat *); | ||
| 165 | |||
| 162 | #endif | 166 | #endif |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 94987b1f69e1..bb109bb0a055 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
| @@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, | |||
| 422 | 422 | ||
| 423 | static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) | 423 | static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) |
| 424 | { | 424 | { |
| 425 | if (!q->rq_wb) | 425 | if (!wbt_rq_qos(q)) |
| 426 | return -EINVAL; | 426 | return -EINVAL; |
| 427 | 427 | ||
| 428 | return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); | 428 | return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); |
| 429 | } | 429 | } |
| 430 | 430 | ||
| 431 | static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, | 431 | static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, |
| 432 | size_t count) | 432 | size_t count) |
| 433 | { | 433 | { |
| 434 | struct rq_wb *rwb; | 434 | struct rq_qos *rqos; |
| 435 | ssize_t ret; | 435 | ssize_t ret; |
| 436 | s64 val; | 436 | s64 val; |
| 437 | 437 | ||
| @@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, | |||
| 441 | if (val < -1) | 441 | if (val < -1) |
| 442 | return -EINVAL; | 442 | return -EINVAL; |
| 443 | 443 | ||
| 444 | rwb = q->rq_wb; | 444 | rqos = wbt_rq_qos(q); |
| 445 | if (!rwb) { | 445 | if (!rqos) { |
| 446 | ret = wbt_init(q); | 446 | ret = wbt_init(q); |
| 447 | if (ret) | 447 | if (ret) |
| 448 | return ret; | 448 | return ret; |
| 449 | } | 449 | } |
| 450 | 450 | ||
| 451 | rwb = q->rq_wb; | ||
| 452 | if (val == -1) | 451 | if (val == -1) |
| 453 | rwb->min_lat_nsec = wbt_default_latency_nsec(q); | 452 | val = wbt_default_latency_nsec(q); |
| 454 | else if (val >= 0) | 453 | else if (val >= 0) |
| 455 | rwb->min_lat_nsec = val * 1000ULL; | 454 | val *= 1000ULL; |
| 456 | 455 | ||
| 457 | if (rwb->enable_state == WBT_STATE_ON_DEFAULT) | 456 | wbt_set_min_lat(q, val); |
| 458 | rwb->enable_state = WBT_STATE_ON_MANUAL; | ||
| 459 | 457 | ||
| 460 | wbt_update_limits(rwb); | 458 | wbt_update_limits(q); |
| 461 | return count; | 459 | return count; |
| 462 | } | 460 | } |
| 463 | 461 | ||
| @@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work) | |||
| 804 | blk_stat_remove_callback(q, q->poll_cb); | 802 | blk_stat_remove_callback(q, q->poll_cb); |
| 805 | blk_stat_free_callback(q->poll_cb); | 803 | blk_stat_free_callback(q->poll_cb); |
| 806 | 804 | ||
| 805 | if (!blk_queue_dead(q)) { | ||
| 806 | /* | ||
| 807 | * Last reference was dropped without having called | ||
| 808 | * blk_cleanup_queue(). | ||
| 809 | */ | ||
| 810 | WARN_ONCE(blk_queue_init_done(q), | ||
| 811 | "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n", | ||
| 812 | q); | ||
| 813 | blk_exit_queue(q); | ||
| 814 | } | ||
| 815 | |||
| 816 | WARN(blk_queue_root_blkg(q), | ||
| 817 | "request queue %p is being released but it has not yet been removed from the blkcg controller\n", | ||
| 818 | q); | ||
| 819 | |||
| 807 | blk_free_queue_stats(q->stats); | 820 | blk_free_queue_stats(q->stats); |
| 808 | 821 | ||
| 809 | blk_exit_rl(q, &q->root_rl); | 822 | blk_exit_rl(q, &q->root_rl); |
| @@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk) | |||
| 964 | kobject_del(&q->kobj); | 977 | kobject_del(&q->kobj); |
| 965 | blk_trace_remove_sysfs(disk_to_dev(disk)); | 978 | blk_trace_remove_sysfs(disk_to_dev(disk)); |
| 966 | 979 | ||
| 967 | wbt_exit(q); | 980 | rq_qos_exit(q); |
| 968 | 981 | ||
| 969 | mutex_lock(&q->sysfs_lock); | 982 | mutex_lock(&q->sysfs_lock); |
| 970 | if (q->request_fn || (q->mq_ops && q->elevator)) | 983 | if (q->request_fn || (q->mq_ops && q->elevator)) |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 82282e6fdcf8..a3eede00d302 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
| @@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) | |||
| 579 | struct throtl_grp *tg = blkg_to_tg(blkg); | 579 | struct throtl_grp *tg = blkg_to_tg(blkg); |
| 580 | 580 | ||
| 581 | if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || | 581 | if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || |
| 582 | tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) | 582 | tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { |
| 583 | low_valid = true; | 583 | low_valid = true; |
| 584 | break; | ||
| 585 | } | ||
| 584 | } | 586 | } |
| 585 | rcu_read_unlock(); | 587 | rcu_read_unlock(); |
| 586 | 588 | ||
| @@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, | |||
| 920 | } | 922 | } |
| 921 | 923 | ||
| 922 | /* Calc approx time to dispatch */ | 924 | /* Calc approx time to dispatch */ |
| 923 | jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; | 925 | jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; |
| 924 | |||
| 925 | if (jiffy_wait > jiffy_elapsed) | ||
| 926 | jiffy_wait = jiffy_wait - jiffy_elapsed; | ||
| 927 | else | ||
| 928 | jiffy_wait = 1; | ||
| 929 | 926 | ||
| 930 | if (wait) | 927 | if (wait) |
| 931 | *wait = jiffy_wait; | 928 | *wait = jiffy_wait; |
| @@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) | |||
| 2132 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) | 2129 | static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) |
| 2133 | { | 2130 | { |
| 2134 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | 2131 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW |
| 2135 | if (bio->bi_css) { | 2132 | if (bio->bi_css) |
| 2136 | if (bio->bi_cg_private) | 2133 | bio_associate_blkg(bio, tg_to_blkg(tg)); |
| 2137 | blkg_put(tg_to_blkg(bio->bi_cg_private)); | ||
| 2138 | bio->bi_cg_private = tg; | ||
| 2139 | blkg_get(tg_to_blkg(tg)); | ||
| 2140 | } | ||
| 2141 | bio_issue_init(&bio->bi_issue, bio_sectors(bio)); | 2134 | bio_issue_init(&bio->bi_issue, bio_sectors(bio)); |
| 2142 | #endif | 2135 | #endif |
| 2143 | } | 2136 | } |
| @@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) | |||
| 2285 | 2278 | ||
| 2286 | void blk_throtl_bio_endio(struct bio *bio) | 2279 | void blk_throtl_bio_endio(struct bio *bio) |
| 2287 | { | 2280 | { |
| 2281 | struct blkcg_gq *blkg; | ||
| 2288 | struct throtl_grp *tg; | 2282 | struct throtl_grp *tg; |
| 2289 | u64 finish_time_ns; | 2283 | u64 finish_time_ns; |
| 2290 | unsigned long finish_time; | 2284 | unsigned long finish_time; |
| @@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
| 2292 | unsigned long lat; | 2286 | unsigned long lat; |
| 2293 | int rw = bio_data_dir(bio); | 2287 | int rw = bio_data_dir(bio); |
| 2294 | 2288 | ||
| 2295 | tg = bio->bi_cg_private; | 2289 | blkg = bio->bi_blkg; |
| 2296 | if (!tg) | 2290 | if (!blkg) |
| 2297 | return; | 2291 | return; |
| 2298 | bio->bi_cg_private = NULL; | 2292 | tg = blkg_to_tg(blkg); |
| 2299 | 2293 | ||
| 2300 | finish_time_ns = ktime_get_ns(); | 2294 | finish_time_ns = ktime_get_ns(); |
| 2301 | tg->last_finish_time = finish_time_ns >> 10; | 2295 | tg->last_finish_time = finish_time_ns >> 10; |
| 2302 | 2296 | ||
| 2303 | start_time = bio_issue_time(&bio->bi_issue) >> 10; | 2297 | start_time = bio_issue_time(&bio->bi_issue) >> 10; |
| 2304 | finish_time = __bio_issue_time(finish_time_ns) >> 10; | 2298 | finish_time = __bio_issue_time(finish_time_ns) >> 10; |
| 2305 | if (!start_time || finish_time <= start_time) { | 2299 | if (!start_time || finish_time <= start_time) |
| 2306 | blkg_put(tg_to_blkg(tg)); | ||
| 2307 | return; | 2300 | return; |
| 2308 | } | ||
| 2309 | 2301 | ||
| 2310 | lat = finish_time - start_time; | 2302 | lat = finish_time - start_time; |
| 2311 | /* this is only for bio based driver */ | 2303 | /* this is only for bio based driver */ |
| @@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio) | |||
| 2334 | tg->bio_cnt /= 2; | 2326 | tg->bio_cnt /= 2; |
| 2335 | tg->bad_bio_cnt /= 2; | 2327 | tg->bad_bio_cnt /= 2; |
| 2336 | } | 2328 | } |
| 2337 | |||
| 2338 | blkg_put(tg_to_blkg(tg)); | ||
| 2339 | } | 2329 | } |
| 2340 | #endif | 2330 | #endif |
| 2341 | 2331 | ||
diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4f89b28fa652..1d94a20374fc 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/swap.h> | 25 | #include <linux/swap.h> |
| 26 | 26 | ||
| 27 | #include "blk-wbt.h" | 27 | #include "blk-wbt.h" |
| 28 | #include "blk-rq-qos.h" | ||
| 28 | 29 | ||
| 29 | #define CREATE_TRACE_POINTS | 30 | #define CREATE_TRACE_POINTS |
| 30 | #include <trace/events/wbt.h> | 31 | #include <trace/events/wbt.h> |
| @@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb) | |||
| 78 | return rwb && rwb->wb_normal != 0; | 79 | return rwb && rwb->wb_normal != 0; |
| 79 | } | 80 | } |
| 80 | 81 | ||
| 81 | /* | ||
| 82 | * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, | ||
| 83 | * false if 'v' + 1 would be bigger than 'below'. | ||
| 84 | */ | ||
| 85 | static bool atomic_inc_below(atomic_t *v, int below) | ||
| 86 | { | ||
| 87 | int cur = atomic_read(v); | ||
| 88 | |||
| 89 | for (;;) { | ||
| 90 | int old; | ||
| 91 | |||
| 92 | if (cur >= below) | ||
| 93 | return false; | ||
| 94 | old = atomic_cmpxchg(v, cur, cur + 1); | ||
| 95 | if (old == cur) | ||
| 96 | break; | ||
| 97 | cur = old; | ||
| 98 | } | ||
| 99 | |||
| 100 | return true; | ||
| 101 | } | ||
| 102 | |||
| 103 | static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) | 82 | static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) |
| 104 | { | 83 | { |
| 105 | if (rwb_enabled(rwb)) { | 84 | if (rwb_enabled(rwb)) { |
| @@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) | |||
| 116 | */ | 95 | */ |
| 117 | static bool wb_recent_wait(struct rq_wb *rwb) | 96 | static bool wb_recent_wait(struct rq_wb *rwb) |
| 118 | { | 97 | { |
| 119 | struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; | 98 | struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; |
| 120 | 99 | ||
| 121 | return time_before(jiffies, wb->dirty_sleep + HZ); | 100 | return time_before(jiffies, wb->dirty_sleep + HZ); |
| 122 | } | 101 | } |
| @@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb) | |||
| 144 | } | 123 | } |
| 145 | } | 124 | } |
| 146 | 125 | ||
| 147 | void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) | 126 | static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) |
| 148 | { | 127 | { |
| 128 | struct rq_wb *rwb = RQWB(rqos); | ||
| 149 | struct rq_wait *rqw; | 129 | struct rq_wait *rqw; |
| 150 | int inflight, limit; | 130 | int inflight, limit; |
| 151 | 131 | ||
| @@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) | |||
| 186 | int diff = limit - inflight; | 166 | int diff = limit - inflight; |
| 187 | 167 | ||
| 188 | if (!inflight || diff >= rwb->wb_background / 2) | 168 | if (!inflight || diff >= rwb->wb_background / 2) |
| 189 | wake_up_all(&rqw->wait); | 169 | wake_up(&rqw->wait); |
| 190 | } | 170 | } |
| 191 | } | 171 | } |
| 192 | 172 | ||
| @@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) | |||
| 194 | * Called on completion of a request. Note that it's also called when | 174 | * Called on completion of a request. Note that it's also called when |
| 195 | * a request is merged, when the request gets freed. | 175 | * a request is merged, when the request gets freed. |
| 196 | */ | 176 | */ |
| 197 | void wbt_done(struct rq_wb *rwb, struct request *rq) | 177 | static void wbt_done(struct rq_qos *rqos, struct request *rq) |
| 198 | { | 178 | { |
| 199 | if (!rwb) | 179 | struct rq_wb *rwb = RQWB(rqos); |
| 200 | return; | ||
| 201 | 180 | ||
| 202 | if (!wbt_is_tracked(rq)) { | 181 | if (!wbt_is_tracked(rq)) { |
| 203 | if (rwb->sync_cookie == rq) { | 182 | if (rwb->sync_cookie == rq) { |
| @@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq) | |||
| 209 | wb_timestamp(rwb, &rwb->last_comp); | 188 | wb_timestamp(rwb, &rwb->last_comp); |
| 210 | } else { | 189 | } else { |
| 211 | WARN_ON_ONCE(rq == rwb->sync_cookie); | 190 | WARN_ON_ONCE(rq == rwb->sync_cookie); |
| 212 | __wbt_done(rwb, wbt_flags(rq)); | 191 | __wbt_done(rqos, wbt_flags(rq)); |
| 213 | } | 192 | } |
| 214 | wbt_clear_state(rq); | 193 | wbt_clear_state(rq); |
| 215 | } | 194 | } |
| 216 | 195 | ||
| 217 | /* | ||
| 218 | * Return true, if we can't increase the depth further by scaling | ||
| 219 | */ | ||
| 220 | static bool calc_wb_limits(struct rq_wb *rwb) | ||
| 221 | { | ||
| 222 | unsigned int depth; | ||
| 223 | bool ret = false; | ||
| 224 | |||
| 225 | if (!rwb->min_lat_nsec) { | ||
| 226 | rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; | ||
| 227 | return false; | ||
| 228 | } | ||
| 229 | |||
| 230 | /* | ||
| 231 | * For QD=1 devices, this is a special case. It's important for those | ||
| 232 | * to have one request ready when one completes, so force a depth of | ||
| 233 | * 2 for those devices. On the backend, it'll be a depth of 1 anyway, | ||
| 234 | * since the device can't have more than that in flight. If we're | ||
| 235 | * scaling down, then keep a setting of 1/1/1. | ||
| 236 | */ | ||
| 237 | if (rwb->queue_depth == 1) { | ||
| 238 | if (rwb->scale_step > 0) | ||
| 239 | rwb->wb_max = rwb->wb_normal = 1; | ||
| 240 | else { | ||
| 241 | rwb->wb_max = rwb->wb_normal = 2; | ||
| 242 | ret = true; | ||
| 243 | } | ||
| 244 | rwb->wb_background = 1; | ||
| 245 | } else { | ||
| 246 | /* | ||
| 247 | * scale_step == 0 is our default state. If we have suffered | ||
| 248 | * latency spikes, step will be > 0, and we shrink the | ||
| 249 | * allowed write depths. If step is < 0, we're only doing | ||
| 250 | * writes, and we allow a temporarily higher depth to | ||
| 251 | * increase performance. | ||
| 252 | */ | ||
| 253 | depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); | ||
| 254 | if (rwb->scale_step > 0) | ||
| 255 | depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); | ||
| 256 | else if (rwb->scale_step < 0) { | ||
| 257 | unsigned int maxd = 3 * rwb->queue_depth / 4; | ||
| 258 | |||
| 259 | depth = 1 + ((depth - 1) << -rwb->scale_step); | ||
| 260 | if (depth > maxd) { | ||
| 261 | depth = maxd; | ||
| 262 | ret = true; | ||
| 263 | } | ||
| 264 | } | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Set our max/normal/bg queue depths based on how far | ||
| 268 | * we have scaled down (->scale_step). | ||
| 269 | */ | ||
| 270 | rwb->wb_max = depth; | ||
| 271 | rwb->wb_normal = (rwb->wb_max + 1) / 2; | ||
| 272 | rwb->wb_background = (rwb->wb_max + 3) / 4; | ||
| 273 | } | ||
| 274 | |||
| 275 | return ret; | ||
| 276 | } | ||
| 277 | |||
| 278 | static inline bool stat_sample_valid(struct blk_rq_stat *stat) | 196 | static inline bool stat_sample_valid(struct blk_rq_stat *stat) |
| 279 | { | 197 | { |
| 280 | /* | 198 | /* |
| @@ -307,7 +225,8 @@ enum { | |||
| 307 | 225 | ||
| 308 | static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | 226 | static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) |
| 309 | { | 227 | { |
| 310 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; | 228 | struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; |
| 229 | struct rq_depth *rqd = &rwb->rq_depth; | ||
| 311 | u64 thislat; | 230 | u64 thislat; |
| 312 | 231 | ||
| 313 | /* | 232 | /* |
| @@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
| 351 | return LAT_EXCEEDED; | 270 | return LAT_EXCEEDED; |
| 352 | } | 271 | } |
| 353 | 272 | ||
| 354 | if (rwb->scale_step) | 273 | if (rqd->scale_step) |
| 355 | trace_wbt_stat(bdi, stat); | 274 | trace_wbt_stat(bdi, stat); |
| 356 | 275 | ||
| 357 | return LAT_OK; | 276 | return LAT_OK; |
| @@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) | |||
| 359 | 278 | ||
| 360 | static void rwb_trace_step(struct rq_wb *rwb, const char *msg) | 279 | static void rwb_trace_step(struct rq_wb *rwb, const char *msg) |
| 361 | { | 280 | { |
| 362 | struct backing_dev_info *bdi = rwb->queue->backing_dev_info; | 281 | struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; |
| 282 | struct rq_depth *rqd = &rwb->rq_depth; | ||
| 363 | 283 | ||
| 364 | trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, | 284 | trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, |
| 365 | rwb->wb_background, rwb->wb_normal, rwb->wb_max); | 285 | rwb->wb_background, rwb->wb_normal, rqd->max_depth); |
| 366 | } | 286 | } |
| 367 | 287 | ||
| 368 | static void scale_up(struct rq_wb *rwb) | 288 | static void calc_wb_limits(struct rq_wb *rwb) |
| 369 | { | 289 | { |
| 370 | /* | 290 | if (rwb->min_lat_nsec == 0) { |
| 371 | * Hit max in previous round, stop here | 291 | rwb->wb_normal = rwb->wb_background = 0; |
| 372 | */ | 292 | } else if (rwb->rq_depth.max_depth <= 2) { |
| 373 | if (rwb->scaled_max) | 293 | rwb->wb_normal = rwb->rq_depth.max_depth; |
| 374 | return; | 294 | rwb->wb_background = 1; |
| 295 | } else { | ||
| 296 | rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; | ||
| 297 | rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; | ||
| 298 | } | ||
| 299 | } | ||
| 375 | 300 | ||
| 376 | rwb->scale_step--; | 301 | static void scale_up(struct rq_wb *rwb) |
| 302 | { | ||
| 303 | rq_depth_scale_up(&rwb->rq_depth); | ||
| 304 | calc_wb_limits(rwb); | ||
| 377 | rwb->unknown_cnt = 0; | 305 | rwb->unknown_cnt = 0; |
| 378 | 306 | rwb_trace_step(rwb, "scale up"); | |
| 379 | rwb->scaled_max = calc_wb_limits(rwb); | ||
| 380 | |||
| 381 | rwb_wake_all(rwb); | ||
| 382 | |||
| 383 | rwb_trace_step(rwb, "step up"); | ||
| 384 | } | 307 | } |
| 385 | 308 | ||
| 386 | /* | ||
| 387 | * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we | ||
| 388 | * had a latency violation. | ||
| 389 | */ | ||
| 390 | static void scale_down(struct rq_wb *rwb, bool hard_throttle) | 309 | static void scale_down(struct rq_wb *rwb, bool hard_throttle) |
| 391 | { | 310 | { |
| 392 | /* | 311 | rq_depth_scale_down(&rwb->rq_depth, hard_throttle); |
| 393 | * Stop scaling down when we've hit the limit. This also prevents | ||
| 394 | * ->scale_step from going to crazy values, if the device can't | ||
| 395 | * keep up. | ||
| 396 | */ | ||
| 397 | if (rwb->wb_max == 1) | ||
| 398 | return; | ||
| 399 | |||
| 400 | if (rwb->scale_step < 0 && hard_throttle) | ||
| 401 | rwb->scale_step = 0; | ||
| 402 | else | ||
| 403 | rwb->scale_step++; | ||
| 404 | |||
| 405 | rwb->scaled_max = false; | ||
| 406 | rwb->unknown_cnt = 0; | ||
| 407 | calc_wb_limits(rwb); | 312 | calc_wb_limits(rwb); |
| 408 | rwb_trace_step(rwb, "step down"); | 313 | rwb->unknown_cnt = 0; |
| 314 | rwb_wake_all(rwb); | ||
| 315 | rwb_trace_step(rwb, "scale down"); | ||
| 409 | } | 316 | } |
| 410 | 317 | ||
| 411 | static void rwb_arm_timer(struct rq_wb *rwb) | 318 | static void rwb_arm_timer(struct rq_wb *rwb) |
| 412 | { | 319 | { |
| 413 | if (rwb->scale_step > 0) { | 320 | struct rq_depth *rqd = &rwb->rq_depth; |
| 321 | |||
| 322 | if (rqd->scale_step > 0) { | ||
| 414 | /* | 323 | /* |
| 415 | * We should speed this up, using some variant of a fast | 324 | * We should speed this up, using some variant of a fast |
| 416 | * integer inverse square root calculation. Since we only do | 325 | * integer inverse square root calculation. Since we only do |
| @@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb) | |||
| 418 | * though. | 327 | * though. |
| 419 | */ | 328 | */ |
| 420 | rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, | 329 | rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, |
| 421 | int_sqrt((rwb->scale_step + 1) << 8)); | 330 | int_sqrt((rqd->scale_step + 1) << 8)); |
| 422 | } else { | 331 | } else { |
| 423 | /* | 332 | /* |
| 424 | * For step < 0, we don't want to increase/decrease the | 333 | * For step < 0, we don't want to increase/decrease the |
| @@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb) | |||
| 433 | static void wb_timer_fn(struct blk_stat_callback *cb) | 342 | static void wb_timer_fn(struct blk_stat_callback *cb) |
| 434 | { | 343 | { |
| 435 | struct rq_wb *rwb = cb->data; | 344 | struct rq_wb *rwb = cb->data; |
| 345 | struct rq_depth *rqd = &rwb->rq_depth; | ||
| 436 | unsigned int inflight = wbt_inflight(rwb); | 346 | unsigned int inflight = wbt_inflight(rwb); |
| 437 | int status; | 347 | int status; |
| 438 | 348 | ||
| 439 | status = latency_exceeded(rwb, cb->stat); | 349 | status = latency_exceeded(rwb, cb->stat); |
| 440 | 350 | ||
| 441 | trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, | 351 | trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, |
| 442 | inflight); | 352 | inflight); |
| 443 | 353 | ||
| 444 | /* | 354 | /* |
| @@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) | |||
| 469 | * currently don't have a valid read/write sample. For that | 379 | * currently don't have a valid read/write sample. For that |
| 470 | * case, slowly return to center state (step == 0). | 380 | * case, slowly return to center state (step == 0). |
| 471 | */ | 381 | */ |
| 472 | if (rwb->scale_step > 0) | 382 | if (rqd->scale_step > 0) |
| 473 | scale_up(rwb); | 383 | scale_up(rwb); |
| 474 | else if (rwb->scale_step < 0) | 384 | else if (rqd->scale_step < 0) |
| 475 | scale_down(rwb, false); | 385 | scale_down(rwb, false); |
| 476 | break; | 386 | break; |
| 477 | default: | 387 | default: |
| @@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb) | |||
| 481 | /* | 391 | /* |
| 482 | * Re-arm timer, if we have IO in flight | 392 | * Re-arm timer, if we have IO in flight |
| 483 | */ | 393 | */ |
| 484 | if (rwb->scale_step || inflight) | 394 | if (rqd->scale_step || inflight) |
| 485 | rwb_arm_timer(rwb); | 395 | rwb_arm_timer(rwb); |
| 486 | } | 396 | } |
| 487 | 397 | ||
| 488 | void wbt_update_limits(struct rq_wb *rwb) | 398 | static void __wbt_update_limits(struct rq_wb *rwb) |
| 489 | { | 399 | { |
| 490 | rwb->scale_step = 0; | 400 | struct rq_depth *rqd = &rwb->rq_depth; |
| 491 | rwb->scaled_max = false; | 401 | |
| 402 | rqd->scale_step = 0; | ||
| 403 | rqd->scaled_max = false; | ||
| 404 | |||
| 405 | rq_depth_calc_max_depth(rqd); | ||
| 492 | calc_wb_limits(rwb); | 406 | calc_wb_limits(rwb); |
| 493 | 407 | ||
| 494 | rwb_wake_all(rwb); | 408 | rwb_wake_all(rwb); |
| 495 | } | 409 | } |
| 496 | 410 | ||
| 411 | void wbt_update_limits(struct request_queue *q) | ||
| 412 | { | ||
| 413 | struct rq_qos *rqos = wbt_rq_qos(q); | ||
| 414 | if (!rqos) | ||
| 415 | return; | ||
| 416 | __wbt_update_limits(RQWB(rqos)); | ||
| 417 | } | ||
| 418 | |||
| 419 | u64 wbt_get_min_lat(struct request_queue *q) | ||
| 420 | { | ||
| 421 | struct rq_qos *rqos = wbt_rq_qos(q); | ||
| 422 | if (!rqos) | ||
| 423 | return 0; | ||
| 424 | return RQWB(rqos)->min_lat_nsec; | ||
| 425 | } | ||
| 426 | |||
| 427 | void wbt_set_min_lat(struct request_queue *q, u64 val) | ||
| 428 | { | ||
| 429 | struct rq_qos *rqos = wbt_rq_qos(q); | ||
| 430 | if (!rqos) | ||
| 431 | return; | ||
| 432 | RQWB(rqos)->min_lat_nsec = val; | ||
| 433 | RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; | ||
| 434 | __wbt_update_limits(RQWB(rqos)); | ||
| 435 | } | ||
| 436 | |||
| 437 | |||
| 497 | static bool close_io(struct rq_wb *rwb) | 438 | static bool close_io(struct rq_wb *rwb) |
| 498 | { | 439 | { |
| 499 | const unsigned long now = jiffies; | 440 | const unsigned long now = jiffies; |
| @@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) | |||
| 520 | * IO for a bit. | 461 | * IO for a bit. |
| 521 | */ | 462 | */ |
| 522 | if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) | 463 | if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) |
| 523 | limit = rwb->wb_max; | 464 | limit = rwb->rq_depth.max_depth; |
| 524 | else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { | 465 | else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { |
| 525 | /* | 466 | /* |
| 526 | * If less than 100ms since we completed unrelated IO, | 467 | * If less than 100ms since we completed unrelated IO, |
| @@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) | |||
| 533 | return limit; | 474 | return limit; |
| 534 | } | 475 | } |
| 535 | 476 | ||
| 536 | static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, | ||
| 537 | wait_queue_entry_t *wait, unsigned long rw) | ||
| 538 | { | ||
| 539 | /* | ||
| 540 | * inc it here even if disabled, since we'll dec it at completion. | ||
| 541 | * this only happens if the task was sleeping in __wbt_wait(), | ||
| 542 | * and someone turned it off at the same time. | ||
| 543 | */ | ||
| 544 | if (!rwb_enabled(rwb)) { | ||
| 545 | atomic_inc(&rqw->inflight); | ||
| 546 | return true; | ||
| 547 | } | ||
| 548 | |||
| 549 | /* | ||
| 550 | * If the waitqueue is already active and we are not the next | ||
| 551 | * in line to be woken up, wait for our turn. | ||
| 552 | */ | ||
| 553 | if (waitqueue_active(&rqw->wait) && | ||
| 554 | rqw->wait.head.next != &wait->entry) | ||
| 555 | return false; | ||
| 556 | |||
| 557 | return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); | ||
| 558 | } | ||
| 559 | |||
| 560 | /* | 477 | /* |
| 561 | * Block if we will exceed our limit, or if we are currently waiting for | 478 | * Block if we will exceed our limit, or if we are currently waiting for |
| 562 | * the timer to kick off queuing again. | 479 | * the timer to kick off queuing again. |
| @@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, | |||
| 567 | __acquires(lock) | 484 | __acquires(lock) |
| 568 | { | 485 | { |
| 569 | struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); | 486 | struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); |
| 570 | DEFINE_WAIT(wait); | 487 | DECLARE_WAITQUEUE(wait, current); |
| 571 | 488 | ||
| 572 | if (may_queue(rwb, rqw, &wait, rw)) | 489 | /* |
| 490 | * inc it here even if disabled, since we'll dec it at completion. | ||
| 491 | * this only happens if the task was sleeping in __wbt_wait(), | ||
| 492 | * and someone turned it off at the same time. | ||
| 493 | */ | ||
| 494 | if (!rwb_enabled(rwb)) { | ||
| 495 | atomic_inc(&rqw->inflight); | ||
| 573 | return; | 496 | return; |
| 497 | } | ||
| 574 | 498 | ||
| 499 | if (!waitqueue_active(&rqw->wait) | ||
| 500 | && rq_wait_inc_below(rqw, get_limit(rwb, rw))) | ||
| 501 | return; | ||
| 502 | |||
| 503 | add_wait_queue_exclusive(&rqw->wait, &wait); | ||
| 575 | do { | 504 | do { |
| 576 | prepare_to_wait_exclusive(&rqw->wait, &wait, | 505 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 577 | TASK_UNINTERRUPTIBLE); | ||
| 578 | 506 | ||
| 579 | if (may_queue(rwb, rqw, &wait, rw)) | 507 | if (!rwb_enabled(rwb)) { |
| 508 | atomic_inc(&rqw->inflight); | ||
| 509 | break; | ||
| 510 | } | ||
| 511 | |||
| 512 | if (rq_wait_inc_below(rqw, get_limit(rwb, rw))) | ||
| 580 | break; | 513 | break; |
| 581 | 514 | ||
| 582 | if (lock) { | 515 | if (lock) { |
| @@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, | |||
| 587 | io_schedule(); | 520 | io_schedule(); |
| 588 | } while (1); | 521 | } while (1); |
| 589 | 522 | ||
| 590 | finish_wait(&rqw->wait, &wait); | 523 | __set_current_state(TASK_RUNNING); |
| 524 | remove_wait_queue(&rqw->wait, &wait); | ||
| 591 | } | 525 | } |
| 592 | 526 | ||
| 593 | static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) | 527 | static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) |
| @@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) | |||
| 608 | } | 542 | } |
| 609 | } | 543 | } |
| 610 | 544 | ||
| 545 | static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) | ||
| 546 | { | ||
| 547 | enum wbt_flags flags = 0; | ||
| 548 | |||
| 549 | if (bio_op(bio) == REQ_OP_READ) { | ||
| 550 | flags = WBT_READ; | ||
| 551 | } else if (wbt_should_throttle(rwb, bio)) { | ||
| 552 | if (current_is_kswapd()) | ||
| 553 | flags |= WBT_KSWAPD; | ||
| 554 | if (bio_op(bio) == REQ_OP_DISCARD) | ||
| 555 | flags |= WBT_DISCARD; | ||
| 556 | flags |= WBT_TRACKED; | ||
| 557 | } | ||
| 558 | return flags; | ||
| 559 | } | ||
| 560 | |||
| 561 | static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) | ||
| 562 | { | ||
| 563 | struct rq_wb *rwb = RQWB(rqos); | ||
| 564 | enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); | ||
| 565 | __wbt_done(rqos, flags); | ||
| 566 | } | ||
| 567 | |||
| 611 | /* | 568 | /* |
| 612 | * Returns true if the IO request should be accounted, false if not. | 569 | * Returns true if the IO request should be accounted, false if not. |
| 613 | * May sleep, if we have exceeded the writeback limits. Caller can pass | 570 | * May sleep, if we have exceeded the writeback limits. Caller can pass |
| 614 | * in an irq held spinlock, if it holds one when calling this function. | 571 | * in an irq held spinlock, if it holds one when calling this function. |
| 615 | * If we do sleep, we'll release and re-grab it. | 572 | * If we do sleep, we'll release and re-grab it. |
| 616 | */ | 573 | */ |
| 617 | enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) | 574 | static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) |
| 618 | { | 575 | { |
| 619 | enum wbt_flags ret = 0; | 576 | struct rq_wb *rwb = RQWB(rqos); |
| 577 | enum wbt_flags flags; | ||
| 620 | 578 | ||
| 621 | if (!rwb_enabled(rwb)) | 579 | if (!rwb_enabled(rwb)) |
| 622 | return 0; | 580 | return; |
| 623 | 581 | ||
| 624 | if (bio_op(bio) == REQ_OP_READ) | 582 | flags = bio_to_wbt_flags(rwb, bio); |
| 625 | ret = WBT_READ; | ||
| 626 | 583 | ||
| 627 | if (!wbt_should_throttle(rwb, bio)) { | 584 | if (!wbt_should_throttle(rwb, bio)) { |
| 628 | if (ret & WBT_READ) | 585 | if (flags & WBT_READ) |
| 629 | wb_timestamp(rwb, &rwb->last_issue); | 586 | wb_timestamp(rwb, &rwb->last_issue); |
| 630 | return ret; | 587 | return; |
| 631 | } | 588 | } |
| 632 | 589 | ||
| 633 | if (current_is_kswapd()) | 590 | if (current_is_kswapd()) |
| 634 | ret |= WBT_KSWAPD; | 591 | flags |= WBT_KSWAPD; |
| 635 | if (bio_op(bio) == REQ_OP_DISCARD) | 592 | if (bio_op(bio) == REQ_OP_DISCARD) |
| 636 | ret |= WBT_DISCARD; | 593 | flags |= WBT_DISCARD; |
| 637 | 594 | ||
| 638 | __wbt_wait(rwb, ret, bio->bi_opf, lock); | 595 | __wbt_wait(rwb, flags, bio->bi_opf, lock); |
| 639 | 596 | ||
| 640 | if (!blk_stat_is_active(rwb->cb)) | 597 | if (!blk_stat_is_active(rwb->cb)) |
| 641 | rwb_arm_timer(rwb); | 598 | rwb_arm_timer(rwb); |
| 599 | } | ||
| 642 | 600 | ||
| 643 | return ret | WBT_TRACKED; | 601 | static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) |
| 602 | { | ||
| 603 | struct rq_wb *rwb = RQWB(rqos); | ||
| 604 | rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); | ||
| 644 | } | 605 | } |
| 645 | 606 | ||
| 646 | void wbt_issue(struct rq_wb *rwb, struct request *rq) | 607 | void wbt_issue(struct rq_qos *rqos, struct request *rq) |
| 647 | { | 608 | { |
| 609 | struct rq_wb *rwb = RQWB(rqos); | ||
| 610 | |||
| 648 | if (!rwb_enabled(rwb)) | 611 | if (!rwb_enabled(rwb)) |
| 649 | return; | 612 | return; |
| 650 | 613 | ||
| @@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq) | |||
| 661 | } | 624 | } |
| 662 | } | 625 | } |
| 663 | 626 | ||
| 664 | void wbt_requeue(struct rq_wb *rwb, struct request *rq) | 627 | void wbt_requeue(struct rq_qos *rqos, struct request *rq) |
| 665 | { | 628 | { |
| 629 | struct rq_wb *rwb = RQWB(rqos); | ||
| 666 | if (!rwb_enabled(rwb)) | 630 | if (!rwb_enabled(rwb)) |
| 667 | return; | 631 | return; |
| 668 | if (rq == rwb->sync_cookie) { | 632 | if (rq == rwb->sync_cookie) { |
| @@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq) | |||
| 671 | } | 635 | } |
| 672 | } | 636 | } |
| 673 | 637 | ||
| 674 | void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) | 638 | void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) |
| 675 | { | 639 | { |
| 676 | if (rwb) { | 640 | struct rq_qos *rqos = wbt_rq_qos(q); |
| 677 | rwb->queue_depth = depth; | 641 | if (rqos) { |
| 678 | wbt_update_limits(rwb); | 642 | RQWB(rqos)->rq_depth.queue_depth = depth; |
| 643 | __wbt_update_limits(RQWB(rqos)); | ||
| 679 | } | 644 | } |
| 680 | } | 645 | } |
| 681 | 646 | ||
| 682 | void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) | 647 | void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) |
| 683 | { | 648 | { |
| 684 | if (rwb) | 649 | struct rq_qos *rqos = wbt_rq_qos(q); |
| 685 | rwb->wc = write_cache_on; | 650 | if (rqos) |
| 651 | RQWB(rqos)->wc = write_cache_on; | ||
| 686 | } | 652 | } |
| 687 | 653 | ||
| 688 | /* | 654 | /* |
| 689 | * Disable wbt, if enabled by default. | ||
| 690 | */ | ||
| 691 | void wbt_disable_default(struct request_queue *q) | ||
| 692 | { | ||
| 693 | struct rq_wb *rwb = q->rq_wb; | ||
| 694 | |||
| 695 | if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) | ||
| 696 | wbt_exit(q); | ||
| 697 | } | ||
| 698 | EXPORT_SYMBOL_GPL(wbt_disable_default); | ||
| 699 | |||
| 700 | /* | ||
| 701 | * Enable wbt if defaults are configured that way | 655 | * Enable wbt if defaults are configured that way |
| 702 | */ | 656 | */ |
| 703 | void wbt_enable_default(struct request_queue *q) | 657 | void wbt_enable_default(struct request_queue *q) |
| 704 | { | 658 | { |
| 659 | struct rq_qos *rqos = wbt_rq_qos(q); | ||
| 705 | /* Throttling already enabled? */ | 660 | /* Throttling already enabled? */ |
| 706 | if (q->rq_wb) | 661 | if (rqos) |
| 707 | return; | 662 | return; |
| 708 | 663 | ||
| 709 | /* Queue not registered? Maybe shutting down... */ | 664 | /* Queue not registered? Maybe shutting down... */ |
| @@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq) | |||
| 741 | return -1; | 696 | return -1; |
| 742 | } | 697 | } |
| 743 | 698 | ||
| 699 | static void wbt_exit(struct rq_qos *rqos) | ||
| 700 | { | ||
| 701 | struct rq_wb *rwb = RQWB(rqos); | ||
| 702 | struct request_queue *q = rqos->q; | ||
| 703 | |||
| 704 | blk_stat_remove_callback(q, rwb->cb); | ||
| 705 | blk_stat_free_callback(rwb->cb); | ||
| 706 | kfree(rwb); | ||
| 707 | } | ||
| 708 | |||
| 709 | /* | ||
| 710 | * Disable wbt, if enabled by default. | ||
| 711 | */ | ||
| 712 | void wbt_disable_default(struct request_queue *q) | ||
| 713 | { | ||
| 714 | struct rq_qos *rqos = wbt_rq_qos(q); | ||
| 715 | struct rq_wb *rwb; | ||
| 716 | if (!rqos) | ||
| 717 | return; | ||
| 718 | rwb = RQWB(rqos); | ||
| 719 | if (rwb->enable_state == WBT_STATE_ON_DEFAULT) | ||
| 720 | rwb->wb_normal = 0; | ||
| 721 | } | ||
| 722 | EXPORT_SYMBOL_GPL(wbt_disable_default); | ||
| 723 | |||
| 724 | |||
| 725 | static struct rq_qos_ops wbt_rqos_ops = { | ||
| 726 | .throttle = wbt_wait, | ||
| 727 | .issue = wbt_issue, | ||
| 728 | .track = wbt_track, | ||
| 729 | .requeue = wbt_requeue, | ||
| 730 | .done = wbt_done, | ||
| 731 | .cleanup = wbt_cleanup, | ||
| 732 | .exit = wbt_exit, | ||
| 733 | }; | ||
| 734 | |||
| 744 | int wbt_init(struct request_queue *q) | 735 | int wbt_init(struct request_queue *q) |
| 745 | { | 736 | { |
| 746 | struct rq_wb *rwb; | 737 | struct rq_wb *rwb; |
| @@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q) | |||
| 756 | return -ENOMEM; | 747 | return -ENOMEM; |
| 757 | } | 748 | } |
| 758 | 749 | ||
| 759 | for (i = 0; i < WBT_NUM_RWQ; i++) { | 750 | for (i = 0; i < WBT_NUM_RWQ; i++) |
| 760 | atomic_set(&rwb->rq_wait[i].inflight, 0); | 751 | rq_wait_init(&rwb->rq_wait[i]); |
| 761 | init_waitqueue_head(&rwb->rq_wait[i].wait); | ||
| 762 | } | ||
| 763 | 752 | ||
| 753 | rwb->rqos.id = RQ_QOS_WBT; | ||
| 754 | rwb->rqos.ops = &wbt_rqos_ops; | ||
| 755 | rwb->rqos.q = q; | ||
| 764 | rwb->last_comp = rwb->last_issue = jiffies; | 756 | rwb->last_comp = rwb->last_issue = jiffies; |
| 765 | rwb->queue = q; | ||
| 766 | rwb->win_nsec = RWB_WINDOW_NSEC; | 757 | rwb->win_nsec = RWB_WINDOW_NSEC; |
| 767 | rwb->enable_state = WBT_STATE_ON_DEFAULT; | 758 | rwb->enable_state = WBT_STATE_ON_DEFAULT; |
| 768 | wbt_update_limits(rwb); | 759 | rwb->wc = 1; |
| 760 | rwb->rq_depth.default_depth = RWB_DEF_DEPTH; | ||
| 761 | __wbt_update_limits(rwb); | ||
| 769 | 762 | ||
| 770 | /* | 763 | /* |
| 771 | * Assign rwb and add the stats callback. | 764 | * Assign rwb and add the stats callback. |
| 772 | */ | 765 | */ |
| 773 | q->rq_wb = rwb; | 766 | rq_qos_add(q, &rwb->rqos); |
| 774 | blk_stat_add_callback(q, rwb->cb); | 767 | blk_stat_add_callback(q, rwb->cb); |
| 775 | 768 | ||
| 776 | rwb->min_lat_nsec = wbt_default_latency_nsec(q); | 769 | rwb->min_lat_nsec = wbt_default_latency_nsec(q); |
| 777 | 770 | ||
| 778 | wbt_set_queue_depth(rwb, blk_queue_depth(q)); | 771 | wbt_set_queue_depth(q, blk_queue_depth(q)); |
| 779 | wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); | 772 | wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); |
| 780 | 773 | ||
| 781 | return 0; | 774 | return 0; |
| 782 | } | 775 | } |
| 783 | |||
| 784 | void wbt_exit(struct request_queue *q) | ||
| 785 | { | ||
| 786 | struct rq_wb *rwb = q->rq_wb; | ||
| 787 | |||
| 788 | if (rwb) { | ||
| 789 | blk_stat_remove_callback(q, rwb->cb); | ||
| 790 | blk_stat_free_callback(rwb->cb); | ||
| 791 | q->rq_wb = NULL; | ||
| 792 | kfree(rwb); | ||
| 793 | } | ||
| 794 | } | ||
diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 300df531d0a6..f47218d5b3b2 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/ktime.h> | 9 | #include <linux/ktime.h> |
| 10 | 10 | ||
| 11 | #include "blk-stat.h" | 11 | #include "blk-stat.h" |
| 12 | #include "blk-rq-qos.h" | ||
| 12 | 13 | ||
| 13 | enum wbt_flags { | 14 | enum wbt_flags { |
| 14 | WBT_TRACKED = 1, /* write, tracked for throttling */ | 15 | WBT_TRACKED = 1, /* write, tracked for throttling */ |
| @@ -35,20 +36,12 @@ enum { | |||
| 35 | WBT_STATE_ON_MANUAL = 2, | 36 | WBT_STATE_ON_MANUAL = 2, |
| 36 | }; | 37 | }; |
| 37 | 38 | ||
| 38 | struct rq_wait { | ||
| 39 | wait_queue_head_t wait; | ||
| 40 | atomic_t inflight; | ||
| 41 | }; | ||
| 42 | |||
| 43 | struct rq_wb { | 39 | struct rq_wb { |
| 44 | /* | 40 | /* |
| 45 | * Settings that govern how we throttle | 41 | * Settings that govern how we throttle |
| 46 | */ | 42 | */ |
| 47 | unsigned int wb_background; /* background writeback */ | 43 | unsigned int wb_background; /* background writeback */ |
| 48 | unsigned int wb_normal; /* normal writeback */ | 44 | unsigned int wb_normal; /* normal writeback */ |
| 49 | unsigned int wb_max; /* max throughput writeback */ | ||
| 50 | int scale_step; | ||
| 51 | bool scaled_max; | ||
| 52 | 45 | ||
| 53 | short enable_state; /* WBT_STATE_* */ | 46 | short enable_state; /* WBT_STATE_* */ |
| 54 | 47 | ||
| @@ -67,15 +60,20 @@ struct rq_wb { | |||
| 67 | void *sync_cookie; | 60 | void *sync_cookie; |
| 68 | 61 | ||
| 69 | unsigned int wc; | 62 | unsigned int wc; |
| 70 | unsigned int queue_depth; | ||
| 71 | 63 | ||
| 72 | unsigned long last_issue; /* last non-throttled issue */ | 64 | unsigned long last_issue; /* last non-throttled issue */ |
| 73 | unsigned long last_comp; /* last non-throttled comp */ | 65 | unsigned long last_comp; /* last non-throttled comp */ |
| 74 | unsigned long min_lat_nsec; | 66 | unsigned long min_lat_nsec; |
| 75 | struct request_queue *queue; | 67 | struct rq_qos rqos; |
| 76 | struct rq_wait rq_wait[WBT_NUM_RWQ]; | 68 | struct rq_wait rq_wait[WBT_NUM_RWQ]; |
| 69 | struct rq_depth rq_depth; | ||
| 77 | }; | 70 | }; |
| 78 | 71 | ||
| 72 | static inline struct rq_wb *RQWB(struct rq_qos *rqos) | ||
| 73 | { | ||
| 74 | return container_of(rqos, struct rq_wb, rqos); | ||
| 75 | } | ||
| 76 | |||
| 79 | static inline unsigned int wbt_inflight(struct rq_wb *rwb) | 77 | static inline unsigned int wbt_inflight(struct rq_wb *rwb) |
| 80 | { | 78 | { |
| 81 | unsigned int i, ret = 0; | 79 | unsigned int i, ret = 0; |
| @@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) | |||
| 86 | return ret; | 84 | return ret; |
| 87 | } | 85 | } |
| 88 | 86 | ||
| 89 | #ifdef CONFIG_BLK_WBT | ||
| 90 | 87 | ||
| 91 | static inline void wbt_track(struct request *rq, enum wbt_flags flags) | 88 | #ifdef CONFIG_BLK_WBT |
| 92 | { | ||
| 93 | rq->wbt_flags |= flags; | ||
| 94 | } | ||
| 95 | 89 | ||
| 96 | void __wbt_done(struct rq_wb *, enum wbt_flags); | ||
| 97 | void wbt_done(struct rq_wb *, struct request *); | ||
| 98 | enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); | ||
| 99 | int wbt_init(struct request_queue *); | 90 | int wbt_init(struct request_queue *); |
| 100 | void wbt_exit(struct request_queue *); | 91 | void wbt_update_limits(struct request_queue *); |
| 101 | void wbt_update_limits(struct rq_wb *); | ||
| 102 | void wbt_requeue(struct rq_wb *, struct request *); | ||
| 103 | void wbt_issue(struct rq_wb *, struct request *); | ||
| 104 | void wbt_disable_default(struct request_queue *); | 92 | void wbt_disable_default(struct request_queue *); |
| 105 | void wbt_enable_default(struct request_queue *); | 93 | void wbt_enable_default(struct request_queue *); |
| 106 | 94 | ||
| 107 | void wbt_set_queue_depth(struct rq_wb *, unsigned int); | 95 | u64 wbt_get_min_lat(struct request_queue *q); |
| 108 | void wbt_set_write_cache(struct rq_wb *, bool); | 96 | void wbt_set_min_lat(struct request_queue *q, u64 val); |
| 97 | |||
| 98 | void wbt_set_queue_depth(struct request_queue *, unsigned int); | ||
| 99 | void wbt_set_write_cache(struct request_queue *, bool); | ||
| 109 | 100 | ||
| 110 | u64 wbt_default_latency_nsec(struct request_queue *); | 101 | u64 wbt_default_latency_nsec(struct request_queue *); |
| 111 | 102 | ||
| @@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *); | |||
| 114 | static inline void wbt_track(struct request *rq, enum wbt_flags flags) | 105 | static inline void wbt_track(struct request *rq, enum wbt_flags flags) |
| 115 | { | 106 | { |
| 116 | } | 107 | } |
| 117 | static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) | ||
| 118 | { | ||
| 119 | } | ||
| 120 | static inline void wbt_done(struct rq_wb *rwb, struct request *rq) | ||
| 121 | { | ||
| 122 | } | ||
| 123 | static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, | ||
| 124 | spinlock_t *lock) | ||
| 125 | { | ||
| 126 | return 0; | ||
| 127 | } | ||
| 128 | static inline int wbt_init(struct request_queue *q) | 108 | static inline int wbt_init(struct request_queue *q) |
| 129 | { | 109 | { |
| 130 | return -EINVAL; | 110 | return -EINVAL; |
| 131 | } | 111 | } |
| 132 | static inline void wbt_exit(struct request_queue *q) | 112 | static inline void wbt_update_limits(struct request_queue *q) |
| 133 | { | ||
| 134 | } | ||
| 135 | static inline void wbt_update_limits(struct rq_wb *rwb) | ||
| 136 | { | 113 | { |
| 137 | } | 114 | } |
| 138 | static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) | 115 | static inline void wbt_disable_default(struct request_queue *q) |
| 139 | { | 116 | { |
| 140 | } | 117 | } |
| 141 | static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) | 118 | static inline void wbt_enable_default(struct request_queue *q) |
| 142 | { | 119 | { |
| 143 | } | 120 | } |
| 144 | static inline void wbt_disable_default(struct request_queue *q) | 121 | static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) |
| 145 | { | 122 | { |
| 146 | } | 123 | } |
| 147 | static inline void wbt_enable_default(struct request_queue *q) | 124 | static inline void wbt_set_write_cache(struct request_queue *q, bool wc) |
| 148 | { | 125 | { |
| 149 | } | 126 | } |
| 150 | static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) | 127 | static inline u64 wbt_get_min_lat(struct request_queue *q) |
| 151 | { | 128 | { |
| 129 | return 0; | ||
| 152 | } | 130 | } |
| 153 | static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) | 131 | static inline void wbt_set_min_lat(struct request_queue *q, u64 val) |
| 154 | { | 132 | { |
| 155 | } | 133 | } |
| 156 | static inline u64 wbt_default_latency_nsec(struct request_queue *q) | 134 | static inline u64 wbt_default_latency_nsec(struct request_queue *q) |
diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 51000914e23f..c461cf63f1f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c | |||
| @@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev, | |||
| 200 | /* Get header in the first page */ | 200 | /* Get header in the first page */ |
| 201 | ofst = 0; | 201 | ofst = 0; |
| 202 | if (!nr_rep) { | 202 | if (!nr_rep) { |
| 203 | hdr = (struct blk_zone_report_hdr *) addr; | 203 | hdr = addr; |
| 204 | nr_rep = hdr->nr_zones; | 204 | nr_rep = hdr->nr_zones; |
| 205 | ofst = sizeof(struct blk_zone_report_hdr); | 205 | ofst = sizeof(struct blk_zone_report_hdr); |
| 206 | } | 206 | } |
diff --git a/block/blk.h b/block/blk.h index 8d23aea96ce9..d4d67e948920 100644 --- a/block/blk.h +++ b/block/blk.h | |||
| @@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); | |||
| 130 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | 130 | int blk_init_rl(struct request_list *rl, struct request_queue *q, |
| 131 | gfp_t gfp_mask); | 131 | gfp_t gfp_mask); |
| 132 | void blk_exit_rl(struct request_queue *q, struct request_list *rl); | 132 | void blk_exit_rl(struct request_queue *q, struct request_list *rl); |
| 133 | void blk_exit_queue(struct request_queue *q); | ||
| 133 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 134 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
| 134 | struct bio *bio); | 135 | struct bio *bio); |
| 135 | void blk_queue_bypass_start(struct request_queue *q); | 136 | void blk_queue_bypass_start(struct request_queue *q); |
| @@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) | |||
| 412 | 413 | ||
| 413 | extern void blk_drain_queue(struct request_queue *q); | 414 | extern void blk_drain_queue(struct request_queue *q); |
| 414 | 415 | ||
| 416 | #ifdef CONFIG_BLK_CGROUP_IOLATENCY | ||
| 417 | extern int blk_iolatency_init(struct request_queue *q); | ||
| 418 | #else | ||
| 419 | static inline int blk_iolatency_init(struct request_queue *q) { return 0; } | ||
| 420 | #endif | ||
| 421 | |||
| 415 | #endif /* BLK_INTERNAL_H */ | 422 | #endif /* BLK_INTERNAL_H */ |
diff --git a/block/bounce.c b/block/bounce.c index fd31347b7836..bc63b3a2d18c 100644 --- a/block/bounce.c +++ b/block/bounce.c | |||
| @@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio) | |||
| 195 | __bounce_end_io_read(bio, &isa_page_pool); | 195 | __bounce_end_io_read(bio, &isa_page_pool); |
| 196 | } | 196 | } |
| 197 | 197 | ||
| 198 | static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, | ||
| 199 | struct bio_set *bs) | ||
| 200 | { | ||
| 201 | struct bvec_iter iter; | ||
| 202 | struct bio_vec bv; | ||
| 203 | struct bio *bio; | ||
| 204 | |||
| 205 | /* | ||
| 206 | * Pre immutable biovecs, __bio_clone() used to just do a memcpy from | ||
| 207 | * bio_src->bi_io_vec to bio->bi_io_vec. | ||
| 208 | * | ||
| 209 | * We can't do that anymore, because: | ||
| 210 | * | ||
| 211 | * - The point of cloning the biovec is to produce a bio with a biovec | ||
| 212 | * the caller can modify: bi_idx and bi_bvec_done should be 0. | ||
| 213 | * | ||
| 214 | * - The original bio could've had more than BIO_MAX_PAGES biovecs; if | ||
| 215 | * we tried to clone the whole thing bio_alloc_bioset() would fail. | ||
| 216 | * But the clone should succeed as long as the number of biovecs we | ||
| 217 | * actually need to allocate is fewer than BIO_MAX_PAGES. | ||
| 218 | * | ||
| 219 | * - Lastly, bi_vcnt should not be looked at or relied upon by code | ||
| 220 | * that does not own the bio - reason being drivers don't use it for | ||
| 221 | * iterating over the biovec anymore, so expecting it to be kept up | ||
| 222 | * to date (i.e. for clones that share the parent biovec) is just | ||
| 223 | * asking for trouble and would force extra work on | ||
| 224 | * __bio_clone_fast() anyways. | ||
| 225 | */ | ||
| 226 | |||
| 227 | bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); | ||
| 228 | if (!bio) | ||
| 229 | return NULL; | ||
| 230 | bio->bi_disk = bio_src->bi_disk; | ||
| 231 | bio->bi_opf = bio_src->bi_opf; | ||
| 232 | bio->bi_write_hint = bio_src->bi_write_hint; | ||
| 233 | bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; | ||
| 234 | bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; | ||
| 235 | |||
| 236 | switch (bio_op(bio)) { | ||
| 237 | case REQ_OP_DISCARD: | ||
| 238 | case REQ_OP_SECURE_ERASE: | ||
| 239 | case REQ_OP_WRITE_ZEROES: | ||
| 240 | break; | ||
| 241 | case REQ_OP_WRITE_SAME: | ||
| 242 | bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; | ||
| 243 | break; | ||
| 244 | default: | ||
| 245 | bio_for_each_segment(bv, bio_src, iter) | ||
| 246 | bio->bi_io_vec[bio->bi_vcnt++] = bv; | ||
| 247 | break; | ||
| 248 | } | ||
| 249 | |||
| 250 | if (bio_integrity(bio_src)) { | ||
| 251 | int ret; | ||
| 252 | |||
| 253 | ret = bio_integrity_clone(bio, bio_src, gfp_mask); | ||
| 254 | if (ret < 0) { | ||
| 255 | bio_put(bio); | ||
| 256 | return NULL; | ||
| 257 | } | ||
| 258 | } | ||
| 259 | |||
| 260 | bio_clone_blkcg_association(bio, bio_src); | ||
| 261 | |||
| 262 | return bio; | ||
| 263 | } | ||
| 264 | |||
| 198 | static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | 265 | static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, |
| 199 | mempool_t *pool) | 266 | mempool_t *pool) |
| 200 | { | 267 | { |
| @@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
| 222 | generic_make_request(*bio_orig); | 289 | generic_make_request(*bio_orig); |
| 223 | *bio_orig = bio; | 290 | *bio_orig = bio; |
| 224 | } | 291 | } |
| 225 | bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : | 292 | bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : |
| 226 | &bounce_bio_set); | 293 | &bounce_bio_set); |
| 227 | 294 | ||
| 228 | bio_for_each_segment_all(to, bio, i) { | 295 | bio_for_each_segment_all(to, bio, i) { |
diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 9419def8c017..f3501cdaf1a6 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c | |||
| @@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, | |||
| 48 | 48 | ||
| 49 | job->request_len = hdr->request_len; | 49 | job->request_len = hdr->request_len; |
| 50 | job->request = memdup_user(uptr64(hdr->request), hdr->request_len); | 50 | job->request = memdup_user(uptr64(hdr->request), hdr->request_len); |
| 51 | if (IS_ERR(job->request)) | 51 | |
| 52 | return PTR_ERR(job->request); | 52 | return PTR_ERR_OR_ZERO(job->request); |
| 53 | return 0; | ||
| 54 | } | 53 | } |
| 55 | 54 | ||
| 56 | static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) | 55 | static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) |
diff --git a/block/bsg.c b/block/bsg.c index 3da540faf673..db588add6ba6 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
| @@ -13,11 +13,9 @@ | |||
| 13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
| 14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
| 15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
| 16 | #include <linux/poll.h> | ||
| 17 | #include <linux/cdev.h> | 16 | #include <linux/cdev.h> |
| 18 | #include <linux/jiffies.h> | 17 | #include <linux/jiffies.h> |
| 19 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
| 20 | #include <linux/uio.h> | ||
| 21 | #include <linux/idr.h> | 19 | #include <linux/idr.h> |
| 22 | #include <linux/bsg.h> | 20 | #include <linux/bsg.h> |
| 23 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
| @@ -38,21 +36,10 @@ | |||
| 38 | struct bsg_device { | 36 | struct bsg_device { |
| 39 | struct request_queue *queue; | 37 | struct request_queue *queue; |
| 40 | spinlock_t lock; | 38 | spinlock_t lock; |
| 41 | struct list_head busy_list; | ||
| 42 | struct list_head done_list; | ||
| 43 | struct hlist_node dev_list; | 39 | struct hlist_node dev_list; |
| 44 | atomic_t ref_count; | 40 | atomic_t ref_count; |
| 45 | int queued_cmds; | ||
| 46 | int done_cmds; | ||
| 47 | wait_queue_head_t wq_done; | ||
| 48 | wait_queue_head_t wq_free; | ||
| 49 | char name[20]; | 41 | char name[20]; |
| 50 | int max_queue; | 42 | int max_queue; |
| 51 | unsigned long flags; | ||
| 52 | }; | ||
| 53 | |||
| 54 | enum { | ||
| 55 | BSG_F_BLOCK = 1, | ||
| 56 | }; | 43 | }; |
| 57 | 44 | ||
| 58 | #define BSG_DEFAULT_CMDS 64 | 45 | #define BSG_DEFAULT_CMDS 64 |
| @@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; | |||
| 67 | static struct class *bsg_class; | 54 | static struct class *bsg_class; |
| 68 | static int bsg_major; | 55 | static int bsg_major; |
| 69 | 56 | ||
| 70 | static struct kmem_cache *bsg_cmd_cachep; | ||
| 71 | |||
| 72 | /* | ||
| 73 | * our internal command type | ||
| 74 | */ | ||
| 75 | struct bsg_command { | ||
| 76 | struct bsg_device *bd; | ||
| 77 | struct list_head list; | ||
| 78 | struct request *rq; | ||
| 79 | struct bio *bio; | ||
| 80 | struct bio *bidi_bio; | ||
| 81 | int err; | ||
| 82 | struct sg_io_v4 hdr; | ||
| 83 | }; | ||
| 84 | |||
| 85 | static void bsg_free_command(struct bsg_command *bc) | ||
| 86 | { | ||
| 87 | struct bsg_device *bd = bc->bd; | ||
| 88 | unsigned long flags; | ||
| 89 | |||
| 90 | kmem_cache_free(bsg_cmd_cachep, bc); | ||
| 91 | |||
| 92 | spin_lock_irqsave(&bd->lock, flags); | ||
| 93 | bd->queued_cmds--; | ||
| 94 | spin_unlock_irqrestore(&bd->lock, flags); | ||
| 95 | |||
| 96 | wake_up(&bd->wq_free); | ||
| 97 | } | ||
| 98 | |||
| 99 | static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) | ||
| 100 | { | ||
| 101 | struct bsg_command *bc = ERR_PTR(-EINVAL); | ||
| 102 | |||
| 103 | spin_lock_irq(&bd->lock); | ||
| 104 | |||
| 105 | if (bd->queued_cmds >= bd->max_queue) | ||
| 106 | goto out; | ||
| 107 | |||
| 108 | bd->queued_cmds++; | ||
| 109 | spin_unlock_irq(&bd->lock); | ||
| 110 | |||
| 111 | bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL); | ||
| 112 | if (unlikely(!bc)) { | ||
| 113 | spin_lock_irq(&bd->lock); | ||
| 114 | bd->queued_cmds--; | ||
| 115 | bc = ERR_PTR(-ENOMEM); | ||
| 116 | goto out; | ||
| 117 | } | ||
| 118 | |||
| 119 | bc->bd = bd; | ||
| 120 | INIT_LIST_HEAD(&bc->list); | ||
| 121 | bsg_dbg(bd, "returning free cmd %p\n", bc); | ||
| 122 | return bc; | ||
| 123 | out: | ||
| 124 | spin_unlock_irq(&bd->lock); | ||
| 125 | return bc; | ||
| 126 | } | ||
| 127 | |||
| 128 | static inline struct hlist_head *bsg_dev_idx_hash(int index) | 57 | static inline struct hlist_head *bsg_dev_idx_hash(int index) |
| 129 | { | 58 | { |
| 130 | return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; | 59 | return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; |
| @@ -285,101 +214,6 @@ out: | |||
| 285 | return ERR_PTR(ret); | 214 | return ERR_PTR(ret); |
| 286 | } | 215 | } |
| 287 | 216 | ||
| 288 | /* | ||
| 289 | * async completion call-back from the block layer, when scsi/ide/whatever | ||
| 290 | * calls end_that_request_last() on a request | ||
| 291 | */ | ||
| 292 | static void bsg_rq_end_io(struct request *rq, blk_status_t status) | ||
| 293 | { | ||
| 294 | struct bsg_command *bc = rq->end_io_data; | ||
| 295 | struct bsg_device *bd = bc->bd; | ||
| 296 | unsigned long flags; | ||
| 297 | |||
| 298 | bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", | ||
| 299 | rq, bc, bc->bio); | ||
| 300 | |||
| 301 | bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); | ||
| 302 | |||
| 303 | spin_lock_irqsave(&bd->lock, flags); | ||
| 304 | list_move_tail(&bc->list, &bd->done_list); | ||
| 305 | bd->done_cmds++; | ||
| 306 | spin_unlock_irqrestore(&bd->lock, flags); | ||
| 307 | |||
| 308 | wake_up(&bd->wq_done); | ||
| 309 | } | ||
| 310 | |||
| 311 | /* | ||
| 312 | * do final setup of a 'bc' and submit the matching 'rq' to the block | ||
| 313 | * layer for io | ||
| 314 | */ | ||
| 315 | static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, | ||
| 316 | struct bsg_command *bc, struct request *rq) | ||
| 317 | { | ||
| 318 | int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL)); | ||
| 319 | |||
| 320 | /* | ||
| 321 | * add bc command to busy queue and submit rq for io | ||
| 322 | */ | ||
| 323 | bc->rq = rq; | ||
| 324 | bc->bio = rq->bio; | ||
| 325 | if (rq->next_rq) | ||
| 326 | bc->bidi_bio = rq->next_rq->bio; | ||
| 327 | bc->hdr.duration = jiffies; | ||
| 328 | spin_lock_irq(&bd->lock); | ||
| 329 | list_add_tail(&bc->list, &bd->busy_list); | ||
| 330 | spin_unlock_irq(&bd->lock); | ||
| 331 | |||
| 332 | bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); | ||
| 333 | |||
| 334 | rq->end_io_data = bc; | ||
| 335 | blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); | ||
| 336 | } | ||
| 337 | |||
| 338 | static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd) | ||
| 339 | { | ||
| 340 | struct bsg_command *bc = NULL; | ||
| 341 | |||
| 342 | spin_lock_irq(&bd->lock); | ||
| 343 | if (bd->done_cmds) { | ||
| 344 | bc = list_first_entry(&bd->done_list, struct bsg_command, list); | ||
| 345 | list_del(&bc->list); | ||
| 346 | bd->done_cmds--; | ||
| 347 | } | ||
| 348 | spin_unlock_irq(&bd->lock); | ||
| 349 | |||
| 350 | return bc; | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Get a finished command from the done list | ||
| 355 | */ | ||
| 356 | static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) | ||
| 357 | { | ||
| 358 | struct bsg_command *bc; | ||
| 359 | int ret; | ||
| 360 | |||
| 361 | do { | ||
| 362 | bc = bsg_next_done_cmd(bd); | ||
| 363 | if (bc) | ||
| 364 | break; | ||
| 365 | |||
| 366 | if (!test_bit(BSG_F_BLOCK, &bd->flags)) { | ||
| 367 | bc = ERR_PTR(-EAGAIN); | ||
| 368 | break; | ||
| 369 | } | ||
| 370 | |||
| 371 | ret = wait_event_interruptible(bd->wq_done, bd->done_cmds); | ||
| 372 | if (ret) { | ||
| 373 | bc = ERR_PTR(-ERESTARTSYS); | ||
| 374 | break; | ||
| 375 | } | ||
| 376 | } while (1); | ||
| 377 | |||
| 378 | bsg_dbg(bd, "returning done %p\n", bc); | ||
| 379 | |||
| 380 | return bc; | ||
| 381 | } | ||
| 382 | |||
| 383 | static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, | 217 | static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, |
| 384 | struct bio *bio, struct bio *bidi_bio) | 218 | struct bio *bio, struct bio *bidi_bio) |
| 385 | { | 219 | { |
| @@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, | |||
| 398 | return ret; | 232 | return ret; |
| 399 | } | 233 | } |
| 400 | 234 | ||
| 401 | static bool bsg_complete(struct bsg_device *bd) | ||
| 402 | { | ||
| 403 | bool ret = false; | ||
| 404 | bool spin; | ||
| 405 | |||
| 406 | do { | ||
| 407 | spin_lock_irq(&bd->lock); | ||
| 408 | |||
| 409 | BUG_ON(bd->done_cmds > bd->queued_cmds); | ||
| 410 | |||
| 411 | /* | ||
| 412 | * All commands consumed. | ||
| 413 | */ | ||
| 414 | if (bd->done_cmds == bd->queued_cmds) | ||
| 415 | ret = true; | ||
| 416 | |||
| 417 | spin = !test_bit(BSG_F_BLOCK, &bd->flags); | ||
| 418 | |||
| 419 | spin_unlock_irq(&bd->lock); | ||
| 420 | } while (!ret && spin); | ||
| 421 | |||
| 422 | return ret; | ||
| 423 | } | ||
| 424 | |||
| 425 | static int bsg_complete_all_commands(struct bsg_device *bd) | ||
| 426 | { | ||
| 427 | struct bsg_command *bc; | ||
| 428 | int ret, tret; | ||
| 429 | |||
| 430 | bsg_dbg(bd, "entered\n"); | ||
| 431 | |||
| 432 | /* | ||
| 433 | * wait for all commands to complete | ||
| 434 | */ | ||
| 435 | io_wait_event(bd->wq_done, bsg_complete(bd)); | ||
| 436 | |||
| 437 | /* | ||
| 438 | * discard done commands | ||
| 439 | */ | ||
| 440 | ret = 0; | ||
| 441 | do { | ||
| 442 | spin_lock_irq(&bd->lock); | ||
| 443 | if (!bd->queued_cmds) { | ||
| 444 | spin_unlock_irq(&bd->lock); | ||
| 445 | break; | ||
| 446 | } | ||
| 447 | spin_unlock_irq(&bd->lock); | ||
| 448 | |||
| 449 | bc = bsg_get_done_cmd(bd); | ||
| 450 | if (IS_ERR(bc)) | ||
| 451 | break; | ||
| 452 | |||
| 453 | tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, | ||
| 454 | bc->bidi_bio); | ||
| 455 | if (!ret) | ||
| 456 | ret = tret; | ||
| 457 | |||
| 458 | bsg_free_command(bc); | ||
| 459 | } while (1); | ||
| 460 | |||
| 461 | return ret; | ||
| 462 | } | ||
| 463 | |||
| 464 | static int | ||
| 465 | __bsg_read(char __user *buf, size_t count, struct bsg_device *bd, | ||
| 466 | const struct iovec *iov, ssize_t *bytes_read) | ||
| 467 | { | ||
| 468 | struct bsg_command *bc; | ||
| 469 | int nr_commands, ret; | ||
| 470 | |||
| 471 | if (count % sizeof(struct sg_io_v4)) | ||
| 472 | return -EINVAL; | ||
| 473 | |||
| 474 | ret = 0; | ||
| 475 | nr_commands = count / sizeof(struct sg_io_v4); | ||
| 476 | while (nr_commands) { | ||
| 477 | bc = bsg_get_done_cmd(bd); | ||
| 478 | if (IS_ERR(bc)) { | ||
| 479 | ret = PTR_ERR(bc); | ||
| 480 | break; | ||
| 481 | } | ||
| 482 | |||
| 483 | /* | ||
| 484 | * this is the only case where we need to copy data back | ||
| 485 | * after completing the request. so do that here, | ||
| 486 | * bsg_complete_work() cannot do that for us | ||
| 487 | */ | ||
| 488 | ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, | ||
| 489 | bc->bidi_bio); | ||
| 490 | |||
| 491 | if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr))) | ||
| 492 | ret = -EFAULT; | ||
| 493 | |||
| 494 | bsg_free_command(bc); | ||
| 495 | |||
| 496 | if (ret) | ||
| 497 | break; | ||
| 498 | |||
| 499 | buf += sizeof(struct sg_io_v4); | ||
| 500 | *bytes_read += sizeof(struct sg_io_v4); | ||
| 501 | nr_commands--; | ||
| 502 | } | ||
| 503 | |||
| 504 | return ret; | ||
| 505 | } | ||
| 506 | |||
| 507 | static inline void bsg_set_block(struct bsg_device *bd, struct file *file) | ||
| 508 | { | ||
| 509 | if (file->f_flags & O_NONBLOCK) | ||
| 510 | clear_bit(BSG_F_BLOCK, &bd->flags); | ||
| 511 | else | ||
| 512 | set_bit(BSG_F_BLOCK, &bd->flags); | ||
| 513 | } | ||
| 514 | |||
| 515 | /* | ||
| 516 | * Check if the error is a "real" error that we should return. | ||
| 517 | */ | ||
| 518 | static inline int err_block_err(int ret) | ||
| 519 | { | ||
| 520 | if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN) | ||
| 521 | return 1; | ||
| 522 | |||
| 523 | return 0; | ||
| 524 | } | ||
| 525 | |||
| 526 | static ssize_t | ||
| 527 | bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
| 528 | { | ||
| 529 | struct bsg_device *bd = file->private_data; | ||
| 530 | int ret; | ||
| 531 | ssize_t bytes_read; | ||
| 532 | |||
| 533 | bsg_dbg(bd, "read %zd bytes\n", count); | ||
| 534 | |||
| 535 | bsg_set_block(bd, file); | ||
| 536 | |||
| 537 | bytes_read = 0; | ||
| 538 | ret = __bsg_read(buf, count, bd, NULL, &bytes_read); | ||
| 539 | *ppos = bytes_read; | ||
| 540 | |||
| 541 | if (!bytes_read || err_block_err(ret)) | ||
| 542 | bytes_read = ret; | ||
| 543 | |||
| 544 | return bytes_read; | ||
| 545 | } | ||
| 546 | |||
| 547 | static int __bsg_write(struct bsg_device *bd, const char __user *buf, | ||
| 548 | size_t count, ssize_t *bytes_written, fmode_t mode) | ||
| 549 | { | ||
| 550 | struct bsg_command *bc; | ||
| 551 | struct request *rq; | ||
| 552 | int ret, nr_commands; | ||
| 553 | |||
| 554 | if (count % sizeof(struct sg_io_v4)) | ||
| 555 | return -EINVAL; | ||
| 556 | |||
| 557 | nr_commands = count / sizeof(struct sg_io_v4); | ||
| 558 | rq = NULL; | ||
| 559 | bc = NULL; | ||
| 560 | ret = 0; | ||
| 561 | while (nr_commands) { | ||
| 562 | struct request_queue *q = bd->queue; | ||
| 563 | |||
| 564 | bc = bsg_alloc_command(bd); | ||
| 565 | if (IS_ERR(bc)) { | ||
| 566 | ret = PTR_ERR(bc); | ||
| 567 | bc = NULL; | ||
| 568 | break; | ||
| 569 | } | ||
| 570 | |||
| 571 | if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) { | ||
| 572 | ret = -EFAULT; | ||
| 573 | break; | ||
| 574 | } | ||
| 575 | |||
| 576 | /* | ||
| 577 | * get a request, fill in the blanks, and add to request queue | ||
| 578 | */ | ||
| 579 | rq = bsg_map_hdr(bd->queue, &bc->hdr, mode); | ||
| 580 | if (IS_ERR(rq)) { | ||
| 581 | ret = PTR_ERR(rq); | ||
| 582 | rq = NULL; | ||
| 583 | break; | ||
| 584 | } | ||
| 585 | |||
| 586 | bsg_add_command(bd, q, bc, rq); | ||
| 587 | bc = NULL; | ||
| 588 | rq = NULL; | ||
| 589 | nr_commands--; | ||
| 590 | buf += sizeof(struct sg_io_v4); | ||
| 591 | *bytes_written += sizeof(struct sg_io_v4); | ||
| 592 | } | ||
| 593 | |||
| 594 | if (bc) | ||
| 595 | bsg_free_command(bc); | ||
| 596 | |||
| 597 | return ret; | ||
| 598 | } | ||
| 599 | |||
| 600 | static ssize_t | ||
| 601 | bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | ||
| 602 | { | ||
| 603 | struct bsg_device *bd = file->private_data; | ||
| 604 | ssize_t bytes_written; | ||
| 605 | int ret; | ||
| 606 | |||
| 607 | bsg_dbg(bd, "write %zd bytes\n", count); | ||
| 608 | |||
| 609 | if (unlikely(uaccess_kernel())) | ||
| 610 | return -EINVAL; | ||
| 611 | |||
| 612 | bsg_set_block(bd, file); | ||
| 613 | |||
| 614 | bytes_written = 0; | ||
| 615 | ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); | ||
| 616 | |||
| 617 | *ppos = bytes_written; | ||
| 618 | |||
| 619 | /* | ||
| 620 | * return bytes written on non-fatal errors | ||
| 621 | */ | ||
| 622 | if (!bytes_written || err_block_err(ret)) | ||
| 623 | bytes_written = ret; | ||
| 624 | |||
| 625 | bsg_dbg(bd, "returning %zd\n", bytes_written); | ||
| 626 | return bytes_written; | ||
| 627 | } | ||
| 628 | |||
| 629 | static struct bsg_device *bsg_alloc_device(void) | 235 | static struct bsg_device *bsg_alloc_device(void) |
| 630 | { | 236 | { |
| 631 | struct bsg_device *bd; | 237 | struct bsg_device *bd; |
| @@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void) | |||
| 635 | return NULL; | 241 | return NULL; |
| 636 | 242 | ||
| 637 | spin_lock_init(&bd->lock); | 243 | spin_lock_init(&bd->lock); |
| 638 | |||
| 639 | bd->max_queue = BSG_DEFAULT_CMDS; | 244 | bd->max_queue = BSG_DEFAULT_CMDS; |
| 640 | |||
| 641 | INIT_LIST_HEAD(&bd->busy_list); | ||
| 642 | INIT_LIST_HEAD(&bd->done_list); | ||
| 643 | INIT_HLIST_NODE(&bd->dev_list); | 245 | INIT_HLIST_NODE(&bd->dev_list); |
| 644 | |||
| 645 | init_waitqueue_head(&bd->wq_free); | ||
| 646 | init_waitqueue_head(&bd->wq_done); | ||
| 647 | return bd; | 246 | return bd; |
| 648 | } | 247 | } |
| 649 | 248 | ||
| 650 | static int bsg_put_device(struct bsg_device *bd) | 249 | static int bsg_put_device(struct bsg_device *bd) |
| 651 | { | 250 | { |
| 652 | int ret = 0, do_free; | ||
| 653 | struct request_queue *q = bd->queue; | 251 | struct request_queue *q = bd->queue; |
| 654 | 252 | ||
| 655 | mutex_lock(&bsg_mutex); | 253 | mutex_lock(&bsg_mutex); |
| 656 | 254 | ||
| 657 | do_free = atomic_dec_and_test(&bd->ref_count); | 255 | if (!atomic_dec_and_test(&bd->ref_count)) { |
| 658 | if (!do_free) { | ||
| 659 | mutex_unlock(&bsg_mutex); | 256 | mutex_unlock(&bsg_mutex); |
| 660 | goto out; | 257 | return 0; |
| 661 | } | 258 | } |
| 662 | 259 | ||
| 663 | hlist_del(&bd->dev_list); | 260 | hlist_del(&bd->dev_list); |
| @@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd) | |||
| 668 | /* | 265 | /* |
| 669 | * close can always block | 266 | * close can always block |
| 670 | */ | 267 | */ |
| 671 | set_bit(BSG_F_BLOCK, &bd->flags); | ||
| 672 | |||
| 673 | /* | ||
| 674 | * correct error detection baddies here again. it's the responsibility | ||
| 675 | * of the app to properly reap commands before close() if it wants | ||
| 676 | * fool-proof error detection | ||
| 677 | */ | ||
| 678 | ret = bsg_complete_all_commands(bd); | ||
| 679 | |||
| 680 | kfree(bd); | 268 | kfree(bd); |
| 681 | out: | 269 | blk_put_queue(q); |
| 682 | if (do_free) | 270 | return 0; |
| 683 | blk_put_queue(q); | ||
| 684 | return ret; | ||
| 685 | } | 271 | } |
| 686 | 272 | ||
| 687 | static struct bsg_device *bsg_add_device(struct inode *inode, | 273 | static struct bsg_device *bsg_add_device(struct inode *inode, |
| @@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode, | |||
| 704 | 290 | ||
| 705 | bd->queue = rq; | 291 | bd->queue = rq; |
| 706 | 292 | ||
| 707 | bsg_set_block(bd, file); | ||
| 708 | |||
| 709 | atomic_set(&bd->ref_count, 1); | 293 | atomic_set(&bd->ref_count, 1); |
| 710 | hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); | 294 | hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); |
| 711 | 295 | ||
| @@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file) | |||
| 779 | return bsg_put_device(bd); | 363 | return bsg_put_device(bd); |
| 780 | } | 364 | } |
| 781 | 365 | ||
| 782 | static __poll_t bsg_poll(struct file *file, poll_table *wait) | ||
| 783 | { | ||
| 784 | struct bsg_device *bd = file->private_data; | ||
| 785 | __poll_t mask = 0; | ||
| 786 | |||
| 787 | poll_wait(file, &bd->wq_done, wait); | ||
| 788 | poll_wait(file, &bd->wq_free, wait); | ||
| 789 | |||
| 790 | spin_lock_irq(&bd->lock); | ||
| 791 | if (!list_empty(&bd->done_list)) | ||
| 792 | mask |= EPOLLIN | EPOLLRDNORM; | ||
| 793 | if (bd->queued_cmds < bd->max_queue) | ||
| 794 | mask |= EPOLLOUT; | ||
| 795 | spin_unlock_irq(&bd->lock); | ||
| 796 | |||
| 797 | return mask; | ||
| 798 | } | ||
| 799 | |||
| 800 | static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 366 | static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
| 801 | { | 367 | { |
| 802 | struct bsg_device *bd = file->private_data; | 368 | struct bsg_device *bd = file->private_data; |
| @@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
| 870 | } | 436 | } |
| 871 | 437 | ||
| 872 | static const struct file_operations bsg_fops = { | 438 | static const struct file_operations bsg_fops = { |
| 873 | .read = bsg_read, | ||
| 874 | .write = bsg_write, | ||
| 875 | .poll = bsg_poll, | ||
| 876 | .open = bsg_open, | 439 | .open = bsg_open, |
| 877 | .release = bsg_release, | 440 | .release = bsg_release, |
| 878 | .unlocked_ioctl = bsg_ioctl, | 441 | .unlocked_ioctl = bsg_ioctl, |
| @@ -977,21 +540,12 @@ static int __init bsg_init(void) | |||
| 977 | int ret, i; | 540 | int ret, i; |
| 978 | dev_t devid; | 541 | dev_t devid; |
| 979 | 542 | ||
| 980 | bsg_cmd_cachep = kmem_cache_create("bsg_cmd", | ||
| 981 | sizeof(struct bsg_command), 0, 0, NULL); | ||
| 982 | if (!bsg_cmd_cachep) { | ||
| 983 | printk(KERN_ERR "bsg: failed creating slab cache\n"); | ||
| 984 | return -ENOMEM; | ||
| 985 | } | ||
| 986 | |||
| 987 | for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) | 543 | for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) |
| 988 | INIT_HLIST_HEAD(&bsg_device_list[i]); | 544 | INIT_HLIST_HEAD(&bsg_device_list[i]); |
| 989 | 545 | ||
| 990 | bsg_class = class_create(THIS_MODULE, "bsg"); | 546 | bsg_class = class_create(THIS_MODULE, "bsg"); |
| 991 | if (IS_ERR(bsg_class)) { | 547 | if (IS_ERR(bsg_class)) |
| 992 | ret = PTR_ERR(bsg_class); | 548 | return PTR_ERR(bsg_class); |
| 993 | goto destroy_kmemcache; | ||
| 994 | } | ||
| 995 | bsg_class->devnode = bsg_devnode; | 549 | bsg_class->devnode = bsg_devnode; |
| 996 | 550 | ||
| 997 | ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); | 551 | ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); |
| @@ -1012,8 +566,6 @@ unregister_chrdev: | |||
| 1012 | unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); | 566 | unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); |
| 1013 | destroy_bsg_class: | 567 | destroy_bsg_class: |
| 1014 | class_destroy(bsg_class); | 568 | class_destroy(bsg_class); |
| 1015 | destroy_kmemcache: | ||
| 1016 | kmem_cache_destroy(bsg_cmd_cachep); | ||
| 1017 | return ret; | 569 | return ret; |
| 1018 | } | 570 | } |
| 1019 | 571 | ||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 82b6c27b3245..2eb87444b157 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
| @@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | |||
| 3666 | switch (ioprio_class) { | 3666 | switch (ioprio_class) { |
| 3667 | default: | 3667 | default: |
| 3668 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); | 3668 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); |
| 3669 | /* fall through */ | ||
| 3669 | case IOPRIO_CLASS_NONE: | 3670 | case IOPRIO_CLASS_NONE: |
| 3670 | /* | 3671 | /* |
| 3671 | * no prio set, inherit CPU scheduling settings | 3672 | * no prio set, inherit CPU scheduling settings |
| @@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); | |||
| 4735 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | 4736 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
| 4736 | { \ | 4737 | { \ |
| 4737 | struct cfq_data *cfqd = e->elevator_data; \ | 4738 | struct cfq_data *cfqd = e->elevator_data; \ |
| 4738 | unsigned int __data; \ | 4739 | unsigned int __data, __min = (MIN), __max = (MAX); \ |
| 4740 | \ | ||
| 4739 | cfq_var_store(&__data, (page)); \ | 4741 | cfq_var_store(&__data, (page)); \ |
| 4740 | if (__data < (MIN)) \ | 4742 | if (__data < __min) \ |
| 4741 | __data = (MIN); \ | 4743 | __data = __min; \ |
| 4742 | else if (__data > (MAX)) \ | 4744 | else if (__data > __max) \ |
| 4743 | __data = (MAX); \ | 4745 | __data = __max; \ |
| 4744 | if (__CONV) \ | 4746 | if (__CONV) \ |
| 4745 | *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ | 4747 | *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ |
| 4746 | else \ | 4748 | else \ |
| @@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, | |||
| 4769 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ | 4771 | static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
| 4770 | { \ | 4772 | { \ |
| 4771 | struct cfq_data *cfqd = e->elevator_data; \ | 4773 | struct cfq_data *cfqd = e->elevator_data; \ |
| 4772 | unsigned int __data; \ | 4774 | unsigned int __data, __min = (MIN), __max = (MAX); \ |
| 4775 | \ | ||
| 4773 | cfq_var_store(&__data, (page)); \ | 4776 | cfq_var_store(&__data, (page)); \ |
| 4774 | if (__data < (MIN)) \ | 4777 | if (__data < __min) \ |
| 4775 | __data = (MIN); \ | 4778 | __data = __min; \ |
| 4776 | else if (__data > (MAX)) \ | 4779 | else if (__data > __max) \ |
| 4777 | __data = (MAX); \ | 4780 | __data = __max; \ |
| 4778 | *(__PTR) = (u64)__data * NSEC_PER_USEC; \ | 4781 | *(__PTR) = (u64)__data * NSEC_PER_USEC; \ |
| 4779 | return count; \ | 4782 | return count; \ |
| 4780 | } | 4783 | } |
diff --git a/block/genhd.c b/block/genhd.c index f1543a45e73b..8cc719a37b32 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v) | |||
| 1333 | part_round_stats(gp->queue, cpu, hd); | 1333 | part_round_stats(gp->queue, cpu, hd); |
| 1334 | part_stat_unlock(); | 1334 | part_stat_unlock(); |
| 1335 | part_in_flight(gp->queue, hd, inflight); | 1335 | part_in_flight(gp->queue, hd, inflight); |
| 1336 | seq_printf(seqf, "%4d %7d %s %lu %lu %lu " | 1336 | seq_printf(seqf, "%4d %7d %s " |
| 1337 | "%u %lu %lu %lu %u %u %u %u\n", | 1337 | "%lu %lu %lu %u " |
| 1338 | "%lu %lu %lu %u " | ||
| 1339 | "%u %u %u " | ||
| 1340 | "%lu %lu %lu %u\n", | ||
| 1338 | MAJOR(part_devt(hd)), MINOR(part_devt(hd)), | 1341 | MAJOR(part_devt(hd)), MINOR(part_devt(hd)), |
| 1339 | disk_name(gp, hd->partno, buf), | 1342 | disk_name(gp, hd->partno, buf), |
| 1340 | part_stat_read(hd, ios[READ]), | 1343 | part_stat_read(hd, ios[STAT_READ]), |
| 1341 | part_stat_read(hd, merges[READ]), | 1344 | part_stat_read(hd, merges[STAT_READ]), |
| 1342 | part_stat_read(hd, sectors[READ]), | 1345 | part_stat_read(hd, sectors[STAT_READ]), |
| 1343 | jiffies_to_msecs(part_stat_read(hd, ticks[READ])), | 1346 | jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), |
| 1344 | part_stat_read(hd, ios[WRITE]), | 1347 | part_stat_read(hd, ios[STAT_WRITE]), |
| 1345 | part_stat_read(hd, merges[WRITE]), | 1348 | part_stat_read(hd, merges[STAT_WRITE]), |
| 1346 | part_stat_read(hd, sectors[WRITE]), | 1349 | part_stat_read(hd, sectors[STAT_WRITE]), |
| 1347 | jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), | 1350 | jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), |
| 1348 | inflight[0], | 1351 | inflight[0], |
| 1349 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), | 1352 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), |
| 1350 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)) | 1353 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)), |
| 1354 | part_stat_read(hd, ios[STAT_DISCARD]), | ||
| 1355 | part_stat_read(hd, merges[STAT_DISCARD]), | ||
| 1356 | part_stat_read(hd, sectors[STAT_DISCARD]), | ||
| 1357 | jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) | ||
| 1351 | ); | 1358 | ); |
| 1352 | } | 1359 | } |
| 1353 | disk_part_iter_exit(&piter); | 1360 | disk_part_iter_exit(&piter); |
diff --git a/block/partition-generic.c b/block/partition-generic.c index 3dcfd4ec0e11..5a8975a1201c 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c | |||
| @@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev, | |||
| 130 | return sprintf(buf, | 130 | return sprintf(buf, |
| 131 | "%8lu %8lu %8llu %8u " | 131 | "%8lu %8lu %8llu %8u " |
| 132 | "%8lu %8lu %8llu %8u " | 132 | "%8lu %8lu %8llu %8u " |
| 133 | "%8u %8u %8u" | 133 | "%8u %8u %8u " |
| 134 | "%8lu %8lu %8llu %8u" | ||
| 134 | "\n", | 135 | "\n", |
| 135 | part_stat_read(p, ios[READ]), | 136 | part_stat_read(p, ios[STAT_READ]), |
| 136 | part_stat_read(p, merges[READ]), | 137 | part_stat_read(p, merges[STAT_READ]), |
| 137 | (unsigned long long)part_stat_read(p, sectors[READ]), | 138 | (unsigned long long)part_stat_read(p, sectors[STAT_READ]), |
| 138 | jiffies_to_msecs(part_stat_read(p, ticks[READ])), | 139 | jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), |
| 139 | part_stat_read(p, ios[WRITE]), | 140 | part_stat_read(p, ios[STAT_WRITE]), |
| 140 | part_stat_read(p, merges[WRITE]), | 141 | part_stat_read(p, merges[STAT_WRITE]), |
| 141 | (unsigned long long)part_stat_read(p, sectors[WRITE]), | 142 | (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), |
| 142 | jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), | 143 | jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), |
| 143 | inflight[0], | 144 | inflight[0], |
| 144 | jiffies_to_msecs(part_stat_read(p, io_ticks)), | 145 | jiffies_to_msecs(part_stat_read(p, io_ticks)), |
| 145 | jiffies_to_msecs(part_stat_read(p, time_in_queue))); | 146 | jiffies_to_msecs(part_stat_read(p, time_in_queue)), |
| 147 | part_stat_read(p, ios[STAT_DISCARD]), | ||
| 148 | part_stat_read(p, merges[STAT_DISCARD]), | ||
| 149 | (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), | ||
| 150 | jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); | ||
| 146 | } | 151 | } |
| 147 | 152 | ||
| 148 | ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, | 153 | ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, |
diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 007f95eea0e1..903f3ed175d0 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c | |||
| @@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state) | |||
| 178 | u32 vgda_sector = 0; | 178 | u32 vgda_sector = 0; |
| 179 | u32 vgda_len = 0; | 179 | u32 vgda_len = 0; |
| 180 | int numlvs = 0; | 180 | int numlvs = 0; |
| 181 | struct pvd *pvd; | 181 | struct pvd *pvd = NULL; |
| 182 | struct lv_info { | 182 | struct lv_info { |
| 183 | unsigned short pps_per_lv; | 183 | unsigned short pps_per_lv; |
| 184 | unsigned short pps_found; | 184 | unsigned short pps_found; |
| @@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state) | |||
| 232 | if (lvip[i].pps_per_lv) | 232 | if (lvip[i].pps_per_lv) |
| 233 | foundlvs += 1; | 233 | foundlvs += 1; |
| 234 | } | 234 | } |
| 235 | /* pvd loops depend on n[].name and lvip[].pps_per_lv */ | ||
| 236 | pvd = alloc_pvd(state, vgda_sector + 17); | ||
| 235 | } | 237 | } |
| 236 | put_dev_sector(sect); | 238 | put_dev_sector(sect); |
| 237 | } | 239 | } |
| 238 | pvd = alloc_pvd(state, vgda_sector + 17); | ||
| 239 | if (pvd) { | 240 | if (pvd) { |
| 240 | int numpps = be16_to_cpu(pvd->pp_count); | 241 | int numpps = be16_to_cpu(pvd->pp_count); |
| 241 | int psn_part1 = be32_to_cpu(pvd->psn_part1); | 242 | int psn_part1 = be32_to_cpu(pvd->psn_part1); |
| @@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state) | |||
| 282 | next_lp_ix += 1; | 283 | next_lp_ix += 1; |
| 283 | } | 284 | } |
| 284 | for (i = 0; i < state->limit; i += 1) | 285 | for (i = 0; i < state->limit; i += 1) |
| 285 | if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) | 286 | if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) { |
| 287 | char tmp[sizeof(n[i].name) + 1]; // null char | ||
| 288 | |||
| 289 | snprintf(tmp, sizeof(tmp), "%s", n[i].name); | ||
| 286 | pr_warn("partition %s (%u pp's found) is " | 290 | pr_warn("partition %s (%u pp's found) is " |
| 287 | "not contiguous\n", | 291 | "not contiguous\n", |
| 288 | n[i].name, lvip[i].pps_found); | 292 | tmp, lvip[i].pps_found); |
| 293 | } | ||
| 289 | kfree(pvd); | 294 | kfree(pvd); |
| 290 | } | 295 | } |
| 291 | kfree(n); | 296 | kfree(n); |
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 0417937dfe99..16766f267559 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c | |||
| @@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) | |||
| 830 | { | 830 | { |
| 831 | char buf[64]; | 831 | char buf[64]; |
| 832 | int r_objid, r_name, r_id1, r_id2, len; | 832 | int r_objid, r_name, r_id1, r_id2, len; |
| 833 | struct vblk_dgrp *dgrp; | ||
| 834 | 833 | ||
| 835 | BUG_ON (!buffer || !vb); | 834 | BUG_ON (!buffer || !vb); |
| 836 | 835 | ||
| @@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) | |||
| 853 | if (len != get_unaligned_be32(buffer + 0x14)) | 852 | if (len != get_unaligned_be32(buffer + 0x14)) |
| 854 | return false; | 853 | return false; |
| 855 | 854 | ||
| 856 | dgrp = &vb->vblk.dgrp; | ||
| 857 | |||
| 858 | ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); | 855 | ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); |
| 859 | return true; | 856 | return true; |
| 860 | } | 857 | } |
diff --git a/block/t10-pi.c b/block/t10-pi.c index a98db384048f..62aed77d0bb9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c | |||
| @@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = { | |||
| 184 | .verify_fn = t10_pi_type3_verify_ip, | 184 | .verify_fn = t10_pi_type3_verify_ip, |
| 185 | }; | 185 | }; |
| 186 | EXPORT_SYMBOL(t10_pi_type3_ip); | 186 | EXPORT_SYMBOL(t10_pi_type3_ip); |
| 187 | |||
| 188 | /** | ||
| 189 | * t10_pi_prepare - prepare PI prior submitting request to device | ||
| 190 | * @rq: request with PI that should be prepared | ||
| 191 | * @protection_type: PI type (Type 1/Type 2/Type 3) | ||
| 192 | * | ||
| 193 | * For Type 1/Type 2, the virtual start sector is the one that was | ||
| 194 | * originally submitted by the block layer for the ref_tag usage. Due to | ||
| 195 | * partitioning, MD/DM cloning, etc. the actual physical start sector is | ||
| 196 | * likely to be different. Remap protection information to match the | ||
| 197 | * physical LBA. | ||
| 198 | * | ||
| 199 | * Type 3 does not have a reference tag so no remapping is required. | ||
| 200 | */ | ||
| 201 | void t10_pi_prepare(struct request *rq, u8 protection_type) | ||
| 202 | { | ||
| 203 | const int tuple_sz = rq->q->integrity.tuple_size; | ||
| 204 | u32 ref_tag = t10_pi_ref_tag(rq); | ||
| 205 | struct bio *bio; | ||
| 206 | |||
| 207 | if (protection_type == T10_PI_TYPE3_PROTECTION) | ||
| 208 | return; | ||
| 209 | |||
| 210 | __rq_for_each_bio(bio, rq) { | ||
| 211 | struct bio_integrity_payload *bip = bio_integrity(bio); | ||
| 212 | u32 virt = bip_get_seed(bip) & 0xffffffff; | ||
| 213 | struct bio_vec iv; | ||
| 214 | struct bvec_iter iter; | ||
| 215 | |||
| 216 | /* Already remapped? */ | ||
| 217 | if (bip->bip_flags & BIP_MAPPED_INTEGRITY) | ||
| 218 | break; | ||
| 219 | |||
| 220 | bip_for_each_vec(iv, bip, iter) { | ||
| 221 | void *p, *pmap; | ||
| 222 | unsigned int j; | ||
| 223 | |||
| 224 | pmap = kmap_atomic(iv.bv_page); | ||
| 225 | p = pmap + iv.bv_offset; | ||
| 226 | for (j = 0; j < iv.bv_len; j += tuple_sz) { | ||
| 227 | struct t10_pi_tuple *pi = p; | ||
| 228 | |||
| 229 | if (be32_to_cpu(pi->ref_tag) == virt) | ||
| 230 | pi->ref_tag = cpu_to_be32(ref_tag); | ||
| 231 | virt++; | ||
| 232 | ref_tag++; | ||
| 233 | p += tuple_sz; | ||
| 234 | } | ||
| 235 | |||
| 236 | kunmap_atomic(pmap); | ||
| 237 | } | ||
| 238 | |||
| 239 | bip->bip_flags |= BIP_MAPPED_INTEGRITY; | ||
| 240 | } | ||
| 241 | } | ||
| 242 | EXPORT_SYMBOL(t10_pi_prepare); | ||
| 243 | |||
| 244 | /** | ||
| 245 | * t10_pi_complete - prepare PI prior returning request to the block layer | ||
| 246 | * @rq: request with PI that should be prepared | ||
| 247 | * @protection_type: PI type (Type 1/Type 2/Type 3) | ||
| 248 | * @intervals: total elements to prepare | ||
| 249 | * | ||
| 250 | * For Type 1/Type 2, the virtual start sector is the one that was | ||
| 251 | * originally submitted by the block layer for the ref_tag usage. Due to | ||
| 252 | * partitioning, MD/DM cloning, etc. the actual physical start sector is | ||
| 253 | * likely to be different. Since the physical start sector was submitted | ||
| 254 | * to the device, we should remap it back to virtual values expected by the | ||
| 255 | * block layer. | ||
| 256 | * | ||
| 257 | * Type 3 does not have a reference tag so no remapping is required. | ||
| 258 | */ | ||
| 259 | void t10_pi_complete(struct request *rq, u8 protection_type, | ||
| 260 | unsigned int intervals) | ||
| 261 | { | ||
| 262 | const int tuple_sz = rq->q->integrity.tuple_size; | ||
| 263 | u32 ref_tag = t10_pi_ref_tag(rq); | ||
| 264 | struct bio *bio; | ||
| 265 | |||
| 266 | if (protection_type == T10_PI_TYPE3_PROTECTION) | ||
| 267 | return; | ||
| 268 | |||
| 269 | __rq_for_each_bio(bio, rq) { | ||
| 270 | struct bio_integrity_payload *bip = bio_integrity(bio); | ||
| 271 | u32 virt = bip_get_seed(bip) & 0xffffffff; | ||
| 272 | struct bio_vec iv; | ||
| 273 | struct bvec_iter iter; | ||
| 274 | |||
| 275 | bip_for_each_vec(iv, bip, iter) { | ||
| 276 | void *p, *pmap; | ||
| 277 | unsigned int j; | ||
| 278 | |||
| 279 | pmap = kmap_atomic(iv.bv_page); | ||
| 280 | p = pmap + iv.bv_offset; | ||
| 281 | for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { | ||
| 282 | struct t10_pi_tuple *pi = p; | ||
| 283 | |||
| 284 | if (be32_to_cpu(pi->ref_tag) == ref_tag) | ||
| 285 | pi->ref_tag = cpu_to_be32(virt); | ||
| 286 | virt++; | ||
| 287 | ref_tag++; | ||
| 288 | intervals--; | ||
| 289 | p += tuple_sz; | ||
| 290 | } | ||
| 291 | |||
| 292 | kunmap_atomic(pmap); | ||
| 293 | } | ||
| 294 | } | ||
| 295 | } | ||
| 296 | EXPORT_SYMBOL(t10_pi_complete); | ||
diff --git a/drivers/Makefile b/drivers/Makefile index 24cd47014657..a6abd7a856c6 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
| @@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ | |||
| 76 | obj-$(CONFIG_NUBUS) += nubus/ | 76 | obj-$(CONFIG_NUBUS) += nubus/ |
| 77 | obj-y += macintosh/ | 77 | obj-y += macintosh/ |
| 78 | obj-$(CONFIG_IDE) += ide/ | 78 | obj-$(CONFIG_IDE) += ide/ |
| 79 | obj-$(CONFIG_SCSI) += scsi/ | 79 | obj-y += scsi/ |
| 80 | obj-y += nvme/ | 80 | obj-y += nvme/ |
| 81 | obj-$(CONFIG_ATA) += ata/ | 81 | obj-$(CONFIG_ATA) += ata/ |
| 82 | obj-$(CONFIG_TARGET_CORE) += target/ | 82 | obj-$(CONFIG_TARGET_CORE) += target/ |
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index aad1b01447de..8e270962b2f3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c | |||
| @@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev, | |||
| 597 | int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) | 597 | int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) |
| 598 | { | 598 | { |
| 599 | int rc = 0; | 599 | int rc = 0; |
| 600 | u8 sensebuf[SCSI_SENSE_BUFFERSIZE]; | ||
| 600 | u8 scsi_cmd[MAX_COMMAND_SIZE]; | 601 | u8 scsi_cmd[MAX_COMMAND_SIZE]; |
| 601 | u8 args[4], *argbuf = NULL, *sensebuf = NULL; | 602 | u8 args[4], *argbuf = NULL; |
| 602 | int argsize = 0; | 603 | int argsize = 0; |
| 603 | enum dma_data_direction data_dir; | 604 | enum dma_data_direction data_dir; |
| 604 | struct scsi_sense_hdr sshdr; | 605 | struct scsi_sense_hdr sshdr; |
| @@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) | |||
| 610 | if (copy_from_user(args, arg, sizeof(args))) | 611 | if (copy_from_user(args, arg, sizeof(args))) |
| 611 | return -EFAULT; | 612 | return -EFAULT; |
| 612 | 613 | ||
| 613 | sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); | 614 | memset(sensebuf, 0, sizeof(sensebuf)); |
| 614 | if (!sensebuf) | ||
| 615 | return -ENOMEM; | ||
| 616 | |||
| 617 | memset(scsi_cmd, 0, sizeof(scsi_cmd)); | 615 | memset(scsi_cmd, 0, sizeof(scsi_cmd)); |
| 618 | 616 | ||
| 619 | if (args[3]) { | 617 | if (args[3]) { |
| @@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) | |||
| 685 | && copy_to_user(arg + sizeof(args), argbuf, argsize)) | 683 | && copy_to_user(arg + sizeof(args), argbuf, argsize)) |
| 686 | rc = -EFAULT; | 684 | rc = -EFAULT; |
| 687 | error: | 685 | error: |
| 688 | kfree(sensebuf); | ||
| 689 | kfree(argbuf); | 686 | kfree(argbuf); |
| 690 | return rc; | 687 | return rc; |
| 691 | } | 688 | } |
| @@ -704,8 +701,9 @@ error: | |||
| 704 | int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) | 701 | int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) |
| 705 | { | 702 | { |
| 706 | int rc = 0; | 703 | int rc = 0; |
| 704 | u8 sensebuf[SCSI_SENSE_BUFFERSIZE]; | ||
| 707 | u8 scsi_cmd[MAX_COMMAND_SIZE]; | 705 | u8 scsi_cmd[MAX_COMMAND_SIZE]; |
| 708 | u8 args[7], *sensebuf = NULL; | 706 | u8 args[7]; |
| 709 | struct scsi_sense_hdr sshdr; | 707 | struct scsi_sense_hdr sshdr; |
| 710 | int cmd_result; | 708 | int cmd_result; |
| 711 | 709 | ||
| @@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) | |||
| 715 | if (copy_from_user(args, arg, sizeof(args))) | 713 | if (copy_from_user(args, arg, sizeof(args))) |
| 716 | return -EFAULT; | 714 | return -EFAULT; |
| 717 | 715 | ||
| 718 | sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); | 716 | memset(sensebuf, 0, sizeof(sensebuf)); |
| 719 | if (!sensebuf) | ||
| 720 | return -ENOMEM; | ||
| 721 | |||
| 722 | memset(scsi_cmd, 0, sizeof(scsi_cmd)); | 717 | memset(scsi_cmd, 0, sizeof(scsi_cmd)); |
| 723 | scsi_cmd[0] = ATA_16; | 718 | scsi_cmd[0] = ATA_16; |
| 724 | scsi_cmd[1] = (3 << 1); /* Non-data */ | 719 | scsi_cmd[1] = (3 << 1); /* Non-data */ |
| @@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) | |||
| 769 | } | 764 | } |
| 770 | 765 | ||
| 771 | error: | 766 | error: |
| 772 | kfree(sensebuf); | ||
| 773 | return rc; | 767 | return rc; |
| 774 | } | 768 | } |
| 775 | 769 | ||
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index f6518067aa7d..f99e5c883368 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #define DAC960_DriverDate "21 Aug 2007" | 21 | #define DAC960_DriverDate "21 Aug 2007" |
| 22 | 22 | ||
| 23 | 23 | ||
| 24 | #include <linux/compiler.h> | ||
| 24 | #include <linux/module.h> | 25 | #include <linux/module.h> |
| 25 | #include <linux/types.h> | 26 | #include <linux/types.h> |
| 26 | #include <linux/miscdevice.h> | 27 | #include <linux/miscdevice.h> |
| @@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller, | |||
| 6426 | return true; | 6427 | return true; |
| 6427 | } | 6428 | } |
| 6428 | 6429 | ||
| 6429 | static int dac960_proc_show(struct seq_file *m, void *v) | 6430 | static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v) |
| 6430 | { | 6431 | { |
| 6431 | unsigned char *StatusMessage = "OK\n"; | 6432 | unsigned char *StatusMessage = "OK\n"; |
| 6432 | int ControllerNumber; | 6433 | int ControllerNumber; |
| @@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v) | |||
| 6446 | return 0; | 6447 | return 0; |
| 6447 | } | 6448 | } |
| 6448 | 6449 | ||
| 6449 | static int dac960_initial_status_proc_show(struct seq_file *m, void *v) | 6450 | static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m, |
| 6451 | void *v) | ||
| 6450 | { | 6452 | { |
| 6451 | DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; | 6453 | DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; |
| 6452 | seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer); | 6454 | seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer); |
| 6453 | return 0; | 6455 | return 0; |
| 6454 | } | 6456 | } |
| 6455 | 6457 | ||
| 6456 | static int dac960_current_status_proc_show(struct seq_file *m, void *v) | 6458 | static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m, |
| 6459 | void *v) | ||
| 6457 | { | 6460 | { |
| 6458 | DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private; | 6461 | DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private; |
| 6459 | unsigned char *StatusMessage = | 6462 | unsigned char *StatusMessage = |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ad9b687a236a..d4913516823f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
| @@ -74,12 +74,12 @@ config AMIGA_Z2RAM | |||
| 74 | 74 | ||
| 75 | config CDROM | 75 | config CDROM |
| 76 | tristate | 76 | tristate |
| 77 | select BLK_SCSI_REQUEST | ||
| 77 | 78 | ||
| 78 | config GDROM | 79 | config GDROM |
| 79 | tristate "SEGA Dreamcast GD-ROM drive" | 80 | tristate "SEGA Dreamcast GD-ROM drive" |
| 80 | depends on SH_DREAMCAST | 81 | depends on SH_DREAMCAST |
| 81 | select CDROM | 82 | select CDROM |
| 82 | select BLK_SCSI_REQUEST # only for the generic cdrom code | ||
| 83 | help | 83 | help |
| 84 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a | 84 | A standard SEGA Dreamcast comes with a modified CD ROM drive called a |
| 85 | "GD-ROM" by SEGA to signify it is capable of reading special disks | 85 | "GD-ROM" by SEGA to signify it is capable of reading special disks |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index dc061158b403..8566b188368b 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
| @@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o | |||
| 36 | obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ | 36 | obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ |
| 37 | 37 | ||
| 38 | obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ | 38 | obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ |
| 39 | obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o | ||
| 40 | obj-$(CONFIG_ZRAM) += zram/ | 39 | obj-$(CONFIG_ZRAM) += zram/ |
| 41 | 40 | ||
| 41 | obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o | ||
| 42 | null_blk-objs := null_blk_main.o | ||
| 43 | null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o | ||
| 44 | |||
| 42 | skd-y := skd_main.o | 45 | skd-y := skd_main.o |
| 43 | swim_mod-y := swim.o swim_asm.o | 46 | swim_mod-y := swim.o swim_asm.o |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 096882e54095..136dc507d020 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
| @@ -1137,6 +1137,7 @@ noskb: if (buf) | |||
| 1137 | break; | 1137 | break; |
| 1138 | } | 1138 | } |
| 1139 | bvcpy(skb, f->buf->bio, f->iter, n); | 1139 | bvcpy(skb, f->buf->bio, f->iter, n); |
| 1140 | /* fall through */ | ||
| 1140 | case ATA_CMD_PIO_WRITE: | 1141 | case ATA_CMD_PIO_WRITE: |
| 1141 | case ATA_CMD_PIO_WRITE_EXT: | 1142 | case ATA_CMD_PIO_WRITE_EXT: |
| 1142 | spin_lock_irq(&d->lock); | 1143 | spin_lock_irq(&d->lock); |
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 697f735b07a4..41060e9cedf2 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c | |||
| @@ -284,8 +284,8 @@ freedev(struct aoedev *d) | |||
| 284 | e = t + d->ntargets; | 284 | e = t + d->ntargets; |
| 285 | for (; t < e && *t; t++) | 285 | for (; t < e && *t; t++) |
| 286 | freetgt(d, *t); | 286 | freetgt(d, *t); |
| 287 | if (d->bufpool) | 287 | |
| 288 | mempool_destroy(d->bufpool); | 288 | mempool_destroy(d->bufpool); |
| 289 | skbpoolfree(d); | 289 | skbpoolfree(d); |
| 290 | minor_free(d->sysminor); | 290 | minor_free(d->sysminor); |
| 291 | 291 | ||
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bb976598ee43..df8103dd40ac 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
| @@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd, | |||
| 254 | * Process a single bvec of a bio. | 254 | * Process a single bvec of a bio. |
| 255 | */ | 255 | */ |
| 256 | static int brd_do_bvec(struct brd_device *brd, struct page *page, | 256 | static int brd_do_bvec(struct brd_device *brd, struct page *page, |
| 257 | unsigned int len, unsigned int off, bool is_write, | 257 | unsigned int len, unsigned int off, unsigned int op, |
| 258 | sector_t sector) | 258 | sector_t sector) |
| 259 | { | 259 | { |
| 260 | void *mem; | 260 | void *mem; |
| 261 | int err = 0; | 261 | int err = 0; |
| 262 | 262 | ||
| 263 | if (is_write) { | 263 | if (op_is_write(op)) { |
| 264 | err = copy_to_brd_setup(brd, sector, len); | 264 | err = copy_to_brd_setup(brd, sector, len); |
| 265 | if (err) | 265 | if (err) |
| 266 | goto out; | 266 | goto out; |
| 267 | } | 267 | } |
| 268 | 268 | ||
| 269 | mem = kmap_atomic(page); | 269 | mem = kmap_atomic(page); |
| 270 | if (!is_write) { | 270 | if (!op_is_write(op)) { |
| 271 | copy_from_brd(mem + off, brd, sector, len); | 271 | copy_from_brd(mem + off, brd, sector, len); |
| 272 | flush_dcache_page(page); | 272 | flush_dcache_page(page); |
| 273 | } else { | 273 | } else { |
| @@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) | |||
| 296 | int err; | 296 | int err; |
| 297 | 297 | ||
| 298 | err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, | 298 | err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, |
| 299 | op_is_write(bio_op(bio)), sector); | 299 | bio_op(bio), sector); |
| 300 | if (err) | 300 | if (err) |
| 301 | goto io_error; | 301 | goto io_error; |
| 302 | sector += len >> SECTOR_SHIFT; | 302 | sector += len >> SECTOR_SHIFT; |
| @@ -310,15 +310,15 @@ io_error: | |||
| 310 | } | 310 | } |
| 311 | 311 | ||
| 312 | static int brd_rw_page(struct block_device *bdev, sector_t sector, | 312 | static int brd_rw_page(struct block_device *bdev, sector_t sector, |
| 313 | struct page *page, bool is_write) | 313 | struct page *page, unsigned int op) |
| 314 | { | 314 | { |
| 315 | struct brd_device *brd = bdev->bd_disk->private_data; | 315 | struct brd_device *brd = bdev->bd_disk->private_data; |
| 316 | int err; | 316 | int err; |
| 317 | 317 | ||
| 318 | if (PageTransHuge(page)) | 318 | if (PageTransHuge(page)) |
| 319 | return -ENOTSUPP; | 319 | return -ENOTSUPP; |
| 320 | err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); | 320 | err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); |
| 321 | page_endio(page, is_write, err); | 321 | page_endio(page, op_is_write(op), err); |
| 322 | return err; | 322 | return err; |
| 323 | } | 323 | } |
| 324 | 324 | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index bc4ed2ed40a2..e35a234b0a8f 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
| @@ -55,12 +55,10 @@ | |||
| 55 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) | 55 | # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) |
| 56 | # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) | 56 | # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) |
| 57 | # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) | 57 | # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) |
| 58 | # define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) | ||
| 59 | #else | 58 | #else |
| 60 | # define __protected_by(x) | 59 | # define __protected_by(x) |
| 61 | # define __protected_read_by(x) | 60 | # define __protected_read_by(x) |
| 62 | # define __protected_write_by(x) | 61 | # define __protected_write_by(x) |
| 63 | # define __must_hold(x) | ||
| 64 | #endif | 62 | #endif |
| 65 | 63 | ||
| 66 | /* shared module parameters, defined in drbd_main.c */ | 64 | /* shared module parameters, defined in drbd_main.c */ |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a80809bd3057..ef8212a4b73e 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
| @@ -2103,14 +2103,10 @@ static void drbd_destroy_mempools(void) | |||
| 2103 | mempool_exit(&drbd_md_io_page_pool); | 2103 | mempool_exit(&drbd_md_io_page_pool); |
| 2104 | mempool_exit(&drbd_ee_mempool); | 2104 | mempool_exit(&drbd_ee_mempool); |
| 2105 | mempool_exit(&drbd_request_mempool); | 2105 | mempool_exit(&drbd_request_mempool); |
| 2106 | if (drbd_ee_cache) | 2106 | kmem_cache_destroy(drbd_ee_cache); |
| 2107 | kmem_cache_destroy(drbd_ee_cache); | 2107 | kmem_cache_destroy(drbd_request_cache); |
| 2108 | if (drbd_request_cache) | 2108 | kmem_cache_destroy(drbd_bm_ext_cache); |
| 2109 | kmem_cache_destroy(drbd_request_cache); | 2109 | kmem_cache_destroy(drbd_al_ext_cache); |
| 2110 | if (drbd_bm_ext_cache) | ||
| 2111 | kmem_cache_destroy(drbd_bm_ext_cache); | ||
| 2112 | if (drbd_al_ext_cache) | ||
| 2113 | kmem_cache_destroy(drbd_al_ext_cache); | ||
| 2114 | 2110 | ||
| 2115 | drbd_ee_cache = NULL; | 2111 | drbd_ee_cache = NULL; |
| 2116 | drbd_request_cache = NULL; | 2112 | drbd_request_cache = NULL; |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index be9450f5ad1c..75f6b47169e6 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
| @@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) | |||
| 2674 | if (c_min_rate == 0) | 2674 | if (c_min_rate == 0) |
| 2675 | return false; | 2675 | return false; |
| 2676 | 2676 | ||
| 2677 | curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + | 2677 | curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - |
| 2678 | (int)part_stat_read(&disk->part0, sectors[1]) - | ||
| 2679 | atomic_read(&device->rs_sect_ev); | 2678 | atomic_read(&device->rs_sect_ev); |
| 2680 | 2679 | ||
| 2681 | if (atomic_read(&device->ap_actlog_cnt) | 2680 | if (atomic_read(&device->ap_actlog_cnt) |
| @@ -2790,6 +2789,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet | |||
| 2790 | then we would do something smarter here than reading | 2789 | then we would do something smarter here than reading |
| 2791 | the block... */ | 2790 | the block... */ |
| 2792 | peer_req->flags |= EE_RS_THIN_REQ; | 2791 | peer_req->flags |= EE_RS_THIN_REQ; |
| 2792 | /* fall through */ | ||
| 2793 | case P_RS_DATA_REQUEST: | 2793 | case P_RS_DATA_REQUEST: |
| 2794 | peer_req->w.cb = w_e_end_rsdata_req; | 2794 | peer_req->w.cb = w_e_end_rsdata_req; |
| 2795 | fault_type = DRBD_FAULT_RS_RD; | 2795 | fault_type = DRBD_FAULT_RS_RD; |
| @@ -2968,6 +2968,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold | |||
| 2968 | /* Else fall through to one of the other strategies... */ | 2968 | /* Else fall through to one of the other strategies... */ |
| 2969 | drbd_warn(device, "Discard younger/older primary did not find a decision\n" | 2969 | drbd_warn(device, "Discard younger/older primary did not find a decision\n" |
| 2970 | "Using discard-least-changes instead\n"); | 2970 | "Using discard-least-changes instead\n"); |
| 2971 | /* fall through */ | ||
| 2971 | case ASB_DISCARD_ZERO_CHG: | 2972 | case ASB_DISCARD_ZERO_CHG: |
| 2972 | if (ch_peer == 0 && ch_self == 0) { | 2973 | if (ch_peer == 0 && ch_self == 0) { |
| 2973 | rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) | 2974 | rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) |
| @@ -2979,6 +2980,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold | |||
| 2979 | } | 2980 | } |
| 2980 | if (after_sb_0p == ASB_DISCARD_ZERO_CHG) | 2981 | if (after_sb_0p == ASB_DISCARD_ZERO_CHG) |
| 2981 | break; | 2982 | break; |
| 2983 | /* else: fall through */ | ||
| 2982 | case ASB_DISCARD_LEAST_CHG: | 2984 | case ASB_DISCARD_LEAST_CHG: |
| 2983 | if (ch_self < ch_peer) | 2985 | if (ch_self < ch_peer) |
| 2984 | rv = -1; | 2986 | rv = -1; |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d146fedc38bb..19cac36e9737 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
| @@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request | |||
| 38 | { | 38 | { |
| 39 | struct request_queue *q = device->rq_queue; | 39 | struct request_queue *q = device->rq_queue; |
| 40 | 40 | ||
| 41 | generic_start_io_acct(q, bio_data_dir(req->master_bio), | 41 | generic_start_io_acct(q, bio_op(req->master_bio), |
| 42 | req->i.size >> 9, &device->vdisk->part0); | 42 | req->i.size >> 9, &device->vdisk->part0); |
| 43 | } | 43 | } |
| 44 | 44 | ||
| @@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r | |||
| 47 | { | 47 | { |
| 48 | struct request_queue *q = device->rq_queue; | 48 | struct request_queue *q = device->rq_queue; |
| 49 | 49 | ||
| 50 | generic_end_io_acct(q, bio_data_dir(req->master_bio), | 50 | generic_end_io_acct(q, bio_op(req->master_bio), |
| 51 | &device->vdisk->part0, req->start_jif); | 51 | &device->vdisk->part0, req->start_jif); |
| 52 | } | 52 | } |
| 53 | 53 | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5e793dd7adfb..b8f77e83d456 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
| @@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device) | |||
| 1690 | atomic_set(&device->rs_sect_in, 0); | 1690 | atomic_set(&device->rs_sect_in, 0); |
| 1691 | atomic_set(&device->rs_sect_ev, 0); | 1691 | atomic_set(&device->rs_sect_ev, 0); |
| 1692 | device->rs_in_flight = 0; | 1692 | device->rs_in_flight = 0; |
| 1693 | device->rs_last_events = | 1693 | device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors); |
| 1694 | (int)part_stat_read(&disk->part0, sectors[0]) + | ||
| 1695 | (int)part_stat_read(&disk->part0, sectors[1]); | ||
| 1696 | 1694 | ||
| 1697 | /* Updating the RCU protected object in place is necessary since | 1695 | /* Updating the RCU protected object in place is necessary since |
| 1698 | this function gets called from atomic context. | 1696 | this function gets called from atomic context. |
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8871b5044d9e..48f622728ce6 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c | |||
| @@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void) | |||
| 1461 | int i; | 1461 | int i; |
| 1462 | int r; | 1462 | int r; |
| 1463 | int flags; | 1463 | int flags; |
| 1464 | int dflags; | ||
| 1465 | unsigned long ready_date; | 1464 | unsigned long ready_date; |
| 1466 | void (*function)(void); | 1465 | void (*function)(void); |
| 1467 | 1466 | ||
| @@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void) | |||
| 1485 | if (fd_wait_for_completion(ready_date, function)) | 1484 | if (fd_wait_for_completion(ready_date, function)) |
| 1486 | return; | 1485 | return; |
| 1487 | } | 1486 | } |
| 1488 | dflags = DRS->flags; | ||
| 1489 | |||
| 1490 | if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) | 1487 | if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) |
| 1491 | setup_DMA(); | 1488 | setup_DMA(); |
| 1492 | 1489 | ||
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4cb1d1be3cfb..ea9debf59b22 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
| @@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, | |||
| 690 | unsigned int arg) | 690 | unsigned int arg) |
| 691 | { | 691 | { |
| 692 | struct file *file, *old_file; | 692 | struct file *file, *old_file; |
| 693 | struct inode *inode; | ||
| 694 | int error; | 693 | int error; |
| 695 | 694 | ||
| 696 | error = -ENXIO; | 695 | error = -ENXIO; |
| @@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, | |||
| 711 | if (error) | 710 | if (error) |
| 712 | goto out_putf; | 711 | goto out_putf; |
| 713 | 712 | ||
| 714 | inode = file->f_mapping->host; | ||
| 715 | old_file = lo->lo_backing_file; | 713 | old_file = lo->lo_backing_file; |
| 716 | 714 | ||
| 717 | error = -EINVAL; | 715 | error = -EINVAL; |
| @@ -1611,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 1611 | case LOOP_GET_STATUS64: | 1609 | case LOOP_GET_STATUS64: |
| 1612 | case LOOP_SET_STATUS64: | 1610 | case LOOP_SET_STATUS64: |
| 1613 | arg = (unsigned long) compat_ptr(arg); | 1611 | arg = (unsigned long) compat_ptr(arg); |
| 1612 | /* fall through */ | ||
| 1614 | case LOOP_SET_FD: | 1613 | case LOOP_SET_FD: |
| 1615 | case LOOP_CHANGE_FD: | 1614 | case LOOP_CHANGE_FD: |
| 1616 | case LOOP_SET_BLOCK_SIZE: | 1615 | case LOOP_SET_BLOCK_SIZE: |
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c73626decb46..db253cd5b32a 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
| @@ -2575,8 +2575,7 @@ static int mtip_hw_debugfs_init(struct driver_data *dd) | |||
| 2575 | 2575 | ||
| 2576 | static void mtip_hw_debugfs_exit(struct driver_data *dd) | 2576 | static void mtip_hw_debugfs_exit(struct driver_data *dd) |
| 2577 | { | 2577 | { |
| 2578 | if (dd->dfs_node) | 2578 | debugfs_remove_recursive(dd->dfs_node); |
| 2579 | debugfs_remove_recursive(dd->dfs_node); | ||
| 2580 | } | 2579 | } |
| 2581 | 2580 | ||
| 2582 | /* | 2581 | /* |
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h new file mode 100644 index 000000000000..d81781f22dba --- /dev/null +++ b/drivers/block/null_blk.h | |||
| @@ -0,0 +1,108 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef __BLK_NULL_BLK_H | ||
| 3 | #define __BLK_NULL_BLK_H | ||
| 4 | |||
| 5 | #include <linux/blkdev.h> | ||
| 6 | #include <linux/slab.h> | ||
| 7 | #include <linux/blk-mq.h> | ||
| 8 | #include <linux/hrtimer.h> | ||
| 9 | #include <linux/configfs.h> | ||
| 10 | #include <linux/badblocks.h> | ||
| 11 | #include <linux/fault-inject.h> | ||
| 12 | |||
| 13 | struct nullb_cmd { | ||
| 14 | struct list_head list; | ||
| 15 | struct llist_node ll_list; | ||
| 16 | struct __call_single_data csd; | ||
| 17 | struct request *rq; | ||
| 18 | struct bio *bio; | ||
| 19 | unsigned int tag; | ||
| 20 | blk_status_t error; | ||
| 21 | struct nullb_queue *nq; | ||
| 22 | struct hrtimer timer; | ||
| 23 | }; | ||
| 24 | |||
| 25 | struct nullb_queue { | ||
| 26 | unsigned long *tag_map; | ||
| 27 | wait_queue_head_t wait; | ||
| 28 | unsigned int queue_depth; | ||
| 29 | struct nullb_device *dev; | ||
| 30 | unsigned int requeue_selection; | ||
| 31 | |||
| 32 | struct nullb_cmd *cmds; | ||
| 33 | }; | ||
| 34 | |||
| 35 | struct nullb_device { | ||
| 36 | struct nullb *nullb; | ||
| 37 | struct config_item item; | ||
| 38 | struct radix_tree_root data; /* data stored in the disk */ | ||
| 39 | struct radix_tree_root cache; /* disk cache data */ | ||
| 40 | unsigned long flags; /* device flags */ | ||
| 41 | unsigned int curr_cache; | ||
| 42 | struct badblocks badblocks; | ||
| 43 | |||
| 44 | unsigned int nr_zones; | ||
| 45 | struct blk_zone *zones; | ||
| 46 | sector_t zone_size_sects; | ||
| 47 | |||
| 48 | unsigned long size; /* device size in MB */ | ||
| 49 | unsigned long completion_nsec; /* time in ns to complete a request */ | ||
| 50 | unsigned long cache_size; /* disk cache size in MB */ | ||
| 51 | unsigned long zone_size; /* zone size in MB if device is zoned */ | ||
| 52 | unsigned int submit_queues; /* number of submission queues */ | ||
| 53 | unsigned int home_node; /* home node for the device */ | ||
| 54 | unsigned int queue_mode; /* block interface */ | ||
| 55 | unsigned int blocksize; /* block size */ | ||
| 56 | unsigned int irqmode; /* IRQ completion handler */ | ||
| 57 | unsigned int hw_queue_depth; /* queue depth */ | ||
| 58 | unsigned int index; /* index of the disk, only valid with a disk */ | ||
| 59 | unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ | ||
| 60 | bool blocking; /* blocking blk-mq device */ | ||
| 61 | bool use_per_node_hctx; /* use per-node allocation for hardware context */ | ||
| 62 | bool power; /* power on/off the device */ | ||
| 63 | bool memory_backed; /* if data is stored in memory */ | ||
| 64 | bool discard; /* if support discard */ | ||
| 65 | bool zoned; /* if device is zoned */ | ||
| 66 | }; | ||
| 67 | |||
| 68 | struct nullb { | ||
| 69 | struct nullb_device *dev; | ||
| 70 | struct list_head list; | ||
| 71 | unsigned int index; | ||
| 72 | struct request_queue *q; | ||
| 73 | struct gendisk *disk; | ||
| 74 | struct blk_mq_tag_set *tag_set; | ||
| 75 | struct blk_mq_tag_set __tag_set; | ||
| 76 | unsigned int queue_depth; | ||
| 77 | atomic_long_t cur_bytes; | ||
| 78 | struct hrtimer bw_timer; | ||
| 79 | unsigned long cache_flush_pos; | ||
| 80 | spinlock_t lock; | ||
| 81 | |||
| 82 | struct nullb_queue *queues; | ||
| 83 | unsigned int nr_queues; | ||
| 84 | char disk_name[DISK_NAME_LEN]; | ||
| 85 | }; | ||
| 86 | |||
| 87 | #ifdef CONFIG_BLK_DEV_ZONED | ||
| 88 | int null_zone_init(struct nullb_device *dev); | ||
| 89 | void null_zone_exit(struct nullb_device *dev); | ||
| 90 | blk_status_t null_zone_report(struct nullb *nullb, | ||
| 91 | struct nullb_cmd *cmd); | ||
| 92 | void null_zone_write(struct nullb_cmd *cmd); | ||
| 93 | void null_zone_reset(struct nullb_cmd *cmd); | ||
| 94 | #else | ||
| 95 | static inline int null_zone_init(struct nullb_device *dev) | ||
| 96 | { | ||
| 97 | return -EINVAL; | ||
| 98 | } | ||
| 99 | static inline void null_zone_exit(struct nullb_device *dev) {} | ||
| 100 | static inline blk_status_t null_zone_report(struct nullb *nullb, | ||
| 101 | struct nullb_cmd *cmd) | ||
| 102 | { | ||
| 103 | return BLK_STS_NOTSUPP; | ||
| 104 | } | ||
| 105 | static inline void null_zone_write(struct nullb_cmd *cmd) {} | ||
| 106 | static inline void null_zone_reset(struct nullb_cmd *cmd) {} | ||
| 107 | #endif /* CONFIG_BLK_DEV_ZONED */ | ||
| 108 | #endif /* __NULL_BLK_H */ | ||
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk_main.c index 042c778e5a4e..6127e3ff7b4b 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk_main.c | |||
| @@ -7,14 +7,8 @@ | |||
| 7 | #include <linux/moduleparam.h> | 7 | #include <linux/moduleparam.h> |
| 8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
| 9 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
| 10 | #include <linux/blkdev.h> | ||
| 11 | #include <linux/init.h> | 10 | #include <linux/init.h> |
| 12 | #include <linux/slab.h> | 11 | #include "null_blk.h" |
| 13 | #include <linux/blk-mq.h> | ||
| 14 | #include <linux/hrtimer.h> | ||
| 15 | #include <linux/configfs.h> | ||
| 16 | #include <linux/badblocks.h> | ||
| 17 | #include <linux/fault-inject.h> | ||
| 18 | 12 | ||
| 19 | #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) | 13 | #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) |
| 20 | #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) | 14 | #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) |
| @@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps) | |||
| 35 | return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); | 29 | return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); |
| 36 | } | 30 | } |
| 37 | 31 | ||
| 38 | struct nullb_cmd { | ||
| 39 | struct list_head list; | ||
| 40 | struct llist_node ll_list; | ||
| 41 | struct __call_single_data csd; | ||
| 42 | struct request *rq; | ||
| 43 | struct bio *bio; | ||
| 44 | unsigned int tag; | ||
| 45 | blk_status_t error; | ||
| 46 | struct nullb_queue *nq; | ||
| 47 | struct hrtimer timer; | ||
| 48 | }; | ||
| 49 | |||
| 50 | struct nullb_queue { | ||
| 51 | unsigned long *tag_map; | ||
| 52 | wait_queue_head_t wait; | ||
| 53 | unsigned int queue_depth; | ||
| 54 | struct nullb_device *dev; | ||
| 55 | unsigned int requeue_selection; | ||
| 56 | |||
| 57 | struct nullb_cmd *cmds; | ||
| 58 | }; | ||
| 59 | |||
| 60 | /* | 32 | /* |
| 61 | * Status flags for nullb_device. | 33 | * Status flags for nullb_device. |
| 62 | * | 34 | * |
| @@ -92,52 +64,6 @@ struct nullb_page { | |||
| 92 | #define NULLB_PAGE_LOCK (MAP_SZ - 1) | 64 | #define NULLB_PAGE_LOCK (MAP_SZ - 1) |
| 93 | #define NULLB_PAGE_FREE (MAP_SZ - 2) | 65 | #define NULLB_PAGE_FREE (MAP_SZ - 2) |
| 94 | 66 | ||
| 95 | struct nullb_device { | ||
| 96 | struct nullb *nullb; | ||
| 97 | struct config_item item; | ||
| 98 | struct radix_tree_root data; /* data stored in the disk */ | ||
| 99 | struct radix_tree_root cache; /* disk cache data */ | ||
| 100 | unsigned long flags; /* device flags */ | ||
| 101 | unsigned int curr_cache; | ||
| 102 | struct badblocks badblocks; | ||
| 103 | |||
| 104 | unsigned long size; /* device size in MB */ | ||
| 105 | unsigned long completion_nsec; /* time in ns to complete a request */ | ||
| 106 | unsigned long cache_size; /* disk cache size in MB */ | ||
| 107 | unsigned int submit_queues; /* number of submission queues */ | ||
| 108 | unsigned int home_node; /* home node for the device */ | ||
| 109 | unsigned int queue_mode; /* block interface */ | ||
| 110 | unsigned int blocksize; /* block size */ | ||
| 111 | unsigned int irqmode; /* IRQ completion handler */ | ||
| 112 | unsigned int hw_queue_depth; /* queue depth */ | ||
| 113 | unsigned int index; /* index of the disk, only valid with a disk */ | ||
| 114 | unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ | ||
| 115 | bool blocking; /* blocking blk-mq device */ | ||
| 116 | bool use_per_node_hctx; /* use per-node allocation for hardware context */ | ||
| 117 | bool power; /* power on/off the device */ | ||
| 118 | bool memory_backed; /* if data is stored in memory */ | ||
| 119 | bool discard; /* if support discard */ | ||
| 120 | }; | ||
| 121 | |||
| 122 | struct nullb { | ||
| 123 | struct nullb_device *dev; | ||
| 124 | struct list_head list; | ||
| 125 | unsigned int index; | ||
| 126 | struct request_queue *q; | ||
| 127 | struct gendisk *disk; | ||
| 128 | struct blk_mq_tag_set *tag_set; | ||
| 129 | struct blk_mq_tag_set __tag_set; | ||
| 130 | unsigned int queue_depth; | ||
| 131 | atomic_long_t cur_bytes; | ||
| 132 | struct hrtimer bw_timer; | ||
| 133 | unsigned long cache_flush_pos; | ||
| 134 | spinlock_t lock; | ||
| 135 | |||
| 136 | struct nullb_queue *queues; | ||
| 137 | unsigned int nr_queues; | ||
| 138 | char disk_name[DISK_NAME_LEN]; | ||
| 139 | }; | ||
| 140 | |||
| 141 | static LIST_HEAD(nullb_list); | 67 | static LIST_HEAD(nullb_list); |
| 142 | static struct mutex lock; | 68 | static struct mutex lock; |
| 143 | static int null_major; | 69 | static int null_major; |
| @@ -254,6 +180,14 @@ static bool g_use_per_node_hctx; | |||
| 254 | module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); | 180 | module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); |
| 255 | MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); | 181 | MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); |
| 256 | 182 | ||
| 183 | static bool g_zoned; | ||
| 184 | module_param_named(zoned, g_zoned, bool, S_IRUGO); | ||
| 185 | MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); | ||
| 186 | |||
| 187 | static unsigned long g_zone_size = 256; | ||
| 188 | module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); | ||
| 189 | MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); | ||
| 190 | |||
| 257 | static struct nullb_device *null_alloc_dev(void); | 191 | static struct nullb_device *null_alloc_dev(void); |
| 258 | static void null_free_dev(struct nullb_device *dev); | 192 | static void null_free_dev(struct nullb_device *dev); |
| 259 | static void null_del_dev(struct nullb *nullb); | 193 | static void null_del_dev(struct nullb *nullb); |
| @@ -357,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool); | |||
| 357 | NULLB_DEVICE_ATTR(discard, bool); | 291 | NULLB_DEVICE_ATTR(discard, bool); |
| 358 | NULLB_DEVICE_ATTR(mbps, uint); | 292 | NULLB_DEVICE_ATTR(mbps, uint); |
| 359 | NULLB_DEVICE_ATTR(cache_size, ulong); | 293 | NULLB_DEVICE_ATTR(cache_size, ulong); |
| 294 | NULLB_DEVICE_ATTR(zoned, bool); | ||
| 295 | NULLB_DEVICE_ATTR(zone_size, ulong); | ||
| 360 | 296 | ||
| 361 | static ssize_t nullb_device_power_show(struct config_item *item, char *page) | 297 | static ssize_t nullb_device_power_show(struct config_item *item, char *page) |
| 362 | { | 298 | { |
| @@ -390,6 +326,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, | |||
| 390 | null_del_dev(dev->nullb); | 326 | null_del_dev(dev->nullb); |
| 391 | mutex_unlock(&lock); | 327 | mutex_unlock(&lock); |
| 392 | clear_bit(NULLB_DEV_FL_UP, &dev->flags); | 328 | clear_bit(NULLB_DEV_FL_UP, &dev->flags); |
| 329 | clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); | ||
| 393 | } | 330 | } |
| 394 | 331 | ||
| 395 | return count; | 332 | return count; |
| @@ -468,6 +405,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { | |||
| 468 | &nullb_device_attr_mbps, | 405 | &nullb_device_attr_mbps, |
| 469 | &nullb_device_attr_cache_size, | 406 | &nullb_device_attr_cache_size, |
| 470 | &nullb_device_attr_badblocks, | 407 | &nullb_device_attr_badblocks, |
| 408 | &nullb_device_attr_zoned, | ||
| 409 | &nullb_device_attr_zone_size, | ||
| 471 | NULL, | 410 | NULL, |
| 472 | }; | 411 | }; |
| 473 | 412 | ||
| @@ -520,7 +459,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) | |||
| 520 | 459 | ||
| 521 | static ssize_t memb_group_features_show(struct config_item *item, char *page) | 460 | static ssize_t memb_group_features_show(struct config_item *item, char *page) |
| 522 | { | 461 | { |
| 523 | return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); | 462 | return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n"); |
| 524 | } | 463 | } |
| 525 | 464 | ||
| 526 | CONFIGFS_ATTR_RO(memb_group_, features); | 465 | CONFIGFS_ATTR_RO(memb_group_, features); |
| @@ -579,6 +518,8 @@ static struct nullb_device *null_alloc_dev(void) | |||
| 579 | dev->hw_queue_depth = g_hw_queue_depth; | 518 | dev->hw_queue_depth = g_hw_queue_depth; |
| 580 | dev->blocking = g_blocking; | 519 | dev->blocking = g_blocking; |
| 581 | dev->use_per_node_hctx = g_use_per_node_hctx; | 520 | dev->use_per_node_hctx = g_use_per_node_hctx; |
| 521 | dev->zoned = g_zoned; | ||
| 522 | dev->zone_size = g_zone_size; | ||
| 582 | return dev; | 523 | return dev; |
| 583 | } | 524 | } |
| 584 | 525 | ||
| @@ -587,6 +528,7 @@ static void null_free_dev(struct nullb_device *dev) | |||
| 587 | if (!dev) | 528 | if (!dev) |
| 588 | return; | 529 | return; |
| 589 | 530 | ||
| 531 | null_zone_exit(dev); | ||
| 590 | badblocks_exit(&dev->badblocks); | 532 | badblocks_exit(&dev->badblocks); |
| 591 | kfree(dev); | 533 | kfree(dev); |
| 592 | } | 534 | } |
| @@ -862,7 +804,9 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb, | |||
| 862 | } | 804 | } |
| 863 | 805 | ||
| 864 | static struct nullb_page *null_insert_page(struct nullb *nullb, | 806 | static struct nullb_page *null_insert_page(struct nullb *nullb, |
| 865 | sector_t sector, bool ignore_cache) | 807 | sector_t sector, bool ignore_cache) |
| 808 | __releases(&nullb->lock) | ||
| 809 | __acquires(&nullb->lock) | ||
| 866 | { | 810 | { |
| 867 | u64 idx; | 811 | u64 idx; |
| 868 | struct nullb_page *t_page; | 812 | struct nullb_page *t_page; |
| @@ -1219,6 +1163,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) | |||
| 1219 | struct nullb *nullb = dev->nullb; | 1163 | struct nullb *nullb = dev->nullb; |
| 1220 | int err = 0; | 1164 | int err = 0; |
| 1221 | 1165 | ||
| 1166 | if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) { | ||
| 1167 | cmd->error = null_zone_report(nullb, cmd); | ||
| 1168 | goto out; | ||
| 1169 | } | ||
| 1170 | |||
| 1222 | if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { | 1171 | if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { |
| 1223 | struct request *rq = cmd->rq; | 1172 | struct request *rq = cmd->rq; |
| 1224 | 1173 | ||
| @@ -1283,6 +1232,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) | |||
| 1283 | } | 1232 | } |
| 1284 | } | 1233 | } |
| 1285 | cmd->error = errno_to_blk_status(err); | 1234 | cmd->error = errno_to_blk_status(err); |
| 1235 | |||
| 1236 | if (!cmd->error && dev->zoned) { | ||
| 1237 | if (req_op(cmd->rq) == REQ_OP_WRITE) | ||
| 1238 | null_zone_write(cmd); | ||
| 1239 | else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET) | ||
| 1240 | null_zone_reset(cmd); | ||
| 1241 | } | ||
| 1286 | out: | 1242 | out: |
| 1287 | /* Complete IO by inline, softirq or timer */ | 1243 | /* Complete IO by inline, softirq or timer */ |
| 1288 | switch (dev->irqmode) { | 1244 | switch (dev->irqmode) { |
| @@ -1810,6 +1766,15 @@ static int null_add_dev(struct nullb_device *dev) | |||
| 1810 | blk_queue_flush_queueable(nullb->q, true); | 1766 | blk_queue_flush_queueable(nullb->q, true); |
| 1811 | } | 1767 | } |
| 1812 | 1768 | ||
| 1769 | if (dev->zoned) { | ||
| 1770 | rv = null_zone_init(dev); | ||
| 1771 | if (rv) | ||
| 1772 | goto out_cleanup_blk_queue; | ||
| 1773 | |||
| 1774 | blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); | ||
| 1775 | nullb->q->limits.zoned = BLK_ZONED_HM; | ||
| 1776 | } | ||
| 1777 | |||
| 1813 | nullb->q->queuedata = nullb; | 1778 | nullb->q->queuedata = nullb; |
| 1814 | blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); | 1779 | blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); |
| 1815 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); | 1780 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); |
| @@ -1828,13 +1793,16 @@ static int null_add_dev(struct nullb_device *dev) | |||
| 1828 | 1793 | ||
| 1829 | rv = null_gendisk_register(nullb); | 1794 | rv = null_gendisk_register(nullb); |
| 1830 | if (rv) | 1795 | if (rv) |
| 1831 | goto out_cleanup_blk_queue; | 1796 | goto out_cleanup_zone; |
| 1832 | 1797 | ||
| 1833 | mutex_lock(&lock); | 1798 | mutex_lock(&lock); |
| 1834 | list_add_tail(&nullb->list, &nullb_list); | 1799 | list_add_tail(&nullb->list, &nullb_list); |
| 1835 | mutex_unlock(&lock); | 1800 | mutex_unlock(&lock); |
| 1836 | 1801 | ||
| 1837 | return 0; | 1802 | return 0; |
| 1803 | out_cleanup_zone: | ||
| 1804 | if (dev->zoned) | ||
| 1805 | null_zone_exit(dev); | ||
| 1838 | out_cleanup_blk_queue: | 1806 | out_cleanup_blk_queue: |
| 1839 | blk_cleanup_queue(nullb->q); | 1807 | blk_cleanup_queue(nullb->q); |
| 1840 | out_cleanup_tags: | 1808 | out_cleanup_tags: |
| @@ -1861,6 +1829,11 @@ static int __init null_init(void) | |||
| 1861 | g_bs = PAGE_SIZE; | 1829 | g_bs = PAGE_SIZE; |
| 1862 | } | 1830 | } |
| 1863 | 1831 | ||
| 1832 | if (!is_power_of_2(g_zone_size)) { | ||
| 1833 | pr_err("null_blk: zone_size must be power-of-two\n"); | ||
| 1834 | return -EINVAL; | ||
| 1835 | } | ||
| 1836 | |||
| 1864 | if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { | 1837 | if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { |
| 1865 | if (g_submit_queues != nr_online_nodes) { | 1838 | if (g_submit_queues != nr_online_nodes) { |
| 1866 | pr_warn("null_blk: submit_queues param is set to %u.\n", | 1839 | pr_warn("null_blk: submit_queues param is set to %u.\n", |
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c new file mode 100644 index 000000000000..a979ca00d7be --- /dev/null +++ b/drivers/block/null_blk_zoned.c | |||
| @@ -0,0 +1,149 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | #include <linux/vmalloc.h> | ||
| 3 | #include "null_blk.h" | ||
| 4 | |||
| 5 | /* zone_size in MBs to sectors. */ | ||
| 6 | #define ZONE_SIZE_SHIFT 11 | ||
| 7 | |||
| 8 | static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) | ||
| 9 | { | ||
| 10 | return sect >> ilog2(dev->zone_size_sects); | ||
| 11 | } | ||
| 12 | |||
| 13 | int null_zone_init(struct nullb_device *dev) | ||
| 14 | { | ||
| 15 | sector_t dev_size = (sector_t)dev->size * 1024 * 1024; | ||
| 16 | sector_t sector = 0; | ||
| 17 | unsigned int i; | ||
| 18 | |||
| 19 | if (!is_power_of_2(dev->zone_size)) { | ||
| 20 | pr_err("null_blk: zone_size must be power-of-two\n"); | ||
| 21 | return -EINVAL; | ||
| 22 | } | ||
| 23 | |||
| 24 | dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; | ||
| 25 | dev->nr_zones = dev_size >> | ||
| 26 | (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); | ||
| 27 | dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), | ||
| 28 | GFP_KERNEL | __GFP_ZERO); | ||
| 29 | if (!dev->zones) | ||
| 30 | return -ENOMEM; | ||
| 31 | |||
| 32 | for (i = 0; i < dev->nr_zones; i++) { | ||
| 33 | struct blk_zone *zone = &dev->zones[i]; | ||
| 34 | |||
| 35 | zone->start = zone->wp = sector; | ||
| 36 | zone->len = dev->zone_size_sects; | ||
| 37 | zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; | ||
| 38 | zone->cond = BLK_ZONE_COND_EMPTY; | ||
| 39 | |||
| 40 | sector += dev->zone_size_sects; | ||
| 41 | } | ||
| 42 | |||
| 43 | return 0; | ||
| 44 | } | ||
| 45 | |||
| 46 | void null_zone_exit(struct nullb_device *dev) | ||
| 47 | { | ||
| 48 | kvfree(dev->zones); | ||
| 49 | } | ||
| 50 | |||
| 51 | static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq, | ||
| 52 | unsigned int zno, unsigned int nr_zones) | ||
| 53 | { | ||
| 54 | struct blk_zone_report_hdr *hdr = NULL; | ||
| 55 | struct bio_vec bvec; | ||
| 56 | struct bvec_iter iter; | ||
| 57 | void *addr; | ||
| 58 | unsigned int zones_to_cpy; | ||
| 59 | |||
| 60 | bio_for_each_segment(bvec, rq->bio, iter) { | ||
| 61 | addr = kmap_atomic(bvec.bv_page); | ||
| 62 | |||
| 63 | zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone); | ||
| 64 | |||
| 65 | if (!hdr) { | ||
| 66 | hdr = (struct blk_zone_report_hdr *)addr; | ||
| 67 | hdr->nr_zones = nr_zones; | ||
| 68 | zones_to_cpy--; | ||
| 69 | addr += sizeof(struct blk_zone_report_hdr); | ||
| 70 | } | ||
| 71 | |||
| 72 | zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones); | ||
| 73 | |||
| 74 | memcpy(addr, &dev->zones[zno], | ||
| 75 | zones_to_cpy * sizeof(struct blk_zone)); | ||
| 76 | |||
| 77 | kunmap_atomic(addr); | ||
| 78 | |||
| 79 | nr_zones -= zones_to_cpy; | ||
| 80 | zno += zones_to_cpy; | ||
| 81 | |||
| 82 | if (!nr_zones) | ||
| 83 | break; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 | blk_status_t null_zone_report(struct nullb *nullb, | ||
| 88 | struct nullb_cmd *cmd) | ||
| 89 | { | ||
| 90 | struct nullb_device *dev = nullb->dev; | ||
| 91 | struct request *rq = cmd->rq; | ||
| 92 | unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); | ||
| 93 | unsigned int nr_zones = dev->nr_zones - zno; | ||
| 94 | unsigned int max_zones = (blk_rq_bytes(rq) / | ||
| 95 | sizeof(struct blk_zone)) - 1; | ||
| 96 | |||
| 97 | nr_zones = min_t(unsigned int, nr_zones, max_zones); | ||
| 98 | |||
| 99 | null_zone_fill_rq(nullb->dev, rq, zno, nr_zones); | ||
| 100 | |||
| 101 | return BLK_STS_OK; | ||
| 102 | } | ||
| 103 | |||
| 104 | void null_zone_write(struct nullb_cmd *cmd) | ||
| 105 | { | ||
| 106 | struct nullb_device *dev = cmd->nq->dev; | ||
| 107 | struct request *rq = cmd->rq; | ||
| 108 | sector_t sector = blk_rq_pos(rq); | ||
| 109 | unsigned int rq_sectors = blk_rq_sectors(rq); | ||
| 110 | unsigned int zno = null_zone_no(dev, sector); | ||
| 111 | struct blk_zone *zone = &dev->zones[zno]; | ||
| 112 | |||
| 113 | switch (zone->cond) { | ||
| 114 | case BLK_ZONE_COND_FULL: | ||
| 115 | /* Cannot write to a full zone */ | ||
| 116 | cmd->error = BLK_STS_IOERR; | ||
| 117 | break; | ||
| 118 | case BLK_ZONE_COND_EMPTY: | ||
| 119 | case BLK_ZONE_COND_IMP_OPEN: | ||
| 120 | /* Writes must be at the write pointer position */ | ||
| 121 | if (blk_rq_pos(rq) != zone->wp) { | ||
| 122 | cmd->error = BLK_STS_IOERR; | ||
| 123 | break; | ||
| 124 | } | ||
| 125 | |||
| 126 | if (zone->cond == BLK_ZONE_COND_EMPTY) | ||
| 127 | zone->cond = BLK_ZONE_COND_IMP_OPEN; | ||
| 128 | |||
| 129 | zone->wp += rq_sectors; | ||
| 130 | if (zone->wp == zone->start + zone->len) | ||
| 131 | zone->cond = BLK_ZONE_COND_FULL; | ||
| 132 | break; | ||
| 133 | default: | ||
| 134 | /* Invalid zone condition */ | ||
| 135 | cmd->error = BLK_STS_IOERR; | ||
| 136 | break; | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | void null_zone_reset(struct nullb_cmd *cmd) | ||
| 141 | { | ||
| 142 | struct nullb_device *dev = cmd->nq->dev; | ||
| 143 | struct request *rq = cmd->rq; | ||
| 144 | unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); | ||
| 145 | struct blk_zone *zone = &dev->zones[zno]; | ||
| 146 | |||
| 147 | zone->cond = BLK_ZONE_COND_EMPTY; | ||
| 148 | zone->wp = zone->start; | ||
| 149 | } | ||
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c index 4f27e7392e38..f5f63ca2889d 100644 --- a/drivers/block/paride/bpck.c +++ b/drivers/block/paride/bpck.c | |||
| @@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose ) | |||
| 347 | 347 | ||
| 348 | static void bpck_read_eeprom ( PIA *pi, char * buf ) | 348 | static void bpck_read_eeprom ( PIA *pi, char * buf ) |
| 349 | 349 | ||
| 350 | { int i,j,k,n,p,v,f, om, od; | 350 | { int i, j, k, p, v, f, om, od; |
| 351 | 351 | ||
| 352 | bpck_force_spp(pi); | 352 | bpck_force_spp(pi); |
| 353 | 353 | ||
| @@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf ) | |||
| 356 | 356 | ||
| 357 | bpck_connect(pi); | 357 | bpck_connect(pi); |
| 358 | 358 | ||
| 359 | n = 0; | ||
| 360 | WR(4,0); | 359 | WR(4,0); |
| 361 | for (i=0;i<64;i++) { | 360 | for (i=0;i<64;i++) { |
| 362 | WR(6,8); | 361 | WR(6,8); |
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 8961b190e256..7cf947586fe4 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c | |||
| @@ -426,6 +426,7 @@ static void run_fsm(void) | |||
| 426 | pd_claimed = 1; | 426 | pd_claimed = 1; |
| 427 | if (!pi_schedule_claimed(pi_current, run_fsm)) | 427 | if (!pi_schedule_claimed(pi_current, run_fsm)) |
| 428 | return; | 428 | return; |
| 429 | /* fall through */ | ||
| 429 | case 1: | 430 | case 1: |
| 430 | pd_claimed = 2; | 431 | pd_claimed = 2; |
| 431 | pi_current->proto->connect(pi_current); | 432 | pi_current->proto->connect(pi_current); |
| @@ -445,6 +446,7 @@ static void run_fsm(void) | |||
| 445 | spin_unlock_irqrestore(&pd_lock, saved_flags); | 446 | spin_unlock_irqrestore(&pd_lock, saved_flags); |
| 446 | if (stop) | 447 | if (stop) |
| 447 | return; | 448 | return; |
| 449 | /* fall through */ | ||
| 448 | case Hold: | 450 | case Hold: |
| 449 | schedule_fsm(); | 451 | schedule_fsm(); |
| 450 | return; | 452 | return; |
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b3f83cd96f33..e285413d4a75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c | |||
| @@ -67,7 +67,7 @@ | |||
| 67 | #include <scsi/scsi.h> | 67 | #include <scsi/scsi.h> |
| 68 | #include <linux/debugfs.h> | 68 | #include <linux/debugfs.h> |
| 69 | #include <linux/device.h> | 69 | #include <linux/device.h> |
| 70 | 70 | #include <linux/nospec.h> | |
| 71 | #include <linux/uaccess.h> | 71 | #include <linux/uaccess.h> |
| 72 | 72 | ||
| 73 | #define DRIVER_NAME "pktcdvd" | 73 | #define DRIVER_NAME "pktcdvd" |
| @@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index) | |||
| 748 | static void pkt_dump_sense(struct pktcdvd_device *pd, | 748 | static void pkt_dump_sense(struct pktcdvd_device *pd, |
| 749 | struct packet_command *cgc) | 749 | struct packet_command *cgc) |
| 750 | { | 750 | { |
| 751 | struct request_sense *sense = cgc->sense; | 751 | struct scsi_sense_hdr *sshdr = cgc->sshdr; |
| 752 | 752 | ||
| 753 | if (sense) | 753 | if (sshdr) |
| 754 | pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", | 754 | pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", |
| 755 | CDROM_PACKET_SIZE, cgc->cmd, | 755 | CDROM_PACKET_SIZE, cgc->cmd, |
| 756 | sense->sense_key, sense->asc, sense->ascq, | 756 | sshdr->sense_key, sshdr->asc, sshdr->ascq, |
| 757 | sense_key_string(sense->sense_key)); | 757 | sense_key_string(sshdr->sense_key)); |
| 758 | else | 758 | else |
| 759 | pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); | 759 | pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); |
| 760 | } | 760 | } |
| @@ -787,18 +787,19 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, | |||
| 787 | unsigned write_speed, unsigned read_speed) | 787 | unsigned write_speed, unsigned read_speed) |
| 788 | { | 788 | { |
| 789 | struct packet_command cgc; | 789 | struct packet_command cgc; |
| 790 | struct request_sense sense; | 790 | struct scsi_sense_hdr sshdr; |
| 791 | int ret; | 791 | int ret; |
| 792 | 792 | ||
| 793 | init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); | 793 | init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); |
| 794 | cgc.sense = &sense; | 794 | cgc.sshdr = &sshdr; |
| 795 | cgc.cmd[0] = GPCMD_SET_SPEED; | 795 | cgc.cmd[0] = GPCMD_SET_SPEED; |
| 796 | cgc.cmd[2] = (read_speed >> 8) & 0xff; | 796 | cgc.cmd[2] = (read_speed >> 8) & 0xff; |
| 797 | cgc.cmd[3] = read_speed & 0xff; | 797 | cgc.cmd[3] = read_speed & 0xff; |
| 798 | cgc.cmd[4] = (write_speed >> 8) & 0xff; | 798 | cgc.cmd[4] = (write_speed >> 8) & 0xff; |
| 799 | cgc.cmd[5] = write_speed & 0xff; | 799 | cgc.cmd[5] = write_speed & 0xff; |
| 800 | 800 | ||
| 801 | if ((ret = pkt_generic_packet(pd, &cgc))) | 801 | ret = pkt_generic_packet(pd, &cgc); |
| 802 | if (ret) | ||
| 802 | pkt_dump_sense(pd, &cgc); | 803 | pkt_dump_sense(pd, &cgc); |
| 803 | 804 | ||
| 804 | return ret; | 805 | return ret; |
| @@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) | |||
| 1562 | cgc.cmd[8] = cgc.buflen = 2; | 1563 | cgc.cmd[8] = cgc.buflen = 2; |
| 1563 | cgc.quiet = 1; | 1564 | cgc.quiet = 1; |
| 1564 | 1565 | ||
| 1565 | if ((ret = pkt_generic_packet(pd, &cgc))) | 1566 | ret = pkt_generic_packet(pd, &cgc); |
| 1567 | if (ret) | ||
| 1566 | return ret; | 1568 | return ret; |
| 1567 | 1569 | ||
| 1568 | /* not all drives have the same disc_info length, so requeue | 1570 | /* not all drives have the same disc_info length, so requeue |
| @@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, | |||
| 1591 | cgc.cmd[8] = 8; | 1593 | cgc.cmd[8] = 8; |
| 1592 | cgc.quiet = 1; | 1594 | cgc.quiet = 1; |
| 1593 | 1595 | ||
| 1594 | if ((ret = pkt_generic_packet(pd, &cgc))) | 1596 | ret = pkt_generic_packet(pd, &cgc); |
| 1597 | if (ret) | ||
| 1595 | return ret; | 1598 | return ret; |
| 1596 | 1599 | ||
| 1597 | cgc.buflen = be16_to_cpu(ti->track_information_length) + | 1600 | cgc.buflen = be16_to_cpu(ti->track_information_length) + |
| @@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, | |||
| 1612 | __u32 last_track; | 1615 | __u32 last_track; |
| 1613 | int ret = -1; | 1616 | int ret = -1; |
| 1614 | 1617 | ||
| 1615 | if ((ret = pkt_get_disc_info(pd, &di))) | 1618 | ret = pkt_get_disc_info(pd, &di); |
| 1619 | if (ret) | ||
| 1616 | return ret; | 1620 | return ret; |
| 1617 | 1621 | ||
| 1618 | last_track = (di.last_track_msb << 8) | di.last_track_lsb; | 1622 | last_track = (di.last_track_msb << 8) | di.last_track_lsb; |
| 1619 | if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) | 1623 | ret = pkt_get_track_info(pd, last_track, 1, &ti); |
| 1624 | if (ret) | ||
| 1620 | return ret; | 1625 | return ret; |
| 1621 | 1626 | ||
| 1622 | /* if this track is blank, try the previous. */ | 1627 | /* if this track is blank, try the previous. */ |
| 1623 | if (ti.blank) { | 1628 | if (ti.blank) { |
| 1624 | last_track--; | 1629 | last_track--; |
| 1625 | if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) | 1630 | ret = pkt_get_track_info(pd, last_track, 1, &ti); |
| 1631 | if (ret) | ||
| 1626 | return ret; | 1632 | return ret; |
| 1627 | } | 1633 | } |
| 1628 | 1634 | ||
| @@ -1645,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, | |||
| 1645 | static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) | 1651 | static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) |
| 1646 | { | 1652 | { |
| 1647 | struct packet_command cgc; | 1653 | struct packet_command cgc; |
| 1648 | struct request_sense sense; | 1654 | struct scsi_sense_hdr sshdr; |
| 1649 | write_param_page *wp; | 1655 | write_param_page *wp; |
| 1650 | char buffer[128]; | 1656 | char buffer[128]; |
| 1651 | int ret, size; | 1657 | int ret, size; |
| @@ -1656,8 +1662,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) | |||
| 1656 | 1662 | ||
| 1657 | memset(buffer, 0, sizeof(buffer)); | 1663 | memset(buffer, 0, sizeof(buffer)); |
| 1658 | init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); | 1664 | init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); |
| 1659 | cgc.sense = &sense; | 1665 | cgc.sshdr = &sshdr; |
| 1660 | if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { | 1666 | ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); |
| 1667 | if (ret) { | ||
| 1661 | pkt_dump_sense(pd, &cgc); | 1668 | pkt_dump_sense(pd, &cgc); |
| 1662 | return ret; | 1669 | return ret; |
| 1663 | } | 1670 | } |
| @@ -1671,8 +1678,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) | |||
| 1671 | * now get it all | 1678 | * now get it all |
| 1672 | */ | 1679 | */ |
| 1673 | init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); | 1680 | init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); |
| 1674 | cgc.sense = &sense; | 1681 | cgc.sshdr = &sshdr; |
| 1675 | if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { | 1682 | ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); |
| 1683 | if (ret) { | ||
| 1676 | pkt_dump_sense(pd, &cgc); | 1684 | pkt_dump_sense(pd, &cgc); |
| 1677 | return ret; | 1685 | return ret; |
| 1678 | } | 1686 | } |
| @@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) | |||
| 1714 | wp->packet_size = cpu_to_be32(pd->settings.size >> 2); | 1722 | wp->packet_size = cpu_to_be32(pd->settings.size >> 2); |
| 1715 | 1723 | ||
| 1716 | cgc.buflen = cgc.cmd[8] = size; | 1724 | cgc.buflen = cgc.cmd[8] = size; |
| 1717 | if ((ret = pkt_mode_select(pd, &cgc))) { | 1725 | ret = pkt_mode_select(pd, &cgc); |
| 1726 | if (ret) { | ||
| 1718 | pkt_dump_sense(pd, &cgc); | 1727 | pkt_dump_sense(pd, &cgc); |
| 1719 | return ret; | 1728 | return ret; |
| 1720 | } | 1729 | } |
| @@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) | |||
| 1819 | memset(&di, 0, sizeof(disc_information)); | 1828 | memset(&di, 0, sizeof(disc_information)); |
| 1820 | memset(&ti, 0, sizeof(track_information)); | 1829 | memset(&ti, 0, sizeof(track_information)); |
| 1821 | 1830 | ||
| 1822 | if ((ret = pkt_get_disc_info(pd, &di))) { | 1831 | ret = pkt_get_disc_info(pd, &di); |
| 1832 | if (ret) { | ||
| 1823 | pkt_err(pd, "failed get_disc\n"); | 1833 | pkt_err(pd, "failed get_disc\n"); |
| 1824 | return ret; | 1834 | return ret; |
| 1825 | } | 1835 | } |
| @@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) | |||
| 1830 | pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; | 1840 | pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; |
| 1831 | 1841 | ||
| 1832 | track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ | 1842 | track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ |
| 1833 | if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { | 1843 | ret = pkt_get_track_info(pd, track, 1, &ti); |
| 1844 | if (ret) { | ||
| 1834 | pkt_err(pd, "failed get_track\n"); | 1845 | pkt_err(pd, "failed get_track\n"); |
| 1835 | return ret; | 1846 | return ret; |
| 1836 | } | 1847 | } |
| @@ -1905,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, | |||
| 1905 | int set) | 1916 | int set) |
| 1906 | { | 1917 | { |
| 1907 | struct packet_command cgc; | 1918 | struct packet_command cgc; |
| 1908 | struct request_sense sense; | 1919 | struct scsi_sense_hdr sshdr; |
| 1909 | unsigned char buf[64]; | 1920 | unsigned char buf[64]; |
| 1910 | int ret; | 1921 | int ret; |
| 1911 | 1922 | ||
| 1912 | init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); | 1923 | init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); |
| 1913 | cgc.sense = &sense; | 1924 | cgc.sshdr = &sshdr; |
| 1914 | cgc.buflen = pd->mode_offset + 12; | 1925 | cgc.buflen = pd->mode_offset + 12; |
| 1915 | 1926 | ||
| 1916 | /* | 1927 | /* |
| @@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, | |||
| 1918 | */ | 1929 | */ |
| 1919 | cgc.quiet = 1; | 1930 | cgc.quiet = 1; |
| 1920 | 1931 | ||
| 1921 | if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0))) | 1932 | ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); |
| 1933 | if (ret) | ||
| 1922 | return ret; | 1934 | return ret; |
| 1923 | 1935 | ||
| 1924 | buf[pd->mode_offset + 10] |= (!!set << 2); | 1936 | buf[pd->mode_offset + 10] |= (!!set << 2); |
| @@ -1950,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, | |||
| 1950 | unsigned *write_speed) | 1962 | unsigned *write_speed) |
| 1951 | { | 1963 | { |
| 1952 | struct packet_command cgc; | 1964 | struct packet_command cgc; |
| 1953 | struct request_sense sense; | 1965 | struct scsi_sense_hdr sshdr; |
| 1954 | unsigned char buf[256+18]; | 1966 | unsigned char buf[256+18]; |
| 1955 | unsigned char *cap_buf; | 1967 | unsigned char *cap_buf; |
| 1956 | int ret, offset; | 1968 | int ret, offset; |
| 1957 | 1969 | ||
| 1958 | cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; | 1970 | cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; |
| 1959 | init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); | 1971 | init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); |
| 1960 | cgc.sense = &sense; | 1972 | cgc.sshdr = &sshdr; |
| 1961 | 1973 | ||
| 1962 | ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); | 1974 | ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); |
| 1963 | if (ret) { | 1975 | if (ret) { |
| @@ -2011,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, | |||
| 2011 | unsigned *speed) | 2023 | unsigned *speed) |
| 2012 | { | 2024 | { |
| 2013 | struct packet_command cgc; | 2025 | struct packet_command cgc; |
| 2014 | struct request_sense sense; | 2026 | struct scsi_sense_hdr sshdr; |
| 2015 | unsigned char buf[64]; | 2027 | unsigned char buf[64]; |
| 2016 | unsigned int size, st, sp; | 2028 | unsigned int size, st, sp; |
| 2017 | int ret; | 2029 | int ret; |
| 2018 | 2030 | ||
| 2019 | init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); | 2031 | init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); |
| 2020 | cgc.sense = &sense; | 2032 | cgc.sshdr = &sshdr; |
| 2021 | cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; | 2033 | cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; |
| 2022 | cgc.cmd[1] = 2; | 2034 | cgc.cmd[1] = 2; |
| 2023 | cgc.cmd[2] = 4; /* READ ATIP */ | 2035 | cgc.cmd[2] = 4; /* READ ATIP */ |
| @@ -2032,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, | |||
| 2032 | size = sizeof(buf); | 2044 | size = sizeof(buf); |
| 2033 | 2045 | ||
| 2034 | init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); | 2046 | init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); |
| 2035 | cgc.sense = &sense; | 2047 | cgc.sshdr = &sshdr; |
| 2036 | cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; | 2048 | cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; |
| 2037 | cgc.cmd[1] = 2; | 2049 | cgc.cmd[1] = 2; |
| 2038 | cgc.cmd[2] = 4; | 2050 | cgc.cmd[2] = 4; |
| @@ -2083,17 +2095,18 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, | |||
| 2083 | static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) | 2095 | static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) |
| 2084 | { | 2096 | { |
| 2085 | struct packet_command cgc; | 2097 | struct packet_command cgc; |
| 2086 | struct request_sense sense; | 2098 | struct scsi_sense_hdr sshdr; |
| 2087 | int ret; | 2099 | int ret; |
| 2088 | 2100 | ||
| 2089 | pkt_dbg(2, pd, "Performing OPC\n"); | 2101 | pkt_dbg(2, pd, "Performing OPC\n"); |
| 2090 | 2102 | ||
| 2091 | init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); | 2103 | init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); |
| 2092 | cgc.sense = &sense; | 2104 | cgc.sshdr = &sshdr; |
| 2093 | cgc.timeout = 60*HZ; | 2105 | cgc.timeout = 60*HZ; |
| 2094 | cgc.cmd[0] = GPCMD_SEND_OPC; | 2106 | cgc.cmd[0] = GPCMD_SEND_OPC; |
| 2095 | cgc.cmd[1] = 1; | 2107 | cgc.cmd[1] = 1; |
| 2096 | if ((ret = pkt_generic_packet(pd, &cgc))) | 2108 | ret = pkt_generic_packet(pd, &cgc); |
| 2109 | if (ret) | ||
| 2097 | pkt_dump_sense(pd, &cgc); | 2110 | pkt_dump_sense(pd, &cgc); |
| 2098 | return ret; | 2111 | return ret; |
| 2099 | } | 2112 | } |
| @@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd) | |||
| 2103 | int ret; | 2116 | int ret; |
| 2104 | unsigned int write_speed, media_write_speed, read_speed; | 2117 | unsigned int write_speed, media_write_speed, read_speed; |
| 2105 | 2118 | ||
| 2106 | if ((ret = pkt_probe_settings(pd))) { | 2119 | ret = pkt_probe_settings(pd); |
| 2120 | if (ret) { | ||
| 2107 | pkt_dbg(2, pd, "failed probe\n"); | 2121 | pkt_dbg(2, pd, "failed probe\n"); |
| 2108 | return ret; | 2122 | return ret; |
| 2109 | } | 2123 | } |
| 2110 | 2124 | ||
| 2111 | if ((ret = pkt_set_write_settings(pd))) { | 2125 | ret = pkt_set_write_settings(pd); |
| 2126 | if (ret) { | ||
| 2112 | pkt_dbg(1, pd, "failed saving write settings\n"); | 2127 | pkt_dbg(1, pd, "failed saving write settings\n"); |
| 2113 | return -EIO; | 2128 | return -EIO; |
| 2114 | } | 2129 | } |
| 2115 | 2130 | ||
| 2116 | pkt_write_caching(pd, USE_WCACHING); | 2131 | pkt_write_caching(pd, USE_WCACHING); |
| 2117 | 2132 | ||
| 2118 | if ((ret = pkt_get_max_speed(pd, &write_speed))) | 2133 | ret = pkt_get_max_speed(pd, &write_speed); |
| 2134 | if (ret) | ||
| 2119 | write_speed = 16 * 177; | 2135 | write_speed = 16 * 177; |
| 2120 | switch (pd->mmc3_profile) { | 2136 | switch (pd->mmc3_profile) { |
| 2121 | case 0x13: /* DVD-RW */ | 2137 | case 0x13: /* DVD-RW */ |
| @@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd) | |||
| 2124 | pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); | 2140 | pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); |
| 2125 | break; | 2141 | break; |
| 2126 | default: | 2142 | default: |
| 2127 | if ((ret = pkt_media_speed(pd, &media_write_speed))) | 2143 | ret = pkt_media_speed(pd, &media_write_speed); |
| 2144 | if (ret) | ||
| 2128 | media_write_speed = 16; | 2145 | media_write_speed = 16; |
| 2129 | write_speed = min(write_speed, media_write_speed * 177); | 2146 | write_speed = min(write_speed, media_write_speed * 177); |
| 2130 | pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); | 2147 | pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); |
| @@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd) | |||
| 2132 | } | 2149 | } |
| 2133 | read_speed = write_speed; | 2150 | read_speed = write_speed; |
| 2134 | 2151 | ||
| 2135 | if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { | 2152 | ret = pkt_set_speed(pd, write_speed, read_speed); |
| 2153 | if (ret) { | ||
| 2136 | pkt_dbg(1, pd, "couldn't set write speed\n"); | 2154 | pkt_dbg(1, pd, "couldn't set write speed\n"); |
| 2137 | return -EIO; | 2155 | return -EIO; |
| 2138 | } | 2156 | } |
| 2139 | pd->write_speed = write_speed; | 2157 | pd->write_speed = write_speed; |
| 2140 | pd->read_speed = read_speed; | 2158 | pd->read_speed = read_speed; |
| 2141 | 2159 | ||
| 2142 | if ((ret = pkt_perform_opc(pd))) { | 2160 | ret = pkt_perform_opc(pd); |
| 2161 | if (ret) { | ||
| 2143 | pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); | 2162 | pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); |
| 2144 | } | 2163 | } |
| 2145 | 2164 | ||
| @@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) | |||
| 2161 | * so bdget() can't fail. | 2180 | * so bdget() can't fail. |
| 2162 | */ | 2181 | */ |
| 2163 | bdget(pd->bdev->bd_dev); | 2182 | bdget(pd->bdev->bd_dev); |
| 2164 | if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) | 2183 | ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd); |
| 2184 | if (ret) | ||
| 2165 | goto out; | 2185 | goto out; |
| 2166 | 2186 | ||
| 2167 | if ((ret = pkt_get_last_written(pd, &lba))) { | 2187 | ret = pkt_get_last_written(pd, &lba); |
| 2188 | if (ret) { | ||
| 2168 | pkt_err(pd, "pkt_get_last_written failed\n"); | 2189 | pkt_err(pd, "pkt_get_last_written failed\n"); |
| 2169 | goto out_putdev; | 2190 | goto out_putdev; |
| 2170 | } | 2191 | } |
| @@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) | |||
| 2175 | 2196 | ||
| 2176 | q = bdev_get_queue(pd->bdev); | 2197 | q = bdev_get_queue(pd->bdev); |
| 2177 | if (write) { | 2198 | if (write) { |
| 2178 | if ((ret = pkt_open_write(pd))) | 2199 | ret = pkt_open_write(pd); |
| 2200 | if (ret) | ||
| 2179 | goto out_putdev; | 2201 | goto out_putdev; |
| 2180 | /* | 2202 | /* |
| 2181 | * Some CDRW drives can not handle writes larger than one packet, | 2203 | * Some CDRW drives can not handle writes larger than one packet, |
| @@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) | |||
| 2190 | clear_bit(PACKET_WRITABLE, &pd->flags); | 2212 | clear_bit(PACKET_WRITABLE, &pd->flags); |
| 2191 | } | 2213 | } |
| 2192 | 2214 | ||
| 2193 | if ((ret = pkt_set_segment_merging(pd, q))) | 2215 | ret = pkt_set_segment_merging(pd, q); |
| 2216 | if (ret) | ||
| 2194 | goto out_putdev; | 2217 | goto out_putdev; |
| 2195 | 2218 | ||
| 2196 | if (write) { | 2219 | if (write) { |
| @@ -2231,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) | |||
| 2231 | { | 2254 | { |
| 2232 | if (dev_minor >= MAX_WRITERS) | 2255 | if (dev_minor >= MAX_WRITERS) |
| 2233 | return NULL; | 2256 | return NULL; |
| 2257 | |||
| 2258 | dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); | ||
| 2234 | return pkt_devs[dev_minor]; | 2259 | return pkt_devs[dev_minor]; |
| 2235 | } | 2260 | } |
| 2236 | 2261 | ||
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index dddb3f2490b6..1a92f9e65937 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c | |||
| @@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = { | |||
| 112 | 112 | ||
| 113 | static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) | 113 | static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) |
| 114 | { | 114 | { |
| 115 | generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio), | 115 | generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio), |
| 116 | &card->gendisk->part0); | 116 | &card->gendisk->part0); |
| 117 | } | 117 | } |
| 118 | 118 | ||
| @@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card, | |||
| 120 | struct bio *bio, | 120 | struct bio *bio, |
| 121 | unsigned long start_time) | 121 | unsigned long start_time) |
| 122 | { | 122 | { |
| 123 | generic_end_io_acct(card->queue, bio_data_dir(bio), | 123 | generic_end_io_acct(card->queue, bio_op(bio), |
| 124 | &card->gendisk->part0, start_time); | 124 | &card->gendisk->part0, start_time); |
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | static void bio_dma_done_cb(struct rsxx_cardinfo *card, | 127 | static void bio_dma_done_cb(struct rsxx_cardinfo *card, |
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index bc7aea6d7b7c..87b9e7fbf062 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c | |||
| @@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev, | |||
| 657 | 657 | ||
| 658 | if (unlikely(skdev->dbg_level > 1)) { | 658 | if (unlikely(skdev->dbg_level > 1)) { |
| 659 | dev_dbg(&skdev->pdev->dev, | 659 | dev_dbg(&skdev->pdev->dev, |
| 660 | "skreq=%x sksg_list=%p sksg_dma=%llx\n", | 660 | "skreq=%x sksg_list=%p sksg_dma=%pad\n", |
| 661 | skreq->id, skreq->sksg_list, skreq->sksg_dma_address); | 661 | skreq->id, skreq->sksg_list, &skreq->sksg_dma_address); |
| 662 | for (i = 0; i < n_sg; i++) { | 662 | for (i = 0; i < n_sg; i++) { |
| 663 | struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; | 663 | struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; |
| 664 | 664 | ||
| @@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev, | |||
| 1190 | { | 1190 | { |
| 1191 | u64 qcmd; | 1191 | u64 qcmd; |
| 1192 | 1192 | ||
| 1193 | dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", | 1193 | dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n", |
| 1194 | skmsg->mb_dma_address, skd_in_flight(skdev)); | 1194 | &skmsg->mb_dma_address, skd_in_flight(skdev)); |
| 1195 | dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); | 1195 | dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); |
| 1196 | 1196 | ||
| 1197 | qcmd = skmsg->mb_dma_address; | 1197 | qcmd = skmsg->mb_dma_address; |
| @@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, | |||
| 1250 | } | 1250 | } |
| 1251 | 1251 | ||
| 1252 | dev_dbg(&skdev->pdev->dev, | 1252 | dev_dbg(&skdev->pdev->dev, |
| 1253 | "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", | 1253 | "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n", |
| 1254 | skspcl, skspcl->req.id, skspcl->req.sksg_list, | 1254 | skspcl, skspcl->req.id, skspcl->req.sksg_list, |
| 1255 | skspcl->req.sksg_dma_address); | 1255 | &skspcl->req.sksg_dma_address); |
| 1256 | for (i = 0; i < skspcl->req.n_sg; i++) { | 1256 | for (i = 0; i < skspcl->req.n_sg; i++) { |
| 1257 | struct fit_sg_descriptor *sgd = | 1257 | struct fit_sg_descriptor *sgd = |
| 1258 | &skspcl->req.sksg_list[i]; | 1258 | &skspcl->req.sksg_list[i]; |
| @@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev) | |||
| 2685 | 2685 | ||
| 2686 | WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & | 2686 | WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & |
| 2687 | (FIT_QCMD_ALIGN - 1), | 2687 | (FIT_QCMD_ALIGN - 1), |
| 2688 | "not aligned: msg_buf %p mb_dma_address %#llx\n", | 2688 | "not aligned: msg_buf %p mb_dma_address %pad\n", |
| 2689 | skmsg->msg_buf, skmsg->mb_dma_address); | 2689 | skmsg->msg_buf, &skmsg->mb_dma_address); |
| 2690 | memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); | 2690 | memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); |
| 2691 | } | 2691 | } |
| 2692 | 2692 | ||
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b5cedccb5d7d..8986adab9bf5 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
| @@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock); | |||
| 251 | #define GRANTS_PER_INDIRECT_FRAME \ | 251 | #define GRANTS_PER_INDIRECT_FRAME \ |
| 252 | (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) | 252 | (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) |
| 253 | 253 | ||
| 254 | #define PSEGS_PER_INDIRECT_FRAME \ | ||
| 255 | (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) | ||
| 256 | |||
| 257 | #define INDIRECT_GREFS(_grants) \ | 254 | #define INDIRECT_GREFS(_grants) \ |
| 258 | DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) | 255 | DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) |
| 259 | 256 | ||
| 260 | #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) | ||
| 261 | |||
| 262 | static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); | 257 | static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); |
| 263 | static void blkfront_gather_backend_features(struct blkfront_info *info); | 258 | static void blkfront_gather_backend_features(struct blkfront_info *info); |
| 264 | static int negotiate_mq(struct blkfront_info *info); | 259 | static int negotiate_mq(struct blkfront_info *info); |
| @@ -1441,7 +1436,7 @@ static bool blkif_completion(unsigned long *id, | |||
| 1441 | 1436 | ||
| 1442 | /* Wait the second response if not yet here. */ | 1437 | /* Wait the second response if not yet here. */ |
| 1443 | if (s2->status == REQ_WAITING) | 1438 | if (s2->status == REQ_WAITING) |
| 1444 | return 0; | 1439 | return false; |
| 1445 | 1440 | ||
| 1446 | bret->status = blkif_get_final_status(s->status, | 1441 | bret->status = blkif_get_final_status(s->status, |
| 1447 | s2->status); | 1442 | s2->status); |
| @@ -1542,7 +1537,7 @@ static bool blkif_completion(unsigned long *id, | |||
| 1542 | } | 1537 | } |
| 1543 | } | 1538 | } |
| 1544 | 1539 | ||
| 1545 | return 1; | 1540 | return true; |
| 1546 | } | 1541 | } |
| 1547 | 1542 | ||
| 1548 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 1543 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a390c6d4f72d..c7acf74253a1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
| @@ -1287,17 +1287,16 @@ static void zram_bio_discard(struct zram *zram, u32 index, | |||
| 1287 | * Returns 1 if IO request was successfully submitted. | 1287 | * Returns 1 if IO request was successfully submitted. |
| 1288 | */ | 1288 | */ |
| 1289 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | 1289 | static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, |
| 1290 | int offset, bool is_write, struct bio *bio) | 1290 | int offset, unsigned int op, struct bio *bio) |
| 1291 | { | 1291 | { |
| 1292 | unsigned long start_time = jiffies; | 1292 | unsigned long start_time = jiffies; |
| 1293 | int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; | ||
| 1294 | struct request_queue *q = zram->disk->queue; | 1293 | struct request_queue *q = zram->disk->queue; |
| 1295 | int ret; | 1294 | int ret; |
| 1296 | 1295 | ||
| 1297 | generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, | 1296 | generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT, |
| 1298 | &zram->disk->part0); | 1297 | &zram->disk->part0); |
| 1299 | 1298 | ||
| 1300 | if (!is_write) { | 1299 | if (!op_is_write(op)) { |
| 1301 | atomic64_inc(&zram->stats.num_reads); | 1300 | atomic64_inc(&zram->stats.num_reads); |
| 1302 | ret = zram_bvec_read(zram, bvec, index, offset, bio); | 1301 | ret = zram_bvec_read(zram, bvec, index, offset, bio); |
| 1303 | flush_dcache_page(bvec->bv_page); | 1302 | flush_dcache_page(bvec->bv_page); |
| @@ -1306,14 +1305,14 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, | |||
| 1306 | ret = zram_bvec_write(zram, bvec, index, offset, bio); | 1305 | ret = zram_bvec_write(zram, bvec, index, offset, bio); |
| 1307 | } | 1306 | } |
| 1308 | 1307 | ||
| 1309 | generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); | 1308 | generic_end_io_acct(q, op, &zram->disk->part0, start_time); |
| 1310 | 1309 | ||
| 1311 | zram_slot_lock(zram, index); | 1310 | zram_slot_lock(zram, index); |
| 1312 | zram_accessed(zram, index); | 1311 | zram_accessed(zram, index); |
| 1313 | zram_slot_unlock(zram, index); | 1312 | zram_slot_unlock(zram, index); |
| 1314 | 1313 | ||
| 1315 | if (unlikely(ret < 0)) { | 1314 | if (unlikely(ret < 0)) { |
| 1316 | if (!is_write) | 1315 | if (!op_is_write(op)) |
| 1317 | atomic64_inc(&zram->stats.failed_reads); | 1316 | atomic64_inc(&zram->stats.failed_reads); |
| 1318 | else | 1317 | else |
| 1319 | atomic64_inc(&zram->stats.failed_writes); | 1318 | atomic64_inc(&zram->stats.failed_writes); |
| @@ -1351,7 +1350,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) | |||
| 1351 | bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, | 1350 | bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, |
| 1352 | unwritten); | 1351 | unwritten); |
| 1353 | if (zram_bvec_rw(zram, &bv, index, offset, | 1352 | if (zram_bvec_rw(zram, &bv, index, offset, |
| 1354 | op_is_write(bio_op(bio)), bio) < 0) | 1353 | bio_op(bio), bio) < 0) |
| 1355 | goto out; | 1354 | goto out; |
| 1356 | 1355 | ||
| 1357 | bv.bv_offset += bv.bv_len; | 1356 | bv.bv_offset += bv.bv_len; |
| @@ -1403,7 +1402,7 @@ static void zram_slot_free_notify(struct block_device *bdev, | |||
| 1403 | } | 1402 | } |
| 1404 | 1403 | ||
| 1405 | static int zram_rw_page(struct block_device *bdev, sector_t sector, | 1404 | static int zram_rw_page(struct block_device *bdev, sector_t sector, |
| 1406 | struct page *page, bool is_write) | 1405 | struct page *page, unsigned int op) |
| 1407 | { | 1406 | { |
| 1408 | int offset, ret; | 1407 | int offset, ret; |
| 1409 | u32 index; | 1408 | u32 index; |
| @@ -1427,7 +1426,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, | |||
| 1427 | bv.bv_len = PAGE_SIZE; | 1426 | bv.bv_len = PAGE_SIZE; |
| 1428 | bv.bv_offset = 0; | 1427 | bv.bv_offset = 0; |
| 1429 | 1428 | ||
| 1430 | ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL); | 1429 | ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); |
| 1431 | out: | 1430 | out: |
| 1432 | /* | 1431 | /* |
| 1433 | * If I/O fails, just return error(ie, non-zero) without | 1432 | * If I/O fails, just return error(ie, non-zero) without |
| @@ -1442,7 +1441,7 @@ out: | |||
| 1442 | 1441 | ||
| 1443 | switch (ret) { | 1442 | switch (ret) { |
| 1444 | case 0: | 1443 | case 0: |
| 1445 | page_endio(page, is_write, 0); | 1444 | page_endio(page, op_is_write(op), 0); |
| 1446 | break; | 1445 | break; |
| 1447 | case 1: | 1446 | case 1: |
| 1448 | ret = 0; | 1447 | ret = 0; |
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index a78b8e7085e9..113fc6edb2b0 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c | |||
| @@ -282,6 +282,7 @@ | |||
| 282 | #include <linux/blkdev.h> | 282 | #include <linux/blkdev.h> |
| 283 | #include <linux/times.h> | 283 | #include <linux/times.h> |
| 284 | #include <linux/uaccess.h> | 284 | #include <linux/uaccess.h> |
| 285 | #include <scsi/scsi_common.h> | ||
| 285 | #include <scsi/scsi_request.h> | 286 | #include <scsi/scsi_request.h> |
| 286 | 287 | ||
| 287 | /* used to tell the module to turn on full debugging messages */ | 288 | /* used to tell the module to turn on full debugging messages */ |
| @@ -345,10 +346,10 @@ static LIST_HEAD(cdrom_list); | |||
| 345 | int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, | 346 | int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, |
| 346 | struct packet_command *cgc) | 347 | struct packet_command *cgc) |
| 347 | { | 348 | { |
| 348 | if (cgc->sense) { | 349 | if (cgc->sshdr) { |
| 349 | cgc->sense->sense_key = 0x05; | 350 | cgc->sshdr->sense_key = 0x05; |
| 350 | cgc->sense->asc = 0x20; | 351 | cgc->sshdr->asc = 0x20; |
| 351 | cgc->sense->ascq = 0x00; | 352 | cgc->sshdr->ascq = 0x00; |
| 352 | } | 353 | } |
| 353 | 354 | ||
| 354 | cgc->stat = -EIO; | 355 | cgc->stat = -EIO; |
| @@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, | |||
| 2222 | 2223 | ||
| 2223 | blk_execute_rq(q, cdi->disk, rq, 0); | 2224 | blk_execute_rq(q, cdi->disk, rq, 0); |
| 2224 | if (scsi_req(rq)->result) { | 2225 | if (scsi_req(rq)->result) { |
| 2225 | struct request_sense *s = req->sense; | 2226 | struct scsi_sense_hdr sshdr; |
| 2227 | |||
| 2226 | ret = -EIO; | 2228 | ret = -EIO; |
| 2227 | cdi->last_sense = s->sense_key; | 2229 | scsi_normalize_sense(req->sense, req->sense_len, |
| 2230 | &sshdr); | ||
| 2231 | cdi->last_sense = sshdr.sense_key; | ||
| 2228 | } | 2232 | } |
| 2229 | 2233 | ||
| 2230 | if (blk_rq_unmap_user(bio)) | 2234 | if (blk_rq_unmap_user(bio)) |
| @@ -2943,7 +2947,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, | |||
| 2943 | struct packet_command *cgc, | 2947 | struct packet_command *cgc, |
| 2944 | int cmd) | 2948 | int cmd) |
| 2945 | { | 2949 | { |
| 2946 | struct request_sense sense; | 2950 | struct scsi_sense_hdr sshdr; |
| 2947 | struct cdrom_msf msf; | 2951 | struct cdrom_msf msf; |
| 2948 | int blocksize = 0, format = 0, lba; | 2952 | int blocksize = 0, format = 0, lba; |
| 2949 | int ret; | 2953 | int ret; |
| @@ -2971,13 +2975,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, | |||
| 2971 | if (cgc->buffer == NULL) | 2975 | if (cgc->buffer == NULL) |
| 2972 | return -ENOMEM; | 2976 | return -ENOMEM; |
| 2973 | 2977 | ||
| 2974 | memset(&sense, 0, sizeof(sense)); | 2978 | memset(&sshdr, 0, sizeof(sshdr)); |
| 2975 | cgc->sense = &sense; | 2979 | cgc->sshdr = &sshdr; |
| 2976 | cgc->data_direction = CGC_DATA_READ; | 2980 | cgc->data_direction = CGC_DATA_READ; |
| 2977 | ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); | 2981 | ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); |
| 2978 | if (ret && sense.sense_key == 0x05 && | 2982 | if (ret && sshdr.sense_key == 0x05 && |
| 2979 | sense.asc == 0x20 && | 2983 | sshdr.asc == 0x20 && |
| 2980 | sense.ascq == 0x00) { | 2984 | sshdr.ascq == 0x00) { |
| 2981 | /* | 2985 | /* |
| 2982 | * SCSI-II devices are not required to support | 2986 | * SCSI-II devices are not required to support |
| 2983 | * READ_CD, so let's try switching block size | 2987 | * READ_CD, so let's try switching block size |
| @@ -2986,7 +2990,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, | |||
| 2986 | ret = cdrom_switch_blocksize(cdi, blocksize); | 2990 | ret = cdrom_switch_blocksize(cdi, blocksize); |
| 2987 | if (ret) | 2991 | if (ret) |
| 2988 | goto out; | 2992 | goto out; |
| 2989 | cgc->sense = NULL; | 2993 | cgc->sshdr = NULL; |
| 2990 | ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); | 2994 | ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); |
| 2991 | ret |= cdrom_switch_blocksize(cdi, blocksize); | 2995 | ret |= cdrom_switch_blocksize(cdi, blocksize); |
| 2992 | } | 2996 | } |
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 5f178384876f..44a7a255ef74 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c | |||
| @@ -419,10 +419,11 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) | |||
| 419 | 419 | ||
| 420 | int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, | 420 | int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, |
| 421 | int write, void *buffer, unsigned *bufflen, | 421 | int write, void *buffer, unsigned *bufflen, |
| 422 | struct request_sense *sense, int timeout, | 422 | struct scsi_sense_hdr *sshdr, int timeout, |
| 423 | req_flags_t rq_flags) | 423 | req_flags_t rq_flags) |
| 424 | { | 424 | { |
| 425 | struct cdrom_info *info = drive->driver_data; | 425 | struct cdrom_info *info = drive->driver_data; |
| 426 | struct scsi_sense_hdr local_sshdr; | ||
| 426 | int retries = 10; | 427 | int retries = 10; |
| 427 | bool failed; | 428 | bool failed; |
| 428 | 429 | ||
| @@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, | |||
| 430 | "rq_flags: 0x%x", | 431 | "rq_flags: 0x%x", |
| 431 | cmd[0], write, timeout, rq_flags); | 432 | cmd[0], write, timeout, rq_flags); |
| 432 | 433 | ||
| 434 | if (!sshdr) | ||
| 435 | sshdr = &local_sshdr; | ||
| 436 | |||
| 433 | /* start of retry loop */ | 437 | /* start of retry loop */ |
| 434 | do { | 438 | do { |
| 435 | struct request *rq; | 439 | struct request *rq; |
| @@ -456,8 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, | |||
| 456 | 460 | ||
| 457 | if (buffer) | 461 | if (buffer) |
| 458 | *bufflen = scsi_req(rq)->resid_len; | 462 | *bufflen = scsi_req(rq)->resid_len; |
| 459 | if (sense) | 463 | scsi_normalize_sense(scsi_req(rq)->sense, |
| 460 | memcpy(sense, scsi_req(rq)->sense, sizeof(*sense)); | 464 | scsi_req(rq)->sense_len, sshdr); |
| 461 | 465 | ||
| 462 | /* | 466 | /* |
| 463 | * FIXME: we should probably abort/retry or something in case of | 467 | * FIXME: we should probably abort/retry or something in case of |
| @@ -469,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, | |||
| 469 | * The request failed. Retry if it was due to a unit | 473 | * The request failed. Retry if it was due to a unit |
| 470 | * attention status (usually means media was changed). | 474 | * attention status (usually means media was changed). |
| 471 | */ | 475 | */ |
| 472 | struct request_sense *reqbuf = scsi_req(rq)->sense; | 476 | if (sshdr->sense_key == UNIT_ATTENTION) |
| 473 | |||
| 474 | if (reqbuf->sense_key == UNIT_ATTENTION) | ||
| 475 | cdrom_saw_media_change(drive); | 477 | cdrom_saw_media_change(drive); |
| 476 | else if (reqbuf->sense_key == NOT_READY && | 478 | else if (sshdr->sense_key == NOT_READY && |
| 477 | reqbuf->asc == 4 && reqbuf->ascq != 4) { | 479 | sshdr->asc == 4 && sshdr->ascq != 4) { |
| 478 | /* | 480 | /* |
| 479 | * The drive is in the process of loading | 481 | * The drive is in the process of loading |
| 480 | * a disk. Retry, but wait a little to give | 482 | * a disk. Retry, but wait a little to give |
| @@ -864,7 +866,7 @@ static void msf_from_bcd(struct atapi_msf *msf) | |||
| 864 | msf->frame = bcd2bin(msf->frame); | 866 | msf->frame = bcd2bin(msf->frame); |
| 865 | } | 867 | } |
| 866 | 868 | ||
| 867 | int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) | 869 | int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr) |
| 868 | { | 870 | { |
| 869 | struct cdrom_info *info = drive->driver_data; | 871 | struct cdrom_info *info = drive->driver_data; |
| 870 | struct cdrom_device_info *cdi; | 872 | struct cdrom_device_info *cdi; |
| @@ -886,12 +888,11 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) | |||
| 886 | */ | 888 | */ |
| 887 | cmd[7] = cdi->sanyo_slot % 3; | 889 | cmd[7] = cdi->sanyo_slot % 3; |
| 888 | 890 | ||
| 889 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET); | 891 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET); |
| 890 | } | 892 | } |
| 891 | 893 | ||
| 892 | static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, | 894 | static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, |
| 893 | unsigned long *sectors_per_frame, | 895 | unsigned long *sectors_per_frame) |
| 894 | struct request_sense *sense) | ||
| 895 | { | 896 | { |
| 896 | struct { | 897 | struct { |
| 897 | __be32 lba; | 898 | __be32 lba; |
| @@ -908,7 +909,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, | |||
| 908 | memset(cmd, 0, BLK_MAX_CDB); | 909 | memset(cmd, 0, BLK_MAX_CDB); |
| 909 | cmd[0] = GPCMD_READ_CDVD_CAPACITY; | 910 | cmd[0] = GPCMD_READ_CDVD_CAPACITY; |
| 910 | 911 | ||
| 911 | stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0, | 912 | stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0, |
| 912 | RQF_QUIET); | 913 | RQF_QUIET); |
| 913 | if (stat) | 914 | if (stat) |
| 914 | return stat; | 915 | return stat; |
| @@ -944,8 +945,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, | |||
| 944 | } | 945 | } |
| 945 | 946 | ||
| 946 | static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, | 947 | static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, |
| 947 | int format, char *buf, int buflen, | 948 | int format, char *buf, int buflen) |
| 948 | struct request_sense *sense) | ||
| 949 | { | 949 | { |
| 950 | unsigned char cmd[BLK_MAX_CDB]; | 950 | unsigned char cmd[BLK_MAX_CDB]; |
| 951 | 951 | ||
| @@ -962,11 +962,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, | |||
| 962 | if (msf_flag) | 962 | if (msf_flag) |
| 963 | cmd[1] = 2; | 963 | cmd[1] = 2; |
| 964 | 964 | ||
| 965 | return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET); | 965 | return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET); |
| 966 | } | 966 | } |
| 967 | 967 | ||
| 968 | /* Try to read the entire TOC for the disk into our internal buffer. */ | 968 | /* Try to read the entire TOC for the disk into our internal buffer. */ |
| 969 | int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | 969 | int ide_cd_read_toc(ide_drive_t *drive) |
| 970 | { | 970 | { |
| 971 | int stat, ntracks, i; | 971 | int stat, ntracks, i; |
| 972 | struct cdrom_info *info = drive->driver_data; | 972 | struct cdrom_info *info = drive->driver_data; |
| @@ -996,14 +996,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 996 | * Check to see if the existing data is still valid. If it is, | 996 | * Check to see if the existing data is still valid. If it is, |
| 997 | * just return. | 997 | * just return. |
| 998 | */ | 998 | */ |
| 999 | (void) cdrom_check_status(drive, sense); | 999 | (void) cdrom_check_status(drive, NULL); |
| 1000 | 1000 | ||
| 1001 | if (drive->atapi_flags & IDE_AFLAG_TOC_VALID) | 1001 | if (drive->atapi_flags & IDE_AFLAG_TOC_VALID) |
| 1002 | return 0; | 1002 | return 0; |
| 1003 | 1003 | ||
| 1004 | /* try to get the total cdrom capacity and sector size */ | 1004 | /* try to get the total cdrom capacity and sector size */ |
| 1005 | stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame, | 1005 | stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame); |
| 1006 | sense); | ||
| 1007 | if (stat) | 1006 | if (stat) |
| 1008 | toc->capacity = 0x1fffff; | 1007 | toc->capacity = 0x1fffff; |
| 1009 | 1008 | ||
| @@ -1016,7 +1015,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 1016 | 1015 | ||
| 1017 | /* first read just the header, so we know how long the TOC is */ | 1016 | /* first read just the header, so we know how long the TOC is */ |
| 1018 | stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr, | 1017 | stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr, |
| 1019 | sizeof(struct atapi_toc_header), sense); | 1018 | sizeof(struct atapi_toc_header)); |
| 1020 | if (stat) | 1019 | if (stat) |
| 1021 | return stat; | 1020 | return stat; |
| 1022 | 1021 | ||
| @@ -1036,7 +1035,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 1036 | (char *)&toc->hdr, | 1035 | (char *)&toc->hdr, |
| 1037 | sizeof(struct atapi_toc_header) + | 1036 | sizeof(struct atapi_toc_header) + |
| 1038 | (ntracks + 1) * | 1037 | (ntracks + 1) * |
| 1039 | sizeof(struct atapi_toc_entry), sense); | 1038 | sizeof(struct atapi_toc_entry)); |
| 1040 | 1039 | ||
| 1041 | if (stat && toc->hdr.first_track > 1) { | 1040 | if (stat && toc->hdr.first_track > 1) { |
| 1042 | /* | 1041 | /* |
| @@ -1056,8 +1055,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 1056 | (char *)&toc->hdr, | 1055 | (char *)&toc->hdr, |
| 1057 | sizeof(struct atapi_toc_header) + | 1056 | sizeof(struct atapi_toc_header) + |
| 1058 | (ntracks + 1) * | 1057 | (ntracks + 1) * |
| 1059 | sizeof(struct atapi_toc_entry), | 1058 | sizeof(struct atapi_toc_entry)); |
| 1060 | sense); | ||
| 1061 | if (stat) | 1059 | if (stat) |
| 1062 | return stat; | 1060 | return stat; |
| 1063 | 1061 | ||
| @@ -1094,7 +1092,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 1094 | if (toc->hdr.first_track != CDROM_LEADOUT) { | 1092 | if (toc->hdr.first_track != CDROM_LEADOUT) { |
| 1095 | /* read the multisession information */ | 1093 | /* read the multisession information */ |
| 1096 | stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp, | 1094 | stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp, |
| 1097 | sizeof(ms_tmp), sense); | 1095 | sizeof(ms_tmp)); |
| 1098 | if (stat) | 1096 | if (stat) |
| 1099 | return stat; | 1097 | return stat; |
| 1100 | 1098 | ||
| @@ -1108,7 +1106,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) | |||
| 1108 | if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) { | 1106 | if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) { |
| 1109 | /* re-read multisession information using MSF format */ | 1107 | /* re-read multisession information using MSF format */ |
| 1110 | stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp, | 1108 | stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp, |
| 1111 | sizeof(ms_tmp), sense); | 1109 | sizeof(ms_tmp)); |
| 1112 | if (stat) | 1110 | if (stat) |
| 1113 | return stat; | 1111 | return stat; |
| 1114 | 1112 | ||
| @@ -1412,7 +1410,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive) | |||
| 1412 | { | 1410 | { |
| 1413 | unsigned long capacity, sectors_per_frame; | 1411 | unsigned long capacity, sectors_per_frame; |
| 1414 | 1412 | ||
| 1415 | if (cdrom_read_capacity(drive, &capacity, §ors_per_frame, NULL)) | 1413 | if (cdrom_read_capacity(drive, &capacity, §ors_per_frame)) |
| 1416 | return 0; | 1414 | return 0; |
| 1417 | 1415 | ||
| 1418 | return capacity * sectors_per_frame; | 1416 | return capacity * sectors_per_frame; |
| @@ -1710,9 +1708,8 @@ static unsigned int idecd_check_events(struct gendisk *disk, | |||
| 1710 | static int idecd_revalidate_disk(struct gendisk *disk) | 1708 | static int idecd_revalidate_disk(struct gendisk *disk) |
| 1711 | { | 1709 | { |
| 1712 | struct cdrom_info *info = ide_drv_g(disk, cdrom_info); | 1710 | struct cdrom_info *info = ide_drv_g(disk, cdrom_info); |
| 1713 | struct request_sense sense; | ||
| 1714 | 1711 | ||
| 1715 | ide_cd_read_toc(info->drive, &sense); | 1712 | ide_cd_read_toc(info->drive); |
| 1716 | 1713 | ||
| 1717 | return 0; | 1714 | return 0; |
| 1718 | } | 1715 | } |
| @@ -1736,7 +1733,6 @@ static int ide_cd_probe(ide_drive_t *drive) | |||
| 1736 | { | 1733 | { |
| 1737 | struct cdrom_info *info; | 1734 | struct cdrom_info *info; |
| 1738 | struct gendisk *g; | 1735 | struct gendisk *g; |
| 1739 | struct request_sense sense; | ||
| 1740 | 1736 | ||
| 1741 | ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x", | 1737 | ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x", |
| 1742 | drive->driver_req, drive->media); | 1738 | drive->driver_req, drive->media); |
| @@ -1785,7 +1781,7 @@ static int ide_cd_probe(ide_drive_t *drive) | |||
| 1785 | goto failed; | 1781 | goto failed; |
| 1786 | } | 1782 | } |
| 1787 | 1783 | ||
| 1788 | ide_cd_read_toc(drive, &sense); | 1784 | ide_cd_read_toc(drive); |
| 1789 | g->fops = &idecd_ops; | 1785 | g->fops = &idecd_ops; |
| 1790 | g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; | 1786 | g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; |
| 1791 | device_add_disk(&drive->gendev, g); | 1787 | device_add_disk(&drive->gendev, g); |
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index 04f0f310a856..a69dc7f61c4d 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h | |||
| @@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *); | |||
| 98 | 98 | ||
| 99 | /* ide-cd.c functions used by ide-cd_ioctl.c */ | 99 | /* ide-cd.c functions used by ide-cd_ioctl.c */ |
| 100 | int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, | 100 | int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, |
| 101 | unsigned *, struct request_sense *, int, req_flags_t); | 101 | unsigned *, struct scsi_sense_hdr *, int, req_flags_t); |
| 102 | int ide_cd_read_toc(ide_drive_t *, struct request_sense *); | 102 | int ide_cd_read_toc(ide_drive_t *); |
| 103 | int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); | 103 | int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); |
| 104 | void ide_cdrom_update_speed(ide_drive_t *, u8 *); | 104 | void ide_cdrom_update_speed(ide_drive_t *, u8 *); |
| 105 | int cdrom_check_status(ide_drive_t *, struct request_sense *); | 105 | int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *); |
| 106 | 106 | ||
| 107 | /* ide-cd_ioctl.c */ | 107 | /* ide-cd_ioctl.c */ |
| 108 | int ide_cdrom_open_real(struct cdrom_device_info *, int); | 108 | int ide_cdrom_open_real(struct cdrom_device_info *, int); |
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index b1322400887b..4a6e1a413ead 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c | |||
| @@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) | |||
| 43 | { | 43 | { |
| 44 | ide_drive_t *drive = cdi->handle; | 44 | ide_drive_t *drive = cdi->handle; |
| 45 | struct media_event_desc med; | 45 | struct media_event_desc med; |
| 46 | struct request_sense sense; | 46 | struct scsi_sense_hdr sshdr; |
| 47 | int stat; | 47 | int stat; |
| 48 | 48 | ||
| 49 | if (slot_nr != CDSL_CURRENT) | 49 | if (slot_nr != CDSL_CURRENT) |
| 50 | return -EINVAL; | 50 | return -EINVAL; |
| 51 | 51 | ||
| 52 | stat = cdrom_check_status(drive, &sense); | 52 | stat = cdrom_check_status(drive, &sshdr); |
| 53 | if (!stat || sense.sense_key == UNIT_ATTENTION) | 53 | if (!stat || sshdr.sense_key == UNIT_ATTENTION) |
| 54 | return CDS_DISC_OK; | 54 | return CDS_DISC_OK; |
| 55 | 55 | ||
| 56 | if (!cdrom_get_media_event(cdi, &med)) { | 56 | if (!cdrom_get_media_event(cdi, &med)) { |
| @@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) | |||
| 62 | return CDS_NO_DISC; | 62 | return CDS_NO_DISC; |
| 63 | } | 63 | } |
| 64 | 64 | ||
| 65 | if (sense.sense_key == NOT_READY && sense.asc == 0x04 | 65 | if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04 |
| 66 | && sense.ascq == 0x04) | 66 | && sshdr.ascq == 0x04) |
| 67 | return CDS_DISC_OK; | 67 | return CDS_DISC_OK; |
| 68 | 68 | ||
| 69 | /* | 69 | /* |
| @@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) | |||
| 71 | * just return TRAY_OPEN since ATAPI doesn't provide | 71 | * just return TRAY_OPEN since ATAPI doesn't provide |
| 72 | * any other way to detect this... | 72 | * any other way to detect this... |
| 73 | */ | 73 | */ |
| 74 | if (sense.sense_key == NOT_READY) { | 74 | if (sshdr.sense_key == NOT_READY) { |
| 75 | if (sense.asc == 0x3a && sense.ascq == 1) | 75 | if (sshdr.asc == 0x3a && sshdr.ascq == 1) |
| 76 | return CDS_NO_DISC; | 76 | return CDS_NO_DISC; |
| 77 | else | 77 | else |
| 78 | return CDS_TRAY_OPEN; | 78 | return CDS_TRAY_OPEN; |
| @@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi, | |||
| 105 | /* Eject the disk if EJECTFLAG is 0. | 105 | /* Eject the disk if EJECTFLAG is 0. |
| 106 | If EJECTFLAG is 1, try to reload the disk. */ | 106 | If EJECTFLAG is 1, try to reload the disk. */ |
| 107 | static | 107 | static |
| 108 | int cdrom_eject(ide_drive_t *drive, int ejectflag, | 108 | int cdrom_eject(ide_drive_t *drive, int ejectflag) |
| 109 | struct request_sense *sense) | ||
| 110 | { | 109 | { |
| 111 | struct cdrom_info *cd = drive->driver_data; | 110 | struct cdrom_info *cd = drive->driver_data; |
| 112 | struct cdrom_device_info *cdi = &cd->devinfo; | 111 | struct cdrom_device_info *cdi = &cd->devinfo; |
| @@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag, | |||
| 129 | cmd[0] = GPCMD_START_STOP_UNIT; | 128 | cmd[0] = GPCMD_START_STOP_UNIT; |
| 130 | cmd[4] = loej | (ejectflag != 0); | 129 | cmd[4] = loej | (ejectflag != 0); |
| 131 | 130 | ||
| 132 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0); | 131 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); |
| 133 | } | 132 | } |
| 134 | 133 | ||
| 135 | /* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */ | 134 | /* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */ |
| 136 | static | 135 | static |
| 137 | int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, | 136 | int ide_cd_lockdoor(ide_drive_t *drive, int lockflag) |
| 138 | struct request_sense *sense) | ||
| 139 | { | 137 | { |
| 140 | struct request_sense my_sense; | 138 | struct scsi_sense_hdr sshdr; |
| 141 | int stat; | 139 | int stat; |
| 142 | 140 | ||
| 143 | if (sense == NULL) | ||
| 144 | sense = &my_sense; | ||
| 145 | |||
| 146 | /* If the drive cannot lock the door, just pretend. */ | 141 | /* If the drive cannot lock the door, just pretend. */ |
| 147 | if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) { | 142 | if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) { |
| 148 | stat = 0; | 143 | stat = 0; |
| @@ -155,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, | |||
| 155 | cmd[4] = lockflag ? 1 : 0; | 150 | cmd[4] = lockflag ? 1 : 0; |
| 156 | 151 | ||
| 157 | stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, | 152 | stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, |
| 158 | sense, 0, 0); | 153 | &sshdr, 0, 0); |
| 159 | } | 154 | } |
| 160 | 155 | ||
| 161 | /* If we got an illegal field error, the drive | 156 | /* If we got an illegal field error, the drive |
| 162 | probably cannot lock the door. */ | 157 | probably cannot lock the door. */ |
| 163 | if (stat != 0 && | 158 | if (stat != 0 && |
| 164 | sense->sense_key == ILLEGAL_REQUEST && | 159 | sshdr.sense_key == ILLEGAL_REQUEST && |
| 165 | (sense->asc == 0x24 || sense->asc == 0x20)) { | 160 | (sshdr.asc == 0x24 || sshdr.asc == 0x20)) { |
| 166 | printk(KERN_ERR "%s: door locking not supported\n", | 161 | printk(KERN_ERR "%s: door locking not supported\n", |
| 167 | drive->name); | 162 | drive->name); |
| 168 | drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; | 163 | drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; |
| @@ -170,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, | |||
| 170 | } | 165 | } |
| 171 | 166 | ||
| 172 | /* no medium, that's alright. */ | 167 | /* no medium, that's alright. */ |
| 173 | if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a) | 168 | if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a) |
| 174 | stat = 0; | 169 | stat = 0; |
| 175 | 170 | ||
| 176 | if (stat == 0) { | 171 | if (stat == 0) { |
| @@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, | |||
| 186 | int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position) | 181 | int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position) |
| 187 | { | 182 | { |
| 188 | ide_drive_t *drive = cdi->handle; | 183 | ide_drive_t *drive = cdi->handle; |
| 189 | struct request_sense sense; | ||
| 190 | 184 | ||
| 191 | if (position) { | 185 | if (position) { |
| 192 | int stat = ide_cd_lockdoor(drive, 0, &sense); | 186 | int stat = ide_cd_lockdoor(drive, 0); |
| 193 | 187 | ||
| 194 | if (stat) | 188 | if (stat) |
| 195 | return stat; | 189 | return stat; |
| 196 | } | 190 | } |
| 197 | 191 | ||
| 198 | return cdrom_eject(drive, !position, &sense); | 192 | return cdrom_eject(drive, !position); |
| 199 | } | 193 | } |
| 200 | 194 | ||
| 201 | int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock) | 195 | int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock) |
| 202 | { | 196 | { |
| 203 | ide_drive_t *drive = cdi->handle; | 197 | ide_drive_t *drive = cdi->handle; |
| 204 | 198 | ||
| 205 | return ide_cd_lockdoor(drive, lock, NULL); | 199 | return ide_cd_lockdoor(drive, lock); |
| 206 | } | 200 | } |
| 207 | 201 | ||
| 208 | /* | 202 | /* |
| @@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed) | |||
| 213 | { | 207 | { |
| 214 | ide_drive_t *drive = cdi->handle; | 208 | ide_drive_t *drive = cdi->handle; |
| 215 | struct cdrom_info *cd = drive->driver_data; | 209 | struct cdrom_info *cd = drive->driver_data; |
| 216 | struct request_sense sense; | ||
| 217 | u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE]; | 210 | u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE]; |
| 218 | int stat; | 211 | int stat; |
| 219 | unsigned char cmd[BLK_MAX_CDB]; | 212 | unsigned char cmd[BLK_MAX_CDB]; |
| @@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed) | |||
| 236 | cmd[5] = speed & 0xff; | 229 | cmd[5] = speed & 0xff; |
| 237 | } | 230 | } |
| 238 | 231 | ||
| 239 | stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); | 232 | stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); |
| 240 | 233 | ||
| 241 | if (!ide_cdrom_get_capabilities(drive, buf)) { | 234 | if (!ide_cdrom_get_capabilities(drive, buf)) { |
| 242 | ide_cdrom_update_speed(drive, buf); | 235 | ide_cdrom_update_speed(drive, buf); |
| @@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi, | |||
| 252 | struct atapi_toc *toc; | 245 | struct atapi_toc *toc; |
| 253 | ide_drive_t *drive = cdi->handle; | 246 | ide_drive_t *drive = cdi->handle; |
| 254 | struct cdrom_info *info = drive->driver_data; | 247 | struct cdrom_info *info = drive->driver_data; |
| 255 | struct request_sense sense; | ||
| 256 | int ret; | 248 | int ret; |
| 257 | 249 | ||
| 258 | if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) { | 250 | if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) { |
| 259 | ret = ide_cd_read_toc(drive, &sense); | 251 | ret = ide_cd_read_toc(drive); |
| 260 | if (ret) | 252 | if (ret) |
| 261 | return ret; | 253 | return ret; |
| 262 | } | 254 | } |
| @@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) | |||
| 300 | { | 292 | { |
| 301 | ide_drive_t *drive = cdi->handle; | 293 | ide_drive_t *drive = cdi->handle; |
| 302 | struct cdrom_info *cd = drive->driver_data; | 294 | struct cdrom_info *cd = drive->driver_data; |
| 303 | struct request_sense sense; | ||
| 304 | struct request *rq; | 295 | struct request *rq; |
| 305 | int ret; | 296 | int ret; |
| 306 | 297 | ||
| @@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) | |||
| 315 | * lock it again. | 306 | * lock it again. |
| 316 | */ | 307 | */ |
| 317 | if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED) | 308 | if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED) |
| 318 | (void)ide_cd_lockdoor(drive, 1, &sense); | 309 | (void)ide_cd_lockdoor(drive, 1); |
| 319 | 310 | ||
| 320 | return ret; | 311 | return ret; |
| 321 | } | 312 | } |
| @@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg) | |||
| 355 | struct atapi_toc_entry *first_toc, *last_toc; | 346 | struct atapi_toc_entry *first_toc, *last_toc; |
| 356 | unsigned long lba_start, lba_end; | 347 | unsigned long lba_start, lba_end; |
| 357 | int stat; | 348 | int stat; |
| 358 | struct request_sense sense; | ||
| 359 | unsigned char cmd[BLK_MAX_CDB]; | 349 | unsigned char cmd[BLK_MAX_CDB]; |
| 360 | 350 | ||
| 361 | stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc); | 351 | stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc); |
| @@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg) | |||
| 380 | lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]); | 370 | lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]); |
| 381 | lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]); | 371 | lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]); |
| 382 | 372 | ||
| 383 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); | 373 | return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); |
| 384 | } | 374 | } |
| 385 | 375 | ||
| 386 | static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) | 376 | static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) |
| @@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) | |||
| 391 | int stat; | 381 | int stat; |
| 392 | 382 | ||
| 393 | /* Make sure our saved TOC is valid. */ | 383 | /* Make sure our saved TOC is valid. */ |
| 394 | stat = ide_cd_read_toc(drive, NULL); | 384 | stat = ide_cd_read_toc(drive); |
| 395 | if (stat) | 385 | if (stat) |
| 396 | return stat; | 386 | return stat; |
| 397 | 387 | ||
| @@ -461,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, | |||
| 461 | layer. the packet must be complete, as we do not | 451 | layer. the packet must be complete, as we do not |
| 462 | touch it at all. */ | 452 | touch it at all. */ |
| 463 | 453 | ||
| 464 | if (cgc->sense) | 454 | if (cgc->sshdr) |
| 465 | memset(cgc->sense, 0, sizeof(struct request_sense)); | 455 | memset(cgc->sshdr, 0, sizeof(*cgc->sshdr)); |
| 466 | 456 | ||
| 467 | if (cgc->quiet) | 457 | if (cgc->quiet) |
| 468 | flags |= RQF_QUIET; | 458 | flags |= RQF_QUIET; |
| @@ -470,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, | |||
| 470 | cgc->stat = ide_cd_queue_pc(drive, cgc->cmd, | 460 | cgc->stat = ide_cd_queue_pc(drive, cgc->cmd, |
| 471 | cgc->data_direction == CGC_DATA_WRITE, | 461 | cgc->data_direction == CGC_DATA_WRITE, |
| 472 | cgc->buffer, &len, | 462 | cgc->buffer, &len, |
| 473 | cgc->sense, cgc->timeout, flags); | 463 | cgc->sshdr, cgc->timeout, flags); |
| 474 | if (!cgc->stat) | 464 | if (!cgc->stat) |
| 475 | cgc->buflen -= len; | 465 | cgc->buflen -= len; |
| 476 | return cgc->stat; | 466 | return cgc->stat; |
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index ca844a926e6a..130bf163f066 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c | |||
| @@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, | |||
| 311 | { | 311 | { |
| 312 | domain->sig_type = IB_SIG_TYPE_T10_DIF; | 312 | domain->sig_type = IB_SIG_TYPE_T10_DIF; |
| 313 | domain->sig.dif.pi_interval = scsi_prot_interval(sc); | 313 | domain->sig.dif.pi_interval = scsi_prot_interval(sc); |
| 314 | domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); | 314 | domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request); |
| 315 | /* | 315 | /* |
| 316 | * At the moment we hard code those, but in the future | 316 | * At the moment we hard code those, but in the future |
| 317 | * we will take them from sc. | 317 | * we will take them from sc. |
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 9c03f35d9df1..439bf90d084d 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig | |||
| @@ -17,23 +17,25 @@ menuconfig NVM | |||
| 17 | 17 | ||
| 18 | if NVM | 18 | if NVM |
| 19 | 19 | ||
| 20 | config NVM_DEBUG | 20 | config NVM_PBLK |
| 21 | bool "Open-Channel SSD debugging support" | 21 | tristate "Physical Block Device Open-Channel SSD target" |
| 22 | default n | 22 | help |
| 23 | ---help--- | 23 | Allows an open-channel SSD to be exposed as a block device to the |
| 24 | Exposes a debug management interface to create/remove targets at: | 24 | host. The target assumes the device exposes raw flash and must be |
| 25 | explicitly managed by the host. | ||
| 25 | 26 | ||
| 26 | /sys/module/lnvm/parameters/configure_debug | 27 | Please note the disk format is considered EXPERIMENTAL for now. |
| 27 | 28 | ||
| 28 | It is required to create/remove targets without IOCTLs. | 29 | if NVM_PBLK |
| 29 | 30 | ||
| 30 | config NVM_PBLK | 31 | config NVM_PBLK_DEBUG |
| 31 | tristate "Physical Block Device Open-Channel SSD target" | 32 | bool "PBlk Debug Support" |
| 32 | ---help--- | 33 | default n |
| 33 | Allows an open-channel SSD to be exposed as a block device to the | 34 | help |
| 34 | host. The target assumes the device exposes raw flash and must be | 35 | Enables debug support for pblk. This includes extra checks, more |
| 35 | explicitly managed by the host. | 36 | vocal error messages, and extra tracking fields in the pblk sysfs |
| 37 | entries. | ||
| 36 | 38 | ||
| 37 | Please note the disk format is considered EXPERIMENTAL for now. | 39 | endif # NVM_PBLK_DEBUG |
| 38 | 40 | ||
| 39 | endif # NVM | 41 | endif # NVM |
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index b1c6d7eb6115..f565a56b898a 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c | |||
| @@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) | |||
| 27 | int nr_entries = pblk_get_secs(bio); | 27 | int nr_entries = pblk_get_secs(bio); |
| 28 | int i, ret; | 28 | int i, ret; |
| 29 | 29 | ||
| 30 | generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); | 30 | generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio), |
| 31 | &pblk->disk->part0); | ||
| 31 | 32 | ||
| 32 | /* Update the write buffer head (mem) with the entries that we can | 33 | /* Update the write buffer head (mem) with the entries that we can |
| 33 | * write. The write in itself cannot fail, so there is no need to | 34 | * write. The write in itself cannot fail, so there is no need to |
| @@ -67,7 +68,7 @@ retry: | |||
| 67 | 68 | ||
| 68 | atomic64_add(nr_entries, &pblk->user_wa); | 69 | atomic64_add(nr_entries, &pblk->user_wa); |
| 69 | 70 | ||
| 70 | #ifdef CONFIG_NVM_DEBUG | 71 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 71 | atomic_long_add(nr_entries, &pblk->inflight_writes); | 72 | atomic_long_add(nr_entries, &pblk->inflight_writes); |
| 72 | atomic_long_add(nr_entries, &pblk->req_writes); | 73 | atomic_long_add(nr_entries, &pblk->req_writes); |
| 73 | #endif | 74 | #endif |
| @@ -75,7 +76,7 @@ retry: | |||
| 75 | pblk_rl_inserted(&pblk->rl, nr_entries); | 76 | pblk_rl_inserted(&pblk->rl, nr_entries); |
| 76 | 77 | ||
| 77 | out: | 78 | out: |
| 78 | generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); | 79 | generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time); |
| 79 | pblk_write_should_kick(pblk); | 80 | pblk_write_should_kick(pblk); |
| 80 | return ret; | 81 | return ret; |
| 81 | } | 82 | } |
| @@ -123,7 +124,7 @@ retry: | |||
| 123 | 124 | ||
| 124 | atomic64_add(valid_entries, &pblk->gc_wa); | 125 | atomic64_add(valid_entries, &pblk->gc_wa); |
| 125 | 126 | ||
| 126 | #ifdef CONFIG_NVM_DEBUG | 127 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 127 | atomic_long_add(valid_entries, &pblk->inflight_writes); | 128 | atomic_long_add(valid_entries, &pblk->inflight_writes); |
| 128 | atomic_long_add(valid_entries, &pblk->recov_gc_writes); | 129 | atomic_long_add(valid_entries, &pblk->recov_gc_writes); |
| 129 | #endif | 130 | #endif |
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index ed9cc977c8b3..00984b486fea 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c | |||
| @@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work) | |||
| 35 | line = &pblk->lines[pblk_ppa_to_line(*ppa)]; | 35 | line = &pblk->lines[pblk_ppa_to_line(*ppa)]; |
| 36 | pos = pblk_ppa_to_pos(&dev->geo, *ppa); | 36 | pos = pblk_ppa_to_pos(&dev->geo, *ppa); |
| 37 | 37 | ||
| 38 | pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", | 38 | pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", |
| 39 | line->id, pos); | 39 | line->id, pos); |
| 40 | } | 40 | } |
| 41 | 41 | ||
| @@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, | |||
| 51 | struct ppa_addr *ppa; | 51 | struct ppa_addr *ppa; |
| 52 | int pos = pblk_ppa_to_pos(geo, ppa_addr); | 52 | int pos = pblk_ppa_to_pos(geo, ppa_addr); |
| 53 | 53 | ||
| 54 | pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); | 54 | pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); |
| 55 | atomic_long_inc(&pblk->erase_failed); | 55 | atomic_long_inc(&pblk->erase_failed); |
| 56 | 56 | ||
| 57 | atomic_dec(&line->blk_in_line); | 57 | atomic_dec(&line->blk_in_line); |
| 58 | if (test_and_set_bit(pos, line->blk_bitmap)) | 58 | if (test_and_set_bit(pos, line->blk_bitmap)) |
| 59 | pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", | 59 | pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", |
| 60 | line->id, pos); | 60 | line->id, pos); |
| 61 | 61 | ||
| 62 | /* Not necessary to mark bad blocks on 2.0 spec. */ | 62 | /* Not necessary to mark bad blocks on 2.0 spec. */ |
| @@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) | |||
| 194 | u64 paddr; | 194 | u64 paddr; |
| 195 | int line_id; | 195 | int line_id; |
| 196 | 196 | ||
| 197 | #ifdef CONFIG_NVM_DEBUG | 197 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 198 | /* Callers must ensure that the ppa points to a device address */ | 198 | /* Callers must ensure that the ppa points to a device address */ |
| 199 | BUG_ON(pblk_addr_in_cache(ppa)); | 199 | BUG_ON(pblk_addr_in_cache(ppa)); |
| 200 | BUG_ON(pblk_ppa_empty(ppa)); | 200 | BUG_ON(pblk_ppa_empty(ppa)); |
| @@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) | |||
| 264 | switch (type) { | 264 | switch (type) { |
| 265 | case PBLK_WRITE: | 265 | case PBLK_WRITE: |
| 266 | kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); | 266 | kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); |
| 267 | /* fall through */ | ||
| 267 | case PBLK_WRITE_INT: | 268 | case PBLK_WRITE_INT: |
| 268 | pool = &pblk->w_rq_pool; | 269 | pool = &pblk->w_rq_pool; |
| 269 | break; | 270 | break; |
| @@ -274,7 +275,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) | |||
| 274 | pool = &pblk->e_rq_pool; | 275 | pool = &pblk->e_rq_pool; |
| 275 | break; | 276 | break; |
| 276 | default: | 277 | default: |
| 277 | pr_err("pblk: trying to free unknown rqd type\n"); | 278 | pblk_err(pblk, "trying to free unknown rqd type\n"); |
| 278 | return; | 279 | return; |
| 279 | } | 280 | } |
| 280 | 281 | ||
| @@ -310,7 +311,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, | |||
| 310 | 311 | ||
| 311 | ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); | 312 | ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); |
| 312 | if (ret != PBLK_EXPOSED_PAGE_SIZE) { | 313 | if (ret != PBLK_EXPOSED_PAGE_SIZE) { |
| 313 | pr_err("pblk: could not add page to bio\n"); | 314 | pblk_err(pblk, "could not add page to bio\n"); |
| 314 | mempool_free(page, &pblk->page_bio_pool); | 315 | mempool_free(page, &pblk->page_bio_pool); |
| 315 | goto err; | 316 | goto err; |
| 316 | } | 317 | } |
| @@ -410,7 +411,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) | |||
| 410 | line->state = PBLK_LINESTATE_CORRUPT; | 411 | line->state = PBLK_LINESTATE_CORRUPT; |
| 411 | line->gc_group = PBLK_LINEGC_NONE; | 412 | line->gc_group = PBLK_LINEGC_NONE; |
| 412 | move_list = &l_mg->corrupt_list; | 413 | move_list = &l_mg->corrupt_list; |
| 413 | pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", | 414 | pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", |
| 414 | line->id, vsc, | 415 | line->id, vsc, |
| 415 | line->sec_in_line, | 416 | line->sec_in_line, |
| 416 | lm->high_thrs, lm->mid_thrs); | 417 | lm->high_thrs, lm->mid_thrs); |
| @@ -430,7 +431,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio) | |||
| 430 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) | 431 | void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) |
| 431 | { | 432 | { |
| 432 | atomic_long_inc(&pblk->write_failed); | 433 | atomic_long_inc(&pblk->write_failed); |
| 433 | #ifdef CONFIG_NVM_DEBUG | 434 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 434 | pblk_print_failed_rqd(pblk, rqd, rqd->error); | 435 | pblk_print_failed_rqd(pblk, rqd, rqd->error); |
| 435 | #endif | 436 | #endif |
| 436 | } | 437 | } |
| @@ -452,9 +453,9 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 452 | atomic_long_inc(&pblk->read_failed); | 453 | atomic_long_inc(&pblk->read_failed); |
| 453 | break; | 454 | break; |
| 454 | default: | 455 | default: |
| 455 | pr_err("pblk: unknown read error:%d\n", rqd->error); | 456 | pblk_err(pblk, "unknown read error:%d\n", rqd->error); |
| 456 | } | 457 | } |
| 457 | #ifdef CONFIG_NVM_DEBUG | 458 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 458 | pblk_print_failed_rqd(pblk, rqd, rqd->error); | 459 | pblk_print_failed_rqd(pblk, rqd, rqd->error); |
| 459 | #endif | 460 | #endif |
| 460 | } | 461 | } |
| @@ -470,7 +471,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 470 | 471 | ||
| 471 | atomic_inc(&pblk->inflight_io); | 472 | atomic_inc(&pblk->inflight_io); |
| 472 | 473 | ||
| 473 | #ifdef CONFIG_NVM_DEBUG | 474 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 474 | if (pblk_check_io(pblk, rqd)) | 475 | if (pblk_check_io(pblk, rqd)) |
| 475 | return NVM_IO_ERR; | 476 | return NVM_IO_ERR; |
| 476 | #endif | 477 | #endif |
| @@ -484,7 +485,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 484 | 485 | ||
| 485 | atomic_inc(&pblk->inflight_io); | 486 | atomic_inc(&pblk->inflight_io); |
| 486 | 487 | ||
| 487 | #ifdef CONFIG_NVM_DEBUG | 488 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 488 | if (pblk_check_io(pblk, rqd)) | 489 | if (pblk_check_io(pblk, rqd)) |
| 489 | return NVM_IO_ERR; | 490 | return NVM_IO_ERR; |
| 490 | #endif | 491 | #endif |
| @@ -517,7 +518,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, | |||
| 517 | for (i = 0; i < nr_secs; i++) { | 518 | for (i = 0; i < nr_secs; i++) { |
| 518 | page = vmalloc_to_page(kaddr); | 519 | page = vmalloc_to_page(kaddr); |
| 519 | if (!page) { | 520 | if (!page) { |
| 520 | pr_err("pblk: could not map vmalloc bio\n"); | 521 | pblk_err(pblk, "could not map vmalloc bio\n"); |
| 521 | bio_put(bio); | 522 | bio_put(bio); |
| 522 | bio = ERR_PTR(-ENOMEM); | 523 | bio = ERR_PTR(-ENOMEM); |
| 523 | goto out; | 524 | goto out; |
| @@ -525,7 +526,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, | |||
| 525 | 526 | ||
| 526 | ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); | 527 | ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); |
| 527 | if (ret != PAGE_SIZE) { | 528 | if (ret != PAGE_SIZE) { |
| 528 | pr_err("pblk: could not add page to bio\n"); | 529 | pblk_err(pblk, "could not add page to bio\n"); |
| 529 | bio_put(bio); | 530 | bio_put(bio); |
| 530 | bio = ERR_PTR(-ENOMEM); | 531 | bio = ERR_PTR(-ENOMEM); |
| 531 | goto out; | 532 | goto out; |
| @@ -711,7 +712,7 @@ next_rq: | |||
| 711 | while (test_bit(pos, line->blk_bitmap)) { | 712 | while (test_bit(pos, line->blk_bitmap)) { |
| 712 | paddr += min; | 713 | paddr += min; |
| 713 | if (pblk_boundary_paddr_checks(pblk, paddr)) { | 714 | if (pblk_boundary_paddr_checks(pblk, paddr)) { |
| 714 | pr_err("pblk: corrupt emeta line:%d\n", | 715 | pblk_err(pblk, "corrupt emeta line:%d\n", |
| 715 | line->id); | 716 | line->id); |
| 716 | bio_put(bio); | 717 | bio_put(bio); |
| 717 | ret = -EINTR; | 718 | ret = -EINTR; |
| @@ -723,7 +724,7 @@ next_rq: | |||
| 723 | } | 724 | } |
| 724 | 725 | ||
| 725 | if (pblk_boundary_paddr_checks(pblk, paddr + min)) { | 726 | if (pblk_boundary_paddr_checks(pblk, paddr + min)) { |
| 726 | pr_err("pblk: corrupt emeta line:%d\n", | 727 | pblk_err(pblk, "corrupt emeta line:%d\n", |
| 727 | line->id); | 728 | line->id); |
| 728 | bio_put(bio); | 729 | bio_put(bio); |
| 729 | ret = -EINTR; | 730 | ret = -EINTR; |
| @@ -738,7 +739,7 @@ next_rq: | |||
| 738 | 739 | ||
| 739 | ret = pblk_submit_io_sync(pblk, &rqd); | 740 | ret = pblk_submit_io_sync(pblk, &rqd); |
| 740 | if (ret) { | 741 | if (ret) { |
| 741 | pr_err("pblk: emeta I/O submission failed: %d\n", ret); | 742 | pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); |
| 742 | bio_put(bio); | 743 | bio_put(bio); |
| 743 | goto free_rqd_dma; | 744 | goto free_rqd_dma; |
| 744 | } | 745 | } |
| @@ -843,7 +844,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, | |||
| 843 | */ | 844 | */ |
| 844 | ret = pblk_submit_io_sync(pblk, &rqd); | 845 | ret = pblk_submit_io_sync(pblk, &rqd); |
| 845 | if (ret) { | 846 | if (ret) { |
| 846 | pr_err("pblk: smeta I/O submission failed: %d\n", ret); | 847 | pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); |
| 847 | bio_put(bio); | 848 | bio_put(bio); |
| 848 | goto free_ppa_list; | 849 | goto free_ppa_list; |
| 849 | } | 850 | } |
| @@ -905,7 +906,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) | |||
| 905 | struct nvm_tgt_dev *dev = pblk->dev; | 906 | struct nvm_tgt_dev *dev = pblk->dev; |
| 906 | struct nvm_geo *geo = &dev->geo; | 907 | struct nvm_geo *geo = &dev->geo; |
| 907 | 908 | ||
| 908 | pr_err("pblk: could not sync erase line:%d,blk:%d\n", | 909 | pblk_err(pblk, "could not sync erase line:%d,blk:%d\n", |
| 909 | pblk_ppa_to_line(ppa), | 910 | pblk_ppa_to_line(ppa), |
| 910 | pblk_ppa_to_pos(geo, ppa)); | 911 | pblk_ppa_to_pos(geo, ppa)); |
| 911 | 912 | ||
| @@ -945,7 +946,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) | |||
| 945 | 946 | ||
| 946 | ret = pblk_blk_erase_sync(pblk, ppa); | 947 | ret = pblk_blk_erase_sync(pblk, ppa); |
| 947 | if (ret) { | 948 | if (ret) { |
| 948 | pr_err("pblk: failed to erase line %d\n", line->id); | 949 | pblk_err(pblk, "failed to erase line %d\n", line->id); |
| 949 | return ret; | 950 | return ret; |
| 950 | } | 951 | } |
| 951 | } while (1); | 952 | } while (1); |
| @@ -1012,7 +1013,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, | |||
| 1012 | list_add_tail(&line->list, &l_mg->bad_list); | 1013 | list_add_tail(&line->list, &l_mg->bad_list); |
| 1013 | spin_unlock(&l_mg->free_lock); | 1014 | spin_unlock(&l_mg->free_lock); |
| 1014 | 1015 | ||
| 1015 | pr_debug("pblk: line %d is bad\n", line->id); | 1016 | pblk_debug(pblk, "line %d is bad\n", line->id); |
| 1016 | 1017 | ||
| 1017 | return 0; | 1018 | return 0; |
| 1018 | } | 1019 | } |
| @@ -1122,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, | |||
| 1122 | line->cur_sec = off + lm->smeta_sec; | 1123 | line->cur_sec = off + lm->smeta_sec; |
| 1123 | 1124 | ||
| 1124 | if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { | 1125 | if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { |
| 1125 | pr_debug("pblk: line smeta I/O failed. Retry\n"); | 1126 | pblk_debug(pblk, "line smeta I/O failed. Retry\n"); |
| 1126 | return 0; | 1127 | return 0; |
| 1127 | } | 1128 | } |
| 1128 | 1129 | ||
| @@ -1154,7 +1155,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, | |||
| 1154 | spin_unlock(&line->lock); | 1155 | spin_unlock(&line->lock); |
| 1155 | 1156 | ||
| 1156 | list_add_tail(&line->list, &l_mg->bad_list); | 1157 | list_add_tail(&line->list, &l_mg->bad_list); |
| 1157 | pr_err("pblk: unexpected line %d is bad\n", line->id); | 1158 | pblk_err(pblk, "unexpected line %d is bad\n", line->id); |
| 1158 | 1159 | ||
| 1159 | return 0; | 1160 | return 0; |
| 1160 | } | 1161 | } |
| @@ -1299,7 +1300,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) | |||
| 1299 | 1300 | ||
| 1300 | retry: | 1301 | retry: |
| 1301 | if (list_empty(&l_mg->free_list)) { | 1302 | if (list_empty(&l_mg->free_list)) { |
| 1302 | pr_err("pblk: no free lines\n"); | 1303 | pblk_err(pblk, "no free lines\n"); |
| 1303 | return NULL; | 1304 | return NULL; |
| 1304 | } | 1305 | } |
| 1305 | 1306 | ||
| @@ -1315,7 +1316,7 @@ retry: | |||
| 1315 | 1316 | ||
| 1316 | list_add_tail(&line->list, &l_mg->bad_list); | 1317 | list_add_tail(&line->list, &l_mg->bad_list); |
| 1317 | 1318 | ||
| 1318 | pr_debug("pblk: line %d is bad\n", line->id); | 1319 | pblk_debug(pblk, "line %d is bad\n", line->id); |
| 1319 | goto retry; | 1320 | goto retry; |
| 1320 | } | 1321 | } |
| 1321 | 1322 | ||
| @@ -1329,7 +1330,7 @@ retry: | |||
| 1329 | list_add(&line->list, &l_mg->corrupt_list); | 1330 | list_add(&line->list, &l_mg->corrupt_list); |
| 1330 | goto retry; | 1331 | goto retry; |
| 1331 | default: | 1332 | default: |
| 1332 | pr_err("pblk: failed to prepare line %d\n", line->id); | 1333 | pblk_err(pblk, "failed to prepare line %d\n", line->id); |
| 1333 | list_add(&line->list, &l_mg->free_list); | 1334 | list_add(&line->list, &l_mg->free_list); |
| 1334 | l_mg->nr_free_lines++; | 1335 | l_mg->nr_free_lines++; |
| 1335 | return NULL; | 1336 | return NULL; |
| @@ -1477,7 +1478,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk) | |||
| 1477 | 1478 | ||
| 1478 | ret = pblk_submit_meta_io(pblk, line); | 1479 | ret = pblk_submit_meta_io(pblk, line); |
| 1479 | if (ret) { | 1480 | if (ret) { |
| 1480 | pr_err("pblk: sync meta line %d failed (%d)\n", | 1481 | pblk_err(pblk, "sync meta line %d failed (%d)\n", |
| 1481 | line->id, ret); | 1482 | line->id, ret); |
| 1482 | return; | 1483 | return; |
| 1483 | } | 1484 | } |
| @@ -1507,7 +1508,7 @@ void __pblk_pipeline_flush(struct pblk *pblk) | |||
| 1507 | 1508 | ||
| 1508 | ret = pblk_recov_pad(pblk); | 1509 | ret = pblk_recov_pad(pblk); |
| 1509 | if (ret) { | 1510 | if (ret) { |
| 1510 | pr_err("pblk: could not close data on teardown(%d)\n", ret); | 1511 | pblk_err(pblk, "could not close data on teardown(%d)\n", ret); |
| 1511 | return; | 1512 | return; |
| 1512 | } | 1513 | } |
| 1513 | 1514 | ||
| @@ -1687,7 +1688,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) | |||
| 1687 | struct nvm_tgt_dev *dev = pblk->dev; | 1688 | struct nvm_tgt_dev *dev = pblk->dev; |
| 1688 | struct nvm_geo *geo = &dev->geo; | 1689 | struct nvm_geo *geo = &dev->geo; |
| 1689 | 1690 | ||
| 1690 | pr_err("pblk: could not async erase line:%d,blk:%d\n", | 1691 | pblk_err(pblk, "could not async erase line:%d,blk:%d\n", |
| 1691 | pblk_ppa_to_line(ppa), | 1692 | pblk_ppa_to_line(ppa), |
| 1692 | pblk_ppa_to_pos(geo, ppa)); | 1693 | pblk_ppa_to_pos(geo, ppa)); |
| 1693 | } | 1694 | } |
| @@ -1726,7 +1727,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) | |||
| 1726 | struct list_head *move_list; | 1727 | struct list_head *move_list; |
| 1727 | int i; | 1728 | int i; |
| 1728 | 1729 | ||
| 1729 | #ifdef CONFIG_NVM_DEBUG | 1730 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1730 | WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), | 1731 | WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), |
| 1731 | "pblk: corrupt closed line %d\n", line->id); | 1732 | "pblk: corrupt closed line %d\n", line->id); |
| 1732 | #endif | 1733 | #endif |
| @@ -1856,7 +1857,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, | |||
| 1856 | * Only send one inflight I/O per LUN. Since we map at a page | 1857 | * Only send one inflight I/O per LUN. Since we map at a page |
| 1857 | * granurality, all ppas in the I/O will map to the same LUN | 1858 | * granurality, all ppas in the I/O will map to the same LUN |
| 1858 | */ | 1859 | */ |
| 1859 | #ifdef CONFIG_NVM_DEBUG | 1860 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1860 | int i; | 1861 | int i; |
| 1861 | 1862 | ||
| 1862 | for (i = 1; i < nr_ppas; i++) | 1863 | for (i = 1; i < nr_ppas; i++) |
| @@ -1866,7 +1867,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, | |||
| 1866 | 1867 | ||
| 1867 | ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); | 1868 | ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); |
| 1868 | if (ret == -ETIME || ret == -EINTR) | 1869 | if (ret == -ETIME || ret == -EINTR) |
| 1869 | pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); | 1870 | pblk_err(pblk, "taking lun semaphore timed out: err %d\n", |
| 1871 | -ret); | ||
| 1870 | } | 1872 | } |
| 1871 | 1873 | ||
| 1872 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) | 1874 | void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) |
| @@ -1901,7 +1903,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) | |||
| 1901 | struct pblk_lun *rlun; | 1903 | struct pblk_lun *rlun; |
| 1902 | int pos = pblk_ppa_to_pos(geo, ppa_list[0]); | 1904 | int pos = pblk_ppa_to_pos(geo, ppa_list[0]); |
| 1903 | 1905 | ||
| 1904 | #ifdef CONFIG_NVM_DEBUG | 1906 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1905 | int i; | 1907 | int i; |
| 1906 | 1908 | ||
| 1907 | for (i = 1; i < nr_ppas; i++) | 1909 | for (i = 1; i < nr_ppas; i++) |
| @@ -1951,7 +1953,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | |||
| 1951 | void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) | 1953 | void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) |
| 1952 | { | 1954 | { |
| 1953 | 1955 | ||
| 1954 | #ifdef CONFIG_NVM_DEBUG | 1956 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1955 | /* Callers must ensure that the ppa points to a cache address */ | 1957 | /* Callers must ensure that the ppa points to a cache address */ |
| 1956 | BUG_ON(!pblk_addr_in_cache(ppa)); | 1958 | BUG_ON(!pblk_addr_in_cache(ppa)); |
| 1957 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); | 1959 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); |
| @@ -1966,7 +1968,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, | |||
| 1966 | struct ppa_addr ppa_l2p, ppa_gc; | 1968 | struct ppa_addr ppa_l2p, ppa_gc; |
| 1967 | int ret = 1; | 1969 | int ret = 1; |
| 1968 | 1970 | ||
| 1969 | #ifdef CONFIG_NVM_DEBUG | 1971 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1970 | /* Callers must ensure that the ppa points to a cache address */ | 1972 | /* Callers must ensure that the ppa points to a cache address */ |
| 1971 | BUG_ON(!pblk_addr_in_cache(ppa_new)); | 1973 | BUG_ON(!pblk_addr_in_cache(ppa_new)); |
| 1972 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); | 1974 | BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); |
| @@ -2003,14 +2005,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, | |||
| 2003 | { | 2005 | { |
| 2004 | struct ppa_addr ppa_l2p; | 2006 | struct ppa_addr ppa_l2p; |
| 2005 | 2007 | ||
| 2006 | #ifdef CONFIG_NVM_DEBUG | 2008 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 2007 | /* Callers must ensure that the ppa points to a device address */ | 2009 | /* Callers must ensure that the ppa points to a device address */ |
| 2008 | BUG_ON(pblk_addr_in_cache(ppa_mapped)); | 2010 | BUG_ON(pblk_addr_in_cache(ppa_mapped)); |
| 2009 | #endif | 2011 | #endif |
| 2010 | /* Invalidate and discard padded entries */ | 2012 | /* Invalidate and discard padded entries */ |
| 2011 | if (lba == ADDR_EMPTY) { | 2013 | if (lba == ADDR_EMPTY) { |
| 2012 | atomic64_inc(&pblk->pad_wa); | 2014 | atomic64_inc(&pblk->pad_wa); |
| 2013 | #ifdef CONFIG_NVM_DEBUG | 2015 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 2014 | atomic_long_inc(&pblk->padded_wb); | 2016 | atomic_long_inc(&pblk->padded_wb); |
| 2015 | #endif | 2017 | #endif |
| 2016 | if (!pblk_ppa_empty(ppa_mapped)) | 2018 | if (!pblk_ppa_empty(ppa_mapped)) |
| @@ -2036,7 +2038,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, | |||
| 2036 | goto out; | 2038 | goto out; |
| 2037 | } | 2039 | } |
| 2038 | 2040 | ||
| 2039 | #ifdef CONFIG_NVM_DEBUG | 2041 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 2040 | WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); | 2042 | WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); |
| 2041 | #endif | 2043 | #endif |
| 2042 | 2044 | ||
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 080469d90b40..157c2567c9e8 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c | |||
| @@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work) | |||
| 90 | 90 | ||
| 91 | gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); | 91 | gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); |
| 92 | if (!gc_rq->data) { | 92 | if (!gc_rq->data) { |
| 93 | pr_err("pblk: could not GC line:%d (%d/%d)\n", | 93 | pblk_err(pblk, "could not GC line:%d (%d/%d)\n", |
| 94 | line->id, *line->vsc, gc_rq->nr_secs); | 94 | line->id, *line->vsc, gc_rq->nr_secs); |
| 95 | goto out; | 95 | goto out; |
| 96 | } | 96 | } |
| @@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work) | |||
| 98 | /* Read from GC victim block */ | 98 | /* Read from GC victim block */ |
| 99 | ret = pblk_submit_read_gc(pblk, gc_rq); | 99 | ret = pblk_submit_read_gc(pblk, gc_rq); |
| 100 | if (ret) { | 100 | if (ret) { |
| 101 | pr_err("pblk: failed GC read in line:%d (err:%d)\n", | 101 | pblk_err(pblk, "failed GC read in line:%d (err:%d)\n", |
| 102 | line->id, ret); | 102 | line->id, ret); |
| 103 | goto out; | 103 | goto out; |
| 104 | } | 104 | } |
| @@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, | |||
| 146 | 146 | ||
| 147 | ret = pblk_line_read_emeta(pblk, line, emeta_buf); | 147 | ret = pblk_line_read_emeta(pblk, line, emeta_buf); |
| 148 | if (ret) { | 148 | if (ret) { |
| 149 | pr_err("pblk: line %d read emeta failed (%d)\n", | 149 | pblk_err(pblk, "line %d read emeta failed (%d)\n", |
| 150 | line->id, ret); | 150 | line->id, ret); |
| 151 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); | 151 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); |
| 152 | return NULL; | 152 | return NULL; |
| @@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, | |||
| 160 | 160 | ||
| 161 | ret = pblk_recov_check_emeta(pblk, emeta_buf); | 161 | ret = pblk_recov_check_emeta(pblk, emeta_buf); |
| 162 | if (ret) { | 162 | if (ret) { |
| 163 | pr_err("pblk: inconsistent emeta (line %d)\n", | 163 | pblk_err(pblk, "inconsistent emeta (line %d)\n", |
| 164 | line->id); | 164 | line->id); |
| 165 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); | 165 | pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); |
| 166 | return NULL; | 166 | return NULL; |
| @@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) | |||
| 201 | } else { | 201 | } else { |
| 202 | lba_list = get_lba_list_from_emeta(pblk, line); | 202 | lba_list = get_lba_list_from_emeta(pblk, line); |
| 203 | if (!lba_list) { | 203 | if (!lba_list) { |
| 204 | pr_err("pblk: could not interpret emeta (line %d)\n", | 204 | pblk_err(pblk, "could not interpret emeta (line %d)\n", |
| 205 | line->id); | 205 | line->id); |
| 206 | goto fail_free_invalid_bitmap; | 206 | goto fail_free_invalid_bitmap; |
| 207 | } | 207 | } |
| @@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) | |||
| 213 | spin_unlock(&line->lock); | 213 | spin_unlock(&line->lock); |
| 214 | 214 | ||
| 215 | if (sec_left < 0) { | 215 | if (sec_left < 0) { |
| 216 | pr_err("pblk: corrupted GC line (%d)\n", line->id); | 216 | pblk_err(pblk, "corrupted GC line (%d)\n", line->id); |
| 217 | goto fail_free_lba_list; | 217 | goto fail_free_lba_list; |
| 218 | } | 218 | } |
| 219 | 219 | ||
| @@ -289,7 +289,7 @@ fail_free_ws: | |||
| 289 | kref_put(&line->ref, pblk_line_put); | 289 | kref_put(&line->ref, pblk_line_put); |
| 290 | atomic_dec(&gc->read_inflight_gc); | 290 | atomic_dec(&gc->read_inflight_gc); |
| 291 | 291 | ||
| 292 | pr_err("pblk: Failed to GC line %d\n", line->id); | 292 | pblk_err(pblk, "failed to GC line %d\n", line->id); |
| 293 | } | 293 | } |
| 294 | 294 | ||
| 295 | static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) | 295 | static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) |
| @@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) | |||
| 297 | struct pblk_gc *gc = &pblk->gc; | 297 | struct pblk_gc *gc = &pblk->gc; |
| 298 | struct pblk_line_ws *line_ws; | 298 | struct pblk_line_ws *line_ws; |
| 299 | 299 | ||
| 300 | pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); | 300 | pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); |
| 301 | 301 | ||
| 302 | line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); | 302 | line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); |
| 303 | if (!line_ws) | 303 | if (!line_ws) |
| @@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk) | |||
| 351 | pblk_gc_kick(pblk); | 351 | pblk_gc_kick(pblk); |
| 352 | 352 | ||
| 353 | if (pblk_gc_line(pblk, line)) | 353 | if (pblk_gc_line(pblk, line)) |
| 354 | pr_err("pblk: failed to GC line %d\n", line->id); | 354 | pblk_err(pblk, "failed to GC line %d\n", line->id); |
| 355 | 355 | ||
| 356 | return 0; | 356 | return 0; |
| 357 | } | 357 | } |
| @@ -522,8 +522,8 @@ static int pblk_gc_reader_ts(void *data) | |||
| 522 | io_schedule(); | 522 | io_schedule(); |
| 523 | } | 523 | } |
| 524 | 524 | ||
| 525 | #ifdef CONFIG_NVM_DEBUG | 525 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 526 | pr_info("pblk: flushing gc pipeline, %d lines left\n", | 526 | pblk_info(pblk, "flushing gc pipeline, %d lines left\n", |
| 527 | atomic_read(&gc->pipeline_gc)); | 527 | atomic_read(&gc->pipeline_gc)); |
| 528 | #endif | 528 | #endif |
| 529 | 529 | ||
| @@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data) | |||
| 540 | static void pblk_gc_start(struct pblk *pblk) | 540 | static void pblk_gc_start(struct pblk *pblk) |
| 541 | { | 541 | { |
| 542 | pblk->gc.gc_active = 1; | 542 | pblk->gc.gc_active = 1; |
| 543 | pr_debug("pblk: gc start\n"); | 543 | pblk_debug(pblk, "gc start\n"); |
| 544 | } | 544 | } |
| 545 | 545 | ||
| 546 | void pblk_gc_should_start(struct pblk *pblk) | 546 | void pblk_gc_should_start(struct pblk *pblk) |
| @@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk) | |||
| 605 | 605 | ||
| 606 | gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); | 606 | gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); |
| 607 | if (IS_ERR(gc->gc_ts)) { | 607 | if (IS_ERR(gc->gc_ts)) { |
| 608 | pr_err("pblk: could not allocate GC main kthread\n"); | 608 | pblk_err(pblk, "could not allocate GC main kthread\n"); |
| 609 | return PTR_ERR(gc->gc_ts); | 609 | return PTR_ERR(gc->gc_ts); |
| 610 | } | 610 | } |
| 611 | 611 | ||
| 612 | gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, | 612 | gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, |
| 613 | "pblk-gc-writer-ts"); | 613 | "pblk-gc-writer-ts"); |
| 614 | if (IS_ERR(gc->gc_writer_ts)) { | 614 | if (IS_ERR(gc->gc_writer_ts)) { |
| 615 | pr_err("pblk: could not allocate GC writer kthread\n"); | 615 | pblk_err(pblk, "could not allocate GC writer kthread\n"); |
| 616 | ret = PTR_ERR(gc->gc_writer_ts); | 616 | ret = PTR_ERR(gc->gc_writer_ts); |
| 617 | goto fail_free_main_kthread; | 617 | goto fail_free_main_kthread; |
| 618 | } | 618 | } |
| @@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk) | |||
| 620 | gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, | 620 | gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, |
| 621 | "pblk-gc-reader-ts"); | 621 | "pblk-gc-reader-ts"); |
| 622 | if (IS_ERR(gc->gc_reader_ts)) { | 622 | if (IS_ERR(gc->gc_reader_ts)) { |
| 623 | pr_err("pblk: could not allocate GC reader kthread\n"); | 623 | pblk_err(pblk, "could not allocate GC reader kthread\n"); |
| 624 | ret = PTR_ERR(gc->gc_reader_ts); | 624 | ret = PTR_ERR(gc->gc_reader_ts); |
| 625 | goto fail_free_writer_kthread; | 625 | goto fail_free_writer_kthread; |
| 626 | } | 626 | } |
| @@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk) | |||
| 641 | gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", | 641 | gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", |
| 642 | WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); | 642 | WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); |
| 643 | if (!gc->gc_line_reader_wq) { | 643 | if (!gc->gc_line_reader_wq) { |
| 644 | pr_err("pblk: could not allocate GC line reader workqueue\n"); | 644 | pblk_err(pblk, "could not allocate GC line reader workqueue\n"); |
| 645 | ret = -ENOMEM; | 645 | ret = -ENOMEM; |
| 646 | goto fail_free_reader_kthread; | 646 | goto fail_free_reader_kthread; |
| 647 | } | 647 | } |
| @@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk) | |||
| 650 | gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", | 650 | gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", |
| 651 | WQ_MEM_RECLAIM | WQ_UNBOUND, 1); | 651 | WQ_MEM_RECLAIM | WQ_UNBOUND, 1); |
| 652 | if (!gc->gc_reader_wq) { | 652 | if (!gc->gc_reader_wq) { |
| 653 | pr_err("pblk: could not allocate GC reader workqueue\n"); | 653 | pblk_err(pblk, "could not allocate GC reader workqueue\n"); |
| 654 | ret = -ENOMEM; | 654 | ret = -ENOMEM; |
| 655 | goto fail_free_reader_line_wq; | 655 | goto fail_free_reader_line_wq; |
| 656 | } | 656 | } |
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b57f764d6a16..537e98f2b24a 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c | |||
| @@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk) | |||
| 91 | return entry_size * pblk->rl.nr_secs; | 91 | return entry_size * pblk->rl.nr_secs; |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | #ifdef CONFIG_NVM_DEBUG | 94 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 95 | static u32 pblk_l2p_crc(struct pblk *pblk) | 95 | static u32 pblk_l2p_crc(struct pblk *pblk) |
| 96 | { | 96 | { |
| 97 | size_t map_size; | 97 | size_t map_size; |
| @@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) | |||
| 117 | } else { | 117 | } else { |
| 118 | line = pblk_recov_l2p(pblk); | 118 | line = pblk_recov_l2p(pblk); |
| 119 | if (IS_ERR(line)) { | 119 | if (IS_ERR(line)) { |
| 120 | pr_err("pblk: could not recover l2p table\n"); | 120 | pblk_err(pblk, "could not recover l2p table\n"); |
| 121 | return -EFAULT; | 121 | return -EFAULT; |
| 122 | } | 122 | } |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | #ifdef CONFIG_NVM_DEBUG | 125 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 126 | pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); | 126 | pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); |
| 127 | #endif | 127 | #endif |
| 128 | 128 | ||
| 129 | /* Free full lines directly as GC has not been started yet */ | 129 | /* Free full lines directly as GC has not been started yet */ |
| @@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) | |||
| 166 | static void pblk_rwb_free(struct pblk *pblk) | 166 | static void pblk_rwb_free(struct pblk *pblk) |
| 167 | { | 167 | { |
| 168 | if (pblk_rb_tear_down_check(&pblk->rwb)) | 168 | if (pblk_rb_tear_down_check(&pblk->rwb)) |
| 169 | pr_err("pblk: write buffer error on tear down\n"); | 169 | pblk_err(pblk, "write buffer error on tear down\n"); |
| 170 | 170 | ||
| 171 | pblk_rb_data_free(&pblk->rwb); | 171 | pblk_rb_data_free(&pblk->rwb); |
| 172 | vfree(pblk_rb_entries_ref(&pblk->rwb)); | 172 | vfree(pblk_rb_entries_ref(&pblk->rwb)); |
| @@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk) | |||
| 179 | struct pblk_rb_entry *entries; | 179 | struct pblk_rb_entry *entries; |
| 180 | unsigned long nr_entries, buffer_size; | 180 | unsigned long nr_entries, buffer_size; |
| 181 | unsigned int power_size, power_seg_sz; | 181 | unsigned int power_size, power_seg_sz; |
| 182 | int pgs_in_buffer; | ||
| 182 | 183 | ||
| 183 | if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer)) | 184 | pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns; |
| 185 | |||
| 186 | if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) | ||
| 184 | buffer_size = write_buffer_size; | 187 | buffer_size = write_buffer_size; |
| 185 | else | 188 | else |
| 186 | buffer_size = pblk->pgs_in_buffer; | 189 | buffer_size = pgs_in_buffer; |
| 187 | 190 | ||
| 188 | nr_entries = pblk_rb_calculate_size(buffer_size); | 191 | nr_entries = pblk_rb_calculate_size(buffer_size); |
| 189 | 192 | ||
| @@ -200,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk) | |||
| 200 | /* Minimum pages needed within a lun */ | 203 | /* Minimum pages needed within a lun */ |
| 201 | #define ADDR_POOL_SIZE 64 | 204 | #define ADDR_POOL_SIZE 64 |
| 202 | 205 | ||
| 203 | static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst) | 206 | static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, |
| 207 | struct nvm_addrf_12 *dst) | ||
| 204 | { | 208 | { |
| 205 | struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; | 209 | struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; |
| 206 | int power_len; | 210 | int power_len; |
| @@ -208,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst) | |||
| 208 | /* Re-calculate channel and lun format to adapt to configuration */ | 212 | /* Re-calculate channel and lun format to adapt to configuration */ |
| 209 | power_len = get_count_order(geo->num_ch); | 213 | power_len = get_count_order(geo->num_ch); |
| 210 | if (1 << power_len != geo->num_ch) { | 214 | if (1 << power_len != geo->num_ch) { |
| 211 | pr_err("pblk: supports only power-of-two channel config.\n"); | 215 | pblk_err(pblk, "supports only power-of-two channel config.\n"); |
| 212 | return -EINVAL; | 216 | return -EINVAL; |
| 213 | } | 217 | } |
| 214 | dst->ch_len = power_len; | 218 | dst->ch_len = power_len; |
| 215 | 219 | ||
| 216 | power_len = get_count_order(geo->num_lun); | 220 | power_len = get_count_order(geo->num_lun); |
| 217 | if (1 << power_len != geo->num_lun) { | 221 | if (1 << power_len != geo->num_lun) { |
| 218 | pr_err("pblk: supports only power-of-two LUN config.\n"); | 222 | pblk_err(pblk, "supports only power-of-two LUN config.\n"); |
| 219 | return -EINVAL; | 223 | return -EINVAL; |
| 220 | } | 224 | } |
| 221 | dst->lun_len = power_len; | 225 | dst->lun_len = power_len; |
| @@ -282,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk) | |||
| 282 | case NVM_OCSSD_SPEC_12: | 286 | case NVM_OCSSD_SPEC_12: |
| 283 | div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); | 287 | div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); |
| 284 | if (mod) { | 288 | if (mod) { |
| 285 | pr_err("pblk: bad configuration of sectors/pages\n"); | 289 | pblk_err(pblk, "bad configuration of sectors/pages\n"); |
| 286 | return -EINVAL; | 290 | return -EINVAL; |
| 287 | } | 291 | } |
| 288 | 292 | ||
| 289 | pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf); | 293 | pblk->addrf_len = pblk_set_addrf_12(pblk, geo, |
| 294 | (void *)&pblk->addrf); | ||
| 290 | break; | 295 | break; |
| 291 | case NVM_OCSSD_SPEC_20: | 296 | case NVM_OCSSD_SPEC_20: |
| 292 | pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, | 297 | pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, |
| 293 | &pblk->uaddrf); | 298 | &pblk->uaddrf); |
| 294 | break; | 299 | break; |
| 295 | default: | 300 | default: |
| 296 | pr_err("pblk: OCSSD revision not supported (%d)\n", | 301 | pblk_err(pblk, "OCSSD revision not supported (%d)\n", |
| 297 | geo->version); | 302 | geo->version); |
| 298 | return -EINVAL; | 303 | return -EINVAL; |
| 299 | } | 304 | } |
| @@ -366,15 +371,13 @@ static int pblk_core_init(struct pblk *pblk) | |||
| 366 | atomic64_set(&pblk->nr_flush, 0); | 371 | atomic64_set(&pblk->nr_flush, 0); |
| 367 | pblk->nr_flush_rst = 0; | 372 | pblk->nr_flush_rst = 0; |
| 368 | 373 | ||
| 369 | pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns; | ||
| 370 | |||
| 371 | pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE); | 374 | pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE); |
| 372 | max_write_ppas = pblk->min_write_pgs * geo->all_luns; | 375 | max_write_ppas = pblk->min_write_pgs * geo->all_luns; |
| 373 | pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); | 376 | pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); |
| 374 | pblk_set_sec_per_write(pblk, pblk->min_write_pgs); | 377 | pblk_set_sec_per_write(pblk, pblk->min_write_pgs); |
| 375 | 378 | ||
| 376 | if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { | 379 | if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { |
| 377 | pr_err("pblk: vector list too big(%u > %u)\n", | 380 | pblk_err(pblk, "vector list too big(%u > %u)\n", |
| 378 | pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); | 381 | pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); |
| 379 | return -EINVAL; | 382 | return -EINVAL; |
| 380 | } | 383 | } |
| @@ -607,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk) | |||
| 607 | 610 | ||
| 608 | /* TODO: Implement unbalanced LUN support */ | 611 | /* TODO: Implement unbalanced LUN support */ |
| 609 | if (geo->num_lun < 0) { | 612 | if (geo->num_lun < 0) { |
| 610 | pr_err("pblk: unbalanced LUN config.\n"); | 613 | pblk_err(pblk, "unbalanced LUN config.\n"); |
| 611 | return -EINVAL; | 614 | return -EINVAL; |
| 612 | } | 615 | } |
| 613 | 616 | ||
| @@ -716,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line, | |||
| 716 | 719 | ||
| 717 | /* | 720 | /* |
| 718 | * In 1.2 spec. chunk state is not persisted by the device. Thus | 721 | * In 1.2 spec. chunk state is not persisted by the device. Thus |
| 719 | * some of the values are reset each time pblk is instantiated. | 722 | * some of the values are reset each time pblk is instantiated, |
| 723 | * so we have to assume that the block is closed. | ||
| 720 | */ | 724 | */ |
| 721 | if (lun_bb_meta[line->id] == NVM_BLK_T_FREE) | 725 | if (lun_bb_meta[line->id] == NVM_BLK_T_FREE) |
| 722 | chunk->state = NVM_CHK_ST_FREE; | 726 | chunk->state = NVM_CHK_ST_CLOSED; |
| 723 | else | 727 | else |
| 724 | chunk->state = NVM_CHK_ST_OFFLINE; | 728 | chunk->state = NVM_CHK_ST_OFFLINE; |
| 725 | 729 | ||
| @@ -1026,7 +1030,7 @@ add_emeta_page: | |||
| 1026 | lm->emeta_sec[0], geo->clba); | 1030 | lm->emeta_sec[0], geo->clba); |
| 1027 | 1031 | ||
| 1028 | if (lm->min_blk_line > lm->blk_per_line) { | 1032 | if (lm->min_blk_line > lm->blk_per_line) { |
| 1029 | pr_err("pblk: config. not supported. Min. LUN in line:%d\n", | 1033 | pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", |
| 1030 | lm->blk_per_line); | 1034 | lm->blk_per_line); |
| 1031 | return -EINVAL; | 1035 | return -EINVAL; |
| 1032 | } | 1036 | } |
| @@ -1078,7 +1082,7 @@ static int pblk_lines_init(struct pblk *pblk) | |||
| 1078 | } | 1082 | } |
| 1079 | 1083 | ||
| 1080 | if (!nr_free_chks) { | 1084 | if (!nr_free_chks) { |
| 1081 | pr_err("pblk: too many bad blocks prevent for sane instance\n"); | 1085 | pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); |
| 1082 | return -EINTR; | 1086 | return -EINTR; |
| 1083 | } | 1087 | } |
| 1084 | 1088 | ||
| @@ -1108,7 +1112,7 @@ static int pblk_writer_init(struct pblk *pblk) | |||
| 1108 | int err = PTR_ERR(pblk->writer_ts); | 1112 | int err = PTR_ERR(pblk->writer_ts); |
| 1109 | 1113 | ||
| 1110 | if (err != -EINTR) | 1114 | if (err != -EINTR) |
| 1111 | pr_err("pblk: could not allocate writer kthread (%d)\n", | 1115 | pblk_err(pblk, "could not allocate writer kthread (%d)\n", |
| 1112 | err); | 1116 | err); |
| 1113 | return err; | 1117 | return err; |
| 1114 | } | 1118 | } |
| @@ -1154,7 +1158,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful) | |||
| 1154 | pblk_rb_sync_l2p(&pblk->rwb); | 1158 | pblk_rb_sync_l2p(&pblk->rwb); |
| 1155 | pblk_rl_free(&pblk->rl); | 1159 | pblk_rl_free(&pblk->rl); |
| 1156 | 1160 | ||
| 1157 | pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful); | 1161 | pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); |
| 1158 | } | 1162 | } |
| 1159 | 1163 | ||
| 1160 | static void pblk_exit(void *private, bool graceful) | 1164 | static void pblk_exit(void *private, bool graceful) |
| @@ -1165,8 +1169,8 @@ static void pblk_exit(void *private, bool graceful) | |||
| 1165 | pblk_gc_exit(pblk, graceful); | 1169 | pblk_gc_exit(pblk, graceful); |
| 1166 | pblk_tear_down(pblk, graceful); | 1170 | pblk_tear_down(pblk, graceful); |
| 1167 | 1171 | ||
| 1168 | #ifdef CONFIG_NVM_DEBUG | 1172 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1169 | pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); | 1173 | pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); |
| 1170 | #endif | 1174 | #endif |
| 1171 | 1175 | ||
| 1172 | pblk_free(pblk); | 1176 | pblk_free(pblk); |
| @@ -1189,34 +1193,35 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
| 1189 | struct pblk *pblk; | 1193 | struct pblk *pblk; |
| 1190 | int ret; | 1194 | int ret; |
| 1191 | 1195 | ||
| 1192 | /* pblk supports 1.2 and 2.0 versions */ | 1196 | pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); |
| 1197 | if (!pblk) | ||
| 1198 | return ERR_PTR(-ENOMEM); | ||
| 1199 | |||
| 1200 | pblk->dev = dev; | ||
| 1201 | pblk->disk = tdisk; | ||
| 1202 | pblk->state = PBLK_STATE_RUNNING; | ||
| 1203 | pblk->gc.gc_enabled = 0; | ||
| 1204 | |||
| 1193 | if (!(geo->version == NVM_OCSSD_SPEC_12 || | 1205 | if (!(geo->version == NVM_OCSSD_SPEC_12 || |
| 1194 | geo->version == NVM_OCSSD_SPEC_20)) { | 1206 | geo->version == NVM_OCSSD_SPEC_20)) { |
| 1195 | pr_err("pblk: OCSSD version not supported (%u)\n", | 1207 | pblk_err(pblk, "OCSSD version not supported (%u)\n", |
| 1196 | geo->version); | 1208 | geo->version); |
| 1209 | kfree(pblk); | ||
| 1197 | return ERR_PTR(-EINVAL); | 1210 | return ERR_PTR(-EINVAL); |
| 1198 | } | 1211 | } |
| 1199 | 1212 | ||
| 1200 | if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) { | 1213 | if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) { |
| 1201 | pr_err("pblk: host-side L2P table not supported. (%x)\n", | 1214 | pblk_err(pblk, "host-side L2P table not supported. (%x)\n", |
| 1202 | geo->dom); | 1215 | geo->dom); |
| 1216 | kfree(pblk); | ||
| 1203 | return ERR_PTR(-EINVAL); | 1217 | return ERR_PTR(-EINVAL); |
| 1204 | } | 1218 | } |
| 1205 | 1219 | ||
| 1206 | pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); | ||
| 1207 | if (!pblk) | ||
| 1208 | return ERR_PTR(-ENOMEM); | ||
| 1209 | |||
| 1210 | pblk->dev = dev; | ||
| 1211 | pblk->disk = tdisk; | ||
| 1212 | pblk->state = PBLK_STATE_RUNNING; | ||
| 1213 | pblk->gc.gc_enabled = 0; | ||
| 1214 | |||
| 1215 | spin_lock_init(&pblk->resubmit_lock); | 1220 | spin_lock_init(&pblk->resubmit_lock); |
| 1216 | spin_lock_init(&pblk->trans_lock); | 1221 | spin_lock_init(&pblk->trans_lock); |
| 1217 | spin_lock_init(&pblk->lock); | 1222 | spin_lock_init(&pblk->lock); |
| 1218 | 1223 | ||
| 1219 | #ifdef CONFIG_NVM_DEBUG | 1224 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1220 | atomic_long_set(&pblk->inflight_writes, 0); | 1225 | atomic_long_set(&pblk->inflight_writes, 0); |
| 1221 | atomic_long_set(&pblk->padded_writes, 0); | 1226 | atomic_long_set(&pblk->padded_writes, 0); |
| 1222 | atomic_long_set(&pblk->padded_wb, 0); | 1227 | atomic_long_set(&pblk->padded_wb, 0); |
| @@ -1241,38 +1246,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
| 1241 | 1246 | ||
| 1242 | ret = pblk_core_init(pblk); | 1247 | ret = pblk_core_init(pblk); |
| 1243 | if (ret) { | 1248 | if (ret) { |
| 1244 | pr_err("pblk: could not initialize core\n"); | 1249 | pblk_err(pblk, "could not initialize core\n"); |
| 1245 | goto fail; | 1250 | goto fail; |
| 1246 | } | 1251 | } |
| 1247 | 1252 | ||
| 1248 | ret = pblk_lines_init(pblk); | 1253 | ret = pblk_lines_init(pblk); |
| 1249 | if (ret) { | 1254 | if (ret) { |
| 1250 | pr_err("pblk: could not initialize lines\n"); | 1255 | pblk_err(pblk, "could not initialize lines\n"); |
| 1251 | goto fail_free_core; | 1256 | goto fail_free_core; |
| 1252 | } | 1257 | } |
| 1253 | 1258 | ||
| 1254 | ret = pblk_rwb_init(pblk); | 1259 | ret = pblk_rwb_init(pblk); |
| 1255 | if (ret) { | 1260 | if (ret) { |
| 1256 | pr_err("pblk: could not initialize write buffer\n"); | 1261 | pblk_err(pblk, "could not initialize write buffer\n"); |
| 1257 | goto fail_free_lines; | 1262 | goto fail_free_lines; |
| 1258 | } | 1263 | } |
| 1259 | 1264 | ||
| 1260 | ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); | 1265 | ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); |
| 1261 | if (ret) { | 1266 | if (ret) { |
| 1262 | pr_err("pblk: could not initialize maps\n"); | 1267 | pblk_err(pblk, "could not initialize maps\n"); |
| 1263 | goto fail_free_rwb; | 1268 | goto fail_free_rwb; |
| 1264 | } | 1269 | } |
| 1265 | 1270 | ||
| 1266 | ret = pblk_writer_init(pblk); | 1271 | ret = pblk_writer_init(pblk); |
| 1267 | if (ret) { | 1272 | if (ret) { |
| 1268 | if (ret != -EINTR) | 1273 | if (ret != -EINTR) |
| 1269 | pr_err("pblk: could not initialize write thread\n"); | 1274 | pblk_err(pblk, "could not initialize write thread\n"); |
| 1270 | goto fail_free_l2p; | 1275 | goto fail_free_l2p; |
| 1271 | } | 1276 | } |
| 1272 | 1277 | ||
| 1273 | ret = pblk_gc_init(pblk); | 1278 | ret = pblk_gc_init(pblk); |
| 1274 | if (ret) { | 1279 | if (ret) { |
| 1275 | pr_err("pblk: could not initialize gc\n"); | 1280 | pblk_err(pblk, "could not initialize gc\n"); |
| 1276 | goto fail_stop_writer; | 1281 | goto fail_stop_writer; |
| 1277 | } | 1282 | } |
| 1278 | 1283 | ||
| @@ -1287,8 +1292,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, | |||
| 1287 | blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); | 1292 | blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); |
| 1288 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); | 1293 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); |
| 1289 | 1294 | ||
| 1290 | pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n", | 1295 | pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", |
| 1291 | tdisk->disk_name, | ||
| 1292 | geo->all_luns, pblk->l_mg.nr_lines, | 1296 | geo->all_luns, pblk->l_mg.nr_lines, |
| 1293 | (unsigned long long)pblk->rl.nr_secs, | 1297 | (unsigned long long)pblk->rl.nr_secs, |
| 1294 | pblk->rwb.nr_entries); | 1298 | pblk->rwb.nr_entries); |
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 55e9442a99e2..f6eec0212dfc 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c | |||
| @@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, | |||
| 111 | } while (iter > 0); | 111 | } while (iter > 0); |
| 112 | up_write(&pblk_rb_lock); | 112 | up_write(&pblk_rb_lock); |
| 113 | 113 | ||
| 114 | #ifdef CONFIG_NVM_DEBUG | 114 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 115 | atomic_set(&rb->inflight_flush_point, 0); | 115 | atomic_set(&rb->inflight_flush_point, 0); |
| 116 | #endif | 116 | #endif |
| 117 | 117 | ||
| @@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, | |||
| 308 | 308 | ||
| 309 | entry = &rb->entries[ring_pos]; | 309 | entry = &rb->entries[ring_pos]; |
| 310 | flags = READ_ONCE(entry->w_ctx.flags); | 310 | flags = READ_ONCE(entry->w_ctx.flags); |
| 311 | #ifdef CONFIG_NVM_DEBUG | 311 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 312 | /* Caller must guarantee that the entry is free */ | 312 | /* Caller must guarantee that the entry is free */ |
| 313 | BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); | 313 | BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); |
| 314 | #endif | 314 | #endif |
| @@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, | |||
| 332 | 332 | ||
| 333 | entry = &rb->entries[ring_pos]; | 333 | entry = &rb->entries[ring_pos]; |
| 334 | flags = READ_ONCE(entry->w_ctx.flags); | 334 | flags = READ_ONCE(entry->w_ctx.flags); |
| 335 | #ifdef CONFIG_NVM_DEBUG | 335 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 336 | /* Caller must guarantee that the entry is free */ | 336 | /* Caller must guarantee that the entry is free */ |
| 337 | BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); | 337 | BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); |
| 338 | #endif | 338 | #endif |
| @@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, | |||
| 362 | return 0; | 362 | return 0; |
| 363 | } | 363 | } |
| 364 | 364 | ||
| 365 | #ifdef CONFIG_NVM_DEBUG | 365 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 366 | atomic_inc(&rb->inflight_flush_point); | 366 | atomic_inc(&rb->inflight_flush_point); |
| 367 | #endif | 367 | #endif |
| 368 | 368 | ||
| @@ -547,7 +547,7 @@ try: | |||
| 547 | 547 | ||
| 548 | page = virt_to_page(entry->data); | 548 | page = virt_to_page(entry->data); |
| 549 | if (!page) { | 549 | if (!page) { |
| 550 | pr_err("pblk: could not allocate write bio page\n"); | 550 | pblk_err(pblk, "could not allocate write bio page\n"); |
| 551 | flags &= ~PBLK_WRITTEN_DATA; | 551 | flags &= ~PBLK_WRITTEN_DATA; |
| 552 | flags |= PBLK_SUBMITTED_ENTRY; | 552 | flags |= PBLK_SUBMITTED_ENTRY; |
| 553 | /* Release flags on context. Protect from writes */ | 553 | /* Release flags on context. Protect from writes */ |
| @@ -557,7 +557,7 @@ try: | |||
| 557 | 557 | ||
| 558 | if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != | 558 | if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != |
| 559 | rb->seg_size) { | 559 | rb->seg_size) { |
| 560 | pr_err("pblk: could not add page to write bio\n"); | 560 | pblk_err(pblk, "could not add page to write bio\n"); |
| 561 | flags &= ~PBLK_WRITTEN_DATA; | 561 | flags &= ~PBLK_WRITTEN_DATA; |
| 562 | flags |= PBLK_SUBMITTED_ENTRY; | 562 | flags |= PBLK_SUBMITTED_ENTRY; |
| 563 | /* Release flags on context. Protect from writes */ | 563 | /* Release flags on context. Protect from writes */ |
| @@ -576,19 +576,19 @@ try: | |||
| 576 | 576 | ||
| 577 | if (pad) { | 577 | if (pad) { |
| 578 | if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { | 578 | if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { |
| 579 | pr_err("pblk: could not pad page in write bio\n"); | 579 | pblk_err(pblk, "could not pad page in write bio\n"); |
| 580 | return NVM_IO_ERR; | 580 | return NVM_IO_ERR; |
| 581 | } | 581 | } |
| 582 | 582 | ||
| 583 | if (pad < pblk->min_write_pgs) | 583 | if (pad < pblk->min_write_pgs) |
| 584 | atomic64_inc(&pblk->pad_dist[pad - 1]); | 584 | atomic64_inc(&pblk->pad_dist[pad - 1]); |
| 585 | else | 585 | else |
| 586 | pr_warn("pblk: padding more than min. sectors\n"); | 586 | pblk_warn(pblk, "padding more than min. sectors\n"); |
| 587 | 587 | ||
| 588 | atomic64_add(pad, &pblk->pad_wa); | 588 | atomic64_add(pad, &pblk->pad_wa); |
| 589 | } | 589 | } |
| 590 | 590 | ||
| 591 | #ifdef CONFIG_NVM_DEBUG | 591 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 592 | atomic_long_add(pad, &pblk->padded_writes); | 592 | atomic_long_add(pad, &pblk->padded_writes); |
| 593 | #endif | 593 | #endif |
| 594 | 594 | ||
| @@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, | |||
| 613 | int ret = 1; | 613 | int ret = 1; |
| 614 | 614 | ||
| 615 | 615 | ||
| 616 | #ifdef CONFIG_NVM_DEBUG | 616 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 617 | /* Caller must ensure that the access will not cause an overflow */ | 617 | /* Caller must ensure that the access will not cause an overflow */ |
| 618 | BUG_ON(pos >= rb->nr_entries); | 618 | BUG_ON(pos >= rb->nr_entries); |
| 619 | #endif | 619 | #endif |
| @@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) | |||
| 820 | rb->subm, | 820 | rb->subm, |
| 821 | rb->sync, | 821 | rb->sync, |
| 822 | rb->l2p_update, | 822 | rb->l2p_update, |
| 823 | #ifdef CONFIG_NVM_DEBUG | 823 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 824 | atomic_read(&rb->inflight_flush_point), | 824 | atomic_read(&rb->inflight_flush_point), |
| 825 | #else | 825 | #else |
| 826 | 0, | 826 | 0, |
| @@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) | |||
| 838 | rb->subm, | 838 | rb->subm, |
| 839 | rb->sync, | 839 | rb->sync, |
| 840 | rb->l2p_update, | 840 | rb->l2p_update, |
| 841 | #ifdef CONFIG_NVM_DEBUG | 841 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 842 | atomic_read(&rb->inflight_flush_point), | 842 | atomic_read(&rb->inflight_flush_point), |
| 843 | #else | 843 | #else |
| 844 | 0, | 844 | 0, |
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 18694694e5f0..5a46d7f9302f 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c | |||
| @@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, | |||
| 28 | sector_t lba, struct ppa_addr ppa, | 28 | sector_t lba, struct ppa_addr ppa, |
| 29 | int bio_iter, bool advanced_bio) | 29 | int bio_iter, bool advanced_bio) |
| 30 | { | 30 | { |
| 31 | #ifdef CONFIG_NVM_DEBUG | 31 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 32 | /* Callers must ensure that the ppa points to a cache address */ | 32 | /* Callers must ensure that the ppa points to a cache address */ |
| 33 | BUG_ON(pblk_ppa_empty(ppa)); | 33 | BUG_ON(pblk_ppa_empty(ppa)); |
| 34 | BUG_ON(!pblk_addr_in_cache(ppa)); | 34 | BUG_ON(!pblk_addr_in_cache(ppa)); |
| @@ -79,7 +79,7 @@ retry: | |||
| 79 | WARN_ON(test_and_set_bit(i, read_bitmap)); | 79 | WARN_ON(test_and_set_bit(i, read_bitmap)); |
| 80 | meta_list[i].lba = cpu_to_le64(lba); | 80 | meta_list[i].lba = cpu_to_le64(lba); |
| 81 | advanced_bio = true; | 81 | advanced_bio = true; |
| 82 | #ifdef CONFIG_NVM_DEBUG | 82 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 83 | atomic_long_inc(&pblk->cache_reads); | 83 | atomic_long_inc(&pblk->cache_reads); |
| 84 | #endif | 84 | #endif |
| 85 | } else { | 85 | } else { |
| @@ -97,7 +97,7 @@ next: | |||
| 97 | else | 97 | else |
| 98 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); | 98 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); |
| 99 | 99 | ||
| 100 | #ifdef CONFIG_NVM_DEBUG | 100 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 101 | atomic_long_add(nr_secs, &pblk->inflight_reads); | 101 | atomic_long_add(nr_secs, &pblk->inflight_reads); |
| 102 | #endif | 102 | #endif |
| 103 | } | 103 | } |
| @@ -117,13 +117,13 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 117 | continue; | 117 | continue; |
| 118 | 118 | ||
| 119 | if (lba != blba + i) { | 119 | if (lba != blba + i) { |
| 120 | #ifdef CONFIG_NVM_DEBUG | 120 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 121 | struct ppa_addr *p; | 121 | struct ppa_addr *p; |
| 122 | 122 | ||
| 123 | p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; | 123 | p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; |
| 124 | print_ppa(&pblk->dev->geo, p, "seq", i); | 124 | print_ppa(pblk, p, "seq", i); |
| 125 | #endif | 125 | #endif |
| 126 | pr_err("pblk: corrupted read LBA (%llu/%llu)\n", | 126 | pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", |
| 127 | lba, (u64)blba + i); | 127 | lba, (u64)blba + i); |
| 128 | WARN_ON(1); | 128 | WARN_ON(1); |
| 129 | } | 129 | } |
| @@ -149,14 +149,14 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 149 | meta_lba = le64_to_cpu(meta_lba_list[j].lba); | 149 | meta_lba = le64_to_cpu(meta_lba_list[j].lba); |
| 150 | 150 | ||
| 151 | if (lba != meta_lba) { | 151 | if (lba != meta_lba) { |
| 152 | #ifdef CONFIG_NVM_DEBUG | 152 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 153 | struct ppa_addr *p; | 153 | struct ppa_addr *p; |
| 154 | int nr_ppas = rqd->nr_ppas; | 154 | int nr_ppas = rqd->nr_ppas; |
| 155 | 155 | ||
| 156 | p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; | 156 | p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; |
| 157 | print_ppa(&pblk->dev->geo, p, "seq", j); | 157 | print_ppa(pblk, p, "seq", j); |
| 158 | #endif | 158 | #endif |
| 159 | pr_err("pblk: corrupted read LBA (%llu/%llu)\n", | 159 | pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", |
| 160 | lba, meta_lba); | 160 | lba, meta_lba); |
| 161 | WARN_ON(1); | 161 | WARN_ON(1); |
| 162 | } | 162 | } |
| @@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 185 | 185 | ||
| 186 | static void pblk_end_user_read(struct bio *bio) | 186 | static void pblk_end_user_read(struct bio *bio) |
| 187 | { | 187 | { |
| 188 | #ifdef CONFIG_NVM_DEBUG | 188 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 189 | WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); | 189 | WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); |
| 190 | #endif | 190 | #endif |
| 191 | bio_endio(bio); | 191 | bio_endio(bio); |
| @@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 199 | struct bio *int_bio = rqd->bio; | 199 | struct bio *int_bio = rqd->bio; |
| 200 | unsigned long start_time = r_ctx->start_time; | 200 | unsigned long start_time = r_ctx->start_time; |
| 201 | 201 | ||
| 202 | generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); | 202 | generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time); |
| 203 | 203 | ||
| 204 | if (rqd->error) | 204 | if (rqd->error) |
| 205 | pblk_log_read_err(pblk, rqd); | 205 | pblk_log_read_err(pblk, rqd); |
| @@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 212 | if (put_line) | 212 | if (put_line) |
| 213 | pblk_read_put_rqd_kref(pblk, rqd); | 213 | pblk_read_put_rqd_kref(pblk, rqd); |
| 214 | 214 | ||
| 215 | #ifdef CONFIG_NVM_DEBUG | 215 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 216 | atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); | 216 | atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); |
| 217 | atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); | 217 | atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); |
| 218 | #endif | 218 | #endif |
| @@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd) | |||
| 231 | __pblk_end_io_read(pblk, rqd, true); | 231 | __pblk_end_io_read(pblk, rqd, true); |
| 232 | } | 232 | } |
| 233 | 233 | ||
| 234 | static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, | 234 | static void pblk_end_partial_read(struct nvm_rq *rqd) |
| 235 | struct bio *orig_bio, unsigned int bio_init_idx, | ||
| 236 | unsigned long *read_bitmap) | ||
| 237 | { | 235 | { |
| 238 | struct pblk_sec_meta *meta_list = rqd->meta_list; | 236 | struct pblk *pblk = rqd->private; |
| 239 | struct bio *new_bio; | 237 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); |
| 238 | struct pblk_pr_ctx *pr_ctx = r_ctx->private; | ||
| 239 | struct bio *new_bio = rqd->bio; | ||
| 240 | struct bio *bio = pr_ctx->orig_bio; | ||
| 240 | struct bio_vec src_bv, dst_bv; | 241 | struct bio_vec src_bv, dst_bv; |
| 241 | void *ppa_ptr = NULL; | 242 | struct pblk_sec_meta *meta_list = rqd->meta_list; |
| 242 | void *src_p, *dst_p; | 243 | int bio_init_idx = pr_ctx->bio_init_idx; |
| 243 | dma_addr_t dma_ppa_list = 0; | 244 | unsigned long *read_bitmap = pr_ctx->bitmap; |
| 244 | __le64 *lba_list_mem, *lba_list_media; | 245 | int nr_secs = pr_ctx->orig_nr_secs; |
| 245 | int nr_secs = rqd->nr_ppas; | ||
| 246 | int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); | 246 | int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); |
| 247 | int i, ret, hole; | 247 | __le64 *lba_list_mem, *lba_list_media; |
| 248 | 248 | void *src_p, *dst_p; | |
| 249 | /* Re-use allocated memory for intermediate lbas */ | 249 | int hole, i; |
| 250 | lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); | ||
| 251 | lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); | ||
| 252 | |||
| 253 | new_bio = bio_alloc(GFP_KERNEL, nr_holes); | ||
| 254 | |||
| 255 | if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) | ||
| 256 | goto fail_add_pages; | ||
| 257 | |||
| 258 | if (nr_holes != new_bio->bi_vcnt) { | ||
| 259 | pr_err("pblk: malformed bio\n"); | ||
| 260 | goto fail; | ||
| 261 | } | ||
| 262 | |||
| 263 | for (i = 0; i < nr_secs; i++) | ||
| 264 | lba_list_mem[i] = meta_list[i].lba; | ||
| 265 | |||
| 266 | new_bio->bi_iter.bi_sector = 0; /* internal bio */ | ||
| 267 | bio_set_op_attrs(new_bio, REQ_OP_READ, 0); | ||
| 268 | |||
| 269 | rqd->bio = new_bio; | ||
| 270 | rqd->nr_ppas = nr_holes; | ||
| 271 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); | ||
| 272 | |||
| 273 | if (unlikely(nr_holes == 1)) { | ||
| 274 | ppa_ptr = rqd->ppa_list; | ||
| 275 | dma_ppa_list = rqd->dma_ppa_list; | ||
| 276 | rqd->ppa_addr = rqd->ppa_list[0]; | ||
| 277 | } | ||
| 278 | |||
| 279 | ret = pblk_submit_io_sync(pblk, rqd); | ||
| 280 | if (ret) { | ||
| 281 | bio_put(rqd->bio); | ||
| 282 | pr_err("pblk: sync read IO submission failed\n"); | ||
| 283 | goto fail; | ||
| 284 | } | ||
| 285 | |||
| 286 | if (rqd->error) { | ||
| 287 | atomic_long_inc(&pblk->read_failed); | ||
| 288 | #ifdef CONFIG_NVM_DEBUG | ||
| 289 | pblk_print_failed_rqd(pblk, rqd, rqd->error); | ||
| 290 | #endif | ||
| 291 | } | ||
| 292 | 250 | ||
| 293 | if (unlikely(nr_holes == 1)) { | 251 | if (unlikely(nr_holes == 1)) { |
| 294 | struct ppa_addr ppa; | 252 | struct ppa_addr ppa; |
| 295 | 253 | ||
| 296 | ppa = rqd->ppa_addr; | 254 | ppa = rqd->ppa_addr; |
| 297 | rqd->ppa_list = ppa_ptr; | 255 | rqd->ppa_list = pr_ctx->ppa_ptr; |
| 298 | rqd->dma_ppa_list = dma_ppa_list; | 256 | rqd->dma_ppa_list = pr_ctx->dma_ppa_list; |
| 299 | rqd->ppa_list[0] = ppa; | 257 | rqd->ppa_list[0] = ppa; |
| 300 | } | 258 | } |
| 301 | 259 | ||
| 260 | /* Re-use allocated memory for intermediate lbas */ | ||
| 261 | lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); | ||
| 262 | lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); | ||
| 263 | |||
| 302 | for (i = 0; i < nr_secs; i++) { | 264 | for (i = 0; i < nr_secs; i++) { |
| 303 | lba_list_media[i] = meta_list[i].lba; | 265 | lba_list_media[i] = meta_list[i].lba; |
| 304 | meta_list[i].lba = lba_list_mem[i]; | 266 | meta_list[i].lba = lba_list_mem[i]; |
| @@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 316 | meta_list[hole].lba = lba_list_media[i]; | 278 | meta_list[hole].lba = lba_list_media[i]; |
| 317 | 279 | ||
| 318 | src_bv = new_bio->bi_io_vec[i++]; | 280 | src_bv = new_bio->bi_io_vec[i++]; |
| 319 | dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole]; | 281 | dst_bv = bio->bi_io_vec[bio_init_idx + hole]; |
| 320 | 282 | ||
| 321 | src_p = kmap_atomic(src_bv.bv_page); | 283 | src_p = kmap_atomic(src_bv.bv_page); |
| 322 | dst_p = kmap_atomic(dst_bv.bv_page); | 284 | dst_p = kmap_atomic(dst_bv.bv_page); |
| @@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 334 | } while (hole < nr_secs); | 296 | } while (hole < nr_secs); |
| 335 | 297 | ||
| 336 | bio_put(new_bio); | 298 | bio_put(new_bio); |
| 299 | kfree(pr_ctx); | ||
| 337 | 300 | ||
| 338 | /* restore original request */ | 301 | /* restore original request */ |
| 339 | rqd->bio = NULL; | 302 | rqd->bio = NULL; |
| 340 | rqd->nr_ppas = nr_secs; | 303 | rqd->nr_ppas = nr_secs; |
| 341 | 304 | ||
| 305 | bio_endio(bio); | ||
| 342 | __pblk_end_io_read(pblk, rqd, false); | 306 | __pblk_end_io_read(pblk, rqd, false); |
| 343 | return NVM_IO_DONE; | 307 | } |
| 344 | 308 | ||
| 345 | fail: | 309 | static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, |
| 346 | /* Free allocated pages in new bio */ | 310 | unsigned int bio_init_idx, |
| 311 | unsigned long *read_bitmap, | ||
| 312 | int nr_holes) | ||
| 313 | { | ||
| 314 | struct pblk_sec_meta *meta_list = rqd->meta_list; | ||
| 315 | struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); | ||
| 316 | struct pblk_pr_ctx *pr_ctx; | ||
| 317 | struct bio *new_bio, *bio = r_ctx->private; | ||
| 318 | __le64 *lba_list_mem; | ||
| 319 | int nr_secs = rqd->nr_ppas; | ||
| 320 | int i; | ||
| 321 | |||
| 322 | /* Re-use allocated memory for intermediate lbas */ | ||
| 323 | lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); | ||
| 324 | |||
| 325 | new_bio = bio_alloc(GFP_KERNEL, nr_holes); | ||
| 326 | |||
| 327 | if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) | ||
| 328 | goto fail_bio_put; | ||
| 329 | |||
| 330 | if (nr_holes != new_bio->bi_vcnt) { | ||
| 331 | WARN_ONCE(1, "pblk: malformed bio\n"); | ||
| 332 | goto fail_free_pages; | ||
| 333 | } | ||
| 334 | |||
| 335 | pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); | ||
| 336 | if (!pr_ctx) | ||
| 337 | goto fail_free_pages; | ||
| 338 | |||
| 339 | for (i = 0; i < nr_secs; i++) | ||
| 340 | lba_list_mem[i] = meta_list[i].lba; | ||
| 341 | |||
| 342 | new_bio->bi_iter.bi_sector = 0; /* internal bio */ | ||
| 343 | bio_set_op_attrs(new_bio, REQ_OP_READ, 0); | ||
| 344 | |||
| 345 | rqd->bio = new_bio; | ||
| 346 | rqd->nr_ppas = nr_holes; | ||
| 347 | rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); | ||
| 348 | |||
| 349 | pr_ctx->ppa_ptr = NULL; | ||
| 350 | pr_ctx->orig_bio = bio; | ||
| 351 | bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); | ||
| 352 | pr_ctx->bio_init_idx = bio_init_idx; | ||
| 353 | pr_ctx->orig_nr_secs = nr_secs; | ||
| 354 | r_ctx->private = pr_ctx; | ||
| 355 | |||
| 356 | if (unlikely(nr_holes == 1)) { | ||
| 357 | pr_ctx->ppa_ptr = rqd->ppa_list; | ||
| 358 | pr_ctx->dma_ppa_list = rqd->dma_ppa_list; | ||
| 359 | rqd->ppa_addr = rqd->ppa_list[0]; | ||
| 360 | } | ||
| 361 | return 0; | ||
| 362 | |||
| 363 | fail_free_pages: | ||
| 347 | pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); | 364 | pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); |
| 348 | fail_add_pages: | 365 | fail_bio_put: |
| 349 | pr_err("pblk: failed to perform partial read\n"); | 366 | bio_put(new_bio); |
| 367 | |||
| 368 | return -ENOMEM; | ||
| 369 | } | ||
| 370 | |||
| 371 | static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, | ||
| 372 | unsigned int bio_init_idx, | ||
| 373 | unsigned long *read_bitmap, int nr_secs) | ||
| 374 | { | ||
| 375 | int nr_holes; | ||
| 376 | int ret; | ||
| 377 | |||
| 378 | nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); | ||
| 379 | |||
| 380 | if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap, | ||
| 381 | nr_holes)) | ||
| 382 | return NVM_IO_ERR; | ||
| 383 | |||
| 384 | rqd->end_io = pblk_end_partial_read; | ||
| 385 | |||
| 386 | ret = pblk_submit_io(pblk, rqd); | ||
| 387 | if (ret) { | ||
| 388 | bio_put(rqd->bio); | ||
| 389 | pblk_err(pblk, "partial read IO submission failed\n"); | ||
| 390 | goto err; | ||
| 391 | } | ||
| 392 | |||
| 393 | return NVM_IO_OK; | ||
| 394 | |||
| 395 | err: | ||
| 396 | pblk_err(pblk, "failed to perform partial read\n"); | ||
| 397 | |||
| 398 | /* Free allocated pages in new bio */ | ||
| 399 | pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt); | ||
| 350 | __pblk_end_io_read(pblk, rqd, false); | 400 | __pblk_end_io_read(pblk, rqd, false); |
| 351 | return NVM_IO_ERR; | 401 | return NVM_IO_ERR; |
| 352 | } | 402 | } |
| @@ -359,7 +409,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, | |||
| 359 | 409 | ||
| 360 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); | 410 | pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); |
| 361 | 411 | ||
| 362 | #ifdef CONFIG_NVM_DEBUG | 412 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 363 | atomic_long_inc(&pblk->inflight_reads); | 413 | atomic_long_inc(&pblk->inflight_reads); |
| 364 | #endif | 414 | #endif |
| 365 | 415 | ||
| @@ -382,7 +432,7 @@ retry: | |||
| 382 | WARN_ON(test_and_set_bit(0, read_bitmap)); | 432 | WARN_ON(test_and_set_bit(0, read_bitmap)); |
| 383 | meta_list[0].lba = cpu_to_le64(lba); | 433 | meta_list[0].lba = cpu_to_le64(lba); |
| 384 | 434 | ||
| 385 | #ifdef CONFIG_NVM_DEBUG | 435 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 386 | atomic_long_inc(&pblk->cache_reads); | 436 | atomic_long_inc(&pblk->cache_reads); |
| 387 | #endif | 437 | #endif |
| 388 | } else { | 438 | } else { |
| @@ -401,7 +451,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
| 401 | struct pblk_g_ctx *r_ctx; | 451 | struct pblk_g_ctx *r_ctx; |
| 402 | struct nvm_rq *rqd; | 452 | struct nvm_rq *rqd; |
| 403 | unsigned int bio_init_idx; | 453 | unsigned int bio_init_idx; |
| 404 | unsigned long read_bitmap; /* Max 64 ppas per request */ | 454 | DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA); |
| 405 | int ret = NVM_IO_ERR; | 455 | int ret = NVM_IO_ERR; |
| 406 | 456 | ||
| 407 | /* logic error: lba out-of-bounds. Ignore read request */ | 457 | /* logic error: lba out-of-bounds. Ignore read request */ |
| @@ -411,9 +461,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
| 411 | return NVM_IO_ERR; | 461 | return NVM_IO_ERR; |
| 412 | } | 462 | } |
| 413 | 463 | ||
| 414 | generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); | 464 | generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio), |
| 465 | &pblk->disk->part0); | ||
| 415 | 466 | ||
| 416 | bitmap_zero(&read_bitmap, nr_secs); | 467 | bitmap_zero(read_bitmap, nr_secs); |
| 417 | 468 | ||
| 418 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); | 469 | rqd = pblk_alloc_rqd(pblk, PBLK_READ); |
| 419 | 470 | ||
| @@ -436,7 +487,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
| 436 | rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, | 487 | rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, |
| 437 | &rqd->dma_meta_list); | 488 | &rqd->dma_meta_list); |
| 438 | if (!rqd->meta_list) { | 489 | if (!rqd->meta_list) { |
| 439 | pr_err("pblk: not able to allocate ppa list\n"); | 490 | pblk_err(pblk, "not able to allocate ppa list\n"); |
| 440 | goto fail_rqd_free; | 491 | goto fail_rqd_free; |
| 441 | } | 492 | } |
| 442 | 493 | ||
| @@ -444,32 +495,32 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
| 444 | rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; | 495 | rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; |
| 445 | rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; | 496 | rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; |
| 446 | 497 | ||
| 447 | pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap); | 498 | pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap); |
| 448 | } else { | 499 | } else { |
| 449 | pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap); | 500 | pblk_read_rq(pblk, rqd, bio, blba, read_bitmap); |
| 450 | } | 501 | } |
| 451 | 502 | ||
| 452 | if (bitmap_full(&read_bitmap, nr_secs)) { | 503 | if (bitmap_full(read_bitmap, nr_secs)) { |
| 453 | atomic_inc(&pblk->inflight_io); | 504 | atomic_inc(&pblk->inflight_io); |
| 454 | __pblk_end_io_read(pblk, rqd, false); | 505 | __pblk_end_io_read(pblk, rqd, false); |
| 455 | return NVM_IO_DONE; | 506 | return NVM_IO_DONE; |
| 456 | } | 507 | } |
| 457 | 508 | ||
| 458 | /* All sectors are to be read from the device */ | 509 | /* All sectors are to be read from the device */ |
| 459 | if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { | 510 | if (bitmap_empty(read_bitmap, rqd->nr_ppas)) { |
| 460 | struct bio *int_bio = NULL; | 511 | struct bio *int_bio = NULL; |
| 461 | 512 | ||
| 462 | /* Clone read bio to deal with read errors internally */ | 513 | /* Clone read bio to deal with read errors internally */ |
| 463 | int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); | 514 | int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); |
| 464 | if (!int_bio) { | 515 | if (!int_bio) { |
| 465 | pr_err("pblk: could not clone read bio\n"); | 516 | pblk_err(pblk, "could not clone read bio\n"); |
| 466 | goto fail_end_io; | 517 | goto fail_end_io; |
| 467 | } | 518 | } |
| 468 | 519 | ||
| 469 | rqd->bio = int_bio; | 520 | rqd->bio = int_bio; |
| 470 | 521 | ||
| 471 | if (pblk_submit_io(pblk, rqd)) { | 522 | if (pblk_submit_io(pblk, rqd)) { |
| 472 | pr_err("pblk: read IO submission failed\n"); | 523 | pblk_err(pblk, "read IO submission failed\n"); |
| 473 | ret = NVM_IO_ERR; | 524 | ret = NVM_IO_ERR; |
| 474 | goto fail_end_io; | 525 | goto fail_end_io; |
| 475 | } | 526 | } |
| @@ -480,8 +531,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) | |||
| 480 | /* The read bio request could be partially filled by the write buffer, | 531 | /* The read bio request could be partially filled by the write buffer, |
| 481 | * but there are some holes that need to be read from the drive. | 532 | * but there are some holes that need to be read from the drive. |
| 482 | */ | 533 | */ |
| 483 | return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap); | 534 | ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap, |
| 535 | nr_secs); | ||
| 536 | if (ret) | ||
| 537 | goto fail_meta_free; | ||
| 538 | |||
| 539 | return NVM_IO_OK; | ||
| 484 | 540 | ||
| 541 | fail_meta_free: | ||
| 542 | nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); | ||
| 485 | fail_rqd_free: | 543 | fail_rqd_free: |
| 486 | pblk_free_rqd(pblk, rqd, PBLK_READ); | 544 | pblk_free_rqd(pblk, rqd, PBLK_READ); |
| 487 | return ret; | 545 | return ret; |
| @@ -514,7 +572,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 514 | rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; | 572 | rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; |
| 515 | } | 573 | } |
| 516 | 574 | ||
| 517 | #ifdef CONFIG_NVM_DEBUG | 575 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 518 | atomic_long_add(valid_secs, &pblk->inflight_reads); | 576 | atomic_long_add(valid_secs, &pblk->inflight_reads); |
| 519 | #endif | 577 | #endif |
| 520 | 578 | ||
| @@ -548,7 +606,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 548 | rqd->ppa_addr = ppa_l2p; | 606 | rqd->ppa_addr = ppa_l2p; |
| 549 | valid_secs = 1; | 607 | valid_secs = 1; |
| 550 | 608 | ||
| 551 | #ifdef CONFIG_NVM_DEBUG | 609 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 552 | atomic_long_inc(&pblk->inflight_reads); | 610 | atomic_long_inc(&pblk->inflight_reads); |
| 553 | #endif | 611 | #endif |
| 554 | 612 | ||
| @@ -595,7 +653,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) | |||
| 595 | bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, | 653 | bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, |
| 596 | PBLK_VMALLOC_META, GFP_KERNEL); | 654 | PBLK_VMALLOC_META, GFP_KERNEL); |
| 597 | if (IS_ERR(bio)) { | 655 | if (IS_ERR(bio)) { |
| 598 | pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); | 656 | pblk_err(pblk, "could not allocate GC bio (%lu)\n", |
| 657 | PTR_ERR(bio)); | ||
| 599 | goto err_free_dma; | 658 | goto err_free_dma; |
| 600 | } | 659 | } |
| 601 | 660 | ||
| @@ -609,7 +668,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) | |||
| 609 | 668 | ||
| 610 | if (pblk_submit_io_sync(pblk, &rqd)) { | 669 | if (pblk_submit_io_sync(pblk, &rqd)) { |
| 611 | ret = -EIO; | 670 | ret = -EIO; |
| 612 | pr_err("pblk: GC read request failed\n"); | 671 | pblk_err(pblk, "GC read request failed\n"); |
| 613 | goto err_free_bio; | 672 | goto err_free_bio; |
| 614 | } | 673 | } |
| 615 | 674 | ||
| @@ -619,12 +678,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) | |||
| 619 | 678 | ||
| 620 | if (rqd.error) { | 679 | if (rqd.error) { |
| 621 | atomic_long_inc(&pblk->read_failed_gc); | 680 | atomic_long_inc(&pblk->read_failed_gc); |
| 622 | #ifdef CONFIG_NVM_DEBUG | 681 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 623 | pblk_print_failed_rqd(pblk, &rqd, rqd.error); | 682 | pblk_print_failed_rqd(pblk, &rqd, rqd.error); |
| 624 | #endif | 683 | #endif |
| 625 | } | 684 | } |
| 626 | 685 | ||
| 627 | #ifdef CONFIG_NVM_DEBUG | 686 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 628 | atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); | 687 | atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); |
| 629 | atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); | 688 | atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); |
| 630 | atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); | 689 | atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); |
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 3a5069183859..e232e47e1353 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c | |||
| @@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) | |||
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | if (nr_valid_lbas != nr_lbas) | 79 | if (nr_valid_lbas != nr_lbas) |
| 80 | pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", | 80 | pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", |
| 81 | line->id, nr_valid_lbas, nr_lbas); | 81 | line->id, nr_valid_lbas, nr_lbas); |
| 82 | 82 | ||
| 83 | line->left_msecs = 0; | 83 | line->left_msecs = 0; |
| @@ -184,7 +184,7 @@ next_read_rq: | |||
| 184 | /* If read fails, more padding is needed */ | 184 | /* If read fails, more padding is needed */ |
| 185 | ret = pblk_submit_io_sync(pblk, rqd); | 185 | ret = pblk_submit_io_sync(pblk, rqd); |
| 186 | if (ret) { | 186 | if (ret) { |
| 187 | pr_err("pblk: I/O submission failed: %d\n", ret); | 187 | pblk_err(pblk, "I/O submission failed: %d\n", ret); |
| 188 | return ret; | 188 | return ret; |
| 189 | } | 189 | } |
| 190 | 190 | ||
| @@ -194,7 +194,7 @@ next_read_rq: | |||
| 194 | * we cannot recover from here. Need FTL log. | 194 | * we cannot recover from here. Need FTL log. |
| 195 | */ | 195 | */ |
| 196 | if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { | 196 | if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { |
| 197 | pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); | 197 | pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error); |
| 198 | return -EINTR; | 198 | return -EINTR; |
| 199 | } | 199 | } |
| 200 | 200 | ||
| @@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, | |||
| 273 | next_pad_rq: | 273 | next_pad_rq: |
| 274 | rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); | 274 | rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); |
| 275 | if (rq_ppas < pblk->min_write_pgs) { | 275 | if (rq_ppas < pblk->min_write_pgs) { |
| 276 | pr_err("pblk: corrupted pad line %d\n", line->id); | 276 | pblk_err(pblk, "corrupted pad line %d\n", line->id); |
| 277 | goto fail_free_pad; | 277 | goto fail_free_pad; |
| 278 | } | 278 | } |
| 279 | 279 | ||
| @@ -342,7 +342,7 @@ next_pad_rq: | |||
| 342 | 342 | ||
| 343 | ret = pblk_submit_io(pblk, rqd); | 343 | ret = pblk_submit_io(pblk, rqd); |
| 344 | if (ret) { | 344 | if (ret) { |
| 345 | pr_err("pblk: I/O submission failed: %d\n", ret); | 345 | pblk_err(pblk, "I/O submission failed: %d\n", ret); |
| 346 | pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); | 346 | pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); |
| 347 | goto fail_free_bio; | 347 | goto fail_free_bio; |
| 348 | } | 348 | } |
| @@ -356,12 +356,12 @@ next_pad_rq: | |||
| 356 | 356 | ||
| 357 | if (!wait_for_completion_io_timeout(&pad_rq->wait, | 357 | if (!wait_for_completion_io_timeout(&pad_rq->wait, |
| 358 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { | 358 | msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { |
| 359 | pr_err("pblk: pad write timed out\n"); | 359 | pblk_err(pblk, "pad write timed out\n"); |
| 360 | ret = -ETIME; | 360 | ret = -ETIME; |
| 361 | } | 361 | } |
| 362 | 362 | ||
| 363 | if (!pblk_line_is_full(line)) | 363 | if (!pblk_line_is_full(line)) |
| 364 | pr_err("pblk: corrupted padded line: %d\n", line->id); | 364 | pblk_err(pblk, "corrupted padded line: %d\n", line->id); |
| 365 | 365 | ||
| 366 | vfree(data); | 366 | vfree(data); |
| 367 | free_rq: | 367 | free_rq: |
| @@ -461,7 +461,7 @@ next_rq: | |||
| 461 | 461 | ||
| 462 | ret = pblk_submit_io_sync(pblk, rqd); | 462 | ret = pblk_submit_io_sync(pblk, rqd); |
| 463 | if (ret) { | 463 | if (ret) { |
| 464 | pr_err("pblk: I/O submission failed: %d\n", ret); | 464 | pblk_err(pblk, "I/O submission failed: %d\n", ret); |
| 465 | return ret; | 465 | return ret; |
| 466 | } | 466 | } |
| 467 | 467 | ||
| @@ -501,11 +501,11 @@ next_rq: | |||
| 501 | 501 | ||
| 502 | ret = pblk_recov_pad_oob(pblk, line, pad_secs); | 502 | ret = pblk_recov_pad_oob(pblk, line, pad_secs); |
| 503 | if (ret) | 503 | if (ret) |
| 504 | pr_err("pblk: OOB padding failed (err:%d)\n", ret); | 504 | pblk_err(pblk, "OOB padding failed (err:%d)\n", ret); |
| 505 | 505 | ||
| 506 | ret = pblk_recov_read_oob(pblk, line, p, r_ptr); | 506 | ret = pblk_recov_read_oob(pblk, line, p, r_ptr); |
| 507 | if (ret) | 507 | if (ret) |
| 508 | pr_err("pblk: OOB read failed (err:%d)\n", ret); | 508 | pblk_err(pblk, "OOB read failed (err:%d)\n", ret); |
| 509 | 509 | ||
| 510 | left_ppas = 0; | 510 | left_ppas = 0; |
| 511 | } | 511 | } |
| @@ -592,7 +592,7 @@ next_rq: | |||
| 592 | 592 | ||
| 593 | ret = pblk_submit_io_sync(pblk, rqd); | 593 | ret = pblk_submit_io_sync(pblk, rqd); |
| 594 | if (ret) { | 594 | if (ret) { |
| 595 | pr_err("pblk: I/O submission failed: %d\n", ret); | 595 | pblk_err(pblk, "I/O submission failed: %d\n", ret); |
| 596 | bio_put(bio); | 596 | bio_put(bio); |
| 597 | return ret; | 597 | return ret; |
| 598 | } | 598 | } |
| @@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) | |||
| 671 | 671 | ||
| 672 | ret = pblk_recov_scan_oob(pblk, line, p, &done); | 672 | ret = pblk_recov_scan_oob(pblk, line, p, &done); |
| 673 | if (ret) { | 673 | if (ret) { |
| 674 | pr_err("pblk: could not recover L2P from OOB\n"); | 674 | pblk_err(pblk, "could not recover L2P from OOB\n"); |
| 675 | goto out; | 675 | goto out; |
| 676 | } | 676 | } |
| 677 | 677 | ||
| 678 | if (!done) { | 678 | if (!done) { |
| 679 | ret = pblk_recov_scan_all_oob(pblk, line, p); | 679 | ret = pblk_recov_scan_all_oob(pblk, line, p); |
| 680 | if (ret) { | 680 | if (ret) { |
| 681 | pr_err("pblk: could not recover L2P from OOB\n"); | 681 | pblk_err(pblk, "could not recover L2P from OOB\n"); |
| 682 | goto out; | 682 | goto out; |
| 683 | } | 683 | } |
| 684 | } | 684 | } |
| @@ -737,14 +737,15 @@ static int pblk_recov_check_line_version(struct pblk *pblk, | |||
| 737 | struct line_header *header = &emeta->header; | 737 | struct line_header *header = &emeta->header; |
| 738 | 738 | ||
| 739 | if (header->version_major != EMETA_VERSION_MAJOR) { | 739 | if (header->version_major != EMETA_VERSION_MAJOR) { |
| 740 | pr_err("pblk: line major version mismatch: %d, expected: %d\n", | 740 | pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", |
| 741 | header->version_major, EMETA_VERSION_MAJOR); | 741 | header->version_major, EMETA_VERSION_MAJOR); |
| 742 | return 1; | 742 | return 1; |
| 743 | } | 743 | } |
| 744 | 744 | ||
| 745 | #ifdef NVM_DEBUG | 745 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 746 | if (header->version_minor > EMETA_VERSION_MINOR) | 746 | if (header->version_minor > EMETA_VERSION_MINOR) |
| 747 | pr_info("pblk: newer line minor version found: %d\n", line_v); | 747 | pblk_info(pblk, "newer line minor version found: %d\n", |
| 748 | header->version_minor); | ||
| 748 | #endif | 749 | #endif |
| 749 | 750 | ||
| 750 | return 0; | 751 | return 0; |
| @@ -851,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
| 851 | continue; | 852 | continue; |
| 852 | 853 | ||
| 853 | if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { | 854 | if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { |
| 854 | pr_err("pblk: found incompatible line version %u\n", | 855 | pblk_err(pblk, "found incompatible line version %u\n", |
| 855 | smeta_buf->header.version_major); | 856 | smeta_buf->header.version_major); |
| 856 | return ERR_PTR(-EINVAL); | 857 | return ERR_PTR(-EINVAL); |
| 857 | } | 858 | } |
| @@ -863,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
| 863 | } | 864 | } |
| 864 | 865 | ||
| 865 | if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { | 866 | if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { |
| 866 | pr_debug("pblk: ignore line %u due to uuid mismatch\n", | 867 | pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", |
| 867 | i); | 868 | i); |
| 868 | continue; | 869 | continue; |
| 869 | } | 870 | } |
| @@ -887,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) | |||
| 887 | 888 | ||
| 888 | pblk_recov_line_add_ordered(&recov_list, line); | 889 | pblk_recov_line_add_ordered(&recov_list, line); |
| 889 | found_lines++; | 890 | found_lines++; |
| 890 | pr_debug("pblk: recovering data line %d, seq:%llu\n", | 891 | pblk_debug(pblk, "recovering data line %d, seq:%llu\n", |
| 891 | line->id, smeta_buf->seq_nr); | 892 | line->id, smeta_buf->seq_nr); |
| 892 | } | 893 | } |
| 893 | 894 | ||
| @@ -947,7 +948,7 @@ next: | |||
| 947 | line->emeta = NULL; | 948 | line->emeta = NULL; |
| 948 | } else { | 949 | } else { |
| 949 | if (open_lines > 1) | 950 | if (open_lines > 1) |
| 950 | pr_err("pblk: failed to recover L2P\n"); | 951 | pblk_err(pblk, "failed to recover L2P\n"); |
| 951 | 952 | ||
| 952 | open_lines++; | 953 | open_lines++; |
| 953 | line->meta_line = meta_line; | 954 | line->meta_line = meta_line; |
| @@ -976,7 +977,7 @@ next: | |||
| 976 | 977 | ||
| 977 | out: | 978 | out: |
| 978 | if (found_lines != recovered_lines) | 979 | if (found_lines != recovered_lines) |
| 979 | pr_err("pblk: failed to recover all found lines %d/%d\n", | 980 | pblk_err(pblk, "failed to recover all found lines %d/%d\n", |
| 980 | found_lines, recovered_lines); | 981 | found_lines, recovered_lines); |
| 981 | 982 | ||
| 982 | return data_line; | 983 | return data_line; |
| @@ -999,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk) | |||
| 999 | 1000 | ||
| 1000 | ret = pblk_recov_pad_oob(pblk, line, left_msecs); | 1001 | ret = pblk_recov_pad_oob(pblk, line, left_msecs); |
| 1001 | if (ret) { | 1002 | if (ret) { |
| 1002 | pr_err("pblk: Tear down padding failed (%d)\n", ret); | 1003 | pblk_err(pblk, "tear down padding failed (%d)\n", ret); |
| 1003 | return ret; | 1004 | return ret; |
| 1004 | } | 1005 | } |
| 1005 | 1006 | ||
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 88a0a7c407aa..9fc3dfa168b4 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c | |||
| @@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) | |||
| 268 | spin_unlock(&l_mg->free_lock); | 268 | spin_unlock(&l_mg->free_lock); |
| 269 | 269 | ||
| 270 | if (nr_free_lines != free_line_cnt) | 270 | if (nr_free_lines != free_line_cnt) |
| 271 | pr_err("pblk: corrupted free line list:%d/%d\n", | 271 | pblk_err(pblk, "corrupted free line list:%d/%d\n", |
| 272 | nr_free_lines, free_line_cnt); | 272 | nr_free_lines, free_line_cnt); |
| 273 | 273 | ||
| 274 | sz = snprintf(page, PAGE_SIZE - sz, | 274 | sz = snprintf(page, PAGE_SIZE - sz, |
| @@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) | |||
| 421 | return sz; | 421 | return sz; |
| 422 | } | 422 | } |
| 423 | 423 | ||
| 424 | #ifdef CONFIG_NVM_DEBUG | 424 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 425 | static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) | 425 | static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) |
| 426 | { | 426 | { |
| 427 | return snprintf(page, PAGE_SIZE, | 427 | return snprintf(page, PAGE_SIZE, |
| @@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = { | |||
| 598 | .mode = 0644, | 598 | .mode = 0644, |
| 599 | }; | 599 | }; |
| 600 | 600 | ||
| 601 | #ifdef CONFIG_NVM_DEBUG | 601 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 602 | static struct attribute sys_stats_debug_attr = { | 602 | static struct attribute sys_stats_debug_attr = { |
| 603 | .name = "stats", | 603 | .name = "stats", |
| 604 | .mode = 0444, | 604 | .mode = 0444, |
| @@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = { | |||
| 619 | &sys_write_amp_mileage, | 619 | &sys_write_amp_mileage, |
| 620 | &sys_write_amp_trip, | 620 | &sys_write_amp_trip, |
| 621 | &sys_padding_dist, | 621 | &sys_padding_dist, |
| 622 | #ifdef CONFIG_NVM_DEBUG | 622 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 623 | &sys_stats_debug_attr, | 623 | &sys_stats_debug_attr, |
| 624 | #endif | 624 | #endif |
| 625 | NULL, | 625 | NULL, |
| @@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, | |||
| 654 | return pblk_sysfs_get_write_amp_trip(pblk, buf); | 654 | return pblk_sysfs_get_write_amp_trip(pblk, buf); |
| 655 | else if (strcmp(attr->name, "padding_dist") == 0) | 655 | else if (strcmp(attr->name, "padding_dist") == 0) |
| 656 | return pblk_sysfs_get_padding_dist(pblk, buf); | 656 | return pblk_sysfs_get_padding_dist(pblk, buf); |
| 657 | #ifdef CONFIG_NVM_DEBUG | 657 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 658 | else if (strcmp(attr->name, "stats") == 0) | 658 | else if (strcmp(attr->name, "stats") == 0) |
| 659 | return pblk_sysfs_stats_debug(pblk, buf); | 659 | return pblk_sysfs_stats_debug(pblk, buf); |
| 660 | #endif | 660 | #endif |
| @@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk) | |||
| 697 | kobject_get(&parent_dev->kobj), | 697 | kobject_get(&parent_dev->kobj), |
| 698 | "%s", "pblk"); | 698 | "%s", "pblk"); |
| 699 | if (ret) { | 699 | if (ret) { |
| 700 | pr_err("pblk: could not register %s/pblk\n", | 700 | pblk_err(pblk, "could not register\n"); |
| 701 | tdisk->disk_name); | ||
| 702 | return ret; | 701 | return ret; |
| 703 | } | 702 | } |
| 704 | 703 | ||
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index f353e52941f5..ee774a86cf1e 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c | |||
| @@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 38 | /* Release flags on context. Protect from writes */ | 38 | /* Release flags on context. Protect from writes */ |
| 39 | smp_store_release(&w_ctx->flags, flags); | 39 | smp_store_release(&w_ctx->flags, flags); |
| 40 | 40 | ||
| 41 | #ifdef CONFIG_NVM_DEBUG | 41 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 42 | atomic_dec(&rwb->inflight_flush_point); | 42 | atomic_dec(&rwb->inflight_flush_point); |
| 43 | #endif | 43 | #endif |
| 44 | } | 44 | } |
| @@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 51 | pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, | 51 | pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, |
| 52 | c_ctx->nr_padded); | 52 | c_ctx->nr_padded); |
| 53 | 53 | ||
| 54 | #ifdef CONFIG_NVM_DEBUG | 54 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 55 | atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); | 55 | atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); |
| 56 | #endif | 56 | #endif |
| 57 | 57 | ||
| @@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 78 | unsigned long flags; | 78 | unsigned long flags; |
| 79 | unsigned long pos; | 79 | unsigned long pos; |
| 80 | 80 | ||
| 81 | #ifdef CONFIG_NVM_DEBUG | 81 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 82 | atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); | 82 | atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); |
| 83 | #endif | 83 | #endif |
| 84 | 84 | ||
| @@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) | |||
| 196 | list_add_tail(&r_ctx->list, &pblk->resubmit_list); | 196 | list_add_tail(&r_ctx->list, &pblk->resubmit_list); |
| 197 | spin_unlock(&pblk->resubmit_lock); | 197 | spin_unlock(&pblk->resubmit_lock); |
| 198 | 198 | ||
| 199 | #ifdef CONFIG_NVM_DEBUG | 199 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 200 | atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); | 200 | atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); |
| 201 | #endif | 201 | #endif |
| 202 | } | 202 | } |
| @@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 238 | 238 | ||
| 239 | recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); | 239 | recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); |
| 240 | if (!recovery) { | 240 | if (!recovery) { |
| 241 | pr_err("pblk: could not allocate recovery work\n"); | 241 | pblk_err(pblk, "could not allocate recovery work\n"); |
| 242 | return; | 242 | return; |
| 243 | } | 243 | } |
| 244 | 244 | ||
| @@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd) | |||
| 258 | pblk_end_w_fail(pblk, rqd); | 258 | pblk_end_w_fail(pblk, rqd); |
| 259 | return; | 259 | return; |
| 260 | } | 260 | } |
| 261 | #ifdef CONFIG_NVM_DEBUG | 261 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 262 | else | 262 | else |
| 263 | WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); | 263 | WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); |
| 264 | #endif | 264 | #endif |
| @@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) | |||
| 279 | 279 | ||
| 280 | if (rqd->error) { | 280 | if (rqd->error) { |
| 281 | pblk_log_write_err(pblk, rqd); | 281 | pblk_log_write_err(pblk, rqd); |
| 282 | pr_err("pblk: metadata I/O failed. Line %d\n", line->id); | 282 | pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); |
| 283 | line->w_err_gc->has_write_err = 1; | 283 | line->w_err_gc->has_write_err = 1; |
| 284 | } | 284 | } |
| 285 | 285 | ||
| @@ -356,11 +356,11 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, | |||
| 356 | 356 | ||
| 357 | secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); | 357 | secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); |
| 358 | 358 | ||
| 359 | #ifdef CONFIG_NVM_DEBUG | 359 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 360 | if ((!secs_to_sync && secs_to_flush) | 360 | if ((!secs_to_sync && secs_to_flush) |
| 361 | || (secs_to_sync < 0) | 361 | || (secs_to_sync < 0) |
| 362 | || (secs_to_sync > secs_avail && !secs_to_flush)) { | 362 | || (secs_to_sync > secs_avail && !secs_to_flush)) { |
| 363 | pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n", | 363 | pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", |
| 364 | secs_avail, secs_to_sync, secs_to_flush); | 364 | secs_avail, secs_to_sync, secs_to_flush); |
| 365 | } | 365 | } |
| 366 | #endif | 366 | #endif |
| @@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) | |||
| 397 | bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, | 397 | bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, |
| 398 | l_mg->emeta_alloc_type, GFP_KERNEL); | 398 | l_mg->emeta_alloc_type, GFP_KERNEL); |
| 399 | if (IS_ERR(bio)) { | 399 | if (IS_ERR(bio)) { |
| 400 | pr_err("pblk: failed to map emeta io"); | 400 | pblk_err(pblk, "failed to map emeta io"); |
| 401 | ret = PTR_ERR(bio); | 401 | ret = PTR_ERR(bio); |
| 402 | goto fail_free_rqd; | 402 | goto fail_free_rqd; |
| 403 | } | 403 | } |
| @@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) | |||
| 428 | 428 | ||
| 429 | ret = pblk_submit_io(pblk, rqd); | 429 | ret = pblk_submit_io(pblk, rqd); |
| 430 | if (ret) { | 430 | if (ret) { |
| 431 | pr_err("pblk: emeta I/O submission failed: %d\n", ret); | 431 | pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); |
| 432 | goto fail_rollback; | 432 | goto fail_rollback; |
| 433 | } | 433 | } |
| 434 | 434 | ||
| @@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 518 | /* Assign lbas to ppas and populate request structure */ | 518 | /* Assign lbas to ppas and populate request structure */ |
| 519 | err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); | 519 | err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); |
| 520 | if (err) { | 520 | if (err) { |
| 521 | pr_err("pblk: could not setup write request: %d\n", err); | 521 | pblk_err(pblk, "could not setup write request: %d\n", err); |
| 522 | return NVM_IO_ERR; | 522 | return NVM_IO_ERR; |
| 523 | } | 523 | } |
| 524 | 524 | ||
| @@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 527 | /* Submit data write for current data line */ | 527 | /* Submit data write for current data line */ |
| 528 | err = pblk_submit_io(pblk, rqd); | 528 | err = pblk_submit_io(pblk, rqd); |
| 529 | if (err) { | 529 | if (err) { |
| 530 | pr_err("pblk: data I/O submission failed: %d\n", err); | 530 | pblk_err(pblk, "data I/O submission failed: %d\n", err); |
| 531 | return NVM_IO_ERR; | 531 | return NVM_IO_ERR; |
| 532 | } | 532 | } |
| 533 | 533 | ||
| @@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 549 | /* Submit metadata write for previous data line */ | 549 | /* Submit metadata write for previous data line */ |
| 550 | err = pblk_submit_meta_io(pblk, meta_line); | 550 | err = pblk_submit_meta_io(pblk, meta_line); |
| 551 | if (err) { | 551 | if (err) { |
| 552 | pr_err("pblk: metadata I/O submission failed: %d", err); | 552 | pblk_err(pblk, "metadata I/O submission failed: %d", |
| 553 | err); | ||
| 553 | return NVM_IO_ERR; | 554 | return NVM_IO_ERR; |
| 554 | } | 555 | } |
| 555 | } | 556 | } |
| @@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk) | |||
| 614 | secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, | 615 | secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, |
| 615 | secs_to_flush); | 616 | secs_to_flush); |
| 616 | if (secs_to_sync > pblk->max_write_pgs) { | 617 | if (secs_to_sync > pblk->max_write_pgs) { |
| 617 | pr_err("pblk: bad buffer sync calculation\n"); | 618 | pblk_err(pblk, "bad buffer sync calculation\n"); |
| 618 | return 1; | 619 | return 1; |
| 619 | } | 620 | } |
| 620 | 621 | ||
| @@ -633,14 +634,14 @@ static int pblk_submit_write(struct pblk *pblk) | |||
| 633 | 634 | ||
| 634 | if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, | 635 | if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, |
| 635 | secs_avail)) { | 636 | secs_avail)) { |
| 636 | pr_err("pblk: corrupted write bio\n"); | 637 | pblk_err(pblk, "corrupted write bio\n"); |
| 637 | goto fail_put_bio; | 638 | goto fail_put_bio; |
| 638 | } | 639 | } |
| 639 | 640 | ||
| 640 | if (pblk_submit_io_set(pblk, rqd)) | 641 | if (pblk_submit_io_set(pblk, rqd)) |
| 641 | goto fail_free_bio; | 642 | goto fail_free_bio; |
| 642 | 643 | ||
| 643 | #ifdef CONFIG_NVM_DEBUG | 644 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 644 | atomic_long_add(secs_to_sync, &pblk->sub_writes); | 645 | atomic_long_add(secs_to_sync, &pblk->sub_writes); |
| 645 | #endif | 646 | #endif |
| 646 | 647 | ||
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 34cc1d64a9d4..4760af7b6499 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h | |||
| @@ -119,6 +119,16 @@ struct pblk_g_ctx { | |||
| 119 | u64 lba; | 119 | u64 lba; |
| 120 | }; | 120 | }; |
| 121 | 121 | ||
| 122 | /* partial read context */ | ||
| 123 | struct pblk_pr_ctx { | ||
| 124 | struct bio *orig_bio; | ||
| 125 | DECLARE_BITMAP(bitmap, NVM_MAX_VLBA); | ||
| 126 | unsigned int orig_nr_secs; | ||
| 127 | unsigned int bio_init_idx; | ||
| 128 | void *ppa_ptr; | ||
| 129 | dma_addr_t dma_ppa_list; | ||
| 130 | }; | ||
| 131 | |||
| 122 | /* Pad context */ | 132 | /* Pad context */ |
| 123 | struct pblk_pad_rq { | 133 | struct pblk_pad_rq { |
| 124 | struct pblk *pblk; | 134 | struct pblk *pblk; |
| @@ -193,7 +203,7 @@ struct pblk_rb { | |||
| 193 | spinlock_t w_lock; /* Write lock */ | 203 | spinlock_t w_lock; /* Write lock */ |
| 194 | spinlock_t s_lock; /* Sync lock */ | 204 | spinlock_t s_lock; /* Sync lock */ |
| 195 | 205 | ||
| 196 | #ifdef CONFIG_NVM_DEBUG | 206 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 197 | atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ | 207 | atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ |
| 198 | #endif | 208 | #endif |
| 199 | }; | 209 | }; |
| @@ -608,9 +618,6 @@ struct pblk { | |||
| 608 | 618 | ||
| 609 | int min_write_pgs; /* Minimum amount of pages required by controller */ | 619 | int min_write_pgs; /* Minimum amount of pages required by controller */ |
| 610 | int max_write_pgs; /* Maximum amount of pages supported by controller */ | 620 | int max_write_pgs; /* Maximum amount of pages supported by controller */ |
| 611 | int pgs_in_buffer; /* Number of pages that need to be held in buffer to | ||
| 612 | * guarantee successful reads. | ||
| 613 | */ | ||
| 614 | 621 | ||
| 615 | sector_t capacity; /* Device capacity when bad blocks are subtracted */ | 622 | sector_t capacity; /* Device capacity when bad blocks are subtracted */ |
| 616 | 623 | ||
| @@ -639,7 +646,7 @@ struct pblk { | |||
| 639 | u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ | 646 | u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ |
| 640 | atomic64_t nr_flush; /* Number of flush/fua I/O */ | 647 | atomic64_t nr_flush; /* Number of flush/fua I/O */ |
| 641 | 648 | ||
| 642 | #ifdef CONFIG_NVM_DEBUG | 649 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 643 | /* Non-persistent debug counters, 4kb sector I/Os */ | 650 | /* Non-persistent debug counters, 4kb sector I/Os */ |
| 644 | atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ | 651 | atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ |
| 645 | atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ | 652 | atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ |
| @@ -706,6 +713,15 @@ struct pblk_line_ws { | |||
| 706 | #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) | 713 | #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) |
| 707 | #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) | 714 | #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) |
| 708 | 715 | ||
| 716 | #define pblk_err(pblk, fmt, ...) \ | ||
| 717 | pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) | ||
| 718 | #define pblk_info(pblk, fmt, ...) \ | ||
| 719 | pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) | ||
| 720 | #define pblk_warn(pblk, fmt, ...) \ | ||
| 721 | pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) | ||
| 722 | #define pblk_debug(pblk, fmt, ...) \ | ||
| 723 | pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) | ||
| 724 | |||
| 709 | /* | 725 | /* |
| 710 | * pblk ring buffer operations | 726 | * pblk ring buffer operations |
| 711 | */ | 727 | */ |
| @@ -1282,20 +1298,22 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) | |||
| 1282 | return !(nr_secs % pblk->min_write_pgs); | 1298 | return !(nr_secs % pblk->min_write_pgs); |
| 1283 | } | 1299 | } |
| 1284 | 1300 | ||
| 1285 | #ifdef CONFIG_NVM_DEBUG | 1301 | #ifdef CONFIG_NVM_PBLK_DEBUG |
| 1286 | static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p, | 1302 | static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, |
| 1287 | char *msg, int error) | 1303 | char *msg, int error) |
| 1288 | { | 1304 | { |
| 1305 | struct nvm_geo *geo = &pblk->dev->geo; | ||
| 1306 | |||
| 1289 | if (p->c.is_cached) { | 1307 | if (p->c.is_cached) { |
| 1290 | pr_err("ppa: (%s: %x) cache line: %llu\n", | 1308 | pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", |
| 1291 | msg, error, (u64)p->c.line); | 1309 | msg, error, (u64)p->c.line); |
| 1292 | } else if (geo->version == NVM_OCSSD_SPEC_12) { | 1310 | } else if (geo->version == NVM_OCSSD_SPEC_12) { |
| 1293 | pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", | 1311 | pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", |
| 1294 | msg, error, | 1312 | msg, error, |
| 1295 | p->g.ch, p->g.lun, p->g.blk, | 1313 | p->g.ch, p->g.lun, p->g.blk, |
| 1296 | p->g.pg, p->g.pl, p->g.sec); | 1314 | p->g.pg, p->g.pl, p->g.sec); |
| 1297 | } else { | 1315 | } else { |
| 1298 | pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", | 1316 | pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", |
| 1299 | msg, error, | 1317 | msg, error, |
| 1300 | p->m.grp, p->m.pu, p->m.chk, p->m.sec); | 1318 | p->m.grp, p->m.pu, p->m.chk, p->m.sec); |
| 1301 | } | 1319 | } |
| @@ -1307,16 +1325,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, | |||
| 1307 | int bit = -1; | 1325 | int bit = -1; |
| 1308 | 1326 | ||
| 1309 | if (rqd->nr_ppas == 1) { | 1327 | if (rqd->nr_ppas == 1) { |
| 1310 | print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error); | 1328 | print_ppa(pblk, &rqd->ppa_addr, "rqd", error); |
| 1311 | return; | 1329 | return; |
| 1312 | } | 1330 | } |
| 1313 | 1331 | ||
| 1314 | while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, | 1332 | while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, |
| 1315 | bit + 1)) < rqd->nr_ppas) { | 1333 | bit + 1)) < rqd->nr_ppas) { |
| 1316 | print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error); | 1334 | print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); |
| 1317 | } | 1335 | } |
| 1318 | 1336 | ||
| 1319 | pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); | 1337 | pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); |
| 1320 | } | 1338 | } |
| 1321 | 1339 | ||
| 1322 | static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, | 1340 | static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, |
| @@ -1347,7 +1365,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, | |||
| 1347 | continue; | 1365 | continue; |
| 1348 | } | 1366 | } |
| 1349 | 1367 | ||
| 1350 | print_ppa(geo, ppa, "boundary", i); | 1368 | print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); |
| 1351 | 1369 | ||
| 1352 | return 1; | 1370 | return 1; |
| 1353 | } | 1371 | } |
| @@ -1377,7 +1395,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) | |||
| 1377 | 1395 | ||
| 1378 | spin_lock(&line->lock); | 1396 | spin_lock(&line->lock); |
| 1379 | if (line->state != PBLK_LINESTATE_OPEN) { | 1397 | if (line->state != PBLK_LINESTATE_OPEN) { |
| 1380 | pr_err("pblk: bad ppa: line:%d,state:%d\n", | 1398 | pblk_err(pblk, "bad ppa: line:%d,state:%d\n", |
| 1381 | line->id, line->state); | 1399 | line->id, line->state); |
| 1382 | WARN_ON(1); | 1400 | WARN_ON(1); |
| 1383 | spin_unlock(&line->lock); | 1401 | spin_unlock(&line->lock); |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6bf294f3907..05f82ff6f016 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
| @@ -328,13 +328,6 @@ struct cached_dev { | |||
| 328 | */ | 328 | */ |
| 329 | atomic_t has_dirty; | 329 | atomic_t has_dirty; |
| 330 | 330 | ||
| 331 | /* | ||
| 332 | * Set to zero by things that touch the backing volume-- except | ||
| 333 | * writeback. Incremented by writeback. Used to determine when to | ||
| 334 | * accelerate idle writeback. | ||
| 335 | */ | ||
| 336 | atomic_t backing_idle; | ||
| 337 | |||
| 338 | struct bch_ratelimit writeback_rate; | 331 | struct bch_ratelimit writeback_rate; |
| 339 | struct delayed_work writeback_rate_update; | 332 | struct delayed_work writeback_rate_update; |
| 340 | 333 | ||
| @@ -423,9 +416,9 @@ struct cache { | |||
| 423 | /* | 416 | /* |
| 424 | * When allocating new buckets, prio_write() gets first dibs - since we | 417 | * When allocating new buckets, prio_write() gets first dibs - since we |
| 425 | * may not be allocate at all without writing priorities and gens. | 418 | * may not be allocate at all without writing priorities and gens. |
| 426 | * prio_buckets[] contains the last buckets we wrote priorities to (so | 419 | * prio_last_buckets[] contains the last buckets we wrote priorities to |
| 427 | * gc can mark them as metadata), prio_next[] contains the buckets | 420 | * (so gc can mark them as metadata), prio_buckets[] contains the |
| 428 | * allocated for the next prio write. | 421 | * buckets allocated for the next prio write. |
| 429 | */ | 422 | */ |
| 430 | uint64_t *prio_buckets; | 423 | uint64_t *prio_buckets; |
| 431 | uint64_t *prio_last_buckets; | 424 | uint64_t *prio_last_buckets; |
| @@ -474,6 +467,7 @@ struct cache { | |||
| 474 | 467 | ||
| 475 | struct gc_stat { | 468 | struct gc_stat { |
| 476 | size_t nodes; | 469 | size_t nodes; |
| 470 | size_t nodes_pre; | ||
| 477 | size_t key_bytes; | 471 | size_t key_bytes; |
| 478 | 472 | ||
| 479 | size_t nkeys; | 473 | size_t nkeys; |
| @@ -514,6 +508,8 @@ struct cache_set { | |||
| 514 | struct cache_accounting accounting; | 508 | struct cache_accounting accounting; |
| 515 | 509 | ||
| 516 | unsigned long flags; | 510 | unsigned long flags; |
| 511 | atomic_t idle_counter; | ||
| 512 | atomic_t at_max_writeback_rate; | ||
| 517 | 513 | ||
| 518 | struct cache_sb sb; | 514 | struct cache_sb sb; |
| 519 | 515 | ||
| @@ -523,8 +519,10 @@ struct cache_set { | |||
| 523 | 519 | ||
| 524 | struct bcache_device **devices; | 520 | struct bcache_device **devices; |
| 525 | unsigned devices_max_used; | 521 | unsigned devices_max_used; |
| 522 | atomic_t attached_dev_nr; | ||
| 526 | struct list_head cached_devs; | 523 | struct list_head cached_devs; |
| 527 | uint64_t cached_dev_sectors; | 524 | uint64_t cached_dev_sectors; |
| 525 | atomic_long_t flash_dev_dirty_sectors; | ||
| 528 | struct closure caching; | 526 | struct closure caching; |
| 529 | 527 | ||
| 530 | struct closure sb_write; | 528 | struct closure sb_write; |
| @@ -603,6 +601,10 @@ struct cache_set { | |||
| 603 | */ | 601 | */ |
| 604 | atomic_t rescale; | 602 | atomic_t rescale; |
| 605 | /* | 603 | /* |
| 604 | * used for GC, identify if any front side I/Os is inflight | ||
| 605 | */ | ||
| 606 | atomic_t search_inflight; | ||
| 607 | /* | ||
| 606 | * When we invalidate buckets, we use both the priority and the amount | 608 | * When we invalidate buckets, we use both the priority and the amount |
| 607 | * of good data to determine which buckets to reuse first - to weight | 609 | * of good data to determine which buckets to reuse first - to weight |
| 608 | * those together consistently we keep track of the smallest nonzero | 610 | * those together consistently we keep track of the smallest nonzero |
| @@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *); | |||
| 995 | int bch_cache_allocator_start(struct cache *ca); | 997 | int bch_cache_allocator_start(struct cache *ca); |
| 996 | 998 | ||
| 997 | void bch_debug_exit(void); | 999 | void bch_debug_exit(void); |
| 998 | int bch_debug_init(struct kobject *); | 1000 | void bch_debug_init(struct kobject *kobj); |
| 999 | void bch_request_exit(void); | 1001 | void bch_request_exit(void); |
| 1000 | int bch_request_init(void); | 1002 | int bch_request_init(void); |
| 1001 | 1003 | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f3403b45bc28..596c93b44e9b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
| @@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init); | |||
| 366 | 366 | ||
| 367 | /* Binary tree stuff for auxiliary search trees */ | 367 | /* Binary tree stuff for auxiliary search trees */ |
| 368 | 368 | ||
| 369 | /* | ||
| 370 | * return array index next to j when does in-order traverse | ||
| 371 | * of a binary tree which is stored in a linear array | ||
| 372 | */ | ||
| 369 | static unsigned inorder_next(unsigned j, unsigned size) | 373 | static unsigned inorder_next(unsigned j, unsigned size) |
| 370 | { | 374 | { |
| 371 | if (j * 2 + 1 < size) { | 375 | if (j * 2 + 1 < size) { |
| @@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size) | |||
| 379 | return j; | 383 | return j; |
| 380 | } | 384 | } |
| 381 | 385 | ||
| 386 | /* | ||
| 387 | * return array index previous to j when does in-order traverse | ||
| 388 | * of a binary tree which is stored in a linear array | ||
| 389 | */ | ||
| 382 | static unsigned inorder_prev(unsigned j, unsigned size) | 390 | static unsigned inorder_prev(unsigned j, unsigned size) |
| 383 | { | 391 | { |
| 384 | if (j * 2 < size) { | 392 | if (j * 2 < size) { |
| @@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) | |||
| 421 | return j; | 429 | return j; |
| 422 | } | 430 | } |
| 423 | 431 | ||
| 432 | /* | ||
| 433 | * Return the cacheline index in bset_tree->data, where j is index | ||
| 434 | * from a linear array which stores the auxiliar binary tree | ||
| 435 | */ | ||
| 424 | static unsigned to_inorder(unsigned j, struct bset_tree *t) | 436 | static unsigned to_inorder(unsigned j, struct bset_tree *t) |
| 425 | { | 437 | { |
| 426 | return __to_inorder(j, t->size, t->extra); | 438 | return __to_inorder(j, t->size, t->extra); |
| @@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) | |||
| 441 | return j; | 453 | return j; |
| 442 | } | 454 | } |
| 443 | 455 | ||
| 456 | /* | ||
| 457 | * Return an index from a linear array which stores the auxiliar binary | ||
| 458 | * tree, j is the cacheline index of t->data. | ||
| 459 | */ | ||
| 444 | static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) | 460 | static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) |
| 445 | { | 461 | { |
| 446 | return __inorder_to_tree(j, t->size, t->extra); | 462 | return __inorder_to_tree(j, t->size, t->extra); |
| @@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) | |||
| 546 | return low; | 562 | return low; |
| 547 | } | 563 | } |
| 548 | 564 | ||
| 565 | /* | ||
| 566 | * Calculate mantissa value for struct bkey_float. | ||
| 567 | * If most significant bit of f->exponent is not set, then | ||
| 568 | * - f->exponent >> 6 is 0 | ||
| 569 | * - p[0] points to bkey->low | ||
| 570 | * - p[-1] borrows bits from KEY_INODE() of bkey->high | ||
| 571 | * if most isgnificant bits of f->exponent is set, then | ||
| 572 | * - f->exponent >> 6 is 1 | ||
| 573 | * - p[0] points to bits from KEY_INODE() of bkey->high | ||
| 574 | * - p[-1] points to other bits from KEY_INODE() of | ||
| 575 | * bkey->high too. | ||
| 576 | * See make_bfloat() to check when most significant bit of f->exponent | ||
| 577 | * is set or not. | ||
| 578 | */ | ||
| 549 | static inline unsigned bfloat_mantissa(const struct bkey *k, | 579 | static inline unsigned bfloat_mantissa(const struct bkey *k, |
| 550 | struct bkey_float *f) | 580 | struct bkey_float *f) |
| 551 | { | 581 | { |
| @@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j) | |||
| 570 | BUG_ON(m < l || m > r); | 600 | BUG_ON(m < l || m > r); |
| 571 | BUG_ON(bkey_next(p) != m); | 601 | BUG_ON(bkey_next(p) != m); |
| 572 | 602 | ||
| 603 | /* | ||
| 604 | * If l and r have different KEY_INODE values (different backing | ||
| 605 | * device), f->exponent records how many least significant bits | ||
| 606 | * are different in KEY_INODE values and sets most significant | ||
| 607 | * bits to 1 (by +64). | ||
| 608 | * If l and r have same KEY_INODE value, f->exponent records | ||
| 609 | * how many different bits in least significant bits of bkey->low. | ||
| 610 | * See bfloat_mantiss() how the most significant bit of | ||
| 611 | * f->exponent is used to calculate bfloat mantissa value. | ||
| 612 | */ | ||
| 573 | if (KEY_INODE(l) != KEY_INODE(r)) | 613 | if (KEY_INODE(l) != KEY_INODE(r)) |
| 574 | f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; | 614 | f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; |
| 575 | else | 615 | else |
| @@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) | |||
| 633 | } | 673 | } |
| 634 | EXPORT_SYMBOL(bch_bset_init_next); | 674 | EXPORT_SYMBOL(bch_bset_init_next); |
| 635 | 675 | ||
| 676 | /* | ||
| 677 | * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to | ||
| 678 | * accelerate bkey search in a btree node (pointed by bset_tree->data in | ||
| 679 | * memory). After search in the auxiliar tree by calling bset_search_tree(), | ||
| 680 | * a struct bset_search_iter is returned which indicates range [l, r] from | ||
| 681 | * bset_tree->data where the searching bkey might be inside. Then a followed | ||
| 682 | * linear comparison does the exact search, see __bch_bset_search() for how | ||
| 683 | * the auxiliary tree is used. | ||
| 684 | */ | ||
| 636 | void bch_bset_build_written_tree(struct btree_keys *b) | 685 | void bch_bset_build_written_tree(struct btree_keys *b) |
| 637 | { | 686 | { |
| 638 | struct bset_tree *t = bset_tree_last(b); | 687 | struct bset_tree *t = bset_tree_last(b); |
| @@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, | |||
| 898 | unsigned inorder, j, n = 1; | 947 | unsigned inorder, j, n = 1; |
| 899 | 948 | ||
| 900 | do { | 949 | do { |
| 950 | /* | ||
| 951 | * A bit trick here. | ||
| 952 | * If p < t->size, (int)(p - t->size) is a minus value and | ||
| 953 | * the most significant bit is set, right shifting 31 bits | ||
| 954 | * gets 1. If p >= t->size, the most significant bit is | ||
| 955 | * not set, right shifting 31 bits gets 0. | ||
| 956 | * So the following 2 lines equals to | ||
| 957 | * if (p >= t->size) | ||
| 958 | * p = 0; | ||
| 959 | * but a branch instruction is avoided. | ||
| 960 | */ | ||
| 901 | unsigned p = n << 4; | 961 | unsigned p = n << 4; |
| 902 | p &= ((int) (p - t->size)) >> 31; | 962 | p &= ((int) (p - t->size)) >> 31; |
| 903 | 963 | ||
| @@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, | |||
| 907 | f = &t->tree[j]; | 967 | f = &t->tree[j]; |
| 908 | 968 | ||
| 909 | /* | 969 | /* |
| 970 | * Similar bit trick, use subtract operation to avoid a branch | ||
| 971 | * instruction. | ||
| 972 | * | ||
| 910 | * n = (f->mantissa > bfloat_mantissa()) | 973 | * n = (f->mantissa > bfloat_mantissa()) |
| 911 | * ? j * 2 | 974 | * ? j * 2 |
| 912 | * : j * 2 + 1; | 975 | * : j * 2 + 1; |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 547c9eedc2f4..c19f7716df88 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
| @@ -90,6 +90,9 @@ | |||
| 90 | 90 | ||
| 91 | #define MAX_NEED_GC 64 | 91 | #define MAX_NEED_GC 64 |
| 92 | #define MAX_SAVE_PRIO 72 | 92 | #define MAX_SAVE_PRIO 72 |
| 93 | #define MAX_GC_TIMES 100 | ||
| 94 | #define MIN_GC_NODES 100 | ||
| 95 | #define GC_SLEEP_MS 100 | ||
| 93 | 96 | ||
| 94 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | 97 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) |
| 95 | 98 | ||
| @@ -1008,6 +1011,13 @@ retry: | |||
| 1008 | BUG_ON(b->level != level); | 1011 | BUG_ON(b->level != level); |
| 1009 | } | 1012 | } |
| 1010 | 1013 | ||
| 1014 | if (btree_node_io_error(b)) { | ||
| 1015 | rw_unlock(write, b); | ||
| 1016 | return ERR_PTR(-EIO); | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | BUG_ON(!b->written); | ||
| 1020 | |||
| 1011 | b->parent = parent; | 1021 | b->parent = parent; |
| 1012 | b->accessed = 1; | 1022 | b->accessed = 1; |
| 1013 | 1023 | ||
| @@ -1019,13 +1029,6 @@ retry: | |||
| 1019 | for (; i <= b->keys.nsets; i++) | 1029 | for (; i <= b->keys.nsets; i++) |
| 1020 | prefetch(b->keys.set[i].data); | 1030 | prefetch(b->keys.set[i].data); |
| 1021 | 1031 | ||
| 1022 | if (btree_node_io_error(b)) { | ||
| 1023 | rw_unlock(write, b); | ||
| 1024 | return ERR_PTR(-EIO); | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | BUG_ON(!b->written); | ||
| 1028 | |||
| 1029 | return b; | 1032 | return b; |
| 1030 | } | 1033 | } |
| 1031 | 1034 | ||
| @@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b) | |||
| 1520 | return ret; | 1523 | return ret; |
| 1521 | } | 1524 | } |
| 1522 | 1525 | ||
| 1526 | static size_t btree_gc_min_nodes(struct cache_set *c) | ||
| 1527 | { | ||
| 1528 | size_t min_nodes; | ||
| 1529 | |||
| 1530 | /* | ||
| 1531 | * Since incremental GC would stop 100ms when front | ||
| 1532 | * side I/O comes, so when there are many btree nodes, | ||
| 1533 | * if GC only processes constant (100) nodes each time, | ||
| 1534 | * GC would last a long time, and the front side I/Os | ||
| 1535 | * would run out of the buckets (since no new bucket | ||
| 1536 | * can be allocated during GC), and be blocked again. | ||
| 1537 | * So GC should not process constant nodes, but varied | ||
| 1538 | * nodes according to the number of btree nodes, which | ||
| 1539 | * realized by dividing GC into constant(100) times, | ||
| 1540 | * so when there are many btree nodes, GC can process | ||
| 1541 | * more nodes each time, otherwise, GC will process less | ||
| 1542 | * nodes each time (but no less than MIN_GC_NODES) | ||
| 1543 | */ | ||
| 1544 | min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; | ||
| 1545 | if (min_nodes < MIN_GC_NODES) | ||
| 1546 | min_nodes = MIN_GC_NODES; | ||
| 1547 | |||
| 1548 | return min_nodes; | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | |||
| 1523 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1552 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
| 1524 | struct closure *writes, struct gc_stat *gc) | 1553 | struct closure *writes, struct gc_stat *gc) |
| 1525 | { | 1554 | { |
| @@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
| 1585 | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); | 1614 | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); |
| 1586 | r->b = NULL; | 1615 | r->b = NULL; |
| 1587 | 1616 | ||
| 1617 | if (atomic_read(&b->c->search_inflight) && | ||
| 1618 | gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { | ||
| 1619 | gc->nodes_pre = gc->nodes; | ||
| 1620 | ret = -EAGAIN; | ||
| 1621 | break; | ||
| 1622 | } | ||
| 1623 | |||
| 1588 | if (need_resched()) { | 1624 | if (need_resched()) { |
| 1589 | ret = -EAGAIN; | 1625 | ret = -EAGAIN; |
| 1590 | break; | 1626 | break; |
| @@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c) | |||
| 1753 | closure_sync(&writes); | 1789 | closure_sync(&writes); |
| 1754 | cond_resched(); | 1790 | cond_resched(); |
| 1755 | 1791 | ||
| 1756 | if (ret && ret != -EAGAIN) | 1792 | if (ret == -EAGAIN) |
| 1793 | schedule_timeout_interruptible(msecs_to_jiffies | ||
| 1794 | (GC_SLEEP_MS)); | ||
| 1795 | else if (ret) | ||
| 1757 | pr_warn("gc failed!"); | 1796 | pr_warn("gc failed!"); |
| 1758 | } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); | 1797 | } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); |
| 1759 | 1798 | ||
| @@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) | |||
| 1834 | do { | 1873 | do { |
| 1835 | k = bch_btree_iter_next_filter(&iter, &b->keys, | 1874 | k = bch_btree_iter_next_filter(&iter, &b->keys, |
| 1836 | bch_ptr_bad); | 1875 | bch_ptr_bad); |
| 1837 | if (k) | 1876 | if (k) { |
| 1838 | btree_node_prefetch(b, k); | 1877 | btree_node_prefetch(b, k); |
| 1878 | /* | ||
| 1879 | * initiallize c->gc_stats.nodes | ||
| 1880 | * for incremental GC | ||
| 1881 | */ | ||
| 1882 | b->c->gc_stats.nodes++; | ||
| 1883 | } | ||
| 1839 | 1884 | ||
| 1840 | if (p) | 1885 | if (p) |
| 1841 | ret = btree(check_recurse, p, b, op); | 1886 | ret = btree(check_recurse, p, b, op); |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d211e2c25b6b..68e9d926134d 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
| @@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \ | |||
| 152 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 152 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ |
| 153 | \ | 153 | \ |
| 154 | static inline void set_btree_node_ ## flag(struct btree *b) \ | 154 | static inline void set_btree_node_ ## flag(struct btree *b) \ |
| 155 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 155 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } |
| 156 | 156 | ||
| 157 | enum btree_flags { | 157 | enum btree_flags { |
| 158 | BTREE_NODE_io_error, | 158 | BTREE_NODE_io_error, |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0e14969182c6..618253683d40 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
| @@ -199,11 +199,16 @@ static const struct file_operations debug_ops = { | |||
| 199 | .release = single_release | 199 | .release = single_release |
| 200 | }; | 200 | }; |
| 201 | 201 | ||
| 202 | int __init closure_debug_init(void) | 202 | void __init closure_debug_init(void) |
| 203 | { | 203 | { |
| 204 | closure_debug = debugfs_create_file("closures", | 204 | if (!IS_ERR_OR_NULL(bcache_debug)) |
| 205 | 0400, bcache_debug, NULL, &debug_ops); | 205 | /* |
| 206 | return IS_ERR_OR_NULL(closure_debug); | 206 | * it is unnecessary to check return value of |
| 207 | * debugfs_create_file(), we should not care | ||
| 208 | * about this. | ||
| 209 | */ | ||
| 210 | closure_debug = debugfs_create_file( | ||
| 211 | "closures", 0400, bcache_debug, NULL, &debug_ops); | ||
| 207 | } | 212 | } |
| 208 | #endif | 213 | #endif |
| 209 | 214 | ||
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 71427eb5fdae..7c2c5bc7c88b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
| @@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl) | |||
| 186 | 186 | ||
| 187 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 187 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
| 188 | 188 | ||
| 189 | int closure_debug_init(void); | 189 | void closure_debug_init(void); |
| 190 | void closure_debug_create(struct closure *cl); | 190 | void closure_debug_create(struct closure *cl); |
| 191 | void closure_debug_destroy(struct closure *cl); | 191 | void closure_debug_destroy(struct closure *cl); |
| 192 | 192 | ||
| 193 | #else | 193 | #else |
| 194 | 194 | ||
| 195 | static inline int closure_debug_init(void) { return 0; } | 195 | static inline void closure_debug_init(void) {} |
| 196 | static inline void closure_debug_create(struct closure *cl) {} | 196 | static inline void closure_debug_create(struct closure *cl) {} |
| 197 | static inline void closure_debug_destroy(struct closure *cl) {} | 197 | static inline void closure_debug_destroy(struct closure *cl) {} |
| 198 | 198 | ||
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d030ce3025a6..12034c07257b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
| @@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) | |||
| 110 | struct bio_vec bv, cbv; | 110 | struct bio_vec bv, cbv; |
| 111 | struct bvec_iter iter, citer = { 0 }; | 111 | struct bvec_iter iter, citer = { 0 }; |
| 112 | 112 | ||
| 113 | check = bio_clone_kmalloc(bio, GFP_NOIO); | 113 | check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); |
| 114 | if (!check) | 114 | if (!check) |
| 115 | return; | 115 | return; |
| 116 | check->bi_disk = bio->bi_disk; | ||
| 116 | check->bi_opf = REQ_OP_READ; | 117 | check->bi_opf = REQ_OP_READ; |
| 118 | check->bi_iter.bi_sector = bio->bi_iter.bi_sector; | ||
| 119 | check->bi_iter.bi_size = bio->bi_iter.bi_size; | ||
| 117 | 120 | ||
| 121 | bch_bio_map(check, NULL); | ||
| 118 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | 122 | if (bch_bio_alloc_pages(check, GFP_NOIO)) |
| 119 | goto out_put; | 123 | goto out_put; |
| 120 | 124 | ||
| @@ -248,11 +252,12 @@ void bch_debug_exit(void) | |||
| 248 | debugfs_remove_recursive(bcache_debug); | 252 | debugfs_remove_recursive(bcache_debug); |
| 249 | } | 253 | } |
| 250 | 254 | ||
| 251 | int __init bch_debug_init(struct kobject *kobj) | 255 | void __init bch_debug_init(struct kobject *kobj) |
| 252 | { | 256 | { |
| 253 | if (!IS_ENABLED(CONFIG_DEBUG_FS)) | 257 | /* |
| 254 | return 0; | 258 | * it is unnecessary to check return value of |
| 255 | 259 | * debugfs_create_file(), we should not care | |
| 260 | * about this. | ||
| 261 | */ | ||
| 256 | bcache_debug = debugfs_create_dir("bcache", NULL); | 262 | bcache_debug = debugfs_create_dir("bcache", NULL); |
| 257 | return IS_ERR_OR_NULL(bcache_debug); | ||
| 258 | } | 263 | } |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18f1b5239620..10748c626a1d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
| @@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c) | |||
| 828 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | 828 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); |
| 829 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | 829 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); |
| 830 | free_fifo(&c->journal.pin); | 830 | free_fifo(&c->journal.pin); |
| 831 | free_heap(&c->flush_btree); | ||
| 831 | } | 832 | } |
| 832 | 833 | ||
| 833 | int bch_journal_alloc(struct cache_set *c) | 834 | int bch_journal_alloc(struct cache_set *c) |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ae67f5fa8047..7dbe8b6316a0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
| @@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s, | |||
| 107 | /* | 107 | /* |
| 108 | * The journalling code doesn't handle the case where the keys to insert | 108 | * The journalling code doesn't handle the case where the keys to insert |
| 109 | * is bigger than an empty write: If we just return -ENOMEM here, | 109 | * is bigger than an empty write: If we just return -ENOMEM here, |
| 110 | * bio_insert() and bio_invalidate() will insert the keys created so far | 110 | * bch_data_insert_keys() will insert the keys created so far |
| 111 | * and finish the rest when the keylist is empty. | 111 | * and finish the rest when the keylist is empty. |
| 112 | */ | 112 | */ |
| 113 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | 113 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) |
| @@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio) | |||
| 667 | static void bio_complete(struct search *s) | 667 | static void bio_complete(struct search *s) |
| 668 | { | 668 | { |
| 669 | if (s->orig_bio) { | 669 | if (s->orig_bio) { |
| 670 | generic_end_io_acct(s->d->disk->queue, | 670 | generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), |
| 671 | bio_data_dir(s->orig_bio), | ||
| 672 | &s->d->disk->part0, s->start_time); | 671 | &s->d->disk->part0, s->start_time); |
| 673 | 672 | ||
| 674 | trace_bcache_request_end(s->d, s->orig_bio); | 673 | trace_bcache_request_end(s->d, s->orig_bio); |
| @@ -702,6 +701,8 @@ static void search_free(struct closure *cl) | |||
| 702 | { | 701 | { |
| 703 | struct search *s = container_of(cl, struct search, cl); | 702 | struct search *s = container_of(cl, struct search, cl); |
| 704 | 703 | ||
| 704 | atomic_dec(&s->d->c->search_inflight); | ||
| 705 | |||
| 705 | if (s->iop.bio) | 706 | if (s->iop.bio) |
| 706 | bio_put(s->iop.bio); | 707 | bio_put(s->iop.bio); |
| 707 | 708 | ||
| @@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio, | |||
| 719 | 720 | ||
| 720 | closure_init(&s->cl, NULL); | 721 | closure_init(&s->cl, NULL); |
| 721 | do_bio_hook(s, bio, request_endio); | 722 | do_bio_hook(s, bio, request_endio); |
| 723 | atomic_inc(&d->c->search_inflight); | ||
| 722 | 724 | ||
| 723 | s->orig_bio = bio; | 725 | s->orig_bio = bio; |
| 724 | s->cache_miss = NULL; | 726 | s->cache_miss = NULL; |
| @@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio) | |||
| 1062 | bio->bi_end_io = ddip->bi_end_io; | 1064 | bio->bi_end_io = ddip->bi_end_io; |
| 1063 | bio->bi_private = ddip->bi_private; | 1065 | bio->bi_private = ddip->bi_private; |
| 1064 | 1066 | ||
| 1065 | generic_end_io_acct(ddip->d->disk->queue, | 1067 | generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), |
| 1066 | bio_data_dir(bio), | ||
| 1067 | &ddip->d->disk->part0, ddip->start_time); | 1068 | &ddip->d->disk->part0, ddip->start_time); |
| 1068 | 1069 | ||
| 1069 | if (bio->bi_status) { | 1070 | if (bio->bi_status) { |
| @@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) | |||
| 1102 | generic_make_request(bio); | 1103 | generic_make_request(bio); |
| 1103 | } | 1104 | } |
| 1104 | 1105 | ||
| 1106 | static void quit_max_writeback_rate(struct cache_set *c, | ||
| 1107 | struct cached_dev *this_dc) | ||
| 1108 | { | ||
| 1109 | int i; | ||
| 1110 | struct bcache_device *d; | ||
| 1111 | struct cached_dev *dc; | ||
| 1112 | |||
| 1113 | /* | ||
| 1114 | * mutex bch_register_lock may compete with other parallel requesters, | ||
| 1115 | * or attach/detach operations on other backing device. Waiting to | ||
| 1116 | * the mutex lock may increase I/O request latency for seconds or more. | ||
| 1117 | * To avoid such situation, if mutext_trylock() failed, only writeback | ||
| 1118 | * rate of current cached device is set to 1, and __update_write_back() | ||
| 1119 | * will decide writeback rate of other cached devices (remember now | ||
| 1120 | * c->idle_counter is 0 already). | ||
| 1121 | */ | ||
| 1122 | if (mutex_trylock(&bch_register_lock)) { | ||
| 1123 | for (i = 0; i < c->devices_max_used; i++) { | ||
| 1124 | if (!c->devices[i]) | ||
| 1125 | continue; | ||
| 1126 | |||
| 1127 | if (UUID_FLASH_ONLY(&c->uuids[i])) | ||
| 1128 | continue; | ||
| 1129 | |||
| 1130 | d = c->devices[i]; | ||
| 1131 | dc = container_of(d, struct cached_dev, disk); | ||
| 1132 | /* | ||
| 1133 | * set writeback rate to default minimum value, | ||
| 1134 | * then let update_writeback_rate() to decide the | ||
| 1135 | * upcoming rate. | ||
| 1136 | */ | ||
| 1137 | atomic_long_set(&dc->writeback_rate.rate, 1); | ||
| 1138 | } | ||
| 1139 | mutex_unlock(&bch_register_lock); | ||
| 1140 | } else | ||
| 1141 | atomic_long_set(&this_dc->writeback_rate.rate, 1); | ||
| 1142 | } | ||
| 1143 | |||
| 1105 | /* Cached devices - read & write stuff */ | 1144 | /* Cached devices - read & write stuff */ |
| 1106 | 1145 | ||
| 1107 | static blk_qc_t cached_dev_make_request(struct request_queue *q, | 1146 | static blk_qc_t cached_dev_make_request(struct request_queue *q, |
| @@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, | |||
| 1119 | return BLK_QC_T_NONE; | 1158 | return BLK_QC_T_NONE; |
| 1120 | } | 1159 | } |
| 1121 | 1160 | ||
| 1122 | atomic_set(&dc->backing_idle, 0); | 1161 | if (likely(d->c)) { |
| 1123 | generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); | 1162 | if (atomic_read(&d->c->idle_counter)) |
| 1163 | atomic_set(&d->c->idle_counter, 0); | ||
| 1164 | /* | ||
| 1165 | * If at_max_writeback_rate of cache set is true and new I/O | ||
| 1166 | * comes, quit max writeback rate of all cached devices | ||
| 1167 | * attached to this cache set, and set at_max_writeback_rate | ||
| 1168 | * to false. | ||
| 1169 | */ | ||
| 1170 | if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { | ||
| 1171 | atomic_set(&d->c->at_max_writeback_rate, 0); | ||
| 1172 | quit_max_writeback_rate(d->c, dc); | ||
| 1173 | } | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | generic_start_io_acct(q, | ||
| 1177 | bio_op(bio), | ||
| 1178 | bio_sectors(bio), | ||
| 1179 | &d->disk->part0); | ||
| 1124 | 1180 | ||
| 1125 | bio_set_dev(bio, dc->bdev); | 1181 | bio_set_dev(bio, dc->bdev); |
| 1126 | bio->bi_iter.bi_sector += dc->sb.data_offset; | 1182 | bio->bi_iter.bi_sector += dc->sb.data_offset; |
| @@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, | |||
| 1229 | struct search *s; | 1285 | struct search *s; |
| 1230 | struct closure *cl; | 1286 | struct closure *cl; |
| 1231 | struct bcache_device *d = bio->bi_disk->private_data; | 1287 | struct bcache_device *d = bio->bi_disk->private_data; |
| 1232 | int rw = bio_data_dir(bio); | ||
| 1233 | 1288 | ||
| 1234 | if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { | 1289 | if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { |
| 1235 | bio->bi_status = BLK_STS_IOERR; | 1290 | bio->bi_status = BLK_STS_IOERR; |
| @@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, | |||
| 1237 | return BLK_QC_T_NONE; | 1292 | return BLK_QC_T_NONE; |
| 1238 | } | 1293 | } |
| 1239 | 1294 | ||
| 1240 | generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); | 1295 | generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); |
| 1241 | 1296 | ||
| 1242 | s = search_alloc(bio, d); | 1297 | s = search_alloc(bio, d); |
| 1243 | cl = &s->cl; | 1298 | cl = &s->cl; |
| @@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, | |||
| 1254 | flash_dev_nodata, | 1309 | flash_dev_nodata, |
| 1255 | bcache_wq); | 1310 | bcache_wq); |
| 1256 | return BLK_QC_T_NONE; | 1311 | return BLK_QC_T_NONE; |
| 1257 | } else if (rw) { | 1312 | } else if (bio_data_dir(bio)) { |
| 1258 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, | 1313 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, |
| 1259 | &KEY(d->id, bio->bi_iter.bi_sector, 0), | 1314 | &KEY(d->id, bio->bi_iter.bi_sector, 0), |
| 1260 | &KEY(d->id, bio_end_sector(bio), 0)); | 1315 | &KEY(d->id, bio_end_sector(bio), 0)); |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa4058e43202..55a37641aa95 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
| @@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, | |||
| 181 | goto err; | 181 | goto err; |
| 182 | } | 182 | } |
| 183 | 183 | ||
| 184 | sb->last_mount = get_seconds(); | 184 | sb->last_mount = (u32)ktime_get_real_seconds(); |
| 185 | err = NULL; | 185 | err = NULL; |
| 186 | 186 | ||
| 187 | get_page(bh->b_page); | 187 | get_page(bh->b_page); |
| @@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d) | |||
| 696 | { | 696 | { |
| 697 | lockdep_assert_held(&bch_register_lock); | 697 | lockdep_assert_held(&bch_register_lock); |
| 698 | 698 | ||
| 699 | atomic_dec(&d->c->attached_dev_nr); | ||
| 700 | |||
| 699 | if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { | 701 | if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { |
| 700 | struct uuid_entry *u = d->c->uuids + d->id; | 702 | struct uuid_entry *u = d->c->uuids + d->id; |
| 701 | 703 | ||
| 702 | SET_UUID_FLASH_ONLY(u, 0); | 704 | SET_UUID_FLASH_ONLY(u, 0); |
| 703 | memcpy(u->uuid, invalid_uuid, 16); | 705 | memcpy(u->uuid, invalid_uuid, 16); |
| 704 | u->invalidated = cpu_to_le32(get_seconds()); | 706 | u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); |
| 705 | bch_uuid_write(d->c); | 707 | bch_uuid_write(d->c); |
| 706 | } | 708 | } |
| 707 | 709 | ||
| @@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
| 796 | return idx; | 798 | return idx; |
| 797 | 799 | ||
| 798 | if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), | 800 | if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), |
| 799 | BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || | 801 | BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) |
| 800 | !(d->disk = alloc_disk(BCACHE_MINORS))) { | 802 | goto err; |
| 801 | ida_simple_remove(&bcache_device_idx, idx); | 803 | |
| 802 | return -ENOMEM; | 804 | d->disk = alloc_disk(BCACHE_MINORS); |
| 803 | } | 805 | if (!d->disk) |
| 806 | goto err; | ||
| 804 | 807 | ||
| 805 | set_capacity(d->disk, sectors); | 808 | set_capacity(d->disk, sectors); |
| 806 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); | 809 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); |
| @@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
| 834 | blk_queue_write_cache(q, true, true); | 837 | blk_queue_write_cache(q, true, true); |
| 835 | 838 | ||
| 836 | return 0; | 839 | return 0; |
| 840 | |||
| 841 | err: | ||
| 842 | ida_simple_remove(&bcache_device_idx, idx); | ||
| 843 | return -ENOMEM; | ||
| 844 | |||
| 837 | } | 845 | } |
| 838 | 846 | ||
| 839 | /* Cached device */ | 847 | /* Cached device */ |
| @@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) | |||
| 1027 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, | 1035 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, |
| 1028 | uint8_t *set_uuid) | 1036 | uint8_t *set_uuid) |
| 1029 | { | 1037 | { |
| 1030 | uint32_t rtime = cpu_to_le32(get_seconds()); | 1038 | uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); |
| 1031 | struct uuid_entry *u; | 1039 | struct uuid_entry *u; |
| 1032 | struct cached_dev *exist_dc, *t; | 1040 | struct cached_dev *exist_dc, *t; |
| 1033 | 1041 | ||
| @@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, | |||
| 1070 | (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || | 1078 | (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || |
| 1071 | BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { | 1079 | BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { |
| 1072 | memcpy(u->uuid, invalid_uuid, 16); | 1080 | memcpy(u->uuid, invalid_uuid, 16); |
| 1073 | u->invalidated = cpu_to_le32(get_seconds()); | 1081 | u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); |
| 1074 | u = NULL; | 1082 | u = NULL; |
| 1075 | } | 1083 | } |
| 1076 | 1084 | ||
| @@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, | |||
| 1138 | 1146 | ||
| 1139 | bch_cached_dev_run(dc); | 1147 | bch_cached_dev_run(dc); |
| 1140 | bcache_device_link(&dc->disk, c, "bdev"); | 1148 | bcache_device_link(&dc->disk, c, "bdev"); |
| 1149 | atomic_inc(&c->attached_dev_nr); | ||
| 1141 | 1150 | ||
| 1142 | /* Allow the writeback thread to proceed */ | 1151 | /* Allow the writeback thread to proceed */ |
| 1143 | up_write(&dc->writeback_lock); | 1152 | up_write(&dc->writeback_lock); |
| @@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, | |||
| 1285 | pr_info("registered backing device %s", dc->backing_dev_name); | 1294 | pr_info("registered backing device %s", dc->backing_dev_name); |
| 1286 | 1295 | ||
| 1287 | list_add(&dc->list, &uncached_devices); | 1296 | list_add(&dc->list, &uncached_devices); |
| 1297 | /* attach to a matched cache set if it exists */ | ||
| 1288 | list_for_each_entry(c, &bch_cache_sets, list) | 1298 | list_for_each_entry(c, &bch_cache_sets, list) |
| 1289 | bch_cached_dev_attach(dc, c, NULL); | 1299 | bch_cached_dev_attach(dc, c, NULL); |
| 1290 | 1300 | ||
| @@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl) | |||
| 1311 | { | 1321 | { |
| 1312 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); | 1322 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); |
| 1313 | mutex_lock(&bch_register_lock); | 1323 | mutex_lock(&bch_register_lock); |
| 1324 | atomic_long_sub(bcache_dev_sectors_dirty(d), | ||
| 1325 | &d->c->flash_dev_dirty_sectors); | ||
| 1314 | bcache_device_free(d); | 1326 | bcache_device_free(d); |
| 1315 | mutex_unlock(&bch_register_lock); | 1327 | mutex_unlock(&bch_register_lock); |
| 1316 | kobject_put(&d->kobj); | 1328 | kobject_put(&d->kobj); |
| @@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) | |||
| 1390 | 1402 | ||
| 1391 | get_random_bytes(u->uuid, 16); | 1403 | get_random_bytes(u->uuid, 16); |
| 1392 | memset(u->label, 0, 32); | 1404 | memset(u->label, 0, 32); |
| 1393 | u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); | 1405 | u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds()); |
| 1394 | 1406 | ||
| 1395 | SET_UUID_FLASH_ONLY(u, 1); | 1407 | SET_UUID_FLASH_ONLY(u, 1); |
| 1396 | u->sectors = size >> 9; | 1408 | u->sectors = size >> 9; |
| @@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
| 1687 | c->block_bits = ilog2(sb->block_size); | 1699 | c->block_bits = ilog2(sb->block_size); |
| 1688 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | 1700 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); |
| 1689 | c->devices_max_used = 0; | 1701 | c->devices_max_used = 0; |
| 1702 | atomic_set(&c->attached_dev_nr, 0); | ||
| 1690 | c->btree_pages = bucket_pages(c); | 1703 | c->btree_pages = bucket_pages(c); |
| 1691 | if (c->btree_pages > BTREE_MAX_PAGES) | 1704 | if (c->btree_pages > BTREE_MAX_PAGES) |
| 1692 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1705 | c->btree_pages = max_t(int, c->btree_pages / 4, |
| @@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c) | |||
| 1894 | goto err; | 1907 | goto err; |
| 1895 | 1908 | ||
| 1896 | closure_sync(&cl); | 1909 | closure_sync(&cl); |
| 1897 | c->sb.last_mount = get_seconds(); | 1910 | c->sb.last_mount = (u32)ktime_get_real_seconds(); |
| 1898 | bcache_write_super(c); | 1911 | bcache_write_super(c); |
| 1899 | 1912 | ||
| 1900 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | 1913 | list_for_each_entry_safe(dc, t, &uncached_devices, list) |
| @@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | |||
| 2163 | if (!try_module_get(THIS_MODULE)) | 2176 | if (!try_module_get(THIS_MODULE)) |
| 2164 | return -EBUSY; | 2177 | return -EBUSY; |
| 2165 | 2178 | ||
| 2166 | if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || | 2179 | path = kstrndup(buffer, size, GFP_KERNEL); |
| 2167 | !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) | 2180 | if (!path) |
| 2181 | goto err; | ||
| 2182 | |||
| 2183 | sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||
| 2184 | if (!sb) | ||
| 2168 | goto err; | 2185 | goto err; |
| 2169 | 2186 | ||
| 2170 | err = "failed to open device"; | 2187 | err = "failed to open device"; |
| @@ -2324,13 +2341,21 @@ static int __init bcache_init(void) | |||
| 2324 | return bcache_major; | 2341 | return bcache_major; |
| 2325 | } | 2342 | } |
| 2326 | 2343 | ||
| 2327 | if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || | 2344 | bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); |
| 2328 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || | 2345 | if (!bcache_wq) |
| 2329 | bch_request_init() || | 2346 | goto err; |
| 2330 | bch_debug_init(bcache_kobj) || closure_debug_init() || | 2347 | |
| 2348 | bcache_kobj = kobject_create_and_add("bcache", fs_kobj); | ||
| 2349 | if (!bcache_kobj) | ||
| 2350 | goto err; | ||
| 2351 | |||
| 2352 | if (bch_request_init() || | ||
| 2331 | sysfs_create_files(bcache_kobj, files)) | 2353 | sysfs_create_files(bcache_kobj, files)) |
| 2332 | goto err; | 2354 | goto err; |
| 2333 | 2355 | ||
| 2356 | bch_debug_init(bcache_kobj); | ||
| 2357 | closure_debug_init(); | ||
| 2358 | |||
| 2334 | return 0; | 2359 | return 0; |
| 2335 | err: | 2360 | err: |
| 2336 | bcache_exit(); | 2361 | bcache_exit(); |
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 225b15aa0340..81d3520b0702 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
| @@ -149,6 +149,7 @@ SHOW(__bch_cached_dev) | |||
| 149 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | 149 | struct cached_dev *dc = container_of(kobj, struct cached_dev, |
| 150 | disk.kobj); | 150 | disk.kobj); |
| 151 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; | 151 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; |
| 152 | int wb = dc->writeback_running; | ||
| 152 | 153 | ||
| 153 | #define var(stat) (dc->stat) | 154 | #define var(stat) (dc->stat) |
| 154 | 155 | ||
| @@ -170,7 +171,8 @@ SHOW(__bch_cached_dev) | |||
| 170 | var_printf(writeback_running, "%i"); | 171 | var_printf(writeback_running, "%i"); |
| 171 | var_print(writeback_delay); | 172 | var_print(writeback_delay); |
| 172 | var_print(writeback_percent); | 173 | var_print(writeback_percent); |
| 173 | sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); | 174 | sysfs_hprint(writeback_rate, |
| 175 | wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); | ||
| 174 | sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); | 176 | sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); |
| 175 | sysfs_printf(io_error_limit, "%i", dc->error_limit); | 177 | sysfs_printf(io_error_limit, "%i", dc->error_limit); |
| 176 | sysfs_printf(io_disable, "%i", dc->io_disable); | 178 | sysfs_printf(io_disable, "%i", dc->io_disable); |
| @@ -188,15 +190,22 @@ SHOW(__bch_cached_dev) | |||
| 188 | char change[20]; | 190 | char change[20]; |
| 189 | s64 next_io; | 191 | s64 next_io; |
| 190 | 192 | ||
| 191 | bch_hprint(rate, dc->writeback_rate.rate << 9); | 193 | /* |
| 192 | bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); | 194 | * Except for dirty and target, other values should |
| 193 | bch_hprint(target, dc->writeback_rate_target << 9); | 195 | * be 0 if writeback is not running. |
| 194 | bch_hprint(proportional,dc->writeback_rate_proportional << 9); | 196 | */ |
| 195 | bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); | 197 | bch_hprint(rate, |
| 196 | bch_hprint(change, dc->writeback_rate_change << 9); | 198 | wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 |
| 197 | 199 | : 0); | |
| 198 | next_io = div64_s64(dc->writeback_rate.next - local_clock(), | 200 | bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); |
| 199 | NSEC_PER_MSEC); | 201 | bch_hprint(target, dc->writeback_rate_target << 9); |
| 202 | bch_hprint(proportional, | ||
| 203 | wb ? dc->writeback_rate_proportional << 9 : 0); | ||
| 204 | bch_hprint(integral, | ||
| 205 | wb ? dc->writeback_rate_integral_scaled << 9 : 0); | ||
| 206 | bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0); | ||
| 207 | next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(), | ||
| 208 | NSEC_PER_MSEC) : 0; | ||
| 200 | 209 | ||
| 201 | return sprintf(buf, | 210 | return sprintf(buf, |
| 202 | "rate:\t\t%s/sec\n" | 211 | "rate:\t\t%s/sec\n" |
| @@ -255,8 +264,19 @@ STORE(__cached_dev) | |||
| 255 | 264 | ||
| 256 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); | 265 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); |
| 257 | 266 | ||
| 258 | sysfs_strtoul_clamp(writeback_rate, | 267 | if (attr == &sysfs_writeback_rate) { |
| 259 | dc->writeback_rate.rate, 1, INT_MAX); | 268 | ssize_t ret; |
| 269 | long int v = atomic_long_read(&dc->writeback_rate.rate); | ||
| 270 | |||
| 271 | ret = strtoul_safe_clamp(buf, v, 1, INT_MAX); | ||
| 272 | |||
| 273 | if (!ret) { | ||
| 274 | atomic_long_set(&dc->writeback_rate.rate, v); | ||
| 275 | ret = size; | ||
| 276 | } | ||
| 277 | |||
| 278 | return ret; | ||
| 279 | } | ||
| 260 | 280 | ||
| 261 | sysfs_strtoul_clamp(writeback_rate_update_seconds, | 281 | sysfs_strtoul_clamp(writeback_rate_update_seconds, |
| 262 | dc->writeback_rate_update_seconds, | 282 | dc->writeback_rate_update_seconds, |
| @@ -338,8 +358,8 @@ STORE(__cached_dev) | |||
| 338 | if (!v) | 358 | if (!v) |
| 339 | return size; | 359 | return size; |
| 340 | } | 360 | } |
| 341 | 361 | if (v == -ENOENT) | |
| 342 | pr_err("Can't attach %s: cache set not found", buf); | 362 | pr_err("Can't attach %s: cache set not found", buf); |
| 343 | return v; | 363 | return v; |
| 344 | } | 364 | } |
| 345 | 365 | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index fc479b026d6d..b15256bcf0e7 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
| @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) | |||
| 200 | { | 200 | { |
| 201 | uint64_t now = local_clock(); | 201 | uint64_t now = local_clock(); |
| 202 | 202 | ||
| 203 | d->next += div_u64(done * NSEC_PER_SEC, d->rate); | 203 | d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); |
| 204 | 204 | ||
| 205 | /* Bound the time. Don't let us fall further than 2 seconds behind | 205 | /* Bound the time. Don't let us fall further than 2 seconds behind |
| 206 | * (this prevents unnecessary backlog that would make it impossible | 206 | * (this prevents unnecessary backlog that would make it impossible |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cced87f8eb27..f7b0133c9d2f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
| @@ -442,7 +442,7 @@ struct bch_ratelimit { | |||
| 442 | * Rate at which we want to do work, in units per second | 442 | * Rate at which we want to do work, in units per second |
| 443 | * The units here correspond to the units passed to bch_next_delay() | 443 | * The units here correspond to the units passed to bch_next_delay() |
| 444 | */ | 444 | */ |
| 445 | uint32_t rate; | 445 | atomic_long_t rate; |
| 446 | }; | 446 | }; |
| 447 | 447 | ||
| 448 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) | 448 | static inline void bch_ratelimit_reset(struct bch_ratelimit *d) |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ad45ebe1a74b..481d4cf38ac0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
| @@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) | |||
| 27 | * flash-only devices | 27 | * flash-only devices |
| 28 | */ | 28 | */ |
| 29 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - | 29 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - |
| 30 | bcache_flash_devs_sectors_dirty(c); | 30 | atomic_long_read(&c->flash_dev_dirty_sectors); |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| 33 | * Unfortunately there is no control of global dirty data. If the | 33 | * Unfortunately there is no control of global dirty data. If the |
| @@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
| 104 | 104 | ||
| 105 | dc->writeback_rate_proportional = proportional_scaled; | 105 | dc->writeback_rate_proportional = proportional_scaled; |
| 106 | dc->writeback_rate_integral_scaled = integral_scaled; | 106 | dc->writeback_rate_integral_scaled = integral_scaled; |
| 107 | dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; | 107 | dc->writeback_rate_change = new_rate - |
| 108 | dc->writeback_rate.rate = new_rate; | 108 | atomic_long_read(&dc->writeback_rate.rate); |
| 109 | atomic_long_set(&dc->writeback_rate.rate, new_rate); | ||
| 109 | dc->writeback_rate_target = target; | 110 | dc->writeback_rate_target = target; |
| 110 | } | 111 | } |
| 111 | 112 | ||
| 113 | static bool set_at_max_writeback_rate(struct cache_set *c, | ||
| 114 | struct cached_dev *dc) | ||
| 115 | { | ||
| 116 | /* | ||
| 117 | * Idle_counter is increased everytime when update_writeback_rate() is | ||
| 118 | * called. If all backing devices attached to the same cache set have | ||
| 119 | * identical dc->writeback_rate_update_seconds values, it is about 6 | ||
| 120 | * rounds of update_writeback_rate() on each backing device before | ||
| 121 | * c->at_max_writeback_rate is set to 1, and then max wrteback rate set | ||
| 122 | * to each dc->writeback_rate.rate. | ||
| 123 | * In order to avoid extra locking cost for counting exact dirty cached | ||
| 124 | * devices number, c->attached_dev_nr is used to calculate the idle | ||
| 125 | * throushold. It might be bigger if not all cached device are in write- | ||
| 126 | * back mode, but it still works well with limited extra rounds of | ||
| 127 | * update_writeback_rate(). | ||
| 128 | */ | ||
| 129 | if (atomic_inc_return(&c->idle_counter) < | ||
| 130 | atomic_read(&c->attached_dev_nr) * 6) | ||
| 131 | return false; | ||
| 132 | |||
| 133 | if (atomic_read(&c->at_max_writeback_rate) != 1) | ||
| 134 | atomic_set(&c->at_max_writeback_rate, 1); | ||
| 135 | |||
| 136 | atomic_long_set(&dc->writeback_rate.rate, INT_MAX); | ||
| 137 | |||
| 138 | /* keep writeback_rate_target as existing value */ | ||
| 139 | dc->writeback_rate_proportional = 0; | ||
| 140 | dc->writeback_rate_integral_scaled = 0; | ||
| 141 | dc->writeback_rate_change = 0; | ||
| 142 | |||
| 143 | /* | ||
| 144 | * Check c->idle_counter and c->at_max_writeback_rate agagain in case | ||
| 145 | * new I/O arrives during before set_at_max_writeback_rate() returns. | ||
| 146 | * Then the writeback rate is set to 1, and its new value should be | ||
| 147 | * decided via __update_writeback_rate(). | ||
| 148 | */ | ||
| 149 | if ((atomic_read(&c->idle_counter) < | ||
| 150 | atomic_read(&c->attached_dev_nr) * 6) || | ||
| 151 | !atomic_read(&c->at_max_writeback_rate)) | ||
| 152 | return false; | ||
| 153 | |||
| 154 | return true; | ||
| 155 | } | ||
| 156 | |||
| 112 | static void update_writeback_rate(struct work_struct *work) | 157 | static void update_writeback_rate(struct work_struct *work) |
| 113 | { | 158 | { |
| 114 | struct cached_dev *dc = container_of(to_delayed_work(work), | 159 | struct cached_dev *dc = container_of(to_delayed_work(work), |
| @@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work) | |||
| 136 | return; | 181 | return; |
| 137 | } | 182 | } |
| 138 | 183 | ||
| 139 | down_read(&dc->writeback_lock); | 184 | if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { |
| 140 | 185 | /* | |
| 141 | if (atomic_read(&dc->has_dirty) && | 186 | * If the whole cache set is idle, set_at_max_writeback_rate() |
| 142 | dc->writeback_percent) | 187 | * will set writeback rate to a max number. Then it is |
| 143 | __update_writeback_rate(dc); | 188 | * unncessary to update writeback rate for an idle cache set |
| 189 | * in maximum writeback rate number(s). | ||
| 190 | */ | ||
| 191 | if (!set_at_max_writeback_rate(c, dc)) { | ||
| 192 | down_read(&dc->writeback_lock); | ||
| 193 | __update_writeback_rate(dc); | ||
| 194 | up_read(&dc->writeback_lock); | ||
| 195 | } | ||
| 196 | } | ||
| 144 | 197 | ||
| 145 | up_read(&dc->writeback_lock); | ||
| 146 | 198 | ||
| 147 | /* | 199 | /* |
| 148 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, | 200 | * CACHE_SET_IO_DISABLE might be set via sysfs interface, |
| @@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc) | |||
| 422 | 474 | ||
| 423 | delay = writeback_delay(dc, size); | 475 | delay = writeback_delay(dc, size); |
| 424 | 476 | ||
| 425 | /* If the control system would wait for at least half a | ||
| 426 | * second, and there's been no reqs hitting the backing disk | ||
| 427 | * for awhile: use an alternate mode where we have at most | ||
| 428 | * one contiguous set of writebacks in flight at a time. If | ||
| 429 | * someone wants to do IO it will be quick, as it will only | ||
| 430 | * have to contend with one operation in flight, and we'll | ||
| 431 | * be round-tripping data to the backing disk as quickly as | ||
| 432 | * it can accept it. | ||
| 433 | */ | ||
| 434 | if (delay >= HZ / 2) { | ||
| 435 | /* 3 means at least 1.5 seconds, up to 7.5 if we | ||
| 436 | * have slowed way down. | ||
| 437 | */ | ||
| 438 | if (atomic_inc_return(&dc->backing_idle) >= 3) { | ||
| 439 | /* Wait for current I/Os to finish */ | ||
| 440 | closure_sync(&cl); | ||
| 441 | /* And immediately launch a new set. */ | ||
| 442 | delay = 0; | ||
| 443 | } | ||
| 444 | } | ||
| 445 | |||
| 446 | while (!kthread_should_stop() && | 477 | while (!kthread_should_stop() && |
| 447 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && | 478 | !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && |
| 448 | delay) { | 479 | delay) { |
| @@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | |||
| 476 | if (!d) | 507 | if (!d) |
| 477 | return; | 508 | return; |
| 478 | 509 | ||
| 510 | if (UUID_FLASH_ONLY(&c->uuids[inode])) | ||
| 511 | atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); | ||
| 512 | |||
| 479 | stripe = offset_to_stripe(d, offset); | 513 | stripe = offset_to_stripe(d, offset); |
| 480 | stripe_offset = offset & (d->stripe_size - 1); | 514 | stripe_offset = offset & (d->stripe_size - 1); |
| 481 | 515 | ||
| @@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg) | |||
| 673 | } | 707 | } |
| 674 | 708 | ||
| 675 | /* Init */ | 709 | /* Init */ |
| 710 | #define INIT_KEYS_EACH_TIME 500000 | ||
| 711 | #define INIT_KEYS_SLEEP_MS 100 | ||
| 676 | 712 | ||
| 677 | struct sectors_dirty_init { | 713 | struct sectors_dirty_init { |
| 678 | struct btree_op op; | 714 | struct btree_op op; |
| 679 | unsigned inode; | 715 | unsigned inode; |
| 716 | size_t count; | ||
| 717 | struct bkey start; | ||
| 680 | }; | 718 | }; |
| 681 | 719 | ||
| 682 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | 720 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, |
| @@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | |||
| 691 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 729 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
| 692 | KEY_START(k), KEY_SIZE(k)); | 730 | KEY_START(k), KEY_SIZE(k)); |
| 693 | 731 | ||
| 732 | op->count++; | ||
| 733 | if (atomic_read(&b->c->search_inflight) && | ||
| 734 | !(op->count % INIT_KEYS_EACH_TIME)) { | ||
| 735 | bkey_copy_key(&op->start, k); | ||
| 736 | return -EAGAIN; | ||
| 737 | } | ||
| 738 | |||
| 694 | return MAP_CONTINUE; | 739 | return MAP_CONTINUE; |
| 695 | } | 740 | } |
| 696 | 741 | ||
| 697 | void bch_sectors_dirty_init(struct bcache_device *d) | 742 | void bch_sectors_dirty_init(struct bcache_device *d) |
| 698 | { | 743 | { |
| 699 | struct sectors_dirty_init op; | 744 | struct sectors_dirty_init op; |
| 745 | int ret; | ||
| 700 | 746 | ||
| 701 | bch_btree_op_init(&op.op, -1); | 747 | bch_btree_op_init(&op.op, -1); |
| 702 | op.inode = d->id; | 748 | op.inode = d->id; |
| 703 | 749 | op.count = 0; | |
| 704 | bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), | 750 | op.start = KEY(op.inode, 0, 0); |
| 705 | sectors_dirty_init_fn, 0); | 751 | |
| 752 | do { | ||
| 753 | ret = bch_btree_map_keys(&op.op, d->c, &op.start, | ||
| 754 | sectors_dirty_init_fn, 0); | ||
| 755 | if (ret == -EAGAIN) | ||
| 756 | schedule_timeout_interruptible( | ||
| 757 | msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); | ||
| 758 | else if (ret < 0) { | ||
| 759 | pr_warn("sectors dirty init failed, ret=%d!", ret); | ||
| 760 | break; | ||
| 761 | } | ||
| 762 | } while (ret == -EAGAIN); | ||
| 706 | } | 763 | } |
| 707 | 764 | ||
| 708 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 765 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
| @@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
| 715 | dc->writeback_running = true; | 772 | dc->writeback_running = true; |
| 716 | dc->writeback_percent = 10; | 773 | dc->writeback_percent = 10; |
| 717 | dc->writeback_delay = 30; | 774 | dc->writeback_delay = 30; |
| 718 | dc->writeback_rate.rate = 1024; | 775 | atomic_long_set(&dc->writeback_rate.rate, 1024); |
| 719 | dc->writeback_rate_minimum = 8; | 776 | dc->writeback_rate_minimum = 8; |
| 720 | 777 | ||
| 721 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; | 778 | dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 610fb01de629..3745d7004c47 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
| @@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | |||
| 28 | return ret; | 28 | return ret; |
| 29 | } | 29 | } |
| 30 | 30 | ||
| 31 | static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) | ||
| 32 | { | ||
| 33 | uint64_t i, ret = 0; | ||
| 34 | |||
| 35 | mutex_lock(&bch_register_lock); | ||
| 36 | |||
| 37 | for (i = 0; i < c->devices_max_used; i++) { | ||
| 38 | struct bcache_device *d = c->devices[i]; | ||
| 39 | |||
| 40 | if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) | ||
| 41 | continue; | ||
| 42 | ret += bcache_dev_sectors_dirty(d); | ||
| 43 | } | ||
| 44 | |||
| 45 | mutex_unlock(&bch_register_lock); | ||
| 46 | |||
| 47 | return ret; | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline unsigned offset_to_stripe(struct bcache_device *d, | 31 | static inline unsigned offset_to_stripe(struct bcache_device *d, |
| 51 | uint64_t offset) | 32 | uint64_t offset) |
| 52 | { | 33 | { |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b0dd7027848b..20f7e4ef5342 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io) | |||
| 609 | 609 | ||
| 610 | io->start_time = jiffies; | 610 | io->start_time = jiffies; |
| 611 | 611 | ||
| 612 | generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); | 612 | generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), |
| 613 | &dm_disk(md)->part0); | ||
| 613 | 614 | ||
| 614 | atomic_set(&dm_disk(md)->part0.in_flight[rw], | 615 | atomic_set(&dm_disk(md)->part0.in_flight[rw], |
| 615 | atomic_inc_return(&md->pending[rw])); | 616 | atomic_inc_return(&md->pending[rw])); |
| @@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io) | |||
| 628 | int pending; | 629 | int pending; |
| 629 | int rw = bio_data_dir(bio); | 630 | int rw = bio_data_dir(bio); |
| 630 | 631 | ||
| 631 | generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); | 632 | generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, |
| 633 | io->start_time); | ||
| 632 | 634 | ||
| 633 | if (unlikely(dm_stats_used(&md->stats))) | 635 | if (unlikely(dm_stats_used(&md->stats))) |
| 634 | dm_stats_account_io(&md->stats, bio_data_dir(bio), | 636 | dm_stats_account_io(&md->stats, bio_data_dir(bio), |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 994aed2f9dff..cb4eb5faa519 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -204,10 +204,6 @@ static int start_readonly; | |||
| 204 | */ | 204 | */ |
| 205 | static bool create_on_open = true; | 205 | static bool create_on_open = true; |
| 206 | 206 | ||
| 207 | /* bio_clone_mddev | ||
| 208 | * like bio_clone_bioset, but with a local bio set | ||
| 209 | */ | ||
| 210 | |||
| 211 | struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 207 | struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
| 212 | struct mddev *mddev) | 208 | struct mddev *mddev) |
| 213 | { | 209 | { |
| @@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request); | |||
| 335 | static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) | 331 | static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) |
| 336 | { | 332 | { |
| 337 | const int rw = bio_data_dir(bio); | 333 | const int rw = bio_data_dir(bio); |
| 334 | const int sgrp = op_stat_group(bio_op(bio)); | ||
| 338 | struct mddev *mddev = q->queuedata; | 335 | struct mddev *mddev = q->queuedata; |
| 339 | unsigned int sectors; | 336 | unsigned int sectors; |
| 340 | int cpu; | 337 | int cpu; |
| @@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) | |||
| 363 | md_handle_request(mddev, bio); | 360 | md_handle_request(mddev, bio); |
| 364 | 361 | ||
| 365 | cpu = part_stat_lock(); | 362 | cpu = part_stat_lock(); |
| 366 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | 363 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); |
| 367 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); | 364 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); |
| 368 | part_stat_unlock(); | 365 | part_stat_unlock(); |
| 369 | 366 | ||
| 370 | return BLK_QC_T_NONE; | 367 | return BLK_QC_T_NONE; |
| @@ -8046,8 +8043,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) | |||
| 8046 | rcu_read_lock(); | 8043 | rcu_read_lock(); |
| 8047 | rdev_for_each_rcu(rdev, mddev) { | 8044 | rdev_for_each_rcu(rdev, mddev) { |
| 8048 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | 8045 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; |
| 8049 | curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + | 8046 | curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - |
| 8050 | (int)part_stat_read(&disk->part0, sectors[1]) - | ||
| 8051 | atomic_read(&disk->sync_io); | 8047 | atomic_read(&disk->sync_io); |
| 8052 | /* sync IO will cause sync_io to increase before the disk_stats | 8048 | /* sync IO will cause sync_io to increase before the disk_stats |
| 8053 | * as sync_io is counted when a request starts, and | 8049 | * as sync_io is counted when a request starts, and |
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 85de8053aa34..0360c015f658 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c | |||
| @@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, | |||
| 1423 | 1423 | ||
| 1424 | static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, | 1424 | static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, |
| 1425 | struct page *page, unsigned int len, unsigned int off, | 1425 | struct page *page, unsigned int len, unsigned int off, |
| 1426 | bool is_write, sector_t sector) | 1426 | unsigned int op, sector_t sector) |
| 1427 | { | 1427 | { |
| 1428 | int ret; | 1428 | int ret; |
| 1429 | 1429 | ||
| 1430 | if (!is_write) { | 1430 | if (!op_is_write(op)) { |
| 1431 | ret = btt_read_pg(btt, bip, page, off, sector, len); | 1431 | ret = btt_read_pg(btt, bip, page, off, sector, len); |
| 1432 | flush_dcache_page(page); | 1432 | flush_dcache_page(page); |
| 1433 | } else { | 1433 | } else { |
| @@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) | |||
| 1464 | } | 1464 | } |
| 1465 | 1465 | ||
| 1466 | err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, | 1466 | err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, |
| 1467 | op_is_write(bio_op(bio)), iter.bi_sector); | 1467 | bio_op(bio), iter.bi_sector); |
| 1468 | if (err) { | 1468 | if (err) { |
| 1469 | dev_err(&btt->nd_btt->dev, | 1469 | dev_err(&btt->nd_btt->dev, |
| 1470 | "io error in %s sector %lld, len %d,\n", | 1470 | "io error in %s sector %lld, len %d,\n", |
| @@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) | |||
| 1483 | } | 1483 | } |
| 1484 | 1484 | ||
| 1485 | static int btt_rw_page(struct block_device *bdev, sector_t sector, | 1485 | static int btt_rw_page(struct block_device *bdev, sector_t sector, |
| 1486 | struct page *page, bool is_write) | 1486 | struct page *page, unsigned int op) |
| 1487 | { | 1487 | { |
| 1488 | struct btt *btt = bdev->bd_disk->private_data; | 1488 | struct btt *btt = bdev->bd_disk->private_data; |
| 1489 | int rc; | 1489 | int rc; |
| 1490 | unsigned int len; | 1490 | unsigned int len; |
| 1491 | 1491 | ||
| 1492 | len = hpage_nr_pages(page) * PAGE_SIZE; | 1492 | len = hpage_nr_pages(page) * PAGE_SIZE; |
| 1493 | rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector); | 1493 | rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector); |
| 1494 | if (rc == 0) | 1494 | if (rc == 0) |
| 1495 | page_endio(page, is_write, 0); | 1495 | page_endio(page, op_is_write(op), 0); |
| 1496 | 1496 | ||
| 1497 | return rc; | 1497 | return rc; |
| 1498 | } | 1498 | } |
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 32e0364b48b9..6ee7fd7e4bbd 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h | |||
| @@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) | |||
| 396 | return false; | 396 | return false; |
| 397 | 397 | ||
| 398 | *start = jiffies; | 398 | *start = jiffies; |
| 399 | generic_start_io_acct(disk->queue, bio_data_dir(bio), | 399 | generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio), |
| 400 | bio_sectors(bio), &disk->part0); | 400 | &disk->part0); |
| 401 | return true; | 401 | return true; |
| 402 | } | 402 | } |
| 403 | static inline void nd_iostat_end(struct bio *bio, unsigned long start) | 403 | static inline void nd_iostat_end(struct bio *bio, unsigned long start) |
| 404 | { | 404 | { |
| 405 | struct gendisk *disk = bio->bi_disk; | 405 | struct gendisk *disk = bio->bi_disk; |
| 406 | 406 | ||
| 407 | generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, | 407 | generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start); |
| 408 | start); | ||
| 409 | } | 408 | } |
| 410 | static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, | 409 | static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, |
| 411 | unsigned int len) | 410 | unsigned int len) |
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8b1fd7f1a224..dd17acd8fe68 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c | |||
| @@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, | |||
| 120 | } | 120 | } |
| 121 | 121 | ||
| 122 | static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, | 122 | static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, |
| 123 | unsigned int len, unsigned int off, bool is_write, | 123 | unsigned int len, unsigned int off, unsigned int op, |
| 124 | sector_t sector) | 124 | sector_t sector) |
| 125 | { | 125 | { |
| 126 | blk_status_t rc = BLK_STS_OK; | 126 | blk_status_t rc = BLK_STS_OK; |
| @@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, | |||
| 131 | if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) | 131 | if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) |
| 132 | bad_pmem = true; | 132 | bad_pmem = true; |
| 133 | 133 | ||
| 134 | if (!is_write) { | 134 | if (!op_is_write(op)) { |
| 135 | if (unlikely(bad_pmem)) | 135 | if (unlikely(bad_pmem)) |
| 136 | rc = BLK_STS_IOERR; | 136 | rc = BLK_STS_IOERR; |
| 137 | else { | 137 | else { |
| @@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) | |||
| 180 | do_acct = nd_iostat_start(bio, &start); | 180 | do_acct = nd_iostat_start(bio, &start); |
| 181 | bio_for_each_segment(bvec, bio, iter) { | 181 | bio_for_each_segment(bvec, bio, iter) { |
| 182 | rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, | 182 | rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, |
| 183 | bvec.bv_offset, op_is_write(bio_op(bio)), | 183 | bvec.bv_offset, bio_op(bio), iter.bi_sector); |
| 184 | iter.bi_sector); | ||
| 185 | if (rc) { | 184 | if (rc) { |
| 186 | bio->bi_status = rc; | 185 | bio->bi_status = rc; |
| 187 | break; | 186 | break; |
| @@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) | |||
| 198 | } | 197 | } |
| 199 | 198 | ||
| 200 | static int pmem_rw_page(struct block_device *bdev, sector_t sector, | 199 | static int pmem_rw_page(struct block_device *bdev, sector_t sector, |
| 201 | struct page *page, bool is_write) | 200 | struct page *page, unsigned int op) |
| 202 | { | 201 | { |
| 203 | struct pmem_device *pmem = bdev->bd_queue->queuedata; | 202 | struct pmem_device *pmem = bdev->bd_queue->queuedata; |
| 204 | blk_status_t rc; | 203 | blk_status_t rc; |
| 205 | 204 | ||
| 206 | rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, | 205 | rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, |
| 207 | 0, is_write, sector); | 206 | 0, op, sector); |
| 208 | 207 | ||
| 209 | /* | 208 | /* |
| 210 | * The ->rw_page interface is subtle and tricky. The core | 209 | * The ->rw_page interface is subtle and tricky. The core |
| @@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, | |||
| 213 | * caused by double completion. | 212 | * caused by double completion. |
| 214 | */ | 213 | */ |
| 215 | if (rc == 0) | 214 | if (rc == 0) |
| 216 | page_endio(page, is_write, 0); | 215 | page_endio(page, op_is_write(op), 0); |
| 217 | 216 | ||
| 218 | return blk_status_to_errno(rc); | 217 | return blk_status_to_errno(rc); |
| 219 | } | 218 | } |
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bf65501e6ed6..dd8ec1dd9219 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c | |||
| @@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req) | |||
| 252 | trace_nvme_complete_rq(req); | 252 | trace_nvme_complete_rq(req); |
| 253 | 253 | ||
| 254 | if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { | 254 | if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { |
| 255 | if (nvme_req_needs_failover(req, status)) { | 255 | if ((req->cmd_flags & REQ_NVME_MPATH) && |
| 256 | blk_path_error(status)) { | ||
| 256 | nvme_failover_req(req); | 257 | nvme_failover_req(req); |
| 257 | return; | 258 | return; |
| 258 | } | 259 | } |
| @@ -617,6 +618,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
| 617 | if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) | 618 | if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) |
| 618 | return BLK_STS_NOTSUPP; | 619 | return BLK_STS_NOTSUPP; |
| 619 | control |= NVME_RW_PRINFO_PRACT; | 620 | control |= NVME_RW_PRINFO_PRACT; |
| 621 | } else if (req_op(req) == REQ_OP_WRITE) { | ||
| 622 | t10_pi_prepare(req, ns->pi_type); | ||
| 620 | } | 623 | } |
| 621 | 624 | ||
| 622 | switch (ns->pi_type) { | 625 | switch (ns->pi_type) { |
| @@ -627,8 +630,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
| 627 | case NVME_NS_DPS_PI_TYPE2: | 630 | case NVME_NS_DPS_PI_TYPE2: |
| 628 | control |= NVME_RW_PRINFO_PRCHK_GUARD | | 631 | control |= NVME_RW_PRINFO_PRCHK_GUARD | |
| 629 | NVME_RW_PRINFO_PRCHK_REF; | 632 | NVME_RW_PRINFO_PRCHK_REF; |
| 630 | cmnd->rw.reftag = cpu_to_le32( | 633 | cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); |
| 631 | nvme_block_nr(ns, blk_rq_pos(req))); | ||
| 632 | break; | 634 | break; |
| 633 | } | 635 | } |
| 634 | } | 636 | } |
| @@ -638,6 +640,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, | |||
| 638 | return 0; | 640 | return 0; |
| 639 | } | 641 | } |
| 640 | 642 | ||
| 643 | void nvme_cleanup_cmd(struct request *req) | ||
| 644 | { | ||
| 645 | if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && | ||
| 646 | nvme_req(req)->status == 0) { | ||
| 647 | struct nvme_ns *ns = req->rq_disk->private_data; | ||
| 648 | |||
| 649 | t10_pi_complete(req, ns->pi_type, | ||
| 650 | blk_rq_bytes(req) >> ns->lba_shift); | ||
| 651 | } | ||
| 652 | if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { | ||
| 653 | kfree(page_address(req->special_vec.bv_page) + | ||
| 654 | req->special_vec.bv_offset); | ||
| 655 | } | ||
| 656 | } | ||
| 657 | EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); | ||
| 658 | |||
| 641 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, | 659 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, |
| 642 | struct nvme_command *cmd) | 660 | struct nvme_command *cmd) |
| 643 | { | 661 | { |
| @@ -668,10 +686,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, | |||
| 668 | } | 686 | } |
| 669 | 687 | ||
| 670 | cmd->common.command_id = req->tag; | 688 | cmd->common.command_id = req->tag; |
| 671 | if (ns) | 689 | trace_nvme_setup_cmd(req, cmd); |
| 672 | trace_nvme_setup_nvm_cmd(req->q->id, cmd); | ||
| 673 | else | ||
| 674 | trace_nvme_setup_admin_cmd(cmd); | ||
| 675 | return ret; | 690 | return ret; |
| 676 | } | 691 | } |
| 677 | EXPORT_SYMBOL_GPL(nvme_setup_cmd); | 692 | EXPORT_SYMBOL_GPL(nvme_setup_cmd); |
| @@ -864,9 +879,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) | |||
| 864 | if (unlikely(ctrl->kato == 0)) | 879 | if (unlikely(ctrl->kato == 0)) |
| 865 | return; | 880 | return; |
| 866 | 881 | ||
| 867 | INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); | ||
| 868 | memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); | ||
| 869 | ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; | ||
| 870 | schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); | 882 | schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); |
| 871 | } | 883 | } |
| 872 | 884 | ||
| @@ -1056,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) | |||
| 1056 | EXPORT_SYMBOL_GPL(nvme_set_queue_count); | 1068 | EXPORT_SYMBOL_GPL(nvme_set_queue_count); |
| 1057 | 1069 | ||
| 1058 | #define NVME_AEN_SUPPORTED \ | 1070 | #define NVME_AEN_SUPPORTED \ |
| 1059 | (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) | 1071 | (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) |
| 1060 | 1072 | ||
| 1061 | static void nvme_enable_aen(struct nvme_ctrl *ctrl) | 1073 | static void nvme_enable_aen(struct nvme_ctrl *ctrl) |
| 1062 | { | 1074 | { |
| @@ -1472,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk, | |||
| 1472 | 1484 | ||
| 1473 | set_capacity(disk, capacity); | 1485 | set_capacity(disk, capacity); |
| 1474 | nvme_config_discard(ns); | 1486 | nvme_config_discard(ns); |
| 1487 | |||
| 1488 | if (id->nsattr & (1 << 0)) | ||
| 1489 | set_disk_ro(disk, true); | ||
| 1490 | else | ||
| 1491 | set_disk_ro(disk, false); | ||
| 1492 | |||
| 1475 | blk_mq_unfreeze_queue(disk->queue); | 1493 | blk_mq_unfreeze_queue(disk->queue); |
| 1476 | } | 1494 | } |
| 1477 | 1495 | ||
| @@ -2270,21 +2288,16 @@ out_unlock: | |||
| 2270 | return ret; | 2288 | return ret; |
| 2271 | } | 2289 | } |
| 2272 | 2290 | ||
| 2273 | int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | 2291 | int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, |
| 2274 | u8 log_page, void *log, | 2292 | void *log, size_t size, u64 offset) |
| 2275 | size_t size, u64 offset) | ||
| 2276 | { | 2293 | { |
| 2277 | struct nvme_command c = { }; | 2294 | struct nvme_command c = { }; |
| 2278 | unsigned long dwlen = size / 4 - 1; | 2295 | unsigned long dwlen = size / 4 - 1; |
| 2279 | 2296 | ||
| 2280 | c.get_log_page.opcode = nvme_admin_get_log_page; | 2297 | c.get_log_page.opcode = nvme_admin_get_log_page; |
| 2281 | 2298 | c.get_log_page.nsid = cpu_to_le32(nsid); | |
| 2282 | if (ns) | ||
| 2283 | c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id); | ||
| 2284 | else | ||
| 2285 | c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL); | ||
| 2286 | |||
| 2287 | c.get_log_page.lid = log_page; | 2299 | c.get_log_page.lid = log_page; |
| 2300 | c.get_log_page.lsp = lsp; | ||
| 2288 | c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); | 2301 | c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); |
| 2289 | c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); | 2302 | c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); |
| 2290 | c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); | 2303 | c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); |
| @@ -2293,12 +2306,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | |||
| 2293 | return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); | 2306 | return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); |
| 2294 | } | 2307 | } |
| 2295 | 2308 | ||
| 2296 | static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, | ||
| 2297 | size_t size) | ||
| 2298 | { | ||
| 2299 | return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0); | ||
| 2300 | } | ||
| 2301 | |||
| 2302 | static int nvme_get_effects_log(struct nvme_ctrl *ctrl) | 2309 | static int nvme_get_effects_log(struct nvme_ctrl *ctrl) |
| 2303 | { | 2310 | { |
| 2304 | int ret; | 2311 | int ret; |
| @@ -2309,8 +2316,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) | |||
| 2309 | if (!ctrl->effects) | 2316 | if (!ctrl->effects) |
| 2310 | return 0; | 2317 | return 0; |
| 2311 | 2318 | ||
| 2312 | ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, | 2319 | ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, |
| 2313 | sizeof(*ctrl->effects)); | 2320 | ctrl->effects, sizeof(*ctrl->effects), 0); |
| 2314 | if (ret) { | 2321 | if (ret) { |
| 2315 | kfree(ctrl->effects); | 2322 | kfree(ctrl->effects); |
| 2316 | ctrl->effects = NULL; | 2323 | ctrl->effects = NULL; |
| @@ -2401,6 +2408,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
| 2401 | nvme_set_queue_limits(ctrl, ctrl->admin_q); | 2408 | nvme_set_queue_limits(ctrl, ctrl->admin_q); |
| 2402 | ctrl->sgls = le32_to_cpu(id->sgls); | 2409 | ctrl->sgls = le32_to_cpu(id->sgls); |
| 2403 | ctrl->kas = le16_to_cpu(id->kas); | 2410 | ctrl->kas = le16_to_cpu(id->kas); |
| 2411 | ctrl->max_namespaces = le32_to_cpu(id->mnan); | ||
| 2404 | 2412 | ||
| 2405 | if (id->rtd3e) { | 2413 | if (id->rtd3e) { |
| 2406 | /* us -> s */ | 2414 | /* us -> s */ |
| @@ -2460,8 +2468,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) | |||
| 2460 | ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); | 2468 | ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); |
| 2461 | } | 2469 | } |
| 2462 | 2470 | ||
| 2471 | ret = nvme_mpath_init(ctrl, id); | ||
| 2463 | kfree(id); | 2472 | kfree(id); |
| 2464 | 2473 | ||
| 2474 | if (ret < 0) | ||
| 2475 | return ret; | ||
| 2476 | |||
| 2465 | if (ctrl->apst_enabled && !prev_apst_enabled) | 2477 | if (ctrl->apst_enabled && !prev_apst_enabled) |
| 2466 | dev_pm_qos_expose_latency_tolerance(ctrl->device); | 2478 | dev_pm_qos_expose_latency_tolerance(ctrl->device); |
| 2467 | else if (!ctrl->apst_enabled && prev_apst_enabled) | 2479 | else if (!ctrl->apst_enabled && prev_apst_enabled) |
| @@ -2680,6 +2692,10 @@ static struct attribute *nvme_ns_id_attrs[] = { | |||
| 2680 | &dev_attr_nguid.attr, | 2692 | &dev_attr_nguid.attr, |
| 2681 | &dev_attr_eui.attr, | 2693 | &dev_attr_eui.attr, |
| 2682 | &dev_attr_nsid.attr, | 2694 | &dev_attr_nsid.attr, |
| 2695 | #ifdef CONFIG_NVME_MULTIPATH | ||
| 2696 | &dev_attr_ana_grpid.attr, | ||
| 2697 | &dev_attr_ana_state.attr, | ||
| 2698 | #endif | ||
| 2683 | NULL, | 2699 | NULL, |
| 2684 | }; | 2700 | }; |
| 2685 | 2701 | ||
| @@ -2702,6 +2718,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, | |||
| 2702 | if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) | 2718 | if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) |
| 2703 | return 0; | 2719 | return 0; |
| 2704 | } | 2720 | } |
| 2721 | #ifdef CONFIG_NVME_MULTIPATH | ||
| 2722 | if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { | ||
| 2723 | if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ | ||
| 2724 | return 0; | ||
| 2725 | if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) | ||
| 2726 | return 0; | ||
| 2727 | } | ||
| 2728 | #endif | ||
| 2705 | return a->mode; | 2729 | return a->mode; |
| 2706 | } | 2730 | } |
| 2707 | 2731 | ||
| @@ -3075,8 +3099,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
| 3075 | 3099 | ||
| 3076 | nvme_get_ctrl(ctrl); | 3100 | nvme_get_ctrl(ctrl); |
| 3077 | 3101 | ||
| 3078 | kfree(id); | ||
| 3079 | |||
| 3080 | device_add_disk(ctrl->device, ns->disk); | 3102 | device_add_disk(ctrl->device, ns->disk); |
| 3081 | if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, | 3103 | if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, |
| 3082 | &nvme_ns_id_attr_group)) | 3104 | &nvme_ns_id_attr_group)) |
| @@ -3086,8 +3108,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
| 3086 | pr_warn("%s: failed to register lightnvm sysfs group for identification\n", | 3108 | pr_warn("%s: failed to register lightnvm sysfs group for identification\n", |
| 3087 | ns->disk->disk_name); | 3109 | ns->disk->disk_name); |
| 3088 | 3110 | ||
| 3089 | nvme_mpath_add_disk(ns->head); | 3111 | nvme_mpath_add_disk(ns, id); |
| 3090 | nvme_fault_inject_init(ns); | 3112 | nvme_fault_inject_init(ns); |
| 3113 | kfree(id); | ||
| 3114 | |||
| 3091 | return; | 3115 | return; |
| 3092 | out_unlink_ns: | 3116 | out_unlink_ns: |
| 3093 | mutex_lock(&ctrl->subsys->lock); | 3117 | mutex_lock(&ctrl->subsys->lock); |
| @@ -3229,7 +3253,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) | |||
| 3229 | * raced with us in reading the log page, which could cause us to miss | 3253 | * raced with us in reading the log page, which could cause us to miss |
| 3230 | * updates. | 3254 | * updates. |
| 3231 | */ | 3255 | */ |
| 3232 | error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); | 3256 | error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, |
| 3257 | log_size, 0); | ||
| 3233 | if (error) | 3258 | if (error) |
| 3234 | dev_warn(ctrl->device, | 3259 | dev_warn(ctrl->device, |
| 3235 | "reading changed ns log failed: %d\n", error); | 3260 | "reading changed ns log failed: %d\n", error); |
| @@ -3346,9 +3371,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) | |||
| 3346 | if (!log) | 3371 | if (!log) |
| 3347 | return; | 3372 | return; |
| 3348 | 3373 | ||
| 3349 | if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) | 3374 | if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, |
| 3350 | dev_warn(ctrl->device, | 3375 | sizeof(*log), 0)) |
| 3351 | "Get FW SLOT INFO log error\n"); | 3376 | dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); |
| 3352 | kfree(log); | 3377 | kfree(log); |
| 3353 | } | 3378 | } |
| 3354 | 3379 | ||
| @@ -3394,6 +3419,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) | |||
| 3394 | case NVME_AER_NOTICE_FW_ACT_STARTING: | 3419 | case NVME_AER_NOTICE_FW_ACT_STARTING: |
| 3395 | queue_work(nvme_wq, &ctrl->fw_act_work); | 3420 | queue_work(nvme_wq, &ctrl->fw_act_work); |
| 3396 | break; | 3421 | break; |
| 3422 | #ifdef CONFIG_NVME_MULTIPATH | ||
| 3423 | case NVME_AER_NOTICE_ANA: | ||
| 3424 | if (!ctrl->ana_log_buf) | ||
| 3425 | break; | ||
| 3426 | queue_work(nvme_wq, &ctrl->ana_work); | ||
| 3427 | break; | ||
| 3428 | #endif | ||
| 3397 | default: | 3429 | default: |
| 3398 | dev_warn(ctrl->device, "async event result %08x\n", result); | 3430 | dev_warn(ctrl->device, "async event result %08x\n", result); |
| 3399 | } | 3431 | } |
| @@ -3426,6 +3458,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); | |||
| 3426 | 3458 | ||
| 3427 | void nvme_stop_ctrl(struct nvme_ctrl *ctrl) | 3459 | void nvme_stop_ctrl(struct nvme_ctrl *ctrl) |
| 3428 | { | 3460 | { |
| 3461 | nvme_mpath_stop(ctrl); | ||
| 3429 | nvme_stop_keep_alive(ctrl); | 3462 | nvme_stop_keep_alive(ctrl); |
| 3430 | flush_work(&ctrl->async_event_work); | 3463 | flush_work(&ctrl->async_event_work); |
| 3431 | flush_work(&ctrl->scan_work); | 3464 | flush_work(&ctrl->scan_work); |
| @@ -3463,6 +3496,7 @@ static void nvme_free_ctrl(struct device *dev) | |||
| 3463 | 3496 | ||
| 3464 | ida_simple_remove(&nvme_instance_ida, ctrl->instance); | 3497 | ida_simple_remove(&nvme_instance_ida, ctrl->instance); |
| 3465 | kfree(ctrl->effects); | 3498 | kfree(ctrl->effects); |
| 3499 | nvme_mpath_uninit(ctrl); | ||
| 3466 | 3500 | ||
| 3467 | if (subsys) { | 3501 | if (subsys) { |
| 3468 | mutex_lock(&subsys->lock); | 3502 | mutex_lock(&subsys->lock); |
| @@ -3499,6 +3533,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, | |||
| 3499 | INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); | 3533 | INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); |
| 3500 | INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); | 3534 | INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); |
| 3501 | 3535 | ||
| 3536 | INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); | ||
| 3537 | memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); | ||
| 3538 | ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; | ||
| 3539 | |||
| 3502 | ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); | 3540 | ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); |
| 3503 | if (ret < 0) | 3541 | if (ret < 0) |
| 3504 | goto out; | 3542 | goto out; |
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f7efe5a58cc7..206d63cb1afc 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c | |||
| @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); | |||
| 474 | 474 | ||
| 475 | bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) | 475 | bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) |
| 476 | { | 476 | { |
| 477 | if (ctrl->opts->max_reconnects != -1 && | 477 | if (ctrl->opts->max_reconnects == -1 || |
| 478 | ctrl->nr_reconnects < ctrl->opts->max_reconnects) | 478 | ctrl->nr_reconnects < ctrl->opts->max_reconnects) |
| 479 | return true; | 479 | return true; |
| 480 | 480 | ||
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9bac912173ba..611e70cae754 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c | |||
| @@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, | |||
| 1737 | int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; | 1737 | int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; |
| 1738 | struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; | 1738 | struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; |
| 1739 | 1739 | ||
| 1740 | nvme_req(rq)->ctrl = &ctrl->ctrl; | ||
| 1740 | return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); | 1741 | return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); |
| 1741 | } | 1742 | } |
| 1742 | 1743 | ||
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 41279da799ed..6fe5923c95d4 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c | |||
| @@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, | |||
| 414 | /* Set compacted version for upper layers */ | 414 | /* Set compacted version for upper layers */ |
| 415 | geo->version = NVM_OCSSD_SPEC_20; | 415 | geo->version = NVM_OCSSD_SPEC_20; |
| 416 | 416 | ||
| 417 | if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) { | ||
| 418 | pr_err("nvm: OCSSD version not supported (v%d.%d)\n", | ||
| 419 | geo->major_ver_id, geo->minor_ver_id); | ||
| 420 | return -EINVAL; | ||
| 421 | } | ||
| 422 | |||
| 423 | geo->num_ch = le16_to_cpu(id->num_grp); | 417 | geo->num_ch = le16_to_cpu(id->num_grp); |
| 424 | geo->num_lun = le16_to_cpu(id->num_pu); | 418 | geo->num_lun = le16_to_cpu(id->num_pu); |
| 425 | geo->all_luns = geo->num_ch * geo->num_lun; | 419 | geo->all_luns = geo->num_ch * geo->num_lun; |
| @@ -583,7 +577,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, | |||
| 583 | struct ppa_addr ppa; | 577 | struct ppa_addr ppa; |
| 584 | size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); | 578 | size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); |
| 585 | size_t log_pos, offset, len; | 579 | size_t log_pos, offset, len; |
| 586 | int ret, i; | 580 | int ret, i, max_len; |
| 581 | |||
| 582 | /* | ||
| 583 | * limit requests to maximum 256K to avoid issuing arbitrary large | ||
| 584 | * requests when the device does not specific a maximum transfer size. | ||
| 585 | */ | ||
| 586 | max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); | ||
| 587 | 587 | ||
| 588 | /* Normalize lba address space to obtain log offset */ | 588 | /* Normalize lba address space to obtain log offset */ |
| 589 | ppa.ppa = slba; | 589 | ppa.ppa = slba; |
| @@ -596,10 +596,11 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, | |||
| 596 | offset = log_pos * sizeof(struct nvme_nvm_chk_meta); | 596 | offset = log_pos * sizeof(struct nvme_nvm_chk_meta); |
| 597 | 597 | ||
| 598 | while (left) { | 598 | while (left) { |
| 599 | len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9); | 599 | len = min_t(unsigned int, left, max_len); |
| 600 | 600 | ||
| 601 | ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, | 601 | ret = nvme_get_log(ctrl, ns->head->ns_id, |
| 602 | dev_meta, len, offset); | 602 | NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, |
| 603 | offset); | ||
| 603 | if (ret) { | 604 | if (ret) { |
| 604 | dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); | 605 | dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); |
| 605 | break; | 606 | break; |
| @@ -662,12 +663,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, | |||
| 662 | 663 | ||
| 663 | rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; | 664 | rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; |
| 664 | 665 | ||
| 665 | if (rqd->bio) { | 666 | if (rqd->bio) |
| 666 | blk_init_request_from_bio(rq, rqd->bio); | 667 | blk_init_request_from_bio(rq, rqd->bio); |
| 667 | } else { | 668 | else |
| 668 | rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); | 669 | rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
| 669 | rq->__data_len = 0; | ||
| 670 | } | ||
| 671 | 670 | ||
| 672 | return rq; | 671 | return rq; |
| 673 | } | 672 | } |
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1ffd3e8b13a1..5a9562881d4e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Copyright (c) 2017 Christoph Hellwig. | 2 | * Copyright (c) 2017-2018 Christoph Hellwig. |
| 3 | * | 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify it | 4 | * This program is free software; you can redistribute it and/or modify it |
| 5 | * under the terms and conditions of the GNU General Public License, | 5 | * under the terms and conditions of the GNU General Public License, |
| @@ -20,6 +20,11 @@ module_param(multipath, bool, 0444); | |||
| 20 | MODULE_PARM_DESC(multipath, | 20 | MODULE_PARM_DESC(multipath, |
| 21 | "turn on native support for multiple controllers per subsystem"); | 21 | "turn on native support for multiple controllers per subsystem"); |
| 22 | 22 | ||
| 23 | inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) | ||
| 24 | { | ||
| 25 | return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3)); | ||
| 26 | } | ||
| 27 | |||
| 23 | /* | 28 | /* |
| 24 | * If multipathing is enabled we need to always use the subsystem instance | 29 | * If multipathing is enabled we need to always use the subsystem instance |
| 25 | * number for numbering our devices to avoid conflicts between subsystems that | 30 | * number for numbering our devices to avoid conflicts between subsystems that |
| @@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, | |||
| 45 | void nvme_failover_req(struct request *req) | 50 | void nvme_failover_req(struct request *req) |
| 46 | { | 51 | { |
| 47 | struct nvme_ns *ns = req->q->queuedata; | 52 | struct nvme_ns *ns = req->q->queuedata; |
| 53 | u16 status = nvme_req(req)->status; | ||
| 48 | unsigned long flags; | 54 | unsigned long flags; |
| 49 | 55 | ||
| 50 | spin_lock_irqsave(&ns->head->requeue_lock, flags); | 56 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
| @@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req) | |||
| 52 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | 58 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); |
| 53 | blk_mq_end_request(req, 0); | 59 | blk_mq_end_request(req, 0); |
| 54 | 60 | ||
| 55 | nvme_reset_ctrl(ns->ctrl); | 61 | switch (status & 0x7ff) { |
| 56 | kblockd_schedule_work(&ns->head->requeue_work); | 62 | case NVME_SC_ANA_TRANSITION: |
| 57 | } | 63 | case NVME_SC_ANA_INACCESSIBLE: |
| 64 | case NVME_SC_ANA_PERSISTENT_LOSS: | ||
| 65 | /* | ||
| 66 | * If we got back an ANA error we know the controller is alive, | ||
| 67 | * but not ready to serve this namespaces. The spec suggests | ||
| 68 | * we should update our general state here, but due to the fact | ||
| 69 | * that the admin and I/O queues are not serialized that is | ||
| 70 | * fundamentally racy. So instead just clear the current path, | ||
| 71 | * mark the the path as pending and kick of a re-read of the ANA | ||
| 72 | * log page ASAP. | ||
| 73 | */ | ||
| 74 | nvme_mpath_clear_current_path(ns); | ||
| 75 | if (ns->ctrl->ana_log_buf) { | ||
| 76 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | ||
| 77 | queue_work(nvme_wq, &ns->ctrl->ana_work); | ||
| 78 | } | ||
| 79 | break; | ||
| 80 | default: | ||
| 81 | /* | ||
| 82 | * Reset the controller for any non-ANA error as we don't know | ||
| 83 | * what caused the error. | ||
| 84 | */ | ||
| 85 | nvme_reset_ctrl(ns->ctrl); | ||
| 86 | break; | ||
| 87 | } | ||
| 58 | 88 | ||
| 59 | bool nvme_req_needs_failover(struct request *req, blk_status_t error) | 89 | kblockd_schedule_work(&ns->head->requeue_work); |
| 60 | { | ||
| 61 | if (!(req->cmd_flags & REQ_NVME_MPATH)) | ||
| 62 | return false; | ||
| 63 | return blk_path_error(error); | ||
| 64 | } | 90 | } |
| 65 | 91 | ||
| 66 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | 92 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
| @@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | |||
| 75 | up_read(&ctrl->namespaces_rwsem); | 101 | up_read(&ctrl->namespaces_rwsem); |
| 76 | } | 102 | } |
| 77 | 103 | ||
| 104 | static const char *nvme_ana_state_names[] = { | ||
| 105 | [0] = "invalid state", | ||
| 106 | [NVME_ANA_OPTIMIZED] = "optimized", | ||
| 107 | [NVME_ANA_NONOPTIMIZED] = "non-optimized", | ||
| 108 | [NVME_ANA_INACCESSIBLE] = "inaccessible", | ||
| 109 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", | ||
| 110 | [NVME_ANA_CHANGE] = "change", | ||
| 111 | }; | ||
| 112 | |||
| 78 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) | 113 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) |
| 79 | { | 114 | { |
| 80 | struct nvme_ns *ns; | 115 | struct nvme_ns *ns, *fallback = NULL; |
| 81 | 116 | ||
| 82 | list_for_each_entry_rcu(ns, &head->list, siblings) { | 117 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
| 83 | if (ns->ctrl->state == NVME_CTRL_LIVE) { | 118 | if (ns->ctrl->state != NVME_CTRL_LIVE || |
| 119 | test_bit(NVME_NS_ANA_PENDING, &ns->flags)) | ||
| 120 | continue; | ||
| 121 | switch (ns->ana_state) { | ||
| 122 | case NVME_ANA_OPTIMIZED: | ||
| 84 | rcu_assign_pointer(head->current_path, ns); | 123 | rcu_assign_pointer(head->current_path, ns); |
| 85 | return ns; | 124 | return ns; |
| 125 | case NVME_ANA_NONOPTIMIZED: | ||
| 126 | fallback = ns; | ||
| 127 | break; | ||
| 128 | default: | ||
| 129 | break; | ||
| 86 | } | 130 | } |
| 87 | } | 131 | } |
| 88 | 132 | ||
| 89 | return NULL; | 133 | if (fallback) |
| 134 | rcu_assign_pointer(head->current_path, fallback); | ||
| 135 | return fallback; | ||
| 136 | } | ||
| 137 | |||
| 138 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) | ||
| 139 | { | ||
| 140 | return ns->ctrl->state == NVME_CTRL_LIVE && | ||
| 141 | ns->ana_state == NVME_ANA_OPTIMIZED; | ||
| 90 | } | 142 | } |
| 91 | 143 | ||
| 92 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | 144 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) |
| 93 | { | 145 | { |
| 94 | struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); | 146 | struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); |
| 95 | 147 | ||
| 96 | if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) | 148 | if (unlikely(!ns || !nvme_path_is_optimized(ns))) |
| 97 | ns = __nvme_find_path(head); | 149 | ns = __nvme_find_path(head); |
| 98 | return ns; | 150 | return ns; |
| 99 | } | 151 | } |
| @@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) | |||
| 142 | 194 | ||
| 143 | srcu_idx = srcu_read_lock(&head->srcu); | 195 | srcu_idx = srcu_read_lock(&head->srcu); |
| 144 | ns = srcu_dereference(head->current_path, &head->srcu); | 196 | ns = srcu_dereference(head->current_path, &head->srcu); |
| 145 | if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) | 197 | if (likely(ns && nvme_path_is_optimized(ns))) |
| 146 | found = ns->queue->poll_fn(q, qc); | 198 | found = ns->queue->poll_fn(q, qc); |
| 147 | srcu_read_unlock(&head->srcu, srcu_idx); | 199 | srcu_read_unlock(&head->srcu, srcu_idx); |
| 148 | return found; | 200 | return found; |
| @@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | |||
| 176 | struct request_queue *q; | 228 | struct request_queue *q; |
| 177 | bool vwc = false; | 229 | bool vwc = false; |
| 178 | 230 | ||
| 231 | mutex_init(&head->lock); | ||
| 179 | bio_list_init(&head->requeue_list); | 232 | bio_list_init(&head->requeue_list); |
| 180 | spin_lock_init(&head->requeue_lock); | 233 | spin_lock_init(&head->requeue_lock); |
| 181 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | 234 | INIT_WORK(&head->requeue_work, nvme_requeue_work); |
| @@ -220,29 +273,232 @@ out: | |||
| 220 | return -ENOMEM; | 273 | return -ENOMEM; |
| 221 | } | 274 | } |
| 222 | 275 | ||
| 223 | void nvme_mpath_add_disk(struct nvme_ns_head *head) | 276 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
| 224 | { | 277 | { |
| 278 | struct nvme_ns_head *head = ns->head; | ||
| 279 | |||
| 280 | lockdep_assert_held(&ns->head->lock); | ||
| 281 | |||
| 225 | if (!head->disk) | 282 | if (!head->disk) |
| 226 | return; | 283 | return; |
| 227 | 284 | ||
| 228 | mutex_lock(&head->subsys->lock); | ||
| 229 | if (!(head->disk->flags & GENHD_FL_UP)) { | 285 | if (!(head->disk->flags & GENHD_FL_UP)) { |
| 230 | device_add_disk(&head->subsys->dev, head->disk); | 286 | device_add_disk(&head->subsys->dev, head->disk); |
| 231 | if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, | 287 | if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, |
| 232 | &nvme_ns_id_attr_group)) | 288 | &nvme_ns_id_attr_group)) |
| 233 | pr_warn("%s: failed to create sysfs group for identification\n", | 289 | dev_warn(&head->subsys->dev, |
| 234 | head->disk->disk_name); | 290 | "failed to create id group.\n"); |
| 291 | } | ||
| 292 | |||
| 293 | kblockd_schedule_work(&ns->head->requeue_work); | ||
| 294 | } | ||
| 295 | |||
| 296 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, | ||
| 297 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, | ||
| 298 | void *)) | ||
| 299 | { | ||
| 300 | void *base = ctrl->ana_log_buf; | ||
| 301 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); | ||
| 302 | int error, i; | ||
| 303 | |||
| 304 | lockdep_assert_held(&ctrl->ana_lock); | ||
| 305 | |||
| 306 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { | ||
| 307 | struct nvme_ana_group_desc *desc = base + offset; | ||
| 308 | u32 nr_nsids = le32_to_cpu(desc->nnsids); | ||
| 309 | size_t nsid_buf_size = nr_nsids * sizeof(__le32); | ||
| 310 | |||
| 311 | if (WARN_ON_ONCE(desc->grpid == 0)) | ||
| 312 | return -EINVAL; | ||
| 313 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) | ||
| 314 | return -EINVAL; | ||
| 315 | if (WARN_ON_ONCE(desc->state == 0)) | ||
| 316 | return -EINVAL; | ||
| 317 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) | ||
| 318 | return -EINVAL; | ||
| 319 | |||
| 320 | offset += sizeof(*desc); | ||
| 321 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) | ||
| 322 | return -EINVAL; | ||
| 323 | |||
| 324 | error = cb(ctrl, desc, data); | ||
| 325 | if (error) | ||
| 326 | return error; | ||
| 327 | |||
| 328 | offset += nsid_buf_size; | ||
| 329 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) | ||
| 330 | return -EINVAL; | ||
| 331 | } | ||
| 332 | |||
| 333 | return 0; | ||
| 334 | } | ||
| 335 | |||
| 336 | static inline bool nvme_state_is_live(enum nvme_ana_state state) | ||
| 337 | { | ||
| 338 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; | ||
| 339 | } | ||
| 340 | |||
| 341 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, | ||
| 342 | struct nvme_ns *ns) | ||
| 343 | { | ||
| 344 | enum nvme_ana_state old; | ||
| 345 | |||
| 346 | mutex_lock(&ns->head->lock); | ||
| 347 | old = ns->ana_state; | ||
| 348 | ns->ana_grpid = le32_to_cpu(desc->grpid); | ||
| 349 | ns->ana_state = desc->state; | ||
| 350 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); | ||
| 351 | |||
| 352 | if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) | ||
| 353 | nvme_mpath_set_live(ns); | ||
| 354 | mutex_unlock(&ns->head->lock); | ||
| 355 | } | ||
| 356 | |||
| 357 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, | ||
| 358 | struct nvme_ana_group_desc *desc, void *data) | ||
| 359 | { | ||
| 360 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; | ||
| 361 | unsigned *nr_change_groups = data; | ||
| 362 | struct nvme_ns *ns; | ||
| 363 | |||
| 364 | dev_info(ctrl->device, "ANA group %d: %s.\n", | ||
| 365 | le32_to_cpu(desc->grpid), | ||
| 366 | nvme_ana_state_names[desc->state]); | ||
| 367 | |||
| 368 | if (desc->state == NVME_ANA_CHANGE) | ||
| 369 | (*nr_change_groups)++; | ||
| 370 | |||
| 371 | if (!nr_nsids) | ||
| 372 | return 0; | ||
| 373 | |||
| 374 | down_write(&ctrl->namespaces_rwsem); | ||
| 375 | list_for_each_entry(ns, &ctrl->namespaces, list) { | ||
| 376 | if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) | ||
| 377 | continue; | ||
| 378 | nvme_update_ns_ana_state(desc, ns); | ||
| 379 | if (++n == nr_nsids) | ||
| 380 | break; | ||
| 381 | } | ||
| 382 | up_write(&ctrl->namespaces_rwsem); | ||
| 383 | WARN_ON_ONCE(n < nr_nsids); | ||
| 384 | return 0; | ||
| 385 | } | ||
| 386 | |||
| 387 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) | ||
| 388 | { | ||
| 389 | u32 nr_change_groups = 0; | ||
| 390 | int error; | ||
| 391 | |||
| 392 | mutex_lock(&ctrl->ana_lock); | ||
| 393 | error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, | ||
| 394 | groups_only ? NVME_ANA_LOG_RGO : 0, | ||
| 395 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); | ||
| 396 | if (error) { | ||
| 397 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); | ||
| 398 | goto out_unlock; | ||
| 399 | } | ||
| 400 | |||
| 401 | error = nvme_parse_ana_log(ctrl, &nr_change_groups, | ||
| 402 | nvme_update_ana_state); | ||
| 403 | if (error) | ||
| 404 | goto out_unlock; | ||
| 405 | |||
| 406 | /* | ||
| 407 | * In theory we should have an ANATT timer per group as they might enter | ||
| 408 | * the change state at different times. But that is a lot of overhead | ||
| 409 | * just to protect against a target that keeps entering new changes | ||
| 410 | * states while never finishing previous ones. But we'll still | ||
| 411 | * eventually time out once all groups are in change state, so this | ||
| 412 | * isn't a big deal. | ||
| 413 | * | ||
| 414 | * We also double the ANATT value to provide some slack for transports | ||
| 415 | * or AEN processing overhead. | ||
| 416 | */ | ||
| 417 | if (nr_change_groups) | ||
| 418 | mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); | ||
| 419 | else | ||
| 420 | del_timer_sync(&ctrl->anatt_timer); | ||
| 421 | out_unlock: | ||
| 422 | mutex_unlock(&ctrl->ana_lock); | ||
| 423 | return error; | ||
| 424 | } | ||
| 425 | |||
| 426 | static void nvme_ana_work(struct work_struct *work) | ||
| 427 | { | ||
| 428 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); | ||
| 429 | |||
| 430 | nvme_read_ana_log(ctrl, false); | ||
| 431 | } | ||
| 432 | |||
| 433 | static void nvme_anatt_timeout(struct timer_list *t) | ||
| 434 | { | ||
| 435 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); | ||
| 436 | |||
| 437 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); | ||
| 438 | nvme_reset_ctrl(ctrl); | ||
| 439 | } | ||
| 440 | |||
| 441 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) | ||
| 442 | { | ||
| 443 | if (!nvme_ctrl_use_ana(ctrl)) | ||
| 444 | return; | ||
| 445 | del_timer_sync(&ctrl->anatt_timer); | ||
| 446 | cancel_work_sync(&ctrl->ana_work); | ||
| 447 | } | ||
| 448 | |||
| 449 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, | ||
| 450 | char *buf) | ||
| 451 | { | ||
| 452 | return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); | ||
| 453 | } | ||
| 454 | DEVICE_ATTR_RO(ana_grpid); | ||
| 455 | |||
| 456 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, | ||
| 457 | char *buf) | ||
| 458 | { | ||
| 459 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | ||
| 460 | |||
| 461 | return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); | ||
| 462 | } | ||
| 463 | DEVICE_ATTR_RO(ana_state); | ||
| 464 | |||
| 465 | static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, | ||
| 466 | struct nvme_ana_group_desc *desc, void *data) | ||
| 467 | { | ||
| 468 | struct nvme_ns *ns = data; | ||
| 469 | |||
| 470 | if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { | ||
| 471 | nvme_update_ns_ana_state(desc, ns); | ||
| 472 | return -ENXIO; /* just break out of the loop */ | ||
| 473 | } | ||
| 474 | |||
| 475 | return 0; | ||
| 476 | } | ||
| 477 | |||
| 478 | void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) | ||
| 479 | { | ||
| 480 | if (nvme_ctrl_use_ana(ns->ctrl)) { | ||
| 481 | mutex_lock(&ns->ctrl->ana_lock); | ||
| 482 | ns->ana_grpid = le32_to_cpu(id->anagrpid); | ||
| 483 | nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); | ||
| 484 | mutex_unlock(&ns->ctrl->ana_lock); | ||
| 485 | } else { | ||
| 486 | mutex_lock(&ns->head->lock); | ||
| 487 | ns->ana_state = NVME_ANA_OPTIMIZED; | ||
| 488 | nvme_mpath_set_live(ns); | ||
| 489 | mutex_unlock(&ns->head->lock); | ||
| 235 | } | 490 | } |
| 236 | mutex_unlock(&head->subsys->lock); | ||
| 237 | } | 491 | } |
| 238 | 492 | ||
| 239 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) | 493 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
| 240 | { | 494 | { |
| 241 | if (!head->disk) | 495 | if (!head->disk) |
| 242 | return; | 496 | return; |
| 243 | sysfs_remove_group(&disk_to_dev(head->disk)->kobj, | 497 | if (head->disk->flags & GENHD_FL_UP) { |
| 244 | &nvme_ns_id_attr_group); | 498 | sysfs_remove_group(&disk_to_dev(head->disk)->kobj, |
| 245 | del_gendisk(head->disk); | 499 | &nvme_ns_id_attr_group); |
| 500 | del_gendisk(head->disk); | ||
| 501 | } | ||
| 246 | blk_set_queue_dying(head->disk->queue); | 502 | blk_set_queue_dying(head->disk->queue); |
| 247 | /* make sure all pending bios are cleaned up */ | 503 | /* make sure all pending bios are cleaned up */ |
| 248 | kblockd_schedule_work(&head->requeue_work); | 504 | kblockd_schedule_work(&head->requeue_work); |
| @@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) | |||
| 250 | blk_cleanup_queue(head->disk->queue); | 506 | blk_cleanup_queue(head->disk->queue); |
| 251 | put_disk(head->disk); | 507 | put_disk(head->disk); |
| 252 | } | 508 | } |
| 509 | |||
| 510 | int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | ||
| 511 | { | ||
| 512 | int error; | ||
| 513 | |||
| 514 | if (!nvme_ctrl_use_ana(ctrl)) | ||
| 515 | return 0; | ||
| 516 | |||
| 517 | ctrl->anacap = id->anacap; | ||
| 518 | ctrl->anatt = id->anatt; | ||
| 519 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); | ||
| 520 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); | ||
| 521 | |||
| 522 | mutex_init(&ctrl->ana_lock); | ||
| 523 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); | ||
| 524 | ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + | ||
| 525 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); | ||
| 526 | if (!(ctrl->anacap & (1 << 6))) | ||
| 527 | ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); | ||
| 528 | |||
| 529 | if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { | ||
| 530 | dev_err(ctrl->device, | ||
| 531 | "ANA log page size (%zd) larger than MDTS (%d).\n", | ||
| 532 | ctrl->ana_log_size, | ||
| 533 | ctrl->max_hw_sectors << SECTOR_SHIFT); | ||
| 534 | dev_err(ctrl->device, "disabling ANA support.\n"); | ||
| 535 | return 0; | ||
| 536 | } | ||
| 537 | |||
| 538 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); | ||
| 539 | ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); | ||
| 540 | if (!ctrl->ana_log_buf) | ||
| 541 | goto out; | ||
| 542 | |||
| 543 | error = nvme_read_ana_log(ctrl, true); | ||
| 544 | if (error) | ||
| 545 | goto out_free_ana_log_buf; | ||
| 546 | return 0; | ||
| 547 | out_free_ana_log_buf: | ||
| 548 | kfree(ctrl->ana_log_buf); | ||
| 549 | out: | ||
| 550 | return -ENOMEM; | ||
| 551 | } | ||
| 552 | |||
| 553 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) | ||
| 554 | { | ||
| 555 | kfree(ctrl->ana_log_buf); | ||
| 556 | } | ||
| 557 | |||
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0c4a33df3b2f..bb4a2003c097 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h | |||
| @@ -102,6 +102,7 @@ struct nvme_request { | |||
| 102 | u8 retries; | 102 | u8 retries; |
| 103 | u8 flags; | 103 | u8 flags; |
| 104 | u16 status; | 104 | u16 status; |
| 105 | struct nvme_ctrl *ctrl; | ||
| 105 | }; | 106 | }; |
| 106 | 107 | ||
| 107 | /* | 108 | /* |
| @@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req) | |||
| 119 | return blk_mq_rq_to_pdu(req); | 120 | return blk_mq_rq_to_pdu(req); |
| 120 | } | 121 | } |
| 121 | 122 | ||
| 123 | static inline u16 nvme_req_qid(struct request *req) | ||
| 124 | { | ||
| 125 | if (!req->rq_disk) | ||
| 126 | return 0; | ||
| 127 | return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; | ||
| 128 | } | ||
| 129 | |||
| 122 | /* The below value is the specific amount of delay needed before checking | 130 | /* The below value is the specific amount of delay needed before checking |
| 123 | * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the | 131 | * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the |
| 124 | * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was | 132 | * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was |
| @@ -175,6 +183,7 @@ struct nvme_ctrl { | |||
| 175 | u16 oacs; | 183 | u16 oacs; |
| 176 | u16 nssa; | 184 | u16 nssa; |
| 177 | u16 nr_streams; | 185 | u16 nr_streams; |
| 186 | u32 max_namespaces; | ||
| 178 | atomic_t abort_limit; | 187 | atomic_t abort_limit; |
| 179 | u8 vwc; | 188 | u8 vwc; |
| 180 | u32 vs; | 189 | u32 vs; |
| @@ -197,6 +206,19 @@ struct nvme_ctrl { | |||
| 197 | struct work_struct fw_act_work; | 206 | struct work_struct fw_act_work; |
| 198 | unsigned long events; | 207 | unsigned long events; |
| 199 | 208 | ||
| 209 | #ifdef CONFIG_NVME_MULTIPATH | ||
| 210 | /* asymmetric namespace access: */ | ||
| 211 | u8 anacap; | ||
| 212 | u8 anatt; | ||
| 213 | u32 anagrpmax; | ||
| 214 | u32 nanagrpid; | ||
| 215 | struct mutex ana_lock; | ||
| 216 | struct nvme_ana_rsp_hdr *ana_log_buf; | ||
| 217 | size_t ana_log_size; | ||
| 218 | struct timer_list anatt_timer; | ||
| 219 | struct work_struct ana_work; | ||
| 220 | #endif | ||
| 221 | |||
| 200 | /* Power saving configuration */ | 222 | /* Power saving configuration */ |
| 201 | u64 ps_max_latency_us; | 223 | u64 ps_max_latency_us; |
| 202 | bool apst_enabled; | 224 | bool apst_enabled; |
| @@ -261,6 +283,7 @@ struct nvme_ns_head { | |||
| 261 | struct bio_list requeue_list; | 283 | struct bio_list requeue_list; |
| 262 | spinlock_t requeue_lock; | 284 | spinlock_t requeue_lock; |
| 263 | struct work_struct requeue_work; | 285 | struct work_struct requeue_work; |
| 286 | struct mutex lock; | ||
| 264 | #endif | 287 | #endif |
| 265 | struct list_head list; | 288 | struct list_head list; |
| 266 | struct srcu_struct srcu; | 289 | struct srcu_struct srcu; |
| @@ -287,6 +310,10 @@ struct nvme_ns { | |||
| 287 | struct nvme_ctrl *ctrl; | 310 | struct nvme_ctrl *ctrl; |
| 288 | struct request_queue *queue; | 311 | struct request_queue *queue; |
| 289 | struct gendisk *disk; | 312 | struct gendisk *disk; |
| 313 | #ifdef CONFIG_NVME_MULTIPATH | ||
| 314 | enum nvme_ana_state ana_state; | ||
| 315 | u32 ana_grpid; | ||
| 316 | #endif | ||
| 290 | struct list_head siblings; | 317 | struct list_head siblings; |
| 291 | struct nvm_dev *ndev; | 318 | struct nvm_dev *ndev; |
| 292 | struct kref kref; | 319 | struct kref kref; |
| @@ -299,8 +326,9 @@ struct nvme_ns { | |||
| 299 | bool ext; | 326 | bool ext; |
| 300 | u8 pi_type; | 327 | u8 pi_type; |
| 301 | unsigned long flags; | 328 | unsigned long flags; |
| 302 | #define NVME_NS_REMOVING 0 | 329 | #define NVME_NS_REMOVING 0 |
| 303 | #define NVME_NS_DEAD 1 | 330 | #define NVME_NS_DEAD 1 |
| 331 | #define NVME_NS_ANA_PENDING 2 | ||
| 304 | u16 noiob; | 332 | u16 noiob; |
| 305 | 333 | ||
| 306 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 334 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| @@ -356,14 +384,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) | |||
| 356 | return (sector >> (ns->lba_shift - 9)); | 384 | return (sector >> (ns->lba_shift - 9)); |
| 357 | } | 385 | } |
| 358 | 386 | ||
| 359 | static inline void nvme_cleanup_cmd(struct request *req) | ||
| 360 | { | ||
| 361 | if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { | ||
| 362 | kfree(page_address(req->special_vec.bv_page) + | ||
| 363 | req->special_vec.bv_offset); | ||
| 364 | } | ||
| 365 | } | ||
| 366 | |||
| 367 | static inline void nvme_end_request(struct request *req, __le16 status, | 387 | static inline void nvme_end_request(struct request *req, __le16 status, |
| 368 | union nvme_result result) | 388 | union nvme_result result) |
| 369 | { | 389 | { |
| @@ -420,6 +440,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); | |||
| 420 | #define NVME_QID_ANY -1 | 440 | #define NVME_QID_ANY -1 |
| 421 | struct request *nvme_alloc_request(struct request_queue *q, | 441 | struct request *nvme_alloc_request(struct request_queue *q, |
| 422 | struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); | 442 | struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); |
| 443 | void nvme_cleanup_cmd(struct request *req); | ||
| 423 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, | 444 | blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, |
| 424 | struct nvme_command *cmd); | 445 | struct nvme_command *cmd); |
| 425 | int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, | 446 | int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, |
| @@ -435,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); | |||
| 435 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl); | 456 | int nvme_delete_ctrl(struct nvme_ctrl *ctrl); |
| 436 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); | 457 | int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); |
| 437 | 458 | ||
| 438 | int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, | 459 | int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, |
| 439 | u8 log_page, void *log, size_t size, u64 offset); | 460 | void *log, size_t size, u64 offset); |
| 440 | 461 | ||
| 441 | extern const struct attribute_group nvme_ns_id_attr_group; | 462 | extern const struct attribute_group nvme_ns_id_attr_group; |
| 442 | extern const struct block_device_operations nvme_ns_head_ops; | 463 | extern const struct block_device_operations nvme_ns_head_ops; |
| 443 | 464 | ||
| 444 | #ifdef CONFIG_NVME_MULTIPATH | 465 | #ifdef CONFIG_NVME_MULTIPATH |
| 466 | bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl); | ||
| 445 | void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, | 467 | void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, |
| 446 | struct nvme_ctrl *ctrl, int *flags); | 468 | struct nvme_ctrl *ctrl, int *flags); |
| 447 | void nvme_failover_req(struct request *req); | 469 | void nvme_failover_req(struct request *req); |
| 448 | bool nvme_req_needs_failover(struct request *req, blk_status_t error); | ||
| 449 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); | 470 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); |
| 450 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); | 471 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); |
| 451 | void nvme_mpath_add_disk(struct nvme_ns_head *head); | 472 | void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); |
| 452 | void nvme_mpath_remove_disk(struct nvme_ns_head *head); | 473 | void nvme_mpath_remove_disk(struct nvme_ns_head *head); |
| 474 | int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); | ||
| 475 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl); | ||
| 476 | void nvme_mpath_stop(struct nvme_ctrl *ctrl); | ||
| 453 | 477 | ||
| 454 | static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) | 478 | static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) |
| 455 | { | 479 | { |
| @@ -468,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) | |||
| 468 | kblockd_schedule_work(&head->requeue_work); | 492 | kblockd_schedule_work(&head->requeue_work); |
| 469 | } | 493 | } |
| 470 | 494 | ||
| 495 | extern struct device_attribute dev_attr_ana_grpid; | ||
| 496 | extern struct device_attribute dev_attr_ana_state; | ||
| 497 | |||
| 471 | #else | 498 | #else |
| 499 | static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) | ||
| 500 | { | ||
| 501 | return false; | ||
| 502 | } | ||
| 472 | /* | 503 | /* |
| 473 | * Without the multipath code enabled, multiple controller per subsystems are | 504 | * Without the multipath code enabled, multiple controller per subsystems are |
| 474 | * visible as devices and thus we cannot use the subsystem instance. | 505 | * visible as devices and thus we cannot use the subsystem instance. |
| @@ -482,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, | |||
| 482 | static inline void nvme_failover_req(struct request *req) | 513 | static inline void nvme_failover_req(struct request *req) |
| 483 | { | 514 | { |
| 484 | } | 515 | } |
| 485 | static inline bool nvme_req_needs_failover(struct request *req, | ||
| 486 | blk_status_t error) | ||
| 487 | { | ||
| 488 | return false; | ||
| 489 | } | ||
| 490 | static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | 516 | static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
| 491 | { | 517 | { |
| 492 | } | 518 | } |
| @@ -495,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, | |||
| 495 | { | 521 | { |
| 496 | return 0; | 522 | return 0; |
| 497 | } | 523 | } |
| 498 | static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) | 524 | static inline void nvme_mpath_add_disk(struct nvme_ns *ns, |
| 525 | struct nvme_id_ns *id) | ||
| 499 | { | 526 | { |
| 500 | } | 527 | } |
| 501 | static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) | 528 | static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
| @@ -507,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) | |||
| 507 | static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) | 534 | static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) |
| 508 | { | 535 | { |
| 509 | } | 536 | } |
| 537 | static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, | ||
| 538 | struct nvme_id_ctrl *id) | ||
| 539 | { | ||
| 540 | return 0; | ||
| 541 | } | ||
| 542 | static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) | ||
| 543 | { | ||
| 544 | } | ||
| 545 | static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) | ||
| 546 | { | ||
| 547 | } | ||
| 510 | #endif /* CONFIG_NVME_MULTIPATH */ | 548 | #endif /* CONFIG_NVME_MULTIPATH */ |
| 511 | 549 | ||
| 512 | #ifdef CONFIG_NVM | 550 | #ifdef CONFIG_NVM |
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ddd441b1516a..1b9951d2067e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
| @@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, | |||
| 418 | 418 | ||
| 419 | BUG_ON(!nvmeq); | 419 | BUG_ON(!nvmeq); |
| 420 | iod->nvmeq = nvmeq; | 420 | iod->nvmeq = nvmeq; |
| 421 | |||
| 422 | nvme_req(req)->ctrl = &dev->ctrl; | ||
| 421 | return 0; | 423 | return 0; |
| 422 | } | 424 | } |
| 423 | 425 | ||
| @@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) | |||
| 535 | mempool_free(iod->sg, dev->iod_mempool); | 537 | mempool_free(iod->sg, dev->iod_mempool); |
| 536 | } | 538 | } |
| 537 | 539 | ||
| 538 | #ifdef CONFIG_BLK_DEV_INTEGRITY | ||
| 539 | static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) | ||
| 540 | { | ||
| 541 | if (be32_to_cpu(pi->ref_tag) == v) | ||
| 542 | pi->ref_tag = cpu_to_be32(p); | ||
| 543 | } | ||
| 544 | |||
| 545 | static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) | ||
| 546 | { | ||
| 547 | if (be32_to_cpu(pi->ref_tag) == p) | ||
| 548 | pi->ref_tag = cpu_to_be32(v); | ||
| 549 | } | ||
| 550 | |||
| 551 | /** | ||
| 552 | * nvme_dif_remap - remaps ref tags to bip seed and physical lba | ||
| 553 | * | ||
| 554 | * The virtual start sector is the one that was originally submitted by the | ||
| 555 | * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical | ||
| 556 | * start sector may be different. Remap protection information to match the | ||
| 557 | * physical LBA on writes, and back to the original seed on reads. | ||
| 558 | * | ||
| 559 | * Type 0 and 3 do not have a ref tag, so no remapping required. | ||
| 560 | */ | ||
| 561 | static void nvme_dif_remap(struct request *req, | ||
| 562 | void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) | ||
| 563 | { | ||
| 564 | struct nvme_ns *ns = req->rq_disk->private_data; | ||
| 565 | struct bio_integrity_payload *bip; | ||
| 566 | struct t10_pi_tuple *pi; | ||
| 567 | void *p, *pmap; | ||
| 568 | u32 i, nlb, ts, phys, virt; | ||
| 569 | |||
| 570 | if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) | ||
| 571 | return; | ||
| 572 | |||
| 573 | bip = bio_integrity(req->bio); | ||
| 574 | if (!bip) | ||
| 575 | return; | ||
| 576 | |||
| 577 | pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; | ||
| 578 | |||
| 579 | p = pmap; | ||
| 580 | virt = bip_get_seed(bip); | ||
| 581 | phys = nvme_block_nr(ns, blk_rq_pos(req)); | ||
| 582 | nlb = (blk_rq_bytes(req) >> ns->lba_shift); | ||
| 583 | ts = ns->disk->queue->integrity.tuple_size; | ||
| 584 | |||
| 585 | for (i = 0; i < nlb; i++, virt++, phys++) { | ||
| 586 | pi = (struct t10_pi_tuple *)p; | ||
| 587 | dif_swap(phys, virt, pi); | ||
| 588 | p += ts; | ||
| 589 | } | ||
| 590 | kunmap_atomic(pmap); | ||
| 591 | } | ||
| 592 | #else /* CONFIG_BLK_DEV_INTEGRITY */ | ||
| 593 | static void nvme_dif_remap(struct request *req, | ||
| 594 | void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) | ||
| 595 | { | ||
| 596 | } | ||
| 597 | static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) | ||
| 598 | { | ||
| 599 | } | ||
| 600 | static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) | ||
| 601 | { | ||
| 602 | } | ||
| 603 | #endif | ||
| 604 | |||
| 605 | static void nvme_print_sgl(struct scatterlist *sgl, int nents) | 540 | static void nvme_print_sgl(struct scatterlist *sgl, int nents) |
| 606 | { | 541 | { |
| 607 | int i; | 542 | int i; |
| @@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, | |||
| 827 | if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) | 762 | if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) |
| 828 | goto out_unmap; | 763 | goto out_unmap; |
| 829 | 764 | ||
| 830 | if (req_op(req) == REQ_OP_WRITE) | ||
| 831 | nvme_dif_remap(req, nvme_dif_prep); | ||
| 832 | |||
| 833 | if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) | 765 | if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) |
| 834 | goto out_unmap; | 766 | goto out_unmap; |
| 835 | } | 767 | } |
| @@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) | |||
| 852 | 784 | ||
| 853 | if (iod->nents) { | 785 | if (iod->nents) { |
| 854 | dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); | 786 | dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); |
| 855 | if (blk_integrity_rq(req)) { | 787 | if (blk_integrity_rq(req)) |
| 856 | if (req_op(req) == REQ_OP_READ) | ||
| 857 | nvme_dif_remap(req, nvme_dif_complete); | ||
| 858 | dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); | 788 | dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); |
| 859 | } | ||
| 860 | } | 789 | } |
| 861 | 790 | ||
| 862 | nvme_cleanup_cmd(req); | 791 | nvme_cleanup_cmd(req); |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 66ec5985c9f3..0805fa6215ee 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c | |||
| @@ -40,13 +40,14 @@ | |||
| 40 | 40 | ||
| 41 | #define NVME_RDMA_MAX_SEGMENTS 256 | 41 | #define NVME_RDMA_MAX_SEGMENTS 256 |
| 42 | 42 | ||
| 43 | #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 | 43 | #define NVME_RDMA_MAX_INLINE_SEGMENTS 4 |
| 44 | 44 | ||
| 45 | struct nvme_rdma_device { | 45 | struct nvme_rdma_device { |
| 46 | struct ib_device *dev; | 46 | struct ib_device *dev; |
| 47 | struct ib_pd *pd; | 47 | struct ib_pd *pd; |
| 48 | struct kref ref; | 48 | struct kref ref; |
| 49 | struct list_head entry; | 49 | struct list_head entry; |
| 50 | unsigned int num_inline_segments; | ||
| 50 | }; | 51 | }; |
| 51 | 52 | ||
| 52 | struct nvme_rdma_qe { | 53 | struct nvme_rdma_qe { |
| @@ -117,6 +118,7 @@ struct nvme_rdma_ctrl { | |||
| 117 | struct sockaddr_storage src_addr; | 118 | struct sockaddr_storage src_addr; |
| 118 | 119 | ||
| 119 | struct nvme_ctrl ctrl; | 120 | struct nvme_ctrl ctrl; |
| 121 | bool use_inline_data; | ||
| 120 | }; | 122 | }; |
| 121 | 123 | ||
| 122 | static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) | 124 | static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) |
| @@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) | |||
| 249 | /* +1 for drain */ | 251 | /* +1 for drain */ |
| 250 | init_attr.cap.max_recv_wr = queue->queue_size + 1; | 252 | init_attr.cap.max_recv_wr = queue->queue_size + 1; |
| 251 | init_attr.cap.max_recv_sge = 1; | 253 | init_attr.cap.max_recv_sge = 1; |
| 252 | init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; | 254 | init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; |
| 253 | init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; | 255 | init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; |
| 254 | init_attr.qp_type = IB_QPT_RC; | 256 | init_attr.qp_type = IB_QPT_RC; |
| 255 | init_attr.send_cq = queue->ib_cq; | 257 | init_attr.send_cq = queue->ib_cq; |
| @@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, | |||
| 286 | struct ib_device *ibdev = dev->dev; | 288 | struct ib_device *ibdev = dev->dev; |
| 287 | int ret; | 289 | int ret; |
| 288 | 290 | ||
| 291 | nvme_req(rq)->ctrl = &ctrl->ctrl; | ||
| 289 | ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), | 292 | ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), |
| 290 | DMA_TO_DEVICE); | 293 | DMA_TO_DEVICE); |
| 291 | if (ret) | 294 | if (ret) |
| @@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) | |||
| 374 | goto out_free_pd; | 377 | goto out_free_pd; |
| 375 | } | 378 | } |
| 376 | 379 | ||
| 380 | ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, | ||
| 381 | ndev->dev->attrs.max_sge - 1); | ||
| 377 | list_add(&ndev->entry, &device_list); | 382 | list_add(&ndev->entry, &device_list); |
| 378 | out_unlock: | 383 | out_unlock: |
| 379 | mutex_unlock(&device_list_mutex); | 384 | mutex_unlock(&device_list_mutex); |
| @@ -868,6 +873,31 @@ out_free_io_queues: | |||
| 868 | return ret; | 873 | return ret; |
| 869 | } | 874 | } |
| 870 | 875 | ||
| 876 | static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, | ||
| 877 | bool remove) | ||
| 878 | { | ||
| 879 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
| 880 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 881 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, | ||
| 882 | &ctrl->ctrl); | ||
| 883 | blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); | ||
| 884 | nvme_rdma_destroy_admin_queue(ctrl, remove); | ||
| 885 | } | ||
| 886 | |||
| 887 | static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, | ||
| 888 | bool remove) | ||
| 889 | { | ||
| 890 | if (ctrl->ctrl.queue_count > 1) { | ||
| 891 | nvme_stop_queues(&ctrl->ctrl); | ||
| 892 | nvme_rdma_stop_io_queues(ctrl); | ||
| 893 | blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, | ||
| 894 | &ctrl->ctrl); | ||
| 895 | if (remove) | ||
| 896 | nvme_start_queues(&ctrl->ctrl); | ||
| 897 | nvme_rdma_destroy_io_queues(ctrl, remove); | ||
| 898 | } | ||
| 899 | } | ||
| 900 | |||
| 871 | static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) | 901 | static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) |
| 872 | { | 902 | { |
| 873 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); | 903 | struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); |
| @@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) | |||
| 912 | } | 942 | } |
| 913 | } | 943 | } |
| 914 | 944 | ||
| 915 | static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | 945 | static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) |
| 916 | { | 946 | { |
| 917 | struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), | 947 | int ret = -EINVAL; |
| 918 | struct nvme_rdma_ctrl, reconnect_work); | ||
| 919 | bool changed; | 948 | bool changed; |
| 920 | int ret; | ||
| 921 | 949 | ||
| 922 | ++ctrl->ctrl.nr_reconnects; | 950 | ret = nvme_rdma_configure_admin_queue(ctrl, new); |
| 923 | |||
| 924 | ret = nvme_rdma_configure_admin_queue(ctrl, false); | ||
| 925 | if (ret) | 951 | if (ret) |
| 926 | goto requeue; | 952 | return ret; |
| 953 | |||
| 954 | if (ctrl->ctrl.icdoff) { | ||
| 955 | dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); | ||
| 956 | goto destroy_admin; | ||
| 957 | } | ||
| 958 | |||
| 959 | if (!(ctrl->ctrl.sgls & (1 << 2))) { | ||
| 960 | dev_err(ctrl->ctrl.device, | ||
| 961 | "Mandatory keyed sgls are not supported!\n"); | ||
| 962 | goto destroy_admin; | ||
| 963 | } | ||
| 964 | |||
| 965 | if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { | ||
| 966 | dev_warn(ctrl->ctrl.device, | ||
| 967 | "queue_size %zu > ctrl sqsize %u, clamping down\n", | ||
| 968 | ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); | ||
| 969 | } | ||
| 970 | |||
| 971 | if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { | ||
| 972 | dev_warn(ctrl->ctrl.device, | ||
| 973 | "sqsize %u > ctrl maxcmd %u, clamping down\n", | ||
| 974 | ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); | ||
| 975 | ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; | ||
| 976 | } | ||
| 977 | |||
| 978 | if (ctrl->ctrl.sgls & (1 << 20)) | ||
| 979 | ctrl->use_inline_data = true; | ||
| 927 | 980 | ||
| 928 | if (ctrl->ctrl.queue_count > 1) { | 981 | if (ctrl->ctrl.queue_count > 1) { |
| 929 | ret = nvme_rdma_configure_io_queues(ctrl, false); | 982 | ret = nvme_rdma_configure_io_queues(ctrl, new); |
| 930 | if (ret) | 983 | if (ret) |
| 931 | goto destroy_admin; | 984 | goto destroy_admin; |
| 932 | } | 985 | } |
| @@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
| 935 | if (!changed) { | 988 | if (!changed) { |
| 936 | /* state change failure is ok if we're in DELETING state */ | 989 | /* state change failure is ok if we're in DELETING state */ |
| 937 | WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); | 990 | WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); |
| 938 | return; | 991 | ret = -EINVAL; |
| 992 | goto destroy_io; | ||
| 939 | } | 993 | } |
| 940 | 994 | ||
| 941 | nvme_start_ctrl(&ctrl->ctrl); | 995 | nvme_start_ctrl(&ctrl->ctrl); |
| 996 | return 0; | ||
| 997 | |||
| 998 | destroy_io: | ||
| 999 | if (ctrl->ctrl.queue_count > 1) | ||
| 1000 | nvme_rdma_destroy_io_queues(ctrl, new); | ||
| 1001 | destroy_admin: | ||
| 1002 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 1003 | nvme_rdma_destroy_admin_queue(ctrl, new); | ||
| 1004 | return ret; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | ||
| 1008 | { | ||
| 1009 | struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), | ||
| 1010 | struct nvme_rdma_ctrl, reconnect_work); | ||
| 1011 | |||
| 1012 | ++ctrl->ctrl.nr_reconnects; | ||
| 1013 | |||
| 1014 | if (nvme_rdma_setup_ctrl(ctrl, false)) | ||
| 1015 | goto requeue; | ||
| 942 | 1016 | ||
| 943 | dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", | 1017 | dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", |
| 944 | ctrl->ctrl.nr_reconnects); | 1018 | ctrl->ctrl.nr_reconnects); |
| @@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) | |||
| 947 | 1021 | ||
| 948 | return; | 1022 | return; |
| 949 | 1023 | ||
| 950 | destroy_admin: | ||
| 951 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 952 | nvme_rdma_destroy_admin_queue(ctrl, false); | ||
| 953 | requeue: | 1024 | requeue: |
| 954 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", | 1025 | dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", |
| 955 | ctrl->ctrl.nr_reconnects); | 1026 | ctrl->ctrl.nr_reconnects); |
| @@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) | |||
| 962 | struct nvme_rdma_ctrl, err_work); | 1033 | struct nvme_rdma_ctrl, err_work); |
| 963 | 1034 | ||
| 964 | nvme_stop_keep_alive(&ctrl->ctrl); | 1035 | nvme_stop_keep_alive(&ctrl->ctrl); |
| 965 | 1036 | nvme_rdma_teardown_io_queues(ctrl, false); | |
| 966 | if (ctrl->ctrl.queue_count > 1) { | ||
| 967 | nvme_stop_queues(&ctrl->ctrl); | ||
| 968 | nvme_rdma_stop_io_queues(ctrl); | ||
| 969 | blk_mq_tagset_busy_iter(&ctrl->tag_set, | ||
| 970 | nvme_cancel_request, &ctrl->ctrl); | ||
| 971 | nvme_rdma_destroy_io_queues(ctrl, false); | ||
| 972 | } | ||
| 973 | |||
| 974 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
| 975 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 976 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, | ||
| 977 | nvme_cancel_request, &ctrl->ctrl); | ||
| 978 | nvme_rdma_destroy_admin_queue(ctrl, false); | ||
| 979 | |||
| 980 | /* | ||
| 981 | * queues are not a live anymore, so restart the queues to fail fast | ||
| 982 | * new IO | ||
| 983 | */ | ||
| 984 | blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); | ||
| 985 | nvme_start_queues(&ctrl->ctrl); | 1037 | nvme_start_queues(&ctrl->ctrl); |
| 1038 | nvme_rdma_teardown_admin_queue(ctrl, false); | ||
| 986 | 1039 | ||
| 987 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { | 1040 | if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { |
| 988 | /* state change failure is ok if we're in DELETING state */ | 1041 | /* state change failure is ok if we're in DELETING state */ |
| @@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c) | |||
| 1090 | } | 1143 | } |
| 1091 | 1144 | ||
| 1092 | static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, | 1145 | static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, |
| 1093 | struct nvme_rdma_request *req, struct nvme_command *c) | 1146 | struct nvme_rdma_request *req, struct nvme_command *c, |
| 1147 | int count) | ||
| 1094 | { | 1148 | { |
| 1095 | struct nvme_sgl_desc *sg = &c->common.dptr.sgl; | 1149 | struct nvme_sgl_desc *sg = &c->common.dptr.sgl; |
| 1150 | struct scatterlist *sgl = req->sg_table.sgl; | ||
| 1151 | struct ib_sge *sge = &req->sge[1]; | ||
| 1152 | u32 len = 0; | ||
| 1153 | int i; | ||
| 1096 | 1154 | ||
| 1097 | req->sge[1].addr = sg_dma_address(req->sg_table.sgl); | 1155 | for (i = 0; i < count; i++, sgl++, sge++) { |
| 1098 | req->sge[1].length = sg_dma_len(req->sg_table.sgl); | 1156 | sge->addr = sg_dma_address(sgl); |
| 1099 | req->sge[1].lkey = queue->device->pd->local_dma_lkey; | 1157 | sge->length = sg_dma_len(sgl); |
| 1158 | sge->lkey = queue->device->pd->local_dma_lkey; | ||
| 1159 | len += sge->length; | ||
| 1160 | } | ||
| 1100 | 1161 | ||
| 1101 | sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); | 1162 | sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); |
| 1102 | sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); | 1163 | sg->length = cpu_to_le32(len); |
| 1103 | sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; | 1164 | sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; |
| 1104 | 1165 | ||
| 1105 | req->num_sge++; | 1166 | req->num_sge += count; |
| 1106 | return 0; | 1167 | return 0; |
| 1107 | } | 1168 | } |
| 1108 | 1169 | ||
| @@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, | |||
| 1195 | goto out_free_table; | 1256 | goto out_free_table; |
| 1196 | } | 1257 | } |
| 1197 | 1258 | ||
| 1198 | if (count == 1) { | 1259 | if (count <= dev->num_inline_segments) { |
| 1199 | if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && | 1260 | if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && |
| 1261 | queue->ctrl->use_inline_data && | ||
| 1200 | blk_rq_payload_bytes(rq) <= | 1262 | blk_rq_payload_bytes(rq) <= |
| 1201 | nvme_rdma_inline_data_size(queue)) { | 1263 | nvme_rdma_inline_data_size(queue)) { |
| 1202 | ret = nvme_rdma_map_sg_inline(queue, req, c); | 1264 | ret = nvme_rdma_map_sg_inline(queue, req, c, count); |
| 1203 | goto out; | 1265 | goto out; |
| 1204 | } | 1266 | } |
| 1205 | 1267 | ||
| 1206 | if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { | 1268 | if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { |
| 1207 | ret = nvme_rdma_map_sg_single(queue, req, c); | 1269 | ret = nvme_rdma_map_sg_single(queue, req, c); |
| 1208 | goto out; | 1270 | goto out; |
| 1209 | } | 1271 | } |
| @@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, | |||
| 1574 | case RDMA_CM_EVENT_CONNECT_ERROR: | 1636 | case RDMA_CM_EVENT_CONNECT_ERROR: |
| 1575 | case RDMA_CM_EVENT_UNREACHABLE: | 1637 | case RDMA_CM_EVENT_UNREACHABLE: |
| 1576 | nvme_rdma_destroy_queue_ib(queue); | 1638 | nvme_rdma_destroy_queue_ib(queue); |
| 1639 | /* fall through */ | ||
| 1577 | case RDMA_CM_EVENT_ADDR_ERROR: | 1640 | case RDMA_CM_EVENT_ADDR_ERROR: |
| 1578 | dev_dbg(queue->ctrl->ctrl.device, | 1641 | dev_dbg(queue->ctrl->ctrl.device, |
| 1579 | "CM error event %d\n", ev->event); | 1642 | "CM error event %d\n", ev->event); |
| @@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { | |||
| 1736 | 1799 | ||
| 1737 | static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) | 1800 | static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) |
| 1738 | { | 1801 | { |
| 1739 | if (ctrl->ctrl.queue_count > 1) { | 1802 | nvme_rdma_teardown_io_queues(ctrl, shutdown); |
| 1740 | nvme_stop_queues(&ctrl->ctrl); | ||
| 1741 | nvme_rdma_stop_io_queues(ctrl); | ||
| 1742 | blk_mq_tagset_busy_iter(&ctrl->tag_set, | ||
| 1743 | nvme_cancel_request, &ctrl->ctrl); | ||
| 1744 | nvme_rdma_destroy_io_queues(ctrl, shutdown); | ||
| 1745 | } | ||
| 1746 | |||
| 1747 | if (shutdown) | 1803 | if (shutdown) |
| 1748 | nvme_shutdown_ctrl(&ctrl->ctrl); | 1804 | nvme_shutdown_ctrl(&ctrl->ctrl); |
| 1749 | else | 1805 | else |
| 1750 | nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); | 1806 | nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); |
| 1751 | 1807 | nvme_rdma_teardown_admin_queue(ctrl, shutdown); | |
| 1752 | blk_mq_quiesce_queue(ctrl->ctrl.admin_q); | ||
| 1753 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 1754 | blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, | ||
| 1755 | nvme_cancel_request, &ctrl->ctrl); | ||
| 1756 | blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); | ||
| 1757 | nvme_rdma_destroy_admin_queue(ctrl, shutdown); | ||
| 1758 | } | 1808 | } |
| 1759 | 1809 | ||
| 1760 | static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) | 1810 | static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) |
| @@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) | |||
| 1766 | { | 1816 | { |
| 1767 | struct nvme_rdma_ctrl *ctrl = | 1817 | struct nvme_rdma_ctrl *ctrl = |
| 1768 | container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); | 1818 | container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); |
| 1769 | int ret; | ||
| 1770 | bool changed; | ||
| 1771 | 1819 | ||
| 1772 | nvme_stop_ctrl(&ctrl->ctrl); | 1820 | nvme_stop_ctrl(&ctrl->ctrl); |
| 1773 | nvme_rdma_shutdown_ctrl(ctrl, false); | 1821 | nvme_rdma_shutdown_ctrl(ctrl, false); |
| @@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) | |||
| 1778 | return; | 1826 | return; |
| 1779 | } | 1827 | } |
| 1780 | 1828 | ||
| 1781 | ret = nvme_rdma_configure_admin_queue(ctrl, false); | 1829 | if (nvme_rdma_setup_ctrl(ctrl, false)) |
| 1782 | if (ret) | ||
| 1783 | goto out_fail; | 1830 | goto out_fail; |
| 1784 | 1831 | ||
| 1785 | if (ctrl->ctrl.queue_count > 1) { | ||
| 1786 | ret = nvme_rdma_configure_io_queues(ctrl, false); | ||
| 1787 | if (ret) | ||
| 1788 | goto out_fail; | ||
| 1789 | } | ||
| 1790 | |||
| 1791 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | ||
| 1792 | if (!changed) { | ||
| 1793 | /* state change failure is ok if we're in DELETING state */ | ||
| 1794 | WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); | ||
| 1795 | return; | ||
| 1796 | } | ||
| 1797 | |||
| 1798 | nvme_start_ctrl(&ctrl->ctrl); | ||
| 1799 | |||
| 1800 | return; | 1832 | return; |
| 1801 | 1833 | ||
| 1802 | out_fail: | 1834 | out_fail: |
| @@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | |||
| 1959 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); | 1991 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); |
| 1960 | WARN_ON_ONCE(!changed); | 1992 | WARN_ON_ONCE(!changed); |
| 1961 | 1993 | ||
| 1962 | ret = nvme_rdma_configure_admin_queue(ctrl, true); | 1994 | ret = nvme_rdma_setup_ctrl(ctrl, true); |
| 1963 | if (ret) | 1995 | if (ret) |
| 1964 | goto out_uninit_ctrl; | 1996 | goto out_uninit_ctrl; |
| 1965 | 1997 | ||
| 1966 | /* sanity check icdoff */ | ||
| 1967 | if (ctrl->ctrl.icdoff) { | ||
| 1968 | dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); | ||
| 1969 | ret = -EINVAL; | ||
| 1970 | goto out_remove_admin_queue; | ||
| 1971 | } | ||
| 1972 | |||
| 1973 | /* sanity check keyed sgls */ | ||
| 1974 | if (!(ctrl->ctrl.sgls & (1 << 2))) { | ||
| 1975 | dev_err(ctrl->ctrl.device, | ||
| 1976 | "Mandatory keyed sgls are not supported!\n"); | ||
| 1977 | ret = -EINVAL; | ||
| 1978 | goto out_remove_admin_queue; | ||
| 1979 | } | ||
| 1980 | |||
| 1981 | /* only warn if argument is too large here, will clamp later */ | ||
| 1982 | if (opts->queue_size > ctrl->ctrl.sqsize + 1) { | ||
| 1983 | dev_warn(ctrl->ctrl.device, | ||
| 1984 | "queue_size %zu > ctrl sqsize %u, clamping down\n", | ||
| 1985 | opts->queue_size, ctrl->ctrl.sqsize + 1); | ||
| 1986 | } | ||
| 1987 | |||
| 1988 | /* warn if maxcmd is lower than sqsize+1 */ | ||
| 1989 | if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { | ||
| 1990 | dev_warn(ctrl->ctrl.device, | ||
| 1991 | "sqsize %u > ctrl maxcmd %u, clamping down\n", | ||
| 1992 | ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); | ||
| 1993 | ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; | ||
| 1994 | } | ||
| 1995 | |||
| 1996 | if (opts->nr_io_queues) { | ||
| 1997 | ret = nvme_rdma_configure_io_queues(ctrl, true); | ||
| 1998 | if (ret) | ||
| 1999 | goto out_remove_admin_queue; | ||
| 2000 | } | ||
| 2001 | |||
| 2002 | changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); | ||
| 2003 | WARN_ON_ONCE(!changed); | ||
| 2004 | |||
| 2005 | dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", | 1998 | dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", |
| 2006 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); | 1999 | ctrl->ctrl.opts->subsysnqn, &ctrl->addr); |
| 2007 | 2000 | ||
| @@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, | |||
| 2011 | list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); | 2004 | list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); |
| 2012 | mutex_unlock(&nvme_rdma_ctrl_mutex); | 2005 | mutex_unlock(&nvme_rdma_ctrl_mutex); |
| 2013 | 2006 | ||
| 2014 | nvme_start_ctrl(&ctrl->ctrl); | ||
| 2015 | |||
| 2016 | return &ctrl->ctrl; | 2007 | return &ctrl->ctrl; |
| 2017 | 2008 | ||
| 2018 | out_remove_admin_queue: | ||
| 2019 | nvme_rdma_stop_queue(&ctrl->queues[0]); | ||
| 2020 | nvme_rdma_destroy_admin_queue(ctrl, true); | ||
| 2021 | out_uninit_ctrl: | 2009 | out_uninit_ctrl: |
| 2022 | nvme_uninit_ctrl(&ctrl->ctrl); | 2010 | nvme_uninit_ctrl(&ctrl->ctrl); |
| 2023 | nvme_put_ctrl(&ctrl->ctrl); | 2011 | nvme_put_ctrl(&ctrl->ctrl); |
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 41944bbef835..25b0e310f4a8 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c | |||
| @@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, | |||
| 128 | return nvme_trace_common(p, cdw10); | 128 | return nvme_trace_common(p, cdw10); |
| 129 | } | 129 | } |
| 130 | } | 130 | } |
| 131 | |||
| 132 | const char *nvme_trace_disk_name(struct trace_seq *p, char *name) | ||
| 133 | { | ||
| 134 | const char *ret = trace_seq_buffer_ptr(p); | ||
| 135 | |||
| 136 | if (*name) | ||
| 137 | trace_seq_printf(p, "disk=%s, ", name); | ||
| 138 | trace_seq_putc(p, 0); | ||
| 139 | |||
| 140 | return ret; | ||
| 141 | } | ||
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 01390f0e1671..a490790d6691 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h | |||
| @@ -50,13 +50,8 @@ | |||
| 50 | nvme_admin_opcode_name(nvme_admin_security_recv), \ | 50 | nvme_admin_opcode_name(nvme_admin_security_recv), \ |
| 51 | nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) | 51 | nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) |
| 52 | 52 | ||
| 53 | const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, | ||
| 54 | u8 *cdw10); | ||
| 55 | #define __parse_nvme_admin_cmd(opcode, cdw10) \ | ||
| 56 | nvme_trace_parse_admin_cmd(p, opcode, cdw10) | ||
| 57 | |||
| 58 | #define nvme_opcode_name(opcode) { opcode, #opcode } | 53 | #define nvme_opcode_name(opcode) { opcode, #opcode } |
| 59 | #define show_opcode_name(val) \ | 54 | #define show_nvm_opcode_name(val) \ |
| 60 | __print_symbolic(val, \ | 55 | __print_symbolic(val, \ |
| 61 | nvme_opcode_name(nvme_cmd_flush), \ | 56 | nvme_opcode_name(nvme_cmd_flush), \ |
| 62 | nvme_opcode_name(nvme_cmd_write), \ | 57 | nvme_opcode_name(nvme_cmd_write), \ |
| @@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, | |||
| 70 | nvme_opcode_name(nvme_cmd_resv_acquire), \ | 65 | nvme_opcode_name(nvme_cmd_resv_acquire), \ |
| 71 | nvme_opcode_name(nvme_cmd_resv_release)) | 66 | nvme_opcode_name(nvme_cmd_resv_release)) |
| 72 | 67 | ||
| 73 | const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, | 68 | #define show_opcode_name(qid, opcode) \ |
| 74 | u8 *cdw10); | 69 | (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) |
| 75 | #define __parse_nvme_cmd(opcode, cdw10) \ | ||
| 76 | nvme_trace_parse_nvm_cmd(p, opcode, cdw10) | ||
| 77 | |||
| 78 | TRACE_EVENT(nvme_setup_admin_cmd, | ||
| 79 | TP_PROTO(struct nvme_command *cmd), | ||
| 80 | TP_ARGS(cmd), | ||
| 81 | TP_STRUCT__entry( | ||
| 82 | __field(u8, opcode) | ||
| 83 | __field(u8, flags) | ||
| 84 | __field(u16, cid) | ||
| 85 | __field(u64, metadata) | ||
| 86 | __array(u8, cdw10, 24) | ||
| 87 | ), | ||
| 88 | TP_fast_assign( | ||
| 89 | __entry->opcode = cmd->common.opcode; | ||
| 90 | __entry->flags = cmd->common.flags; | ||
| 91 | __entry->cid = cmd->common.command_id; | ||
| 92 | __entry->metadata = le64_to_cpu(cmd->common.metadata); | ||
| 93 | memcpy(__entry->cdw10, cmd->common.cdw10, | ||
| 94 | sizeof(__entry->cdw10)); | ||
| 95 | ), | ||
| 96 | TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", | ||
| 97 | __entry->cid, __entry->flags, __entry->metadata, | ||
| 98 | show_admin_opcode_name(__entry->opcode), | ||
| 99 | __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) | ||
| 100 | ); | ||
| 101 | |||
| 102 | 70 | ||
| 103 | TRACE_EVENT(nvme_setup_nvm_cmd, | 71 | const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, |
| 104 | TP_PROTO(int qid, struct nvme_command *cmd), | 72 | u8 *cdw10); |
| 105 | TP_ARGS(qid, cmd), | 73 | const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, |
| 74 | u8 *cdw10); | ||
| 75 | |||
| 76 | #define parse_nvme_cmd(qid, opcode, cdw10) \ | ||
| 77 | (qid ? \ | ||
| 78 | nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ | ||
| 79 | nvme_trace_parse_admin_cmd(p, opcode, cdw10)) | ||
| 80 | |||
| 81 | const char *nvme_trace_disk_name(struct trace_seq *p, char *name); | ||
| 82 | #define __print_disk_name(name) \ | ||
| 83 | nvme_trace_disk_name(p, name) | ||
| 84 | |||
| 85 | #ifndef TRACE_HEADER_MULTI_READ | ||
| 86 | static inline void __assign_disk_name(char *name, struct gendisk *disk) | ||
| 87 | { | ||
| 88 | if (disk) | ||
| 89 | memcpy(name, disk->disk_name, DISK_NAME_LEN); | ||
| 90 | else | ||
| 91 | memset(name, 0, DISK_NAME_LEN); | ||
| 92 | } | ||
| 93 | #endif | ||
| 94 | |||
| 95 | TRACE_EVENT(nvme_setup_cmd, | ||
| 96 | TP_PROTO(struct request *req, struct nvme_command *cmd), | ||
| 97 | TP_ARGS(req, cmd), | ||
| 106 | TP_STRUCT__entry( | 98 | TP_STRUCT__entry( |
| 107 | __field(int, qid) | 99 | __array(char, disk, DISK_NAME_LEN) |
| 108 | __field(u8, opcode) | 100 | __field(int, ctrl_id) |
| 109 | __field(u8, flags) | 101 | __field(int, qid) |
| 110 | __field(u16, cid) | 102 | __field(u8, opcode) |
| 111 | __field(u32, nsid) | 103 | __field(u8, flags) |
| 112 | __field(u64, metadata) | 104 | __field(u16, cid) |
| 113 | __array(u8, cdw10, 24) | 105 | __field(u32, nsid) |
| 106 | __field(u64, metadata) | ||
| 107 | __array(u8, cdw10, 24) | ||
| 114 | ), | 108 | ), |
| 115 | TP_fast_assign( | 109 | TP_fast_assign( |
| 116 | __entry->qid = qid; | 110 | __entry->ctrl_id = nvme_req(req)->ctrl->instance; |
| 117 | __entry->opcode = cmd->common.opcode; | 111 | __entry->qid = nvme_req_qid(req); |
| 118 | __entry->flags = cmd->common.flags; | 112 | __entry->opcode = cmd->common.opcode; |
| 119 | __entry->cid = cmd->common.command_id; | 113 | __entry->flags = cmd->common.flags; |
| 120 | __entry->nsid = le32_to_cpu(cmd->common.nsid); | 114 | __entry->cid = cmd->common.command_id; |
| 121 | __entry->metadata = le64_to_cpu(cmd->common.metadata); | 115 | __entry->nsid = le32_to_cpu(cmd->common.nsid); |
| 122 | memcpy(__entry->cdw10, cmd->common.cdw10, | 116 | __entry->metadata = le64_to_cpu(cmd->common.metadata); |
| 123 | sizeof(__entry->cdw10)); | 117 | __assign_disk_name(__entry->disk, req->rq_disk); |
| 118 | memcpy(__entry->cdw10, cmd->common.cdw10, | ||
| 119 | sizeof(__entry->cdw10)); | ||
| 124 | ), | 120 | ), |
| 125 | TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", | 121 | TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", |
| 126 | __entry->qid, __entry->nsid, __entry->cid, | 122 | __entry->ctrl_id, __print_disk_name(__entry->disk), |
| 123 | __entry->qid, __entry->cid, __entry->nsid, | ||
| 127 | __entry->flags, __entry->metadata, | 124 | __entry->flags, __entry->metadata, |
| 128 | show_opcode_name(__entry->opcode), | 125 | show_opcode_name(__entry->qid, __entry->opcode), |
| 129 | __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) | 126 | parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) |
| 130 | ); | 127 | ); |
| 131 | 128 | ||
| 132 | TRACE_EVENT(nvme_complete_rq, | 129 | TRACE_EVENT(nvme_complete_rq, |
| 133 | TP_PROTO(struct request *req), | 130 | TP_PROTO(struct request *req), |
| 134 | TP_ARGS(req), | 131 | TP_ARGS(req), |
| 135 | TP_STRUCT__entry( | 132 | TP_STRUCT__entry( |
| 136 | __field(int, qid) | 133 | __array(char, disk, DISK_NAME_LEN) |
| 137 | __field(int, cid) | 134 | __field(int, ctrl_id) |
| 138 | __field(u64, result) | 135 | __field(int, qid) |
| 139 | __field(u8, retries) | 136 | __field(int, cid) |
| 140 | __field(u8, flags) | 137 | __field(u64, result) |
| 141 | __field(u16, status) | 138 | __field(u8, retries) |
| 139 | __field(u8, flags) | ||
| 140 | __field(u16, status) | ||
| 142 | ), | 141 | ), |
| 143 | TP_fast_assign( | 142 | TP_fast_assign( |
| 144 | __entry->qid = req->q->id; | 143 | __entry->ctrl_id = nvme_req(req)->ctrl->instance; |
| 145 | __entry->cid = req->tag; | 144 | __entry->qid = nvme_req_qid(req); |
| 146 | __entry->result = le64_to_cpu(nvme_req(req)->result.u64); | 145 | __entry->cid = req->tag; |
| 147 | __entry->retries = nvme_req(req)->retries; | 146 | __entry->result = le64_to_cpu(nvme_req(req)->result.u64); |
| 148 | __entry->flags = nvme_req(req)->flags; | 147 | __entry->retries = nvme_req(req)->retries; |
| 149 | __entry->status = nvme_req(req)->status; | 148 | __entry->flags = nvme_req(req)->flags; |
| 149 | __entry->status = nvme_req(req)->status; | ||
| 150 | __assign_disk_name(__entry->disk, req->rq_disk); | ||
| 150 | ), | 151 | ), |
| 151 | TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", | 152 | TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", |
| 153 | __entry->ctrl_id, __print_disk_name(__entry->disk), | ||
| 152 | __entry->qid, __entry->cid, __entry->result, | 154 | __entry->qid, __entry->cid, __entry->result, |
| 153 | __entry->retries, __entry->flags, __entry->status) | 155 | __entry->retries, __entry->flags, __entry->status) |
| 154 | 156 | ||
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 38803576d5e1..a21caea1e080 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c | |||
| @@ -19,6 +19,19 @@ | |||
| 19 | #include <asm/unaligned.h> | 19 | #include <asm/unaligned.h> |
| 20 | #include "nvmet.h" | 20 | #include "nvmet.h" |
| 21 | 21 | ||
| 22 | /* | ||
| 23 | * This helper allows us to clear the AEN based on the RAE bit, | ||
| 24 | * Please use this helper when processing the log pages which are | ||
| 25 | * associated with the AEN. | ||
| 26 | */ | ||
| 27 | static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit) | ||
| 28 | { | ||
| 29 | int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15; | ||
| 30 | |||
| 31 | if (!rae) | ||
| 32 | clear_bit(aen_bit, &req->sq->ctrl->aen_masked); | ||
| 33 | } | ||
| 34 | |||
| 22 | u32 nvmet_get_log_page_len(struct nvme_command *cmd) | 35 | u32 nvmet_get_log_page_len(struct nvme_command *cmd) |
| 23 | { | 36 | { |
| 24 | u32 len = le16_to_cpu(cmd->get_log_page.numdu); | 37 | u32 len = le16_to_cpu(cmd->get_log_page.numdu); |
| @@ -128,6 +141,36 @@ out: | |||
| 128 | nvmet_req_complete(req, status); | 141 | nvmet_req_complete(req, status); |
| 129 | } | 142 | } |
| 130 | 143 | ||
| 144 | static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) | ||
| 145 | { | ||
| 146 | u16 status = NVME_SC_INTERNAL; | ||
| 147 | struct nvme_effects_log *log; | ||
| 148 | |||
| 149 | log = kzalloc(sizeof(*log), GFP_KERNEL); | ||
| 150 | if (!log) | ||
| 151 | goto out; | ||
| 152 | |||
| 153 | log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); | ||
| 154 | log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); | ||
| 155 | log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); | ||
| 156 | log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); | ||
| 157 | log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); | ||
| 158 | log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); | ||
| 159 | log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); | ||
| 160 | |||
| 161 | log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); | ||
| 162 | log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); | ||
| 163 | log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); | ||
| 164 | log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); | ||
| 165 | log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); | ||
| 166 | |||
| 167 | status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); | ||
| 168 | |||
| 169 | kfree(log); | ||
| 170 | out: | ||
| 171 | nvmet_req_complete(req, status); | ||
| 172 | } | ||
| 173 | |||
| 131 | static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) | 174 | static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) |
| 132 | { | 175 | { |
| 133 | struct nvmet_ctrl *ctrl = req->sq->ctrl; | 176 | struct nvmet_ctrl *ctrl = req->sq->ctrl; |
| @@ -146,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) | |||
| 146 | if (!status) | 189 | if (!status) |
| 147 | status = nvmet_zero_sgl(req, len, req->data_len - len); | 190 | status = nvmet_zero_sgl(req, len, req->data_len - len); |
| 148 | ctrl->nr_changed_ns = 0; | 191 | ctrl->nr_changed_ns = 0; |
| 149 | clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked); | 192 | nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); |
| 150 | mutex_unlock(&ctrl->lock); | 193 | mutex_unlock(&ctrl->lock); |
| 151 | out: | 194 | out: |
| 152 | nvmet_req_complete(req, status); | 195 | nvmet_req_complete(req, status); |
| 153 | } | 196 | } |
| 154 | 197 | ||
| 198 | static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, | ||
| 199 | struct nvme_ana_group_desc *desc) | ||
| 200 | { | ||
| 201 | struct nvmet_ctrl *ctrl = req->sq->ctrl; | ||
| 202 | struct nvmet_ns *ns; | ||
| 203 | u32 count = 0; | ||
| 204 | |||
| 205 | if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { | ||
| 206 | rcu_read_lock(); | ||
| 207 | list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) | ||
| 208 | if (ns->anagrpid == grpid) | ||
| 209 | desc->nsids[count++] = cpu_to_le32(ns->nsid); | ||
| 210 | rcu_read_unlock(); | ||
| 211 | } | ||
| 212 | |||
| 213 | desc->grpid = cpu_to_le32(grpid); | ||
| 214 | desc->nnsids = cpu_to_le32(count); | ||
| 215 | desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); | ||
| 216 | desc->state = req->port->ana_state[grpid]; | ||
| 217 | memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); | ||
| 218 | return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32); | ||
| 219 | } | ||
| 220 | |||
| 221 | static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) | ||
| 222 | { | ||
| 223 | struct nvme_ana_rsp_hdr hdr = { 0, }; | ||
| 224 | struct nvme_ana_group_desc *desc; | ||
| 225 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */ | ||
| 226 | size_t len; | ||
| 227 | u32 grpid; | ||
| 228 | u16 ngrps = 0; | ||
| 229 | u16 status; | ||
| 230 | |||
| 231 | status = NVME_SC_INTERNAL; | ||
| 232 | desc = kmalloc(sizeof(struct nvme_ana_group_desc) + | ||
| 233 | NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL); | ||
| 234 | if (!desc) | ||
| 235 | goto out; | ||
| 236 | |||
| 237 | down_read(&nvmet_ana_sem); | ||
| 238 | for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) { | ||
| 239 | if (!nvmet_ana_group_enabled[grpid]) | ||
| 240 | continue; | ||
| 241 | len = nvmet_format_ana_group(req, grpid, desc); | ||
| 242 | status = nvmet_copy_to_sgl(req, offset, desc, len); | ||
| 243 | if (status) | ||
| 244 | break; | ||
| 245 | offset += len; | ||
| 246 | ngrps++; | ||
| 247 | } | ||
| 248 | |||
| 249 | hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); | ||
| 250 | hdr.ngrps = cpu_to_le16(ngrps); | ||
| 251 | nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); | ||
| 252 | up_read(&nvmet_ana_sem); | ||
| 253 | |||
| 254 | kfree(desc); | ||
| 255 | |||
| 256 | /* copy the header last once we know the number of groups */ | ||
| 257 | status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr)); | ||
| 258 | out: | ||
| 259 | nvmet_req_complete(req, status); | ||
| 260 | } | ||
| 261 | |||
| 155 | static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | 262 | static void nvmet_execute_identify_ctrl(struct nvmet_req *req) |
| 156 | { | 263 | { |
| 157 | struct nvmet_ctrl *ctrl = req->sq->ctrl; | 264 | struct nvmet_ctrl *ctrl = req->sq->ctrl; |
| @@ -183,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | |||
| 183 | * the safest is to leave it as zeroes. | 290 | * the safest is to leave it as zeroes. |
| 184 | */ | 291 | */ |
| 185 | 292 | ||
| 186 | /* we support multiple ports and multiples hosts: */ | 293 | /* we support multiple ports, multiples hosts and ANA: */ |
| 187 | id->cmic = (1 << 0) | (1 << 1); | 294 | id->cmic = (1 << 0) | (1 << 1) | (1 << 3); |
| 188 | 295 | ||
| 189 | /* no limit on data transfer sizes for now */ | 296 | /* no limit on data transfer sizes for now */ |
| 190 | id->mdts = 0; | 297 | id->mdts = 0; |
| @@ -208,7 +315,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | |||
| 208 | 315 | ||
| 209 | /* first slot is read-only, only one slot supported */ | 316 | /* first slot is read-only, only one slot supported */ |
| 210 | id->frmw = (1 << 0) | (1 << 1); | 317 | id->frmw = (1 << 0) | (1 << 1); |
| 211 | id->lpa = (1 << 0) | (1 << 2); | 318 | id->lpa = (1 << 0) | (1 << 1) | (1 << 2); |
| 212 | id->elpe = NVMET_ERROR_LOG_SLOTS - 1; | 319 | id->elpe = NVMET_ERROR_LOG_SLOTS - 1; |
| 213 | id->npss = 0; | 320 | id->npss = 0; |
| 214 | 321 | ||
| @@ -222,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | |||
| 222 | id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); | 329 | id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); |
| 223 | 330 | ||
| 224 | id->nn = cpu_to_le32(ctrl->subsys->max_nsid); | 331 | id->nn = cpu_to_le32(ctrl->subsys->max_nsid); |
| 332 | id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); | ||
| 225 | id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | | 333 | id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | |
| 226 | NVME_CTRL_ONCS_WRITE_ZEROES); | 334 | NVME_CTRL_ONCS_WRITE_ZEROES); |
| 227 | 335 | ||
| @@ -238,19 +346,24 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | |||
| 238 | id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ | 346 | id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ |
| 239 | if (ctrl->ops->has_keyed_sgls) | 347 | if (ctrl->ops->has_keyed_sgls) |
| 240 | id->sgls |= cpu_to_le32(1 << 2); | 348 | id->sgls |= cpu_to_le32(1 << 2); |
| 241 | if (ctrl->ops->sqe_inline_size) | 349 | if (req->port->inline_data_size) |
| 242 | id->sgls |= cpu_to_le32(1 << 20); | 350 | id->sgls |= cpu_to_le32(1 << 20); |
| 243 | 351 | ||
| 244 | strcpy(id->subnqn, ctrl->subsys->subsysnqn); | 352 | strcpy(id->subnqn, ctrl->subsys->subsysnqn); |
| 245 | 353 | ||
| 246 | /* Max command capsule size is sqe + single page of in-capsule data */ | 354 | /* Max command capsule size is sqe + single page of in-capsule data */ |
| 247 | id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + | 355 | id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + |
| 248 | ctrl->ops->sqe_inline_size) / 16); | 356 | req->port->inline_data_size) / 16); |
| 249 | /* Max response capsule size is cqe */ | 357 | /* Max response capsule size is cqe */ |
| 250 | id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); | 358 | id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); |
| 251 | 359 | ||
| 252 | id->msdbd = ctrl->ops->msdbd; | 360 | id->msdbd = ctrl->ops->msdbd; |
| 253 | 361 | ||
| 362 | id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); | ||
| 363 | id->anatt = 10; /* random value */ | ||
| 364 | id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS); | ||
| 365 | id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS); | ||
| 366 | |||
| 254 | /* | 367 | /* |
| 255 | * Meh, we don't really support any power state. Fake up the same | 368 | * Meh, we don't really support any power state. Fake up the same |
| 256 | * values that qemu does. | 369 | * values that qemu does. |
| @@ -259,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) | |||
| 259 | id->psd[0].entry_lat = cpu_to_le32(0x10); | 372 | id->psd[0].entry_lat = cpu_to_le32(0x10); |
| 260 | id->psd[0].exit_lat = cpu_to_le32(0x4); | 373 | id->psd[0].exit_lat = cpu_to_le32(0x4); |
| 261 | 374 | ||
| 375 | id->nwpc = 1 << 0; /* write protect and no write protect */ | ||
| 376 | |||
| 262 | status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); | 377 | status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); |
| 263 | 378 | ||
| 264 | kfree(id); | 379 | kfree(id); |
| @@ -292,8 +407,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) | |||
| 292 | * nuse = ncap = nsze isn't always true, but we have no way to find | 407 | * nuse = ncap = nsze isn't always true, but we have no way to find |
| 293 | * that out from the underlying device. | 408 | * that out from the underlying device. |
| 294 | */ | 409 | */ |
| 295 | id->ncap = id->nuse = id->nsze = | 410 | id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift); |
| 296 | cpu_to_le64(ns->size >> ns->blksize_shift); | 411 | switch (req->port->ana_state[ns->anagrpid]) { |
| 412 | case NVME_ANA_INACCESSIBLE: | ||
| 413 | case NVME_ANA_PERSISTENT_LOSS: | ||
| 414 | break; | ||
| 415 | default: | ||
| 416 | id->nuse = id->nsze; | ||
| 417 | break; | ||
| 418 | } | ||
| 297 | 419 | ||
| 298 | /* | 420 | /* |
| 299 | * We just provide a single LBA format that matches what the | 421 | * We just provide a single LBA format that matches what the |
| @@ -307,11 +429,14 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) | |||
| 307 | * controllers, but also with any other user of the block device. | 429 | * controllers, but also with any other user of the block device. |
| 308 | */ | 430 | */ |
| 309 | id->nmic = (1 << 0); | 431 | id->nmic = (1 << 0); |
| 432 | id->anagrpid = cpu_to_le32(ns->anagrpid); | ||
| 310 | 433 | ||
| 311 | memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); | 434 | memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); |
| 312 | 435 | ||
| 313 | id->lbaf[0].ds = ns->blksize_shift; | 436 | id->lbaf[0].ds = ns->blksize_shift; |
| 314 | 437 | ||
| 438 | if (ns->readonly) | ||
| 439 | id->nsattr |= (1 << 0); | ||
| 315 | nvmet_put_namespace(ns); | 440 | nvmet_put_namespace(ns); |
| 316 | done: | 441 | done: |
| 317 | status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); | 442 | status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); |
| @@ -424,6 +549,52 @@ static void nvmet_execute_abort(struct nvmet_req *req) | |||
| 424 | nvmet_req_complete(req, 0); | 549 | nvmet_req_complete(req, 0); |
| 425 | } | 550 | } |
| 426 | 551 | ||
| 552 | static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) | ||
| 553 | { | ||
| 554 | u16 status; | ||
| 555 | |||
| 556 | if (req->ns->file) | ||
| 557 | status = nvmet_file_flush(req); | ||
| 558 | else | ||
| 559 | status = nvmet_bdev_flush(req); | ||
| 560 | |||
| 561 | if (status) | ||
| 562 | pr_err("write protect flush failed nsid: %u\n", req->ns->nsid); | ||
| 563 | return status; | ||
| 564 | } | ||
| 565 | |||
| 566 | static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) | ||
| 567 | { | ||
| 568 | u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]); | ||
| 569 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; | ||
| 570 | u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; | ||
| 571 | |||
| 572 | req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); | ||
| 573 | if (unlikely(!req->ns)) | ||
| 574 | return status; | ||
| 575 | |||
| 576 | mutex_lock(&subsys->lock); | ||
| 577 | switch (write_protect) { | ||
| 578 | case NVME_NS_WRITE_PROTECT: | ||
| 579 | req->ns->readonly = true; | ||
| 580 | status = nvmet_write_protect_flush_sync(req); | ||
| 581 | if (status) | ||
| 582 | req->ns->readonly = false; | ||
| 583 | break; | ||
| 584 | case NVME_NS_NO_WRITE_PROTECT: | ||
| 585 | req->ns->readonly = false; | ||
| 586 | status = 0; | ||
| 587 | break; | ||
| 588 | default: | ||
| 589 | break; | ||
| 590 | } | ||
| 591 | |||
| 592 | if (!status) | ||
| 593 | nvmet_ns_changed(subsys, req->ns->nsid); | ||
| 594 | mutex_unlock(&subsys->lock); | ||
| 595 | return status; | ||
| 596 | } | ||
| 597 | |||
| 427 | static void nvmet_execute_set_features(struct nvmet_req *req) | 598 | static void nvmet_execute_set_features(struct nvmet_req *req) |
| 428 | { | 599 | { |
| 429 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; | 600 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; |
| @@ -454,6 +625,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req) | |||
| 454 | case NVME_FEAT_HOST_ID: | 625 | case NVME_FEAT_HOST_ID: |
| 455 | status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; | 626 | status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; |
| 456 | break; | 627 | break; |
| 628 | case NVME_FEAT_WRITE_PROTECT: | ||
| 629 | status = nvmet_set_feat_write_protect(req); | ||
| 630 | break; | ||
| 457 | default: | 631 | default: |
| 458 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; | 632 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; |
| 459 | break; | 633 | break; |
| @@ -462,6 +636,26 @@ static void nvmet_execute_set_features(struct nvmet_req *req) | |||
| 462 | nvmet_req_complete(req, status); | 636 | nvmet_req_complete(req, status); |
| 463 | } | 637 | } |
| 464 | 638 | ||
| 639 | static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) | ||
| 640 | { | ||
| 641 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; | ||
| 642 | u32 result; | ||
| 643 | |||
| 644 | req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); | ||
| 645 | if (!req->ns) | ||
| 646 | return NVME_SC_INVALID_NS | NVME_SC_DNR; | ||
| 647 | |||
| 648 | mutex_lock(&subsys->lock); | ||
| 649 | if (req->ns->readonly == true) | ||
| 650 | result = NVME_NS_WRITE_PROTECT; | ||
| 651 | else | ||
| 652 | result = NVME_NS_NO_WRITE_PROTECT; | ||
| 653 | nvmet_set_result(req, result); | ||
| 654 | mutex_unlock(&subsys->lock); | ||
| 655 | |||
| 656 | return 0; | ||
| 657 | } | ||
| 658 | |||
| 465 | static void nvmet_execute_get_features(struct nvmet_req *req) | 659 | static void nvmet_execute_get_features(struct nvmet_req *req) |
| 466 | { | 660 | { |
| 467 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; | 661 | struct nvmet_subsys *subsys = req->sq->ctrl->subsys; |
| @@ -513,6 +707,9 @@ static void nvmet_execute_get_features(struct nvmet_req *req) | |||
| 513 | status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, | 707 | status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, |
| 514 | sizeof(req->sq->ctrl->hostid)); | 708 | sizeof(req->sq->ctrl->hostid)); |
| 515 | break; | 709 | break; |
| 710 | case NVME_FEAT_WRITE_PROTECT: | ||
| 711 | status = nvmet_get_feat_write_protect(req); | ||
| 712 | break; | ||
| 516 | default: | 713 | default: |
| 517 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; | 714 | status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; |
| 518 | break; | 715 | break; |
| @@ -586,6 +783,12 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) | |||
| 586 | case NVME_LOG_CHANGED_NS: | 783 | case NVME_LOG_CHANGED_NS: |
| 587 | req->execute = nvmet_execute_get_log_changed_ns; | 784 | req->execute = nvmet_execute_get_log_changed_ns; |
| 588 | return 0; | 785 | return 0; |
| 786 | case NVME_LOG_CMD_EFFECTS: | ||
| 787 | req->execute = nvmet_execute_get_log_cmd_effects_ns; | ||
| 788 | return 0; | ||
| 789 | case NVME_LOG_ANA: | ||
| 790 | req->execute = nvmet_execute_get_log_page_ana; | ||
| 791 | return 0; | ||
| 589 | } | 792 | } |
| 590 | break; | 793 | break; |
| 591 | case nvme_admin_identify: | 794 | case nvme_admin_identify: |
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index ebea1373d1b7..b37a8e3e3f80 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c | |||
| @@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, | |||
| 218 | 218 | ||
| 219 | CONFIGFS_ATTR(nvmet_, addr_trsvcid); | 219 | CONFIGFS_ATTR(nvmet_, addr_trsvcid); |
| 220 | 220 | ||
| 221 | static ssize_t nvmet_param_inline_data_size_show(struct config_item *item, | ||
| 222 | char *page) | ||
| 223 | { | ||
| 224 | struct nvmet_port *port = to_nvmet_port(item); | ||
| 225 | |||
| 226 | return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size); | ||
| 227 | } | ||
| 228 | |||
| 229 | static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, | ||
| 230 | const char *page, size_t count) | ||
| 231 | { | ||
| 232 | struct nvmet_port *port = to_nvmet_port(item); | ||
| 233 | int ret; | ||
| 234 | |||
| 235 | if (port->enabled) { | ||
| 236 | pr_err("Cannot modify inline_data_size while port enabled\n"); | ||
| 237 | pr_err("Disable the port before modifying\n"); | ||
| 238 | return -EACCES; | ||
| 239 | } | ||
| 240 | ret = kstrtoint(page, 0, &port->inline_data_size); | ||
| 241 | if (ret) { | ||
| 242 | pr_err("Invalid value '%s' for inline_data_size\n", page); | ||
| 243 | return -EINVAL; | ||
| 244 | } | ||
| 245 | return count; | ||
| 246 | } | ||
| 247 | |||
| 248 | CONFIGFS_ATTR(nvmet_, param_inline_data_size); | ||
| 249 | |||
| 221 | static ssize_t nvmet_addr_trtype_show(struct config_item *item, | 250 | static ssize_t nvmet_addr_trtype_show(struct config_item *item, |
| 222 | char *page) | 251 | char *page) |
| 223 | { | 252 | { |
| @@ -387,6 +416,39 @@ out_unlock: | |||
| 387 | 416 | ||
| 388 | CONFIGFS_ATTR(nvmet_ns_, device_nguid); | 417 | CONFIGFS_ATTR(nvmet_ns_, device_nguid); |
| 389 | 418 | ||
| 419 | static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page) | ||
| 420 | { | ||
| 421 | return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid); | ||
| 422 | } | ||
| 423 | |||
| 424 | static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item, | ||
| 425 | const char *page, size_t count) | ||
| 426 | { | ||
| 427 | struct nvmet_ns *ns = to_nvmet_ns(item); | ||
| 428 | u32 oldgrpid, newgrpid; | ||
| 429 | int ret; | ||
| 430 | |||
| 431 | ret = kstrtou32(page, 0, &newgrpid); | ||
| 432 | if (ret) | ||
| 433 | return ret; | ||
| 434 | |||
| 435 | if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS) | ||
| 436 | return -EINVAL; | ||
| 437 | |||
| 438 | down_write(&nvmet_ana_sem); | ||
| 439 | oldgrpid = ns->anagrpid; | ||
| 440 | nvmet_ana_group_enabled[newgrpid]++; | ||
| 441 | ns->anagrpid = newgrpid; | ||
| 442 | nvmet_ana_group_enabled[oldgrpid]--; | ||
| 443 | nvmet_ana_chgcnt++; | ||
| 444 | up_write(&nvmet_ana_sem); | ||
| 445 | |||
| 446 | nvmet_send_ana_event(ns->subsys, NULL); | ||
| 447 | return count; | ||
| 448 | } | ||
| 449 | |||
| 450 | CONFIGFS_ATTR(nvmet_ns_, ana_grpid); | ||
| 451 | |||
| 390 | static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) | 452 | static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) |
| 391 | { | 453 | { |
| 392 | return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); | 454 | return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); |
| @@ -412,11 +474,41 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, | |||
| 412 | 474 | ||
| 413 | CONFIGFS_ATTR(nvmet_ns_, enable); | 475 | CONFIGFS_ATTR(nvmet_ns_, enable); |
| 414 | 476 | ||
| 477 | static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page) | ||
| 478 | { | ||
| 479 | return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io); | ||
| 480 | } | ||
| 481 | |||
| 482 | static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, | ||
| 483 | const char *page, size_t count) | ||
| 484 | { | ||
| 485 | struct nvmet_ns *ns = to_nvmet_ns(item); | ||
| 486 | bool val; | ||
| 487 | |||
| 488 | if (strtobool(page, &val)) | ||
| 489 | return -EINVAL; | ||
| 490 | |||
| 491 | mutex_lock(&ns->subsys->lock); | ||
| 492 | if (ns->enabled) { | ||
| 493 | pr_err("disable ns before setting buffered_io value.\n"); | ||
| 494 | mutex_unlock(&ns->subsys->lock); | ||
| 495 | return -EINVAL; | ||
| 496 | } | ||
| 497 | |||
| 498 | ns->buffered_io = val; | ||
| 499 | mutex_unlock(&ns->subsys->lock); | ||
| 500 | return count; | ||
| 501 | } | ||
| 502 | |||
| 503 | CONFIGFS_ATTR(nvmet_ns_, buffered_io); | ||
| 504 | |||
| 415 | static struct configfs_attribute *nvmet_ns_attrs[] = { | 505 | static struct configfs_attribute *nvmet_ns_attrs[] = { |
| 416 | &nvmet_ns_attr_device_path, | 506 | &nvmet_ns_attr_device_path, |
| 417 | &nvmet_ns_attr_device_nguid, | 507 | &nvmet_ns_attr_device_nguid, |
| 418 | &nvmet_ns_attr_device_uuid, | 508 | &nvmet_ns_attr_device_uuid, |
| 509 | &nvmet_ns_attr_ana_grpid, | ||
| 419 | &nvmet_ns_attr_enable, | 510 | &nvmet_ns_attr_enable, |
| 511 | &nvmet_ns_attr_buffered_io, | ||
| 420 | NULL, | 512 | NULL, |
| 421 | }; | 513 | }; |
| 422 | 514 | ||
| @@ -863,6 +955,134 @@ static const struct config_item_type nvmet_referrals_type = { | |||
| 863 | .ct_group_ops = &nvmet_referral_group_ops, | 955 | .ct_group_ops = &nvmet_referral_group_ops, |
| 864 | }; | 956 | }; |
| 865 | 957 | ||
| 958 | static struct { | ||
| 959 | enum nvme_ana_state state; | ||
| 960 | const char *name; | ||
| 961 | } nvmet_ana_state_names[] = { | ||
| 962 | { NVME_ANA_OPTIMIZED, "optimized" }, | ||
| 963 | { NVME_ANA_NONOPTIMIZED, "non-optimized" }, | ||
| 964 | { NVME_ANA_INACCESSIBLE, "inaccessible" }, | ||
| 965 | { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" }, | ||
| 966 | { NVME_ANA_CHANGE, "change" }, | ||
| 967 | }; | ||
| 968 | |||
| 969 | static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item, | ||
| 970 | char *page) | ||
| 971 | { | ||
| 972 | struct nvmet_ana_group *grp = to_ana_group(item); | ||
| 973 | enum nvme_ana_state state = grp->port->ana_state[grp->grpid]; | ||
| 974 | int i; | ||
| 975 | |||
| 976 | for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { | ||
| 977 | if (state != nvmet_ana_state_names[i].state) | ||
| 978 | continue; | ||
| 979 | return sprintf(page, "%s\n", nvmet_ana_state_names[i].name); | ||
| 980 | } | ||
| 981 | |||
| 982 | return sprintf(page, "\n"); | ||
| 983 | } | ||
| 984 | |||
| 985 | static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item, | ||
| 986 | const char *page, size_t count) | ||
| 987 | { | ||
| 988 | struct nvmet_ana_group *grp = to_ana_group(item); | ||
| 989 | int i; | ||
| 990 | |||
| 991 | for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { | ||
| 992 | if (sysfs_streq(page, nvmet_ana_state_names[i].name)) | ||
| 993 | goto found; | ||
| 994 | } | ||
| 995 | |||
| 996 | pr_err("Invalid value '%s' for ana_state\n", page); | ||
| 997 | return -EINVAL; | ||
| 998 | |||
| 999 | found: | ||
| 1000 | down_write(&nvmet_ana_sem); | ||
| 1001 | grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state; | ||
| 1002 | nvmet_ana_chgcnt++; | ||
| 1003 | up_write(&nvmet_ana_sem); | ||
| 1004 | |||
| 1005 | nvmet_port_send_ana_event(grp->port); | ||
| 1006 | return count; | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | CONFIGFS_ATTR(nvmet_ana_group_, ana_state); | ||
| 1010 | |||
| 1011 | static struct configfs_attribute *nvmet_ana_group_attrs[] = { | ||
| 1012 | &nvmet_ana_group_attr_ana_state, | ||
| 1013 | NULL, | ||
| 1014 | }; | ||
| 1015 | |||
| 1016 | static void nvmet_ana_group_release(struct config_item *item) | ||
| 1017 | { | ||
| 1018 | struct nvmet_ana_group *grp = to_ana_group(item); | ||
| 1019 | |||
| 1020 | if (grp == &grp->port->ana_default_group) | ||
| 1021 | return; | ||
| 1022 | |||
| 1023 | down_write(&nvmet_ana_sem); | ||
| 1024 | grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE; | ||
| 1025 | nvmet_ana_group_enabled[grp->grpid]--; | ||
| 1026 | up_write(&nvmet_ana_sem); | ||
| 1027 | |||
| 1028 | nvmet_port_send_ana_event(grp->port); | ||
| 1029 | kfree(grp); | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | static struct configfs_item_operations nvmet_ana_group_item_ops = { | ||
| 1033 | .release = nvmet_ana_group_release, | ||
| 1034 | }; | ||
| 1035 | |||
| 1036 | static const struct config_item_type nvmet_ana_group_type = { | ||
| 1037 | .ct_item_ops = &nvmet_ana_group_item_ops, | ||
| 1038 | .ct_attrs = nvmet_ana_group_attrs, | ||
| 1039 | .ct_owner = THIS_MODULE, | ||
| 1040 | }; | ||
| 1041 | |||
| 1042 | static struct config_group *nvmet_ana_groups_make_group( | ||
| 1043 | struct config_group *group, const char *name) | ||
| 1044 | { | ||
| 1045 | struct nvmet_port *port = ana_groups_to_port(&group->cg_item); | ||
| 1046 | struct nvmet_ana_group *grp; | ||
| 1047 | u32 grpid; | ||
| 1048 | int ret; | ||
| 1049 | |||
| 1050 | ret = kstrtou32(name, 0, &grpid); | ||
| 1051 | if (ret) | ||
| 1052 | goto out; | ||
| 1053 | |||
| 1054 | ret = -EINVAL; | ||
| 1055 | if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS) | ||
| 1056 | goto out; | ||
| 1057 | |||
| 1058 | ret = -ENOMEM; | ||
| 1059 | grp = kzalloc(sizeof(*grp), GFP_KERNEL); | ||
| 1060 | if (!grp) | ||
| 1061 | goto out; | ||
| 1062 | grp->port = port; | ||
| 1063 | grp->grpid = grpid; | ||
| 1064 | |||
| 1065 | down_write(&nvmet_ana_sem); | ||
| 1066 | nvmet_ana_group_enabled[grpid]++; | ||
| 1067 | up_write(&nvmet_ana_sem); | ||
| 1068 | |||
| 1069 | nvmet_port_send_ana_event(grp->port); | ||
| 1070 | |||
| 1071 | config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type); | ||
| 1072 | return &grp->group; | ||
| 1073 | out: | ||
| 1074 | return ERR_PTR(ret); | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | static struct configfs_group_operations nvmet_ana_groups_group_ops = { | ||
| 1078 | .make_group = nvmet_ana_groups_make_group, | ||
| 1079 | }; | ||
| 1080 | |||
| 1081 | static const struct config_item_type nvmet_ana_groups_type = { | ||
| 1082 | .ct_group_ops = &nvmet_ana_groups_group_ops, | ||
| 1083 | .ct_owner = THIS_MODULE, | ||
| 1084 | }; | ||
| 1085 | |||
| 866 | /* | 1086 | /* |
| 867 | * Ports definitions. | 1087 | * Ports definitions. |
| 868 | */ | 1088 | */ |
| @@ -870,6 +1090,7 @@ static void nvmet_port_release(struct config_item *item) | |||
| 870 | { | 1090 | { |
| 871 | struct nvmet_port *port = to_nvmet_port(item); | 1091 | struct nvmet_port *port = to_nvmet_port(item); |
| 872 | 1092 | ||
| 1093 | kfree(port->ana_state); | ||
| 873 | kfree(port); | 1094 | kfree(port); |
| 874 | } | 1095 | } |
| 875 | 1096 | ||
| @@ -879,6 +1100,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { | |||
| 879 | &nvmet_attr_addr_traddr, | 1100 | &nvmet_attr_addr_traddr, |
| 880 | &nvmet_attr_addr_trsvcid, | 1101 | &nvmet_attr_addr_trsvcid, |
| 881 | &nvmet_attr_addr_trtype, | 1102 | &nvmet_attr_addr_trtype, |
| 1103 | &nvmet_attr_param_inline_data_size, | ||
| 882 | NULL, | 1104 | NULL, |
| 883 | }; | 1105 | }; |
| 884 | 1106 | ||
| @@ -897,6 +1119,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, | |||
| 897 | { | 1119 | { |
| 898 | struct nvmet_port *port; | 1120 | struct nvmet_port *port; |
| 899 | u16 portid; | 1121 | u16 portid; |
| 1122 | u32 i; | ||
| 900 | 1123 | ||
| 901 | if (kstrtou16(name, 0, &portid)) | 1124 | if (kstrtou16(name, 0, &portid)) |
| 902 | return ERR_PTR(-EINVAL); | 1125 | return ERR_PTR(-EINVAL); |
| @@ -905,9 +1128,24 @@ static struct config_group *nvmet_ports_make(struct config_group *group, | |||
| 905 | if (!port) | 1128 | if (!port) |
| 906 | return ERR_PTR(-ENOMEM); | 1129 | return ERR_PTR(-ENOMEM); |
| 907 | 1130 | ||
| 1131 | port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1, | ||
| 1132 | sizeof(*port->ana_state), GFP_KERNEL); | ||
| 1133 | if (!port->ana_state) { | ||
| 1134 | kfree(port); | ||
| 1135 | return ERR_PTR(-ENOMEM); | ||
| 1136 | } | ||
| 1137 | |||
| 1138 | for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) { | ||
| 1139 | if (i == NVMET_DEFAULT_ANA_GRPID) | ||
| 1140 | port->ana_state[1] = NVME_ANA_OPTIMIZED; | ||
| 1141 | else | ||
| 1142 | port->ana_state[i] = NVME_ANA_INACCESSIBLE; | ||
| 1143 | } | ||
| 1144 | |||
| 908 | INIT_LIST_HEAD(&port->entry); | 1145 | INIT_LIST_HEAD(&port->entry); |
| 909 | INIT_LIST_HEAD(&port->subsystems); | 1146 | INIT_LIST_HEAD(&port->subsystems); |
| 910 | INIT_LIST_HEAD(&port->referrals); | 1147 | INIT_LIST_HEAD(&port->referrals); |
| 1148 | port->inline_data_size = -1; /* < 0 == let the transport choose */ | ||
| 911 | 1149 | ||
| 912 | port->disc_addr.portid = cpu_to_le16(portid); | 1150 | port->disc_addr.portid = cpu_to_le16(portid); |
| 913 | config_group_init_type_name(&port->group, name, &nvmet_port_type); | 1151 | config_group_init_type_name(&port->group, name, &nvmet_port_type); |
| @@ -920,6 +1158,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group, | |||
| 920 | "referrals", &nvmet_referrals_type); | 1158 | "referrals", &nvmet_referrals_type); |
| 921 | configfs_add_default_group(&port->referrals_group, &port->group); | 1159 | configfs_add_default_group(&port->referrals_group, &port->group); |
| 922 | 1160 | ||
| 1161 | config_group_init_type_name(&port->ana_groups_group, | ||
| 1162 | "ana_groups", &nvmet_ana_groups_type); | ||
| 1163 | configfs_add_default_group(&port->ana_groups_group, &port->group); | ||
| 1164 | |||
| 1165 | port->ana_default_group.port = port; | ||
| 1166 | port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID; | ||
| 1167 | config_group_init_type_name(&port->ana_default_group.group, | ||
| 1168 | __stringify(NVMET_DEFAULT_ANA_GRPID), | ||
| 1169 | &nvmet_ana_group_type); | ||
| 1170 | configfs_add_default_group(&port->ana_default_group.group, | ||
| 1171 | &port->ana_groups_group); | ||
| 1172 | |||
| 923 | return &port->group; | 1173 | return &port->group; |
| 924 | } | 1174 | } |
| 925 | 1175 | ||
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 9838103f2d62..ebf3e7a6c49e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | 18 | ||
| 19 | #include "nvmet.h" | 19 | #include "nvmet.h" |
| 20 | 20 | ||
| 21 | struct workqueue_struct *buffered_io_wq; | ||
| 21 | static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; | 22 | static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; |
| 22 | static DEFINE_IDA(cntlid_ida); | 23 | static DEFINE_IDA(cntlid_ida); |
| 23 | 24 | ||
| @@ -39,6 +40,10 @@ static DEFINE_IDA(cntlid_ida); | |||
| 39 | */ | 40 | */ |
| 40 | DECLARE_RWSEM(nvmet_config_sem); | 41 | DECLARE_RWSEM(nvmet_config_sem); |
| 41 | 42 | ||
| 43 | u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; | ||
| 44 | u64 nvmet_ana_chgcnt; | ||
| 45 | DECLARE_RWSEM(nvmet_ana_sem); | ||
| 46 | |||
| 42 | static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, | 47 | static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, |
| 43 | const char *subsysnqn); | 48 | const char *subsysnqn); |
| 44 | 49 | ||
| @@ -175,7 +180,7 @@ out_unlock: | |||
| 175 | mutex_unlock(&ctrl->lock); | 180 | mutex_unlock(&ctrl->lock); |
| 176 | } | 181 | } |
| 177 | 182 | ||
| 178 | static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) | 183 | void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) |
| 179 | { | 184 | { |
| 180 | struct nvmet_ctrl *ctrl; | 185 | struct nvmet_ctrl *ctrl; |
| 181 | 186 | ||
| @@ -189,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) | |||
| 189 | } | 194 | } |
| 190 | } | 195 | } |
| 191 | 196 | ||
| 197 | void nvmet_send_ana_event(struct nvmet_subsys *subsys, | ||
| 198 | struct nvmet_port *port) | ||
| 199 | { | ||
| 200 | struct nvmet_ctrl *ctrl; | ||
| 201 | |||
| 202 | mutex_lock(&subsys->lock); | ||
| 203 | list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { | ||
| 204 | if (port && ctrl->port != port) | ||
| 205 | continue; | ||
| 206 | if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) | ||
| 207 | continue; | ||
| 208 | nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, | ||
| 209 | NVME_AER_NOTICE_ANA, NVME_LOG_ANA); | ||
| 210 | } | ||
| 211 | mutex_unlock(&subsys->lock); | ||
| 212 | } | ||
| 213 | |||
| 214 | void nvmet_port_send_ana_event(struct nvmet_port *port) | ||
| 215 | { | ||
| 216 | struct nvmet_subsys_link *p; | ||
| 217 | |||
| 218 | down_read(&nvmet_config_sem); | ||
| 219 | list_for_each_entry(p, &port->subsystems, entry) | ||
| 220 | nvmet_send_ana_event(p->subsys, port); | ||
| 221 | up_read(&nvmet_config_sem); | ||
| 222 | } | ||
| 223 | |||
| 192 | int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) | 224 | int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) |
| 193 | { | 225 | { |
| 194 | int ret = 0; | 226 | int ret = 0; |
| @@ -241,6 +273,10 @@ int nvmet_enable_port(struct nvmet_port *port) | |||
| 241 | return ret; | 273 | return ret; |
| 242 | } | 274 | } |
| 243 | 275 | ||
| 276 | /* If the transport didn't set inline_data_size, then disable it. */ | ||
| 277 | if (port->inline_data_size < 0) | ||
| 278 | port->inline_data_size = 0; | ||
| 279 | |||
| 244 | port->enabled = true; | 280 | port->enabled = true; |
| 245 | return 0; | 281 | return 0; |
| 246 | } | 282 | } |
| @@ -332,9 +368,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns) | |||
| 332 | int nvmet_ns_enable(struct nvmet_ns *ns) | 368 | int nvmet_ns_enable(struct nvmet_ns *ns) |
| 333 | { | 369 | { |
| 334 | struct nvmet_subsys *subsys = ns->subsys; | 370 | struct nvmet_subsys *subsys = ns->subsys; |
| 335 | int ret = 0; | 371 | int ret; |
| 336 | 372 | ||
| 337 | mutex_lock(&subsys->lock); | 373 | mutex_lock(&subsys->lock); |
| 374 | ret = -EMFILE; | ||
| 375 | if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) | ||
| 376 | goto out_unlock; | ||
| 377 | ret = 0; | ||
| 338 | if (ns->enabled) | 378 | if (ns->enabled) |
| 339 | goto out_unlock; | 379 | goto out_unlock; |
| 340 | 380 | ||
| @@ -369,6 +409,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) | |||
| 369 | 409 | ||
| 370 | list_add_tail_rcu(&ns->dev_link, &old->dev_link); | 410 | list_add_tail_rcu(&ns->dev_link, &old->dev_link); |
| 371 | } | 411 | } |
| 412 | subsys->nr_namespaces++; | ||
| 372 | 413 | ||
| 373 | nvmet_ns_changed(subsys, ns->nsid); | 414 | nvmet_ns_changed(subsys, ns->nsid); |
| 374 | ns->enabled = true; | 415 | ns->enabled = true; |
| @@ -409,6 +450,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) | |||
| 409 | percpu_ref_exit(&ns->ref); | 450 | percpu_ref_exit(&ns->ref); |
| 410 | 451 | ||
| 411 | mutex_lock(&subsys->lock); | 452 | mutex_lock(&subsys->lock); |
| 453 | subsys->nr_namespaces--; | ||
| 412 | nvmet_ns_changed(subsys, ns->nsid); | 454 | nvmet_ns_changed(subsys, ns->nsid); |
| 413 | nvmet_ns_dev_disable(ns); | 455 | nvmet_ns_dev_disable(ns); |
| 414 | out_unlock: | 456 | out_unlock: |
| @@ -419,6 +461,10 @@ void nvmet_ns_free(struct nvmet_ns *ns) | |||
| 419 | { | 461 | { |
| 420 | nvmet_ns_disable(ns); | 462 | nvmet_ns_disable(ns); |
| 421 | 463 | ||
| 464 | down_write(&nvmet_ana_sem); | ||
| 465 | nvmet_ana_group_enabled[ns->anagrpid]--; | ||
| 466 | up_write(&nvmet_ana_sem); | ||
| 467 | |||
| 422 | kfree(ns->device_path); | 468 | kfree(ns->device_path); |
| 423 | kfree(ns); | 469 | kfree(ns); |
| 424 | } | 470 | } |
| @@ -436,7 +482,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) | |||
| 436 | 482 | ||
| 437 | ns->nsid = nsid; | 483 | ns->nsid = nsid; |
| 438 | ns->subsys = subsys; | 484 | ns->subsys = subsys; |
| 485 | |||
| 486 | down_write(&nvmet_ana_sem); | ||
| 487 | ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; | ||
| 488 | nvmet_ana_group_enabled[ns->anagrpid]++; | ||
| 489 | up_write(&nvmet_ana_sem); | ||
| 490 | |||
| 439 | uuid_gen(&ns->uuid); | 491 | uuid_gen(&ns->uuid); |
| 492 | ns->buffered_io = false; | ||
| 440 | 493 | ||
| 441 | return ns; | 494 | return ns; |
| 442 | } | 495 | } |
| @@ -542,6 +595,35 @@ int nvmet_sq_init(struct nvmet_sq *sq) | |||
| 542 | } | 595 | } |
| 543 | EXPORT_SYMBOL_GPL(nvmet_sq_init); | 596 | EXPORT_SYMBOL_GPL(nvmet_sq_init); |
| 544 | 597 | ||
| 598 | static inline u16 nvmet_check_ana_state(struct nvmet_port *port, | ||
| 599 | struct nvmet_ns *ns) | ||
| 600 | { | ||
| 601 | enum nvme_ana_state state = port->ana_state[ns->anagrpid]; | ||
| 602 | |||
| 603 | if (unlikely(state == NVME_ANA_INACCESSIBLE)) | ||
| 604 | return NVME_SC_ANA_INACCESSIBLE; | ||
| 605 | if (unlikely(state == NVME_ANA_PERSISTENT_LOSS)) | ||
| 606 | return NVME_SC_ANA_PERSISTENT_LOSS; | ||
| 607 | if (unlikely(state == NVME_ANA_CHANGE)) | ||
| 608 | return NVME_SC_ANA_TRANSITION; | ||
| 609 | return 0; | ||
| 610 | } | ||
| 611 | |||
| 612 | static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) | ||
| 613 | { | ||
| 614 | if (unlikely(req->ns->readonly)) { | ||
| 615 | switch (req->cmd->common.opcode) { | ||
| 616 | case nvme_cmd_read: | ||
| 617 | case nvme_cmd_flush: | ||
| 618 | break; | ||
| 619 | default: | ||
| 620 | return NVME_SC_NS_WRITE_PROTECTED; | ||
| 621 | } | ||
| 622 | } | ||
| 623 | |||
| 624 | return 0; | ||
| 625 | } | ||
| 626 | |||
| 545 | static u16 nvmet_parse_io_cmd(struct nvmet_req *req) | 627 | static u16 nvmet_parse_io_cmd(struct nvmet_req *req) |
| 546 | { | 628 | { |
| 547 | struct nvme_command *cmd = req->cmd; | 629 | struct nvme_command *cmd = req->cmd; |
| @@ -554,6 +636,12 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) | |||
| 554 | req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); | 636 | req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); |
| 555 | if (unlikely(!req->ns)) | 637 | if (unlikely(!req->ns)) |
| 556 | return NVME_SC_INVALID_NS | NVME_SC_DNR; | 638 | return NVME_SC_INVALID_NS | NVME_SC_DNR; |
| 639 | ret = nvmet_check_ana_state(req->port, req->ns); | ||
| 640 | if (unlikely(ret)) | ||
| 641 | return ret; | ||
| 642 | ret = nvmet_io_cmd_check_access(req); | ||
| 643 | if (unlikely(ret)) | ||
| 644 | return ret; | ||
| 557 | 645 | ||
| 558 | if (req->ns->file) | 646 | if (req->ns->file) |
| 559 | return nvmet_file_parse_io_cmd(req); | 647 | return nvmet_file_parse_io_cmd(req); |
| @@ -870,6 +958,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | |||
| 870 | 958 | ||
| 871 | nvmet_init_cap(ctrl); | 959 | nvmet_init_cap(ctrl); |
| 872 | 960 | ||
| 961 | ctrl->port = req->port; | ||
| 962 | |||
| 873 | INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); | 963 | INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); |
| 874 | INIT_LIST_HEAD(&ctrl->async_events); | 964 | INIT_LIST_HEAD(&ctrl->async_events); |
| 875 | 965 | ||
| @@ -1109,6 +1199,15 @@ static int __init nvmet_init(void) | |||
| 1109 | { | 1199 | { |
| 1110 | int error; | 1200 | int error; |
| 1111 | 1201 | ||
| 1202 | nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; | ||
| 1203 | |||
| 1204 | buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", | ||
| 1205 | WQ_MEM_RECLAIM, 0); | ||
| 1206 | if (!buffered_io_wq) { | ||
| 1207 | error = -ENOMEM; | ||
| 1208 | goto out; | ||
| 1209 | } | ||
| 1210 | |||
| 1112 | error = nvmet_init_discovery(); | 1211 | error = nvmet_init_discovery(); |
| 1113 | if (error) | 1212 | if (error) |
| 1114 | goto out; | 1213 | goto out; |
| @@ -1129,6 +1228,7 @@ static void __exit nvmet_exit(void) | |||
| 1129 | nvmet_exit_configfs(); | 1228 | nvmet_exit_configfs(); |
| 1130 | nvmet_exit_discovery(); | 1229 | nvmet_exit_discovery(); |
| 1131 | ida_destroy(&cntlid_ida); | 1230 | ida_destroy(&cntlid_ida); |
| 1231 | destroy_workqueue(buffered_io_wq); | ||
| 1132 | 1232 | ||
| 1133 | BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); | 1233 | BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); |
| 1134 | BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); | 1234 | BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); |
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 08656b849bd6..eae29f493a07 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c | |||
| @@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) | |||
| 171 | id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ | 171 | id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ |
| 172 | if (ctrl->ops->has_keyed_sgls) | 172 | if (ctrl->ops->has_keyed_sgls) |
| 173 | id->sgls |= cpu_to_le32(1 << 2); | 173 | id->sgls |= cpu_to_le32(1 << 2); |
| 174 | if (ctrl->ops->sqe_inline_size) | 174 | if (req->port->inline_data_size) |
| 175 | id->sgls |= cpu_to_le32(1 << 20); | 175 | id->sgls |= cpu_to_le32(1 << 20); |
| 176 | 176 | ||
| 177 | strcpy(id->subnqn, ctrl->subsys->subsysnqn); | 177 | strcpy(id->subnqn, ctrl->subsys->subsysnqn); |
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index e0b0f7df70c2..7bc9f6240432 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c | |||
| @@ -124,6 +124,13 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) | |||
| 124 | submit_bio(bio); | 124 | submit_bio(bio); |
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | u16 nvmet_bdev_flush(struct nvmet_req *req) | ||
| 128 | { | ||
| 129 | if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL)) | ||
| 130 | return NVME_SC_INTERNAL | NVME_SC_DNR; | ||
| 131 | return 0; | ||
| 132 | } | ||
| 133 | |||
| 127 | static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, | 134 | static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, |
| 128 | struct nvme_dsm_range *range, struct bio **bio) | 135 | struct nvme_dsm_range *range, struct bio **bio) |
| 129 | { | 136 | { |
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 8c42b3a8c420..81a9dc5290a8 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | void nvmet_file_ns_disable(struct nvmet_ns *ns) | 16 | void nvmet_file_ns_disable(struct nvmet_ns *ns) |
| 17 | { | 17 | { |
| 18 | if (ns->file) { | 18 | if (ns->file) { |
| 19 | if (ns->buffered_io) | ||
| 20 | flush_workqueue(buffered_io_wq); | ||
| 19 | mempool_destroy(ns->bvec_pool); | 21 | mempool_destroy(ns->bvec_pool); |
| 20 | ns->bvec_pool = NULL; | 22 | ns->bvec_pool = NULL; |
| 21 | kmem_cache_destroy(ns->bvec_cache); | 23 | kmem_cache_destroy(ns->bvec_cache); |
| @@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) | |||
| 27 | 29 | ||
| 28 | int nvmet_file_ns_enable(struct nvmet_ns *ns) | 30 | int nvmet_file_ns_enable(struct nvmet_ns *ns) |
| 29 | { | 31 | { |
| 30 | int ret; | 32 | int flags = O_RDWR | O_LARGEFILE; |
| 31 | struct kstat stat; | 33 | struct kstat stat; |
| 34 | int ret; | ||
| 35 | |||
| 36 | if (!ns->buffered_io) | ||
| 37 | flags |= O_DIRECT; | ||
| 32 | 38 | ||
| 33 | ns->file = filp_open(ns->device_path, | 39 | ns->file = filp_open(ns->device_path, flags, 0); |
| 34 | O_RDWR | O_LARGEFILE | O_DIRECT, 0); | ||
| 35 | if (IS_ERR(ns->file)) { | 40 | if (IS_ERR(ns->file)) { |
| 36 | pr_err("failed to open file %s: (%ld)\n", | 41 | pr_err("failed to open file %s: (%ld)\n", |
| 37 | ns->device_path, PTR_ERR(ns->file)); | 42 | ns->device_path, PTR_ERR(ns->file)); |
| @@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, | |||
| 100 | 105 | ||
| 101 | iocb->ki_pos = pos; | 106 | iocb->ki_pos = pos; |
| 102 | iocb->ki_filp = req->ns->file; | 107 | iocb->ki_filp = req->ns->file; |
| 103 | iocb->ki_flags = IOCB_DIRECT | ki_flags; | 108 | iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); |
| 104 | 109 | ||
| 105 | ret = call_iter(iocb, &iter); | 110 | ret = call_iter(iocb, &iter); |
| 106 | 111 | ||
| @@ -140,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) | |||
| 140 | return; | 145 | return; |
| 141 | } | 146 | } |
| 142 | 147 | ||
| 148 | pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; | ||
| 149 | if (unlikely(pos + req->data_len > req->ns->size)) { | ||
| 150 | nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); | ||
| 151 | return; | ||
| 152 | } | ||
| 153 | |||
| 143 | if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) | 154 | if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) |
| 144 | req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), | 155 | req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), |
| 145 | GFP_KERNEL); | 156 | GFP_KERNEL); |
| @@ -155,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) | |||
| 155 | is_sync = true; | 166 | is_sync = true; |
| 156 | } | 167 | } |
| 157 | 168 | ||
| 158 | pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; | ||
| 159 | |||
| 160 | memset(&req->f.iocb, 0, sizeof(struct kiocb)); | 169 | memset(&req->f.iocb, 0, sizeof(struct kiocb)); |
| 161 | for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { | 170 | for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { |
| 162 | nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); | 171 | nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); |
| @@ -189,14 +198,31 @@ out: | |||
| 189 | nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); | 198 | nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); |
| 190 | } | 199 | } |
| 191 | 200 | ||
| 192 | static void nvmet_file_flush_work(struct work_struct *w) | 201 | static void nvmet_file_buffered_io_work(struct work_struct *w) |
| 193 | { | 202 | { |
| 194 | struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); | 203 | struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); |
| 195 | int ret; | ||
| 196 | 204 | ||
| 197 | ret = vfs_fsync(req->ns->file, 1); | 205 | nvmet_file_execute_rw(req); |
| 206 | } | ||
| 198 | 207 | ||
| 199 | nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); | 208 | static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) |
| 209 | { | ||
| 210 | INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); | ||
| 211 | queue_work(buffered_io_wq, &req->f.work); | ||
| 212 | } | ||
| 213 | |||
| 214 | u16 nvmet_file_flush(struct nvmet_req *req) | ||
| 215 | { | ||
| 216 | if (vfs_fsync(req->ns->file, 1) < 0) | ||
| 217 | return NVME_SC_INTERNAL | NVME_SC_DNR; | ||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | |||
| 221 | static void nvmet_file_flush_work(struct work_struct *w) | ||
| 222 | { | ||
| 223 | struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); | ||
| 224 | |||
| 225 | nvmet_req_complete(req, nvmet_file_flush(req)); | ||
| 200 | } | 226 | } |
| 201 | 227 | ||
| 202 | static void nvmet_file_execute_flush(struct nvmet_req *req) | 228 | static void nvmet_file_execute_flush(struct nvmet_req *req) |
| @@ -209,22 +235,30 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) | |||
| 209 | { | 235 | { |
| 210 | int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; | 236 | int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; |
| 211 | struct nvme_dsm_range range; | 237 | struct nvme_dsm_range range; |
| 212 | loff_t offset; | 238 | loff_t offset, len; |
| 213 | loff_t len; | 239 | u16 ret; |
| 214 | int i, ret; | 240 | int i; |
| 215 | 241 | ||
| 216 | for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { | 242 | for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { |
| 217 | if (nvmet_copy_from_sgl(req, i * sizeof(range), &range, | 243 | ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, |
| 218 | sizeof(range))) | 244 | sizeof(range)); |
| 245 | if (ret) | ||
| 219 | break; | 246 | break; |
| 247 | |||
| 220 | offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; | 248 | offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; |
| 221 | len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; | 249 | len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; |
| 222 | ret = vfs_fallocate(req->ns->file, mode, offset, len); | 250 | if (offset + len > req->ns->size) { |
| 223 | if (ret) | 251 | ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; |
| 224 | break; | 252 | break; |
| 253 | } | ||
| 254 | |||
| 255 | if (vfs_fallocate(req->ns->file, mode, offset, len)) { | ||
| 256 | ret = NVME_SC_INTERNAL | NVME_SC_DNR; | ||
| 257 | break; | ||
| 258 | } | ||
| 225 | } | 259 | } |
| 226 | 260 | ||
| 227 | nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); | 261 | nvmet_req_complete(req, ret); |
| 228 | } | 262 | } |
| 229 | 263 | ||
| 230 | static void nvmet_file_dsm_work(struct work_struct *w) | 264 | static void nvmet_file_dsm_work(struct work_struct *w) |
| @@ -263,6 +297,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) | |||
| 263 | len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << | 297 | len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << |
| 264 | req->ns->blksize_shift); | 298 | req->ns->blksize_shift); |
| 265 | 299 | ||
| 300 | if (unlikely(offset + len > req->ns->size)) { | ||
| 301 | nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); | ||
| 302 | return; | ||
| 303 | } | ||
| 304 | |||
| 266 | ret = vfs_fallocate(req->ns->file, mode, offset, len); | 305 | ret = vfs_fallocate(req->ns->file, mode, offset, len); |
| 267 | nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); | 306 | nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); |
| 268 | } | 307 | } |
| @@ -280,7 +319,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) | |||
| 280 | switch (cmd->common.opcode) { | 319 | switch (cmd->common.opcode) { |
| 281 | case nvme_cmd_read: | 320 | case nvme_cmd_read: |
| 282 | case nvme_cmd_write: | 321 | case nvme_cmd_write: |
| 283 | req->execute = nvmet_file_execute_rw; | 322 | if (req->ns->buffered_io) |
| 323 | req->execute = nvmet_file_execute_rw_buffered_io; | ||
| 324 | else | ||
| 325 | req->execute = nvmet_file_execute_rw; | ||
| 284 | req->data_len = nvmet_rw_len(req); | 326 | req->data_len = nvmet_rw_len(req); |
| 285 | return 0; | 327 | return 0; |
| 286 | case nvme_cmd_flush: | 328 | case nvme_cmd_flush: |
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index ae7586b8be07..9908082b32c4 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c | |||
| @@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, | |||
| 227 | { | 227 | { |
| 228 | struct nvme_loop_ctrl *ctrl = set->driver_data; | 228 | struct nvme_loop_ctrl *ctrl = set->driver_data; |
| 229 | 229 | ||
| 230 | nvme_req(req)->ctrl = &ctrl->ctrl; | ||
| 230 | return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), | 231 | return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), |
| 231 | (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); | 232 | (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); |
| 232 | } | 233 | } |
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 480dfe10fad9..ec9af4ee03b6 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h | |||
| @@ -30,12 +30,11 @@ | |||
| 30 | #define NVMET_ASYNC_EVENTS 4 | 30 | #define NVMET_ASYNC_EVENTS 4 |
| 31 | #define NVMET_ERROR_LOG_SLOTS 128 | 31 | #define NVMET_ERROR_LOG_SLOTS 128 |
| 32 | 32 | ||
| 33 | |||
| 34 | /* | 33 | /* |
| 35 | * Supported optional AENs: | 34 | * Supported optional AENs: |
| 36 | */ | 35 | */ |
| 37 | #define NVMET_AEN_CFG_OPTIONAL \ | 36 | #define NVMET_AEN_CFG_OPTIONAL \ |
| 38 | NVME_AEN_CFG_NS_ATTR | 37 | (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) |
| 39 | 38 | ||
| 40 | /* | 39 | /* |
| 41 | * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): | 40 | * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): |
| @@ -59,12 +58,15 @@ struct nvmet_ns { | |||
| 59 | struct percpu_ref ref; | 58 | struct percpu_ref ref; |
| 60 | struct block_device *bdev; | 59 | struct block_device *bdev; |
| 61 | struct file *file; | 60 | struct file *file; |
| 61 | bool readonly; | ||
| 62 | u32 nsid; | 62 | u32 nsid; |
| 63 | u32 blksize_shift; | 63 | u32 blksize_shift; |
| 64 | loff_t size; | 64 | loff_t size; |
| 65 | u8 nguid[16]; | 65 | u8 nguid[16]; |
| 66 | uuid_t uuid; | 66 | uuid_t uuid; |
| 67 | u32 anagrpid; | ||
| 67 | 68 | ||
| 69 | bool buffered_io; | ||
| 68 | bool enabled; | 70 | bool enabled; |
| 69 | struct nvmet_subsys *subsys; | 71 | struct nvmet_subsys *subsys; |
| 70 | const char *device_path; | 72 | const char *device_path; |
| @@ -97,6 +99,18 @@ struct nvmet_sq { | |||
| 97 | struct completion confirm_done; | 99 | struct completion confirm_done; |
| 98 | }; | 100 | }; |
| 99 | 101 | ||
| 102 | struct nvmet_ana_group { | ||
| 103 | struct config_group group; | ||
| 104 | struct nvmet_port *port; | ||
| 105 | u32 grpid; | ||
| 106 | }; | ||
| 107 | |||
| 108 | static inline struct nvmet_ana_group *to_ana_group(struct config_item *item) | ||
| 109 | { | ||
| 110 | return container_of(to_config_group(item), struct nvmet_ana_group, | ||
| 111 | group); | ||
| 112 | } | ||
| 113 | |||
| 100 | /** | 114 | /** |
| 101 | * struct nvmet_port - Common structure to keep port | 115 | * struct nvmet_port - Common structure to keep port |
| 102 | * information for the target. | 116 | * information for the target. |
| @@ -114,8 +128,12 @@ struct nvmet_port { | |||
| 114 | struct list_head subsystems; | 128 | struct list_head subsystems; |
| 115 | struct config_group referrals_group; | 129 | struct config_group referrals_group; |
| 116 | struct list_head referrals; | 130 | struct list_head referrals; |
| 131 | struct config_group ana_groups_group; | ||
| 132 | struct nvmet_ana_group ana_default_group; | ||
| 133 | enum nvme_ana_state *ana_state; | ||
| 117 | void *priv; | 134 | void *priv; |
| 118 | bool enabled; | 135 | bool enabled; |
| 136 | int inline_data_size; | ||
| 119 | }; | 137 | }; |
| 120 | 138 | ||
| 121 | static inline struct nvmet_port *to_nvmet_port(struct config_item *item) | 139 | static inline struct nvmet_port *to_nvmet_port(struct config_item *item) |
| @@ -124,6 +142,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item) | |||
| 124 | group); | 142 | group); |
| 125 | } | 143 | } |
| 126 | 144 | ||
| 145 | static inline struct nvmet_port *ana_groups_to_port( | ||
| 146 | struct config_item *item) | ||
| 147 | { | ||
| 148 | return container_of(to_config_group(item), struct nvmet_port, | ||
| 149 | ana_groups_group); | ||
| 150 | } | ||
| 151 | |||
| 127 | struct nvmet_ctrl { | 152 | struct nvmet_ctrl { |
| 128 | struct nvmet_subsys *subsys; | 153 | struct nvmet_subsys *subsys; |
| 129 | struct nvmet_cq **cqs; | 154 | struct nvmet_cq **cqs; |
| @@ -138,6 +163,8 @@ struct nvmet_ctrl { | |||
| 138 | u16 cntlid; | 163 | u16 cntlid; |
| 139 | u32 kato; | 164 | u32 kato; |
| 140 | 165 | ||
| 166 | struct nvmet_port *port; | ||
| 167 | |||
| 141 | u32 aen_enabled; | 168 | u32 aen_enabled; |
| 142 | unsigned long aen_masked; | 169 | unsigned long aen_masked; |
| 143 | struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; | 170 | struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; |
| @@ -166,6 +193,7 @@ struct nvmet_subsys { | |||
| 166 | struct kref ref; | 193 | struct kref ref; |
| 167 | 194 | ||
| 168 | struct list_head namespaces; | 195 | struct list_head namespaces; |
| 196 | unsigned int nr_namespaces; | ||
| 169 | unsigned int max_nsid; | 197 | unsigned int max_nsid; |
| 170 | 198 | ||
| 171 | struct list_head ctrls; | 199 | struct list_head ctrls; |
| @@ -225,7 +253,6 @@ struct nvmet_req; | |||
| 225 | struct nvmet_fabrics_ops { | 253 | struct nvmet_fabrics_ops { |
| 226 | struct module *owner; | 254 | struct module *owner; |
| 227 | unsigned int type; | 255 | unsigned int type; |
| 228 | unsigned int sqe_inline_size; | ||
| 229 | unsigned int msdbd; | 256 | unsigned int msdbd; |
| 230 | bool has_keyed_sgls : 1; | 257 | bool has_keyed_sgls : 1; |
| 231 | void (*queue_response)(struct nvmet_req *req); | 258 | void (*queue_response)(struct nvmet_req *req); |
| @@ -269,6 +296,8 @@ struct nvmet_req { | |||
| 269 | const struct nvmet_fabrics_ops *ops; | 296 | const struct nvmet_fabrics_ops *ops; |
| 270 | }; | 297 | }; |
| 271 | 298 | ||
| 299 | extern struct workqueue_struct *buffered_io_wq; | ||
| 300 | |||
| 272 | static inline void nvmet_set_status(struct nvmet_req *req, u16 status) | 301 | static inline void nvmet_set_status(struct nvmet_req *req, u16 status) |
| 273 | { | 302 | { |
| 274 | req->rsp->status = cpu_to_le16(status << 1); | 303 | req->rsp->status = cpu_to_le16(status << 1); |
| @@ -337,6 +366,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns); | |||
| 337 | struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); | 366 | struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); |
| 338 | void nvmet_ns_free(struct nvmet_ns *ns); | 367 | void nvmet_ns_free(struct nvmet_ns *ns); |
| 339 | 368 | ||
| 369 | void nvmet_send_ana_event(struct nvmet_subsys *subsys, | ||
| 370 | struct nvmet_port *port); | ||
| 371 | void nvmet_port_send_ana_event(struct nvmet_port *port); | ||
| 372 | |||
| 340 | int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); | 373 | int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); |
| 341 | void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); | 374 | void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); |
| 342 | 375 | ||
| @@ -357,6 +390,22 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); | |||
| 357 | #define NVMET_QUEUE_SIZE 1024 | 390 | #define NVMET_QUEUE_SIZE 1024 |
| 358 | #define NVMET_NR_QUEUES 128 | 391 | #define NVMET_NR_QUEUES 128 |
| 359 | #define NVMET_MAX_CMD NVMET_QUEUE_SIZE | 392 | #define NVMET_MAX_CMD NVMET_QUEUE_SIZE |
| 393 | |||
| 394 | /* | ||
| 395 | * Nice round number that makes a list of nsids fit into a page. | ||
| 396 | * Should become tunable at some point in the future. | ||
| 397 | */ | ||
| 398 | #define NVMET_MAX_NAMESPACES 1024 | ||
| 399 | |||
| 400 | /* | ||
| 401 | * 0 is not a valid ANA group ID, so we start numbering at 1. | ||
| 402 | * | ||
| 403 | * ANA Group 1 exists without manual intervention, has namespaces assigned to it | ||
| 404 | * by default, and is available in an optimized state through all ports. | ||
| 405 | */ | ||
| 406 | #define NVMET_MAX_ANAGRPS 128 | ||
| 407 | #define NVMET_DEFAULT_ANA_GRPID 1 | ||
| 408 | |||
| 360 | #define NVMET_KAS 10 | 409 | #define NVMET_KAS 10 |
| 361 | #define NVMET_DISC_KATO 120 | 410 | #define NVMET_DISC_KATO 120 |
| 362 | 411 | ||
| @@ -370,6 +419,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys; | |||
| 370 | extern u64 nvmet_genctr; | 419 | extern u64 nvmet_genctr; |
| 371 | extern struct rw_semaphore nvmet_config_sem; | 420 | extern struct rw_semaphore nvmet_config_sem; |
| 372 | 421 | ||
| 422 | extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; | ||
| 423 | extern u64 nvmet_ana_chgcnt; | ||
| 424 | extern struct rw_semaphore nvmet_ana_sem; | ||
| 425 | |||
| 373 | bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, | 426 | bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, |
| 374 | const char *hostnqn); | 427 | const char *hostnqn); |
| 375 | 428 | ||
| @@ -377,6 +430,9 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns); | |||
| 377 | int nvmet_file_ns_enable(struct nvmet_ns *ns); | 430 | int nvmet_file_ns_enable(struct nvmet_ns *ns); |
| 378 | void nvmet_bdev_ns_disable(struct nvmet_ns *ns); | 431 | void nvmet_bdev_ns_disable(struct nvmet_ns *ns); |
| 379 | void nvmet_file_ns_disable(struct nvmet_ns *ns); | 432 | void nvmet_file_ns_disable(struct nvmet_ns *ns); |
| 433 | u16 nvmet_bdev_flush(struct nvmet_req *req); | ||
| 434 | u16 nvmet_file_flush(struct nvmet_req *req); | ||
| 435 | void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); | ||
| 380 | 436 | ||
| 381 | static inline u32 nvmet_rw_len(struct nvmet_req *req) | 437 | static inline u32 nvmet_rw_len(struct nvmet_req *req) |
| 382 | { | 438 | { |
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 52e0c5d579a7..e7f43d1e1779 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c | |||
| @@ -33,16 +33,17 @@ | |||
| 33 | #include "nvmet.h" | 33 | #include "nvmet.h" |
| 34 | 34 | ||
| 35 | /* | 35 | /* |
| 36 | * We allow up to a page of inline data to go with the SQE | 36 | * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data |
| 37 | */ | 37 | */ |
| 38 | #define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE | 38 | #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE |
| 39 | #define NVMET_RDMA_MAX_INLINE_SGE 4 | ||
| 40 | #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) | ||
| 39 | 41 | ||
| 40 | struct nvmet_rdma_cmd { | 42 | struct nvmet_rdma_cmd { |
| 41 | struct ib_sge sge[2]; | 43 | struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; |
| 42 | struct ib_cqe cqe; | 44 | struct ib_cqe cqe; |
| 43 | struct ib_recv_wr wr; | 45 | struct ib_recv_wr wr; |
| 44 | struct scatterlist inline_sg; | 46 | struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; |
| 45 | struct page *inline_page; | ||
| 46 | struct nvme_command *nvme_cmd; | 47 | struct nvme_command *nvme_cmd; |
| 47 | struct nvmet_rdma_queue *queue; | 48 | struct nvmet_rdma_queue *queue; |
| 48 | }; | 49 | }; |
| @@ -116,6 +117,8 @@ struct nvmet_rdma_device { | |||
| 116 | size_t srq_size; | 117 | size_t srq_size; |
| 117 | struct kref ref; | 118 | struct kref ref; |
| 118 | struct list_head entry; | 119 | struct list_head entry; |
| 120 | int inline_data_size; | ||
| 121 | int inline_page_count; | ||
| 119 | }; | 122 | }; |
| 120 | 123 | ||
| 121 | static bool nvmet_rdma_use_srq; | 124 | static bool nvmet_rdma_use_srq; |
| @@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); | |||
| 138 | 141 | ||
| 139 | static const struct nvmet_fabrics_ops nvmet_rdma_ops; | 142 | static const struct nvmet_fabrics_ops nvmet_rdma_ops; |
| 140 | 143 | ||
| 144 | static int num_pages(int len) | ||
| 145 | { | ||
| 146 | return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); | ||
| 147 | } | ||
| 148 | |||
| 141 | /* XXX: really should move to a generic header sooner or later.. */ | 149 | /* XXX: really should move to a generic header sooner or later.. */ |
| 142 | static inline u32 get_unaligned_le24(const u8 *p) | 150 | static inline u32 get_unaligned_le24(const u8 *p) |
| 143 | { | 151 | { |
| @@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) | |||
| 184 | spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); | 192 | spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); |
| 185 | } | 193 | } |
| 186 | 194 | ||
| 195 | static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, | ||
| 196 | struct nvmet_rdma_cmd *c) | ||
| 197 | { | ||
| 198 | struct scatterlist *sg; | ||
| 199 | struct ib_sge *sge; | ||
| 200 | int i; | ||
| 201 | |||
| 202 | if (!ndev->inline_data_size) | ||
| 203 | return; | ||
| 204 | |||
| 205 | sg = c->inline_sg; | ||
| 206 | sge = &c->sge[1]; | ||
| 207 | |||
| 208 | for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { | ||
| 209 | if (sge->length) | ||
| 210 | ib_dma_unmap_page(ndev->device, sge->addr, | ||
| 211 | sge->length, DMA_FROM_DEVICE); | ||
| 212 | if (sg_page(sg)) | ||
| 213 | __free_page(sg_page(sg)); | ||
| 214 | } | ||
| 215 | } | ||
| 216 | |||
| 217 | static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, | ||
| 218 | struct nvmet_rdma_cmd *c) | ||
| 219 | { | ||
| 220 | struct scatterlist *sg; | ||
| 221 | struct ib_sge *sge; | ||
| 222 | struct page *pg; | ||
| 223 | int len; | ||
| 224 | int i; | ||
| 225 | |||
| 226 | if (!ndev->inline_data_size) | ||
| 227 | return 0; | ||
| 228 | |||
| 229 | sg = c->inline_sg; | ||
| 230 | sg_init_table(sg, ndev->inline_page_count); | ||
| 231 | sge = &c->sge[1]; | ||
| 232 | len = ndev->inline_data_size; | ||
| 233 | |||
| 234 | for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { | ||
| 235 | pg = alloc_page(GFP_KERNEL); | ||
| 236 | if (!pg) | ||
| 237 | goto out_err; | ||
| 238 | sg_assign_page(sg, pg); | ||
| 239 | sge->addr = ib_dma_map_page(ndev->device, | ||
| 240 | pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); | ||
| 241 | if (ib_dma_mapping_error(ndev->device, sge->addr)) | ||
| 242 | goto out_err; | ||
| 243 | sge->length = min_t(int, len, PAGE_SIZE); | ||
| 244 | sge->lkey = ndev->pd->local_dma_lkey; | ||
| 245 | len -= sge->length; | ||
| 246 | } | ||
| 247 | |||
| 248 | return 0; | ||
| 249 | out_err: | ||
| 250 | for (; i >= 0; i--, sg--, sge--) { | ||
| 251 | if (sge->length) | ||
| 252 | ib_dma_unmap_page(ndev->device, sge->addr, | ||
| 253 | sge->length, DMA_FROM_DEVICE); | ||
| 254 | if (sg_page(sg)) | ||
| 255 | __free_page(sg_page(sg)); | ||
| 256 | } | ||
| 257 | return -ENOMEM; | ||
| 258 | } | ||
| 259 | |||
| 187 | static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, | 260 | static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, |
| 188 | struct nvmet_rdma_cmd *c, bool admin) | 261 | struct nvmet_rdma_cmd *c, bool admin) |
| 189 | { | 262 | { |
| @@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, | |||
| 200 | c->sge[0].length = sizeof(*c->nvme_cmd); | 273 | c->sge[0].length = sizeof(*c->nvme_cmd); |
| 201 | c->sge[0].lkey = ndev->pd->local_dma_lkey; | 274 | c->sge[0].lkey = ndev->pd->local_dma_lkey; |
| 202 | 275 | ||
| 203 | if (!admin) { | 276 | if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) |
| 204 | c->inline_page = alloc_pages(GFP_KERNEL, | 277 | goto out_unmap_cmd; |
| 205 | get_order(NVMET_RDMA_INLINE_DATA_SIZE)); | ||
| 206 | if (!c->inline_page) | ||
| 207 | goto out_unmap_cmd; | ||
| 208 | c->sge[1].addr = ib_dma_map_page(ndev->device, | ||
| 209 | c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, | ||
| 210 | DMA_FROM_DEVICE); | ||
| 211 | if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) | ||
| 212 | goto out_free_inline_page; | ||
| 213 | c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; | ||
| 214 | c->sge[1].lkey = ndev->pd->local_dma_lkey; | ||
| 215 | } | ||
| 216 | 278 | ||
| 217 | c->cqe.done = nvmet_rdma_recv_done; | 279 | c->cqe.done = nvmet_rdma_recv_done; |
| 218 | 280 | ||
| 219 | c->wr.wr_cqe = &c->cqe; | 281 | c->wr.wr_cqe = &c->cqe; |
| 220 | c->wr.sg_list = c->sge; | 282 | c->wr.sg_list = c->sge; |
| 221 | c->wr.num_sge = admin ? 1 : 2; | 283 | c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; |
| 222 | 284 | ||
| 223 | return 0; | 285 | return 0; |
| 224 | 286 | ||
| 225 | out_free_inline_page: | ||
| 226 | if (!admin) { | ||
| 227 | __free_pages(c->inline_page, | ||
| 228 | get_order(NVMET_RDMA_INLINE_DATA_SIZE)); | ||
| 229 | } | ||
| 230 | out_unmap_cmd: | 287 | out_unmap_cmd: |
| 231 | ib_dma_unmap_single(ndev->device, c->sge[0].addr, | 288 | ib_dma_unmap_single(ndev->device, c->sge[0].addr, |
| 232 | sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); | 289 | sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); |
| @@ -240,12 +297,8 @@ out: | |||
| 240 | static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, | 297 | static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, |
| 241 | struct nvmet_rdma_cmd *c, bool admin) | 298 | struct nvmet_rdma_cmd *c, bool admin) |
| 242 | { | 299 | { |
| 243 | if (!admin) { | 300 | if (!admin) |
| 244 | ib_dma_unmap_page(ndev->device, c->sge[1].addr, | 301 | nvmet_rdma_free_inline_pages(ndev, c); |
| 245 | NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); | ||
| 246 | __free_pages(c->inline_page, | ||
| 247 | get_order(NVMET_RDMA_INLINE_DATA_SIZE)); | ||
| 248 | } | ||
| 249 | ib_dma_unmap_single(ndev->device, c->sge[0].addr, | 302 | ib_dma_unmap_single(ndev->device, c->sge[0].addr, |
| 250 | sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); | 303 | sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); |
| 251 | kfree(c->nvme_cmd); | 304 | kfree(c->nvme_cmd); |
| @@ -383,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, | |||
| 383 | struct nvmet_rdma_cmd *cmd) | 436 | struct nvmet_rdma_cmd *cmd) |
| 384 | { | 437 | { |
| 385 | struct ib_recv_wr *bad_wr; | 438 | struct ib_recv_wr *bad_wr; |
| 439 | int ret; | ||
| 386 | 440 | ||
| 387 | ib_dma_sync_single_for_device(ndev->device, | 441 | ib_dma_sync_single_for_device(ndev->device, |
| 388 | cmd->sge[0].addr, cmd->sge[0].length, | 442 | cmd->sge[0].addr, cmd->sge[0].length, |
| 389 | DMA_FROM_DEVICE); | 443 | DMA_FROM_DEVICE); |
| 390 | 444 | ||
| 391 | if (ndev->srq) | 445 | if (ndev->srq) |
| 392 | return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); | 446 | ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); |
| 393 | return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); | 447 | else |
| 448 | ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); | ||
| 449 | |||
| 450 | if (unlikely(ret)) | ||
| 451 | pr_err("post_recv cmd failed\n"); | ||
| 452 | |||
| 453 | return ret; | ||
| 394 | } | 454 | } |
| 395 | 455 | ||
| 396 | static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) | 456 | static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) |
| @@ -429,7 +489,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) | |||
| 429 | rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); | 489 | rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); |
| 430 | } | 490 | } |
| 431 | 491 | ||
| 432 | if (rsp->req.sg != &rsp->cmd->inline_sg) | 492 | if (rsp->req.sg != rsp->cmd->inline_sg) |
| 433 | sgl_free(rsp->req.sg); | 493 | sgl_free(rsp->req.sg); |
| 434 | 494 | ||
| 435 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) | 495 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) |
| @@ -493,7 +553,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) | |||
| 493 | rsp->send_sge.addr, rsp->send_sge.length, | 553 | rsp->send_sge.addr, rsp->send_sge.length, |
| 494 | DMA_TO_DEVICE); | 554 | DMA_TO_DEVICE); |
| 495 | 555 | ||
| 496 | if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { | 556 | if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) { |
| 497 | pr_err("sending cmd response failed\n"); | 557 | pr_err("sending cmd response failed\n"); |
| 498 | nvmet_rdma_release_rsp(rsp); | 558 | nvmet_rdma_release_rsp(rsp); |
| 499 | } | 559 | } |
| @@ -529,10 +589,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) | |||
| 529 | static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, | 589 | static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, |
| 530 | u64 off) | 590 | u64 off) |
| 531 | { | 591 | { |
| 532 | sg_init_table(&rsp->cmd->inline_sg, 1); | 592 | int sg_count = num_pages(len); |
| 533 | sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); | 593 | struct scatterlist *sg; |
| 534 | rsp->req.sg = &rsp->cmd->inline_sg; | 594 | int i; |
| 535 | rsp->req.sg_cnt = 1; | 595 | |
| 596 | sg = rsp->cmd->inline_sg; | ||
| 597 | for (i = 0; i < sg_count; i++, sg++) { | ||
| 598 | if (i < sg_count - 1) | ||
| 599 | sg_unmark_end(sg); | ||
| 600 | else | ||
| 601 | sg_mark_end(sg); | ||
| 602 | sg->offset = off; | ||
| 603 | sg->length = min_t(int, len, PAGE_SIZE - off); | ||
| 604 | len -= sg->length; | ||
| 605 | if (!i) | ||
| 606 | off = 0; | ||
| 607 | } | ||
| 608 | |||
| 609 | rsp->req.sg = rsp->cmd->inline_sg; | ||
| 610 | rsp->req.sg_cnt = sg_count; | ||
| 536 | } | 611 | } |
| 537 | 612 | ||
| 538 | static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) | 613 | static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) |
| @@ -544,7 +619,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) | |||
| 544 | if (!nvme_is_write(rsp->req.cmd)) | 619 | if (!nvme_is_write(rsp->req.cmd)) |
| 545 | return NVME_SC_INVALID_FIELD | NVME_SC_DNR; | 620 | return NVME_SC_INVALID_FIELD | NVME_SC_DNR; |
| 546 | 621 | ||
| 547 | if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { | 622 | if (off + len > rsp->queue->dev->inline_data_size) { |
| 548 | pr_err("invalid inline data offset!\n"); | 623 | pr_err("invalid inline data offset!\n"); |
| 549 | return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; | 624 | return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; |
| 550 | } | 625 | } |
| @@ -743,7 +818,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) | |||
| 743 | srq_size = 4095; /* XXX: tune */ | 818 | srq_size = 4095; /* XXX: tune */ |
| 744 | 819 | ||
| 745 | srq_attr.attr.max_wr = srq_size; | 820 | srq_attr.attr.max_wr = srq_size; |
| 746 | srq_attr.attr.max_sge = 2; | 821 | srq_attr.attr.max_sge = 1 + ndev->inline_page_count; |
| 747 | srq_attr.attr.srq_limit = 0; | 822 | srq_attr.attr.srq_limit = 0; |
| 748 | srq_attr.srq_type = IB_SRQT_BASIC; | 823 | srq_attr.srq_type = IB_SRQT_BASIC; |
| 749 | srq = ib_create_srq(ndev->pd, &srq_attr); | 824 | srq = ib_create_srq(ndev->pd, &srq_attr); |
| @@ -765,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) | |||
| 765 | ndev->srq = srq; | 840 | ndev->srq = srq; |
| 766 | ndev->srq_size = srq_size; | 841 | ndev->srq_size = srq_size; |
| 767 | 842 | ||
| 768 | for (i = 0; i < srq_size; i++) | 843 | for (i = 0; i < srq_size; i++) { |
| 769 | nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); | 844 | ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); |
| 845 | if (ret) | ||
| 846 | goto out_free_cmds; | ||
| 847 | } | ||
| 770 | 848 | ||
| 771 | return 0; | 849 | return 0; |
| 772 | 850 | ||
| 851 | out_free_cmds: | ||
| 852 | nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); | ||
| 773 | out_destroy_srq: | 853 | out_destroy_srq: |
| 774 | ib_destroy_srq(srq); | 854 | ib_destroy_srq(srq); |
| 775 | return ret; | 855 | return ret; |
| @@ -793,7 +873,10 @@ static void nvmet_rdma_free_dev(struct kref *ref) | |||
| 793 | static struct nvmet_rdma_device * | 873 | static struct nvmet_rdma_device * |
| 794 | nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) | 874 | nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) |
| 795 | { | 875 | { |
| 876 | struct nvmet_port *port = cm_id->context; | ||
| 796 | struct nvmet_rdma_device *ndev; | 877 | struct nvmet_rdma_device *ndev; |
| 878 | int inline_page_count; | ||
| 879 | int inline_sge_count; | ||
| 797 | int ret; | 880 | int ret; |
| 798 | 881 | ||
| 799 | mutex_lock(&device_list_mutex); | 882 | mutex_lock(&device_list_mutex); |
| @@ -807,6 +890,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) | |||
| 807 | if (!ndev) | 890 | if (!ndev) |
| 808 | goto out_err; | 891 | goto out_err; |
| 809 | 892 | ||
| 893 | inline_page_count = num_pages(port->inline_data_size); | ||
| 894 | inline_sge_count = max(cm_id->device->attrs.max_sge_rd, | ||
| 895 | cm_id->device->attrs.max_sge) - 1; | ||
| 896 | if (inline_page_count > inline_sge_count) { | ||
| 897 | pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", | ||
| 898 | port->inline_data_size, cm_id->device->name, | ||
| 899 | inline_sge_count * PAGE_SIZE); | ||
| 900 | port->inline_data_size = inline_sge_count * PAGE_SIZE; | ||
| 901 | inline_page_count = inline_sge_count; | ||
| 902 | } | ||
| 903 | ndev->inline_data_size = port->inline_data_size; | ||
| 904 | ndev->inline_page_count = inline_page_count; | ||
| 810 | ndev->device = cm_id->device; | 905 | ndev->device = cm_id->device; |
| 811 | kref_init(&ndev->ref); | 906 | kref_init(&ndev->ref); |
| 812 | 907 | ||
| @@ -881,7 +976,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) | |||
| 881 | } else { | 976 | } else { |
| 882 | /* +1 for drain */ | 977 | /* +1 for drain */ |
| 883 | qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; | 978 | qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; |
| 884 | qp_attr.cap.max_recv_sge = 2; | 979 | qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; |
| 885 | } | 980 | } |
| 886 | 981 | ||
| 887 | ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); | 982 | ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); |
| @@ -899,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) | |||
| 899 | if (!ndev->srq) { | 994 | if (!ndev->srq) { |
| 900 | for (i = 0; i < queue->recv_queue_size; i++) { | 995 | for (i = 0; i < queue->recv_queue_size; i++) { |
| 901 | queue->cmds[i].queue = queue; | 996 | queue->cmds[i].queue = queue; |
| 902 | nvmet_rdma_post_recv(ndev, &queue->cmds[i]); | 997 | ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); |
| 998 | if (ret) | ||
| 999 | goto err_destroy_qp; | ||
| 903 | } | 1000 | } |
| 904 | } | 1001 | } |
| 905 | 1002 | ||
| 906 | out: | 1003 | out: |
| 907 | return ret; | 1004 | return ret; |
| 908 | 1005 | ||
| 1006 | err_destroy_qp: | ||
| 1007 | rdma_destroy_qp(queue->cm_id); | ||
| 909 | err_destroy_cq: | 1008 | err_destroy_cq: |
| 910 | ib_free_cq(queue->cq); | 1009 | ib_free_cq(queue->cq); |
| 911 | goto out; | 1010 | goto out; |
| @@ -1379,6 +1478,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) | |||
| 1379 | return -EINVAL; | 1478 | return -EINVAL; |
| 1380 | } | 1479 | } |
| 1381 | 1480 | ||
| 1481 | if (port->inline_data_size < 0) { | ||
| 1482 | port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; | ||
| 1483 | } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { | ||
| 1484 | pr_warn("inline_data_size %u is too large, reducing to %u\n", | ||
| 1485 | port->inline_data_size, | ||
| 1486 | NVMET_RDMA_MAX_INLINE_DATA_SIZE); | ||
| 1487 | port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; | ||
| 1488 | } | ||
| 1489 | |||
| 1382 | ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, | 1490 | ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, |
| 1383 | port->disc_addr.trsvcid, &addr); | 1491 | port->disc_addr.trsvcid, &addr); |
| 1384 | if (ret) { | 1492 | if (ret) { |
| @@ -1456,7 +1564,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, | |||
| 1456 | static const struct nvmet_fabrics_ops nvmet_rdma_ops = { | 1564 | static const struct nvmet_fabrics_ops nvmet_rdma_ops = { |
| 1457 | .owner = THIS_MODULE, | 1565 | .owner = THIS_MODULE, |
| 1458 | .type = NVMF_TRTYPE_RDMA, | 1566 | .type = NVMF_TRTYPE_RDMA, |
| 1459 | .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, | ||
| 1460 | .msdbd = 1, | 1567 | .msdbd = 1, |
| 1461 | .has_keyed_sgls = 1, | 1568 | .has_keyed_sgls = 1, |
| 1462 | .add_port = nvmet_rdma_add_port, | 1569 | .add_port = nvmet_rdma_add_port, |
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 80aca2456353..768953881c9e 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile | |||
| @@ -21,6 +21,7 @@ CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS | |||
| 21 | obj-$(CONFIG_PCMCIA) += pcmcia/ | 21 | obj-$(CONFIG_PCMCIA) += pcmcia/ |
| 22 | 22 | ||
| 23 | obj-$(CONFIG_SCSI) += scsi_mod.o | 23 | obj-$(CONFIG_SCSI) += scsi_mod.o |
| 24 | obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_common.o | ||
| 24 | 25 | ||
| 25 | obj-$(CONFIG_RAID_ATTRS) += raid_class.o | 26 | obj-$(CONFIG_RAID_ATTRS) += raid_class.o |
| 26 | 27 | ||
| @@ -156,7 +157,6 @@ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/ | |||
| 156 | obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o | 157 | obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o |
| 157 | scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \ | 158 | scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \ |
| 158 | scsicam.o scsi_error.o scsi_lib.o | 159 | scsicam.o scsi_error.o scsi_lib.o |
| 159 | scsi_mod-y += scsi_common.o | ||
| 160 | scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o | 160 | scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o |
| 161 | scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o | 161 | scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o |
| 162 | scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o | 162 | scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o |
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index e489d89cbb45..379890c4500b 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c | |||
| @@ -339,7 +339,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) | |||
| 339 | struct scsi_sense_hdr sshdr; | 339 | struct scsi_sense_hdr sshdr; |
| 340 | u8 *cmd_buf = NULL; | 340 | u8 *cmd_buf = NULL; |
| 341 | u8 *scsi_cmd = NULL; | 341 | u8 *scsi_cmd = NULL; |
| 342 | u8 *sense_buf = NULL; | ||
| 343 | int rc = 0; | 342 | int rc = 0; |
| 344 | int result = 0; | 343 | int result = 0; |
| 345 | int retry_cnt = 0; | 344 | int retry_cnt = 0; |
| @@ -348,8 +347,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) | |||
| 348 | retry: | 347 | retry: |
| 349 | cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); | 348 | cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); |
| 350 | scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); | 349 | scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); |
| 351 | sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); | 350 | if (unlikely(!cmd_buf || !scsi_cmd)) { |
| 352 | if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) { | ||
| 353 | rc = -ENOMEM; | 351 | rc = -ENOMEM; |
| 354 | goto out; | 352 | goto out; |
| 355 | } | 353 | } |
| @@ -364,7 +362,7 @@ retry: | |||
| 364 | /* Drop the ioctl read semahpore across lengthy call */ | 362 | /* Drop the ioctl read semahpore across lengthy call */ |
| 365 | up_read(&cfg->ioctl_rwsem); | 363 | up_read(&cfg->ioctl_rwsem); |
| 366 | result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf, | 364 | result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf, |
| 367 | CMD_BUFSIZE, sense_buf, &sshdr, to, CMD_RETRIES, | 365 | CMD_BUFSIZE, NULL, &sshdr, to, CMD_RETRIES, |
| 368 | 0, 0, NULL); | 366 | 0, 0, NULL); |
| 369 | down_read(&cfg->ioctl_rwsem); | 367 | down_read(&cfg->ioctl_rwsem); |
| 370 | rc = check_state(cfg); | 368 | rc = check_state(cfg); |
| @@ -395,7 +393,6 @@ retry: | |||
| 395 | if (retry_cnt++ < 1) { | 393 | if (retry_cnt++ < 1) { |
| 396 | kfree(cmd_buf); | 394 | kfree(cmd_buf); |
| 397 | kfree(scsi_cmd); | 395 | kfree(scsi_cmd); |
| 398 | kfree(sense_buf); | ||
| 399 | goto retry; | 396 | goto retry; |
| 400 | } | 397 | } |
| 401 | } | 398 | } |
| @@ -426,7 +423,6 @@ retry: | |||
| 426 | out: | 423 | out: |
| 427 | kfree(cmd_buf); | 424 | kfree(cmd_buf); |
| 428 | kfree(scsi_cmd); | 425 | kfree(scsi_cmd); |
| 429 | kfree(sense_buf); | ||
| 430 | 426 | ||
| 431 | dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n", | 427 | dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n", |
| 432 | __func__, gli->max_lba, gli->blk_len, rc); | 428 | __func__, gli->max_lba, gli->blk_len, rc); |
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c index 66e445a17d6c..2c904bf16b65 100644 --- a/drivers/scsi/cxlflash/vlun.c +++ b/drivers/scsi/cxlflash/vlun.c | |||
| @@ -426,7 +426,6 @@ static int write_same16(struct scsi_device *sdev, | |||
| 426 | { | 426 | { |
| 427 | u8 *cmd_buf = NULL; | 427 | u8 *cmd_buf = NULL; |
| 428 | u8 *scsi_cmd = NULL; | 428 | u8 *scsi_cmd = NULL; |
| 429 | u8 *sense_buf = NULL; | ||
| 430 | int rc = 0; | 429 | int rc = 0; |
| 431 | int result = 0; | 430 | int result = 0; |
| 432 | u64 offset = lba; | 431 | u64 offset = lba; |
| @@ -440,8 +439,7 @@ static int write_same16(struct scsi_device *sdev, | |||
| 440 | 439 | ||
| 441 | cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); | 440 | cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); |
| 442 | scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); | 441 | scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); |
| 443 | sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); | 442 | if (unlikely(!cmd_buf || !scsi_cmd)) { |
| 444 | if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) { | ||
| 445 | rc = -ENOMEM; | 443 | rc = -ENOMEM; |
| 446 | goto out; | 444 | goto out; |
| 447 | } | 445 | } |
| @@ -457,7 +455,7 @@ static int write_same16(struct scsi_device *sdev, | |||
| 457 | /* Drop the ioctl read semahpore across lengthy call */ | 455 | /* Drop the ioctl read semahpore across lengthy call */ |
| 458 | up_read(&cfg->ioctl_rwsem); | 456 | up_read(&cfg->ioctl_rwsem); |
| 459 | result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf, | 457 | result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf, |
| 460 | CMD_BUFSIZE, sense_buf, NULL, to, | 458 | CMD_BUFSIZE, NULL, NULL, to, |
| 461 | CMD_RETRIES, 0, 0, NULL); | 459 | CMD_RETRIES, 0, 0, NULL); |
| 462 | down_read(&cfg->ioctl_rwsem); | 460 | down_read(&cfg->ioctl_rwsem); |
| 463 | rc = check_state(cfg); | 461 | rc = check_state(cfg); |
| @@ -482,7 +480,6 @@ static int write_same16(struct scsi_device *sdev, | |||
| 482 | out: | 480 | out: |
| 483 | kfree(cmd_buf); | 481 | kfree(cmd_buf); |
| 484 | kfree(scsi_cmd); | 482 | kfree(scsi_cmd); |
| 485 | kfree(sense_buf); | ||
| 486 | dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc); | 483 | dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc); |
| 487 | return rc; | 484 | return rc; |
| 488 | } | 485 | } |
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index b8d131a455d0..dd738ae5c75b 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c | |||
| @@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd, | |||
| 4568 | MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | | 4568 | MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | |
| 4569 | MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; | 4569 | MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; |
| 4570 | mpi_request->CDB.EEDP32.PrimaryReferenceTag = | 4570 | mpi_request->CDB.EEDP32.PrimaryReferenceTag = |
| 4571 | cpu_to_be32(scsi_prot_ref_tag(scmd)); | 4571 | cpu_to_be32(t10_pi_ref_tag(scmd->request)); |
| 4572 | break; | 4572 | break; |
| 4573 | 4573 | ||
| 4574 | case SCSI_PROT_DIF_TYPE3: | 4574 | case SCSI_PROT_DIF_TYPE3: |
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 41e9ac9fc138..9cb9a166fa0c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c | |||
| @@ -238,7 +238,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) | |||
| 238 | 238 | ||
| 239 | 239 | ||
| 240 | /** | 240 | /** |
| 241 | * scsi_execute - insert request and wait for the result | 241 | * __scsi_execute - insert request and wait for the result |
| 242 | * @sdev: scsi device | 242 | * @sdev: scsi device |
| 243 | * @cmd: scsi command | 243 | * @cmd: scsi command |
| 244 | * @data_direction: data direction | 244 | * @data_direction: data direction |
| @@ -255,7 +255,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) | |||
| 255 | * Returns the scsi_cmnd result field if a command was executed, or a negative | 255 | * Returns the scsi_cmnd result field if a command was executed, or a negative |
| 256 | * Linux error code if we didn't get that far. | 256 | * Linux error code if we didn't get that far. |
| 257 | */ | 257 | */ |
| 258 | int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, | 258 | int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, |
| 259 | int data_direction, void *buffer, unsigned bufflen, | 259 | int data_direction, void *buffer, unsigned bufflen, |
| 260 | unsigned char *sense, struct scsi_sense_hdr *sshdr, | 260 | unsigned char *sense, struct scsi_sense_hdr *sshdr, |
| 261 | int timeout, int retries, u64 flags, req_flags_t rq_flags, | 261 | int timeout, int retries, u64 flags, req_flags_t rq_flags, |
| @@ -309,7 +309,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, | |||
| 309 | 309 | ||
| 310 | return ret; | 310 | return ret; |
| 311 | } | 311 | } |
| 312 | EXPORT_SYMBOL(scsi_execute); | 312 | EXPORT_SYMBOL(__scsi_execute); |
| 313 | 313 | ||
| 314 | /* | 314 | /* |
| 315 | * Function: scsi_init_cmd_errh() | 315 | * Function: scsi_init_cmd_errh() |
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 9421d9877730..bbebdc3769b0 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c | |||
| @@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) | |||
| 1119 | SCpnt->cmnd[0] = WRITE_6; | 1119 | SCpnt->cmnd[0] = WRITE_6; |
| 1120 | 1120 | ||
| 1121 | if (blk_integrity_rq(rq)) | 1121 | if (blk_integrity_rq(rq)) |
| 1122 | sd_dif_prepare(SCpnt); | 1122 | t10_pi_prepare(SCpnt->request, sdkp->protection_type); |
| 1123 | 1123 | ||
| 1124 | } else if (rq_data_dir(rq) == READ) { | 1124 | } else if (rq_data_dir(rq) == READ) { |
| 1125 | SCpnt->cmnd[0] = READ_6; | 1125 | SCpnt->cmnd[0] = READ_6; |
| @@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt) | |||
| 2047 | "sd_done: completed %d of %d bytes\n", | 2047 | "sd_done: completed %d of %d bytes\n", |
| 2048 | good_bytes, scsi_bufflen(SCpnt))); | 2048 | good_bytes, scsi_bufflen(SCpnt))); |
| 2049 | 2049 | ||
| 2050 | if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt)) | 2050 | if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && |
| 2051 | sd_dif_complete(SCpnt, good_bytes); | 2051 | good_bytes) |
| 2052 | t10_pi_complete(SCpnt->request, sdkp->protection_type, | ||
| 2053 | good_bytes / scsi_prot_interval(SCpnt)); | ||
| 2052 | 2054 | ||
| 2053 | return good_bytes; | 2055 | return good_bytes; |
| 2054 | } | 2056 | } |
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 392c7d078ae3..a7d4f50b67d4 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h | |||
| @@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op) | |||
| 254 | #ifdef CONFIG_BLK_DEV_INTEGRITY | 254 | #ifdef CONFIG_BLK_DEV_INTEGRITY |
| 255 | 255 | ||
| 256 | extern void sd_dif_config_host(struct scsi_disk *); | 256 | extern void sd_dif_config_host(struct scsi_disk *); |
| 257 | extern void sd_dif_prepare(struct scsi_cmnd *scmd); | ||
| 258 | extern void sd_dif_complete(struct scsi_cmnd *, unsigned int); | ||
| 259 | 257 | ||
| 260 | #else /* CONFIG_BLK_DEV_INTEGRITY */ | 258 | #else /* CONFIG_BLK_DEV_INTEGRITY */ |
| 261 | 259 | ||
| 262 | static inline void sd_dif_config_host(struct scsi_disk *disk) | 260 | static inline void sd_dif_config_host(struct scsi_disk *disk) |
| 263 | { | 261 | { |
| 264 | } | 262 | } |
| 265 | static inline int sd_dif_prepare(struct scsi_cmnd *scmd) | ||
| 266 | { | ||
| 267 | return 0; | ||
| 268 | } | ||
| 269 | static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a) | ||
| 270 | { | ||
| 271 | } | ||
| 272 | 263 | ||
| 273 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 264 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
| 274 | 265 | ||
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 9035380c0dda..db72c82486e3 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c | |||
| @@ -95,116 +95,3 @@ out: | |||
| 95 | blk_integrity_register(disk, &bi); | 95 | blk_integrity_register(disk, &bi); |
| 96 | } | 96 | } |
| 97 | 97 | ||
| 98 | /* | ||
| 99 | * The virtual start sector is the one that was originally submitted | ||
| 100 | * by the block layer. Due to partitioning, MD/DM cloning, etc. the | ||
| 101 | * actual physical start sector is likely to be different. Remap | ||
| 102 | * protection information to match the physical LBA. | ||
| 103 | * | ||
| 104 | * From a protocol perspective there's a slight difference between | ||
| 105 | * Type 1 and 2. The latter uses 32-byte CDBs exclusively, and the | ||
| 106 | * reference tag is seeded in the CDB. This gives us the potential to | ||
| 107 | * avoid virt->phys remapping during write. However, at read time we | ||
| 108 | * don't know whether the virt sector is the same as when we wrote it | ||
| 109 | * (we could be reading from real disk as opposed to MD/DM device. So | ||
| 110 | * we always remap Type 2 making it identical to Type 1. | ||
| 111 | * | ||
| 112 | * Type 3 does not have a reference tag so no remapping is required. | ||
| 113 | */ | ||
| 114 | void sd_dif_prepare(struct scsi_cmnd *scmd) | ||
| 115 | { | ||
| 116 | const int tuple_sz = sizeof(struct t10_pi_tuple); | ||
| 117 | struct bio *bio; | ||
| 118 | struct scsi_disk *sdkp; | ||
| 119 | struct t10_pi_tuple *pi; | ||
| 120 | u32 phys, virt; | ||
| 121 | |||
| 122 | sdkp = scsi_disk(scmd->request->rq_disk); | ||
| 123 | |||
| 124 | if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION) | ||
| 125 | return; | ||
| 126 | |||
| 127 | phys = scsi_prot_ref_tag(scmd); | ||
| 128 | |||
| 129 | __rq_for_each_bio(bio, scmd->request) { | ||
| 130 | struct bio_integrity_payload *bip = bio_integrity(bio); | ||
| 131 | struct bio_vec iv; | ||
| 132 | struct bvec_iter iter; | ||
| 133 | unsigned int j; | ||
| 134 | |||
| 135 | /* Already remapped? */ | ||
| 136 | if (bip->bip_flags & BIP_MAPPED_INTEGRITY) | ||
| 137 | break; | ||
| 138 | |||
| 139 | virt = bip_get_seed(bip) & 0xffffffff; | ||
| 140 | |||
| 141 | bip_for_each_vec(iv, bip, iter) { | ||
| 142 | pi = kmap_atomic(iv.bv_page) + iv.bv_offset; | ||
| 143 | |||
| 144 | for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { | ||
| 145 | |||
| 146 | if (be32_to_cpu(pi->ref_tag) == virt) | ||
| 147 | pi->ref_tag = cpu_to_be32(phys); | ||
| 148 | |||
| 149 | virt++; | ||
| 150 | phys++; | ||
| 151 | } | ||
| 152 | |||
| 153 | kunmap_atomic(pi); | ||
| 154 | } | ||
| 155 | |||
| 156 | bip->bip_flags |= BIP_MAPPED_INTEGRITY; | ||
| 157 | } | ||
| 158 | } | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Remap physical sector values in the reference tag to the virtual | ||
| 162 | * values expected by the block layer. | ||
| 163 | */ | ||
| 164 | void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) | ||
| 165 | { | ||
| 166 | const int tuple_sz = sizeof(struct t10_pi_tuple); | ||
| 167 | struct scsi_disk *sdkp; | ||
| 168 | struct bio *bio; | ||
| 169 | struct t10_pi_tuple *pi; | ||
| 170 | unsigned int j, intervals; | ||
| 171 | u32 phys, virt; | ||
| 172 | |||
| 173 | sdkp = scsi_disk(scmd->request->rq_disk); | ||
| 174 | |||
| 175 | if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0) | ||
| 176 | return; | ||
| 177 | |||
| 178 | intervals = good_bytes / scsi_prot_interval(scmd); | ||
| 179 | phys = scsi_prot_ref_tag(scmd); | ||
| 180 | |||
| 181 | __rq_for_each_bio(bio, scmd->request) { | ||
| 182 | struct bio_integrity_payload *bip = bio_integrity(bio); | ||
| 183 | struct bio_vec iv; | ||
| 184 | struct bvec_iter iter; | ||
| 185 | |||
| 186 | virt = bip_get_seed(bip) & 0xffffffff; | ||
| 187 | |||
| 188 | bip_for_each_vec(iv, bip, iter) { | ||
| 189 | pi = kmap_atomic(iv.bv_page) + iv.bv_offset; | ||
| 190 | |||
| 191 | for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { | ||
| 192 | |||
| 193 | if (intervals == 0) { | ||
| 194 | kunmap_atomic(pi); | ||
| 195 | return; | ||
| 196 | } | ||
| 197 | |||
| 198 | if (be32_to_cpu(pi->ref_tag) == phys) | ||
| 199 | pi->ref_tag = cpu_to_be32(virt); | ||
| 200 | |||
| 201 | virt++; | ||
| 202 | phys++; | ||
| 203 | intervals--; | ||
| 204 | } | ||
| 205 | |||
| 206 | kunmap_atomic(pi); | ||
| 207 | } | ||
| 208 | } | ||
| 209 | } | ||
| 210 | |||
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 35fab1e18adc..ffcf902da390 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c | |||
| @@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi, | |||
| 186 | int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) | 186 | int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) |
| 187 | { | 187 | { |
| 188 | struct scsi_device *SDev; | 188 | struct scsi_device *SDev; |
| 189 | struct scsi_sense_hdr sshdr; | 189 | struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr; |
| 190 | int result, err = 0, retries = 0; | 190 | int result, err = 0, retries = 0; |
| 191 | unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL; | ||
| 192 | 191 | ||
| 193 | SDev = cd->device; | 192 | SDev = cd->device; |
| 194 | 193 | ||
| 195 | if (cgc->sense) | 194 | if (cgc->sshdr) |
| 196 | senseptr = sense_buffer; | 195 | sshdr = cgc->sshdr; |
| 197 | 196 | ||
| 198 | retry: | 197 | retry: |
| 199 | if (!scsi_block_when_processing_errors(SDev)) { | 198 | if (!scsi_block_when_processing_errors(SDev)) { |
| @@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) | |||
| 202 | } | 201 | } |
| 203 | 202 | ||
| 204 | result = scsi_execute(SDev, cgc->cmd, cgc->data_direction, | 203 | result = scsi_execute(SDev, cgc->cmd, cgc->data_direction, |
| 205 | cgc->buffer, cgc->buflen, senseptr, &sshdr, | 204 | cgc->buffer, cgc->buflen, NULL, sshdr, |
| 206 | cgc->timeout, IOCTL_RETRIES, 0, 0, NULL); | 205 | cgc->timeout, IOCTL_RETRIES, 0, 0, NULL); |
| 207 | 206 | ||
| 208 | if (cgc->sense) | ||
| 209 | memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense)); | ||
| 210 | |||
| 211 | /* Minimal error checking. Ignore cases we know about, and report the rest. */ | 207 | /* Minimal error checking. Ignore cases we know about, and report the rest. */ |
| 212 | if (driver_byte(result) != 0) { | 208 | if (driver_byte(result) != 0) { |
| 213 | switch (sshdr.sense_key) { | 209 | switch (sshdr->sense_key) { |
| 214 | case UNIT_ATTENTION: | 210 | case UNIT_ATTENTION: |
| 215 | SDev->changed = 1; | 211 | SDev->changed = 1; |
| 216 | if (!cgc->quiet) | 212 | if (!cgc->quiet) |
| @@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) | |||
| 221 | err = -ENOMEDIUM; | 217 | err = -ENOMEDIUM; |
| 222 | break; | 218 | break; |
| 223 | case NOT_READY: /* This happens if there is no disc in drive */ | 219 | case NOT_READY: /* This happens if there is no disc in drive */ |
| 224 | if (sshdr.asc == 0x04 && | 220 | if (sshdr->asc == 0x04 && |
| 225 | sshdr.ascq == 0x01) { | 221 | sshdr->ascq == 0x01) { |
| 226 | /* sense: Logical unit is in process of becoming ready */ | 222 | /* sense: Logical unit is in process of becoming ready */ |
| 227 | if (!cgc->quiet) | 223 | if (!cgc->quiet) |
| 228 | sr_printk(KERN_INFO, cd, | 224 | sr_printk(KERN_INFO, cd, |
| @@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) | |||
| 245 | break; | 241 | break; |
| 246 | case ILLEGAL_REQUEST: | 242 | case ILLEGAL_REQUEST: |
| 247 | err = -EIO; | 243 | err = -EIO; |
| 248 | if (sshdr.asc == 0x20 && | 244 | if (sshdr->asc == 0x20 && |
| 249 | sshdr.ascq == 0x00) | 245 | sshdr->ascq == 0x00) |
| 250 | /* sense: Invalid command operation code */ | 246 | /* sense: Invalid command operation code */ |
| 251 | err = -EDRIVE_CANT_DO_THIS; | 247 | err = -EDRIVE_CANT_DO_THIS; |
| 252 | break; | 248 | break; |
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 6dc8891ccb74..1c72db94270e 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c | |||
| @@ -513,12 +513,12 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, | |||
| 513 | 513 | ||
| 514 | if (sc->sc_data_direction == DMA_TO_DEVICE) | 514 | if (sc->sc_data_direction == DMA_TO_DEVICE) |
| 515 | cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, | 515 | cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, |
| 516 | blk_rq_sectors(rq) * | 516 | bio_integrity_bytes(bi, |
| 517 | bi->tuple_size); | 517 | blk_rq_sectors(rq))); |
| 518 | else if (sc->sc_data_direction == DMA_FROM_DEVICE) | 518 | else if (sc->sc_data_direction == DMA_FROM_DEVICE) |
| 519 | cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, | 519 | cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, |
| 520 | blk_rq_sectors(rq) * | 520 | bio_integrity_bytes(bi, |
| 521 | bi->tuple_size); | 521 | blk_rq_sectors(rq))); |
| 522 | } | 522 | } |
| 523 | #endif | 523 | #endif |
| 524 | 524 | ||
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index 4c44d7bed01a..cb6f32ce7de8 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig | |||
| @@ -1,10 +1,10 @@ | |||
| 1 | 1 | ||
| 2 | menuconfig TARGET_CORE | 2 | menuconfig TARGET_CORE |
| 3 | tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure" | 3 | tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure" |
| 4 | depends on SCSI && BLOCK | 4 | depends on BLOCK |
| 5 | select CONFIGFS_FS | 5 | select CONFIGFS_FS |
| 6 | select CRC_T10DIF | 6 | select CRC_T10DIF |
| 7 | select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. | 7 | select BLK_SCSI_REQUEST |
| 8 | select SGL_ALLOC | 8 | select SGL_ALLOC |
| 9 | default n | 9 | default n |
| 10 | help | 10 | help |
| @@ -29,6 +29,7 @@ config TCM_FILEIO | |||
| 29 | 29 | ||
| 30 | config TCM_PSCSI | 30 | config TCM_PSCSI |
| 31 | tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI" | 31 | tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI" |
| 32 | depends on SCSI | ||
| 32 | help | 33 | help |
| 33 | Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered | 34 | Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered |
| 34 | passthrough access to Linux/SCSI device | 35 | passthrough access to Linux/SCSI device |
diff --git a/drivers/target/loopback/Kconfig b/drivers/target/loopback/Kconfig index abe8ecbcdf06..158ee9d522f7 100644 --- a/drivers/target/loopback/Kconfig +++ b/drivers/target/loopback/Kconfig | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | config LOOPBACK_TARGET | 1 | config LOOPBACK_TARGET |
| 2 | tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module" | 2 | tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module" |
| 3 | depends on SCSI | ||
| 3 | help | 4 | help |
| 4 | Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD | 5 | Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD |
| 5 | fabric loopback module. | 6 | fabric loopback module. |
diff --git a/fs/block_dev.c b/fs/block_dev.c index aba25414231a..38b8ce05cbc7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
| @@ -666,7 +666,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, | |||
| 666 | result = blk_queue_enter(bdev->bd_queue, 0); | 666 | result = blk_queue_enter(bdev->bd_queue, 0); |
| 667 | if (result) | 667 | if (result) |
| 668 | return result; | 668 | return result; |
| 669 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); | 669 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, |
| 670 | REQ_OP_READ); | ||
| 670 | blk_queue_exit(bdev->bd_queue); | 671 | blk_queue_exit(bdev->bd_queue); |
| 671 | return result; | 672 | return result; |
| 672 | } | 673 | } |
| @@ -704,7 +705,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, | |||
| 704 | return result; | 705 | return result; |
| 705 | 706 | ||
| 706 | set_page_writeback(page); | 707 | set_page_writeback(page); |
| 707 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); | 708 | result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, |
| 709 | REQ_OP_WRITE); | ||
| 708 | if (result) { | 710 | if (result) { |
| 709 | end_page_writeback(page); | 711 | end_page_writeback(page); |
| 710 | } else { | 712 | } else { |
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 1b8b44637e70..5331a15a61f1 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
| @@ -873,8 +873,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) | |||
| 873 | struct bio *bio; | 873 | struct bio *bio; |
| 874 | 874 | ||
| 875 | if (per_dev != master_dev) { | 875 | if (per_dev != master_dev) { |
| 876 | bio = bio_clone_kmalloc(master_dev->bio, | 876 | bio = bio_clone_fast(master_dev->bio, |
| 877 | GFP_KERNEL); | 877 | GFP_KERNEL, NULL); |
| 878 | if (unlikely(!bio)) { | 878 | if (unlikely(!bio)) { |
| 879 | ORE_DBGMSG( | 879 | ORE_DBGMSG( |
| 880 | "Failed to allocate BIO size=%u\n", | 880 | "Failed to allocate BIO size=%u\n", |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f7750bc5b85a..5863fd22e90b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
| @@ -3529,7 +3529,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
| 3529 | sbi->s_sb_block = sb_block; | 3529 | sbi->s_sb_block = sb_block; |
| 3530 | if (sb->s_bdev->bd_part) | 3530 | if (sb->s_bdev->bd_part) |
| 3531 | sbi->s_sectors_written_start = | 3531 | sbi->s_sectors_written_start = |
| 3532 | part_stat_read(sb->s_bdev->bd_part, sectors[1]); | 3532 | part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); |
| 3533 | 3533 | ||
| 3534 | /* Cleanup superblock name */ | 3534 | /* Cleanup superblock name */ |
| 3535 | strreplace(sb->s_id, '/', '!'); | 3535 | strreplace(sb->s_id, '/', '!'); |
| @@ -4838,7 +4838,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) | |||
| 4838 | if (sb->s_bdev->bd_part) | 4838 | if (sb->s_bdev->bd_part) |
| 4839 | es->s_kbytes_written = | 4839 | es->s_kbytes_written = |
| 4840 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + | 4840 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + |
| 4841 | ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - | 4841 | ((part_stat_read(sb->s_bdev->bd_part, |
| 4842 | sectors[STAT_WRITE]) - | ||
| 4842 | EXT4_SB(sb)->s_sectors_written_start) >> 1)); | 4843 | EXT4_SB(sb)->s_sectors_written_start) >> 1)); |
| 4843 | else | 4844 | else |
| 4844 | es->s_kbytes_written = | 4845 | es->s_kbytes_written = |
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index e60cc5e89023..9212a026a1f1 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c | |||
| @@ -58,7 +58,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) | |||
| 58 | if (!sb->s_bdev->bd_part) | 58 | if (!sb->s_bdev->bd_part) |
| 59 | return snprintf(buf, PAGE_SIZE, "0\n"); | 59 | return snprintf(buf, PAGE_SIZE, "0\n"); |
| 60 | return snprintf(buf, PAGE_SIZE, "%lu\n", | 60 | return snprintf(buf, PAGE_SIZE, "%lu\n", |
| 61 | (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - | 61 | (part_stat_read(sb->s_bdev->bd_part, |
| 62 | sectors[STAT_WRITE]) - | ||
| 62 | sbi->s_sectors_written_start) >> 1); | 63 | sbi->s_sectors_written_start) >> 1); |
| 63 | } | 64 | } |
| 64 | 65 | ||
| @@ -70,7 +71,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) | |||
| 70 | return snprintf(buf, PAGE_SIZE, "0\n"); | 71 | return snprintf(buf, PAGE_SIZE, "0\n"); |
| 71 | return snprintf(buf, PAGE_SIZE, "%llu\n", | 72 | return snprintf(buf, PAGE_SIZE, "%llu\n", |
| 72 | (unsigned long long)(sbi->s_kbytes_written + | 73 | (unsigned long long)(sbi->s_kbytes_written + |
| 73 | ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - | 74 | ((part_stat_read(sb->s_bdev->bd_part, |
| 75 | sectors[STAT_WRITE]) - | ||
| 74 | EXT4_SB(sb)->s_sectors_written_start) >> 1))); | 76 | EXT4_SB(sb)->s_sectors_written_start) >> 1))); |
| 75 | } | 77 | } |
| 76 | 78 | ||
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d8b1de83143..6799c3fc44e3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h | |||
| @@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) | |||
| 1304 | * and the return value is in kbytes. s is of struct f2fs_sb_info. | 1304 | * and the return value is in kbytes. s is of struct f2fs_sb_info. |
| 1305 | */ | 1305 | */ |
| 1306 | #define BD_PART_WRITTEN(s) \ | 1306 | #define BD_PART_WRITTEN(s) \ |
| 1307 | (((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ | 1307 | (((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ |
| 1308 | (s)->sectors_written_start) >> 1) | 1308 | (s)->sectors_written_start) >> 1) |
| 1309 | 1309 | ||
| 1310 | static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) | 1310 | static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) |
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3995e926ba3a..17bcff789c08 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c | |||
| @@ -2882,7 +2882,8 @@ try_onemore: | |||
| 2882 | /* For write statistics */ | 2882 | /* For write statistics */ |
| 2883 | if (sb->s_bdev->bd_part) | 2883 | if (sb->s_bdev->bd_part) |
| 2884 | sbi->sectors_written_start = | 2884 | sbi->sectors_written_start = |
| 2885 | (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); | 2885 | (u64)part_stat_read(sb->s_bdev->bd_part, |
| 2886 | sectors[STAT_WRITE]); | ||
| 2886 | 2887 | ||
| 2887 | /* Read accumulated write IO statistics if exists */ | 2888 | /* Read accumulated write IO statistics if exists */ |
| 2888 | seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); | 2889 | seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); |
diff --git a/fs/mpage.c b/fs/mpage.c index b7e7f570733a..b73638db9866 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
| @@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio) | |||
| 51 | 51 | ||
| 52 | bio_for_each_segment_all(bv, bio, i) { | 52 | bio_for_each_segment_all(bv, bio, i) { |
| 53 | struct page *page = bv->bv_page; | 53 | struct page *page = bv->bv_page; |
| 54 | page_endio(page, op_is_write(bio_op(bio)), | 54 | page_endio(page, bio_op(bio), |
| 55 | blk_status_to_errno(bio->bi_status)); | 55 | blk_status_to_errno(bio->bi_status)); |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | bio_put(bio); | 58 | bio_put(bio); |
diff --git a/include/linux/bio.h b/include/linux/bio.h index f08f5fe7bd08..51371740d2a8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
| @@ -429,7 +429,6 @@ extern void bio_put(struct bio *); | |||
| 429 | 429 | ||
| 430 | extern void __bio_clone_fast(struct bio *, struct bio *); | 430 | extern void __bio_clone_fast(struct bio *, struct bio *); |
| 431 | extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); | 431 | extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); |
| 432 | extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); | ||
| 433 | 432 | ||
| 434 | extern struct bio_set fs_bio_set; | 433 | extern struct bio_set fs_bio_set; |
| 435 | 434 | ||
| @@ -443,12 +442,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) | |||
| 443 | return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); | 442 | return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); |
| 444 | } | 443 | } |
| 445 | 444 | ||
| 446 | static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) | ||
| 447 | { | ||
| 448 | return bio_clone_bioset(bio, gfp_mask, NULL); | ||
| 449 | |||
| 450 | } | ||
| 451 | |||
| 452 | extern blk_qc_t submit_bio(struct bio *); | 445 | extern blk_qc_t submit_bio(struct bio *); |
| 453 | 446 | ||
| 454 | extern void bio_endio(struct bio *); | 447 | extern void bio_endio(struct bio *); |
| @@ -496,9 +489,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, | |||
| 496 | extern void bio_set_pages_dirty(struct bio *bio); | 489 | extern void bio_set_pages_dirty(struct bio *bio); |
| 497 | extern void bio_check_pages_dirty(struct bio *bio); | 490 | extern void bio_check_pages_dirty(struct bio *bio); |
| 498 | 491 | ||
| 499 | void generic_start_io_acct(struct request_queue *q, int rw, | 492 | void generic_start_io_acct(struct request_queue *q, int op, |
| 500 | unsigned long sectors, struct hd_struct *part); | 493 | unsigned long sectors, struct hd_struct *part); |
| 501 | void generic_end_io_acct(struct request_queue *q, int rw, | 494 | void generic_end_io_acct(struct request_queue *q, int op, |
| 502 | struct hd_struct *part, | 495 | struct hd_struct *part, |
| 503 | unsigned long start_time); | 496 | unsigned long start_time); |
| 504 | 497 | ||
| @@ -553,8 +546,16 @@ do { \ | |||
| 553 | #define bio_dev(bio) \ | 546 | #define bio_dev(bio) \ |
| 554 | disk_devt((bio)->bi_disk) | 547 | disk_devt((bio)->bi_disk) |
| 555 | 548 | ||
| 549 | #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) | ||
| 550 | int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); | ||
| 551 | #else | ||
| 552 | static inline int bio_associate_blkcg_from_page(struct bio *bio, | ||
| 553 | struct page *page) { return 0; } | ||
| 554 | #endif | ||
| 555 | |||
| 556 | #ifdef CONFIG_BLK_CGROUP | 556 | #ifdef CONFIG_BLK_CGROUP |
| 557 | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); | 557 | int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); |
| 558 | int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); | ||
| 558 | void bio_disassociate_task(struct bio *bio); | 559 | void bio_disassociate_task(struct bio *bio); |
| 559 | void bio_clone_blkcg_association(struct bio *dst, struct bio *src); | 560 | void bio_clone_blkcg_association(struct bio *dst, struct bio *src); |
| 560 | #else /* CONFIG_BLK_CGROUP */ | 561 | #else /* CONFIG_BLK_CGROUP */ |
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6c666fd7de3c..34aec30e06c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h | |||
| @@ -35,6 +35,7 @@ enum blkg_rwstat_type { | |||
| 35 | BLKG_RWSTAT_WRITE, | 35 | BLKG_RWSTAT_WRITE, |
| 36 | BLKG_RWSTAT_SYNC, | 36 | BLKG_RWSTAT_SYNC, |
| 37 | BLKG_RWSTAT_ASYNC, | 37 | BLKG_RWSTAT_ASYNC, |
| 38 | BLKG_RWSTAT_DISCARD, | ||
| 38 | 39 | ||
| 39 | BLKG_RWSTAT_NR, | 40 | BLKG_RWSTAT_NR, |
| 40 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, | 41 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, |
| @@ -136,6 +137,12 @@ struct blkcg_gq { | |||
| 136 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; | 137 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; |
| 137 | 138 | ||
| 138 | struct rcu_head rcu_head; | 139 | struct rcu_head rcu_head; |
| 140 | |||
| 141 | atomic_t use_delay; | ||
| 142 | atomic64_t delay_nsec; | ||
| 143 | atomic64_t delay_start; | ||
| 144 | u64 last_delay; | ||
| 145 | int last_use; | ||
| 139 | }; | 146 | }; |
| 140 | 147 | ||
| 141 | typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); | 148 | typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); |
| @@ -148,6 +155,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); | |||
| 148 | typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); | 155 | typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); |
| 149 | typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); | 156 | typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); |
| 150 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); | 157 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); |
| 158 | typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, | ||
| 159 | size_t size); | ||
| 151 | 160 | ||
| 152 | struct blkcg_policy { | 161 | struct blkcg_policy { |
| 153 | int plid; | 162 | int plid; |
| @@ -167,6 +176,7 @@ struct blkcg_policy { | |||
| 167 | blkcg_pol_offline_pd_fn *pd_offline_fn; | 176 | blkcg_pol_offline_pd_fn *pd_offline_fn; |
| 168 | blkcg_pol_free_pd_fn *pd_free_fn; | 177 | blkcg_pol_free_pd_fn *pd_free_fn; |
| 169 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; | 178 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; |
| 179 | blkcg_pol_stat_pd_fn *pd_stat_fn; | ||
| 170 | }; | 180 | }; |
| 171 | 181 | ||
| 172 | extern struct blkcg blkcg_root; | 182 | extern struct blkcg blkcg_root; |
| @@ -238,6 +248,42 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) | |||
| 238 | return css_to_blkcg(task_css(current, io_cgrp_id)); | 248 | return css_to_blkcg(task_css(current, io_cgrp_id)); |
| 239 | } | 249 | } |
| 240 | 250 | ||
| 251 | static inline bool blk_cgroup_congested(void) | ||
| 252 | { | ||
| 253 | struct cgroup_subsys_state *css; | ||
| 254 | bool ret = false; | ||
| 255 | |||
| 256 | rcu_read_lock(); | ||
| 257 | css = kthread_blkcg(); | ||
| 258 | if (!css) | ||
| 259 | css = task_css(current, io_cgrp_id); | ||
| 260 | while (css) { | ||
| 261 | if (atomic_read(&css->cgroup->congestion_count)) { | ||
| 262 | ret = true; | ||
| 263 | break; | ||
| 264 | } | ||
| 265 | css = css->parent; | ||
| 266 | } | ||
| 267 | rcu_read_unlock(); | ||
| 268 | return ret; | ||
| 269 | } | ||
| 270 | |||
| 271 | /** | ||
| 272 | * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg | ||
| 273 | * @return: true if this bio needs to be submitted with the root blkg context. | ||
| 274 | * | ||
| 275 | * In order to avoid priority inversions we sometimes need to issue a bio as if | ||
| 276 | * it were attached to the root blkg, and then backcharge to the actual owning | ||
| 277 | * blkg. The idea is we do bio_blkcg() to look up the actual context for the | ||
| 278 | * bio and attach the appropriate blkg to the bio. Then we call this helper and | ||
| 279 | * if it is true run with the root blkg for that queue and then do any | ||
| 280 | * backcharging to the originating cgroup once the io is complete. | ||
| 281 | */ | ||
| 282 | static inline bool bio_issue_as_root_blkg(struct bio *bio) | ||
| 283 | { | ||
| 284 | return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; | ||
| 285 | } | ||
| 286 | |||
| 241 | /** | 287 | /** |
| 242 | * blkcg_parent - get the parent of a blkcg | 288 | * blkcg_parent - get the parent of a blkcg |
| 243 | * @blkcg: blkcg of interest | 289 | * @blkcg: blkcg of interest |
| @@ -296,6 +342,17 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, | |||
| 296 | } | 342 | } |
| 297 | 343 | ||
| 298 | /** | 344 | /** |
| 345 | * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair | ||
| 346 | * @q: request_queue of interest | ||
| 347 | * | ||
| 348 | * Lookup blkg for @q at the root level. See also blkg_lookup(). | ||
| 349 | */ | ||
| 350 | static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) | ||
| 351 | { | ||
| 352 | return q->root_blkg; | ||
| 353 | } | ||
| 354 | |||
| 355 | /** | ||
| 299 | * blkg_to_pdata - get policy private data | 356 | * blkg_to_pdata - get policy private data |
| 300 | * @blkg: blkg of interest | 357 | * @blkg: blkg of interest |
| 301 | * @pol: policy of interest | 358 | * @pol: policy of interest |
| @@ -355,6 +412,21 @@ static inline void blkg_get(struct blkcg_gq *blkg) | |||
| 355 | atomic_inc(&blkg->refcnt); | 412 | atomic_inc(&blkg->refcnt); |
| 356 | } | 413 | } |
| 357 | 414 | ||
| 415 | /** | ||
| 416 | * blkg_try_get - try and get a blkg reference | ||
| 417 | * @blkg: blkg to get | ||
| 418 | * | ||
| 419 | * This is for use when doing an RCU lookup of the blkg. We may be in the midst | ||
| 420 | * of freeing this blkg, so we can only use it if the refcnt is not zero. | ||
| 421 | */ | ||
| 422 | static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) | ||
| 423 | { | ||
| 424 | if (atomic_inc_not_zero(&blkg->refcnt)) | ||
| 425 | return blkg; | ||
| 426 | return NULL; | ||
| 427 | } | ||
| 428 | |||
| 429 | |||
| 358 | void __blkg_release_rcu(struct rcu_head *rcu); | 430 | void __blkg_release_rcu(struct rcu_head *rcu); |
| 359 | 431 | ||
| 360 | /** | 432 | /** |
| @@ -589,7 +661,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, | |||
| 589 | { | 661 | { |
| 590 | struct percpu_counter *cnt; | 662 | struct percpu_counter *cnt; |
| 591 | 663 | ||
| 592 | if (op_is_write(op)) | 664 | if (op_is_discard(op)) |
| 665 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; | ||
| 666 | else if (op_is_write(op)) | ||
| 593 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; | 667 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; |
| 594 | else | 668 | else |
| 595 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; | 669 | cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; |
| @@ -706,8 +780,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, | |||
| 706 | 780 | ||
| 707 | if (!throtl) { | 781 | if (!throtl) { |
| 708 | blkg = blkg ?: q->root_blkg; | 782 | blkg = blkg ?: q->root_blkg; |
| 709 | blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, | 783 | /* |
| 710 | bio->bi_iter.bi_size); | 784 | * If the bio is flagged with BIO_QUEUE_ENTERED it means this |
| 785 | * is a split bio and we would have already accounted for the | ||
| 786 | * size of the bio. | ||
| 787 | */ | ||
| 788 | if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) | ||
| 789 | blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, | ||
| 790 | bio->bi_iter.bi_size); | ||
| 711 | blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); | 791 | blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); |
| 712 | } | 792 | } |
| 713 | 793 | ||
| @@ -715,6 +795,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, | |||
| 715 | return !throtl; | 795 | return !throtl; |
| 716 | } | 796 | } |
| 717 | 797 | ||
| 798 | static inline void blkcg_use_delay(struct blkcg_gq *blkg) | ||
| 799 | { | ||
| 800 | if (atomic_add_return(1, &blkg->use_delay) == 1) | ||
| 801 | atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); | ||
| 802 | } | ||
| 803 | |||
| 804 | static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) | ||
| 805 | { | ||
| 806 | int old = atomic_read(&blkg->use_delay); | ||
| 807 | |||
| 808 | if (old == 0) | ||
| 809 | return 0; | ||
| 810 | |||
| 811 | /* | ||
| 812 | * We do this song and dance because we can race with somebody else | ||
| 813 | * adding or removing delay. If we just did an atomic_dec we'd end up | ||
| 814 | * negative and we'd already be in trouble. We need to subtract 1 and | ||
| 815 | * then check to see if we were the last delay so we can drop the | ||
| 816 | * congestion count on the cgroup. | ||
| 817 | */ | ||
| 818 | while (old) { | ||
| 819 | int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); | ||
| 820 | if (cur == old) | ||
| 821 | break; | ||
| 822 | old = cur; | ||
| 823 | } | ||
| 824 | |||
| 825 | if (old == 0) | ||
| 826 | return 0; | ||
| 827 | if (old == 1) | ||
| 828 | atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); | ||
| 829 | return 1; | ||
| 830 | } | ||
| 831 | |||
| 832 | static inline void blkcg_clear_delay(struct blkcg_gq *blkg) | ||
| 833 | { | ||
| 834 | int old = atomic_read(&blkg->use_delay); | ||
| 835 | if (!old) | ||
| 836 | return; | ||
| 837 | /* We only want 1 person clearing the congestion count for this blkg. */ | ||
| 838 | while (old) { | ||
| 839 | int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); | ||
| 840 | if (cur == old) { | ||
| 841 | atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); | ||
| 842 | break; | ||
| 843 | } | ||
| 844 | old = cur; | ||
| 845 | } | ||
| 846 | } | ||
| 847 | |||
| 848 | void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); | ||
| 849 | void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); | ||
| 850 | void blkcg_maybe_throttle_current(void); | ||
| 718 | #else /* CONFIG_BLK_CGROUP */ | 851 | #else /* CONFIG_BLK_CGROUP */ |
| 719 | 852 | ||
| 720 | struct blkcg { | 853 | struct blkcg { |
| @@ -734,9 +867,16 @@ struct blkcg_policy { | |||
| 734 | 867 | ||
| 735 | #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) | 868 | #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) |
| 736 | 869 | ||
| 870 | static inline void blkcg_maybe_throttle_current(void) { } | ||
| 871 | static inline bool blk_cgroup_congested(void) { return false; } | ||
| 872 | |||
| 737 | #ifdef CONFIG_BLOCK | 873 | #ifdef CONFIG_BLOCK |
| 738 | 874 | ||
| 875 | static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } | ||
| 876 | |||
| 739 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } | 877 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } |
| 878 | static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) | ||
| 879 | { return NULL; } | ||
| 740 | static inline int blkcg_init_queue(struct request_queue *q) { return 0; } | 880 | static inline int blkcg_init_queue(struct request_queue *q) { return 0; } |
| 741 | static inline void blkcg_drain_queue(struct request_queue *q) { } | 881 | static inline void blkcg_drain_queue(struct request_queue *q) { } |
| 742 | static inline void blkcg_exit_queue(struct request_queue *q) { } | 882 | static inline void blkcg_exit_queue(struct request_queue *q) { } |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ca3f2c2edd85..1da59c16f637 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
| @@ -35,10 +35,12 @@ struct blk_mq_hw_ctx { | |||
| 35 | struct sbitmap ctx_map; | 35 | struct sbitmap ctx_map; |
| 36 | 36 | ||
| 37 | struct blk_mq_ctx *dispatch_from; | 37 | struct blk_mq_ctx *dispatch_from; |
| 38 | unsigned int dispatch_busy; | ||
| 38 | 39 | ||
| 39 | struct blk_mq_ctx **ctxs; | ||
| 40 | unsigned int nr_ctx; | 40 | unsigned int nr_ctx; |
| 41 | struct blk_mq_ctx **ctxs; | ||
| 41 | 42 | ||
| 43 | spinlock_t dispatch_wait_lock; | ||
| 42 | wait_queue_entry_t dispatch_wait; | 44 | wait_queue_entry_t dispatch_wait; |
| 43 | atomic_t wait_index; | 45 | atomic_t wait_index; |
| 44 | 46 | ||
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3c4f390aea4b..f6dfb30737d8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
| @@ -179,11 +179,9 @@ struct bio { | |||
| 179 | */ | 179 | */ |
| 180 | struct io_context *bi_ioc; | 180 | struct io_context *bi_ioc; |
| 181 | struct cgroup_subsys_state *bi_css; | 181 | struct cgroup_subsys_state *bi_css; |
| 182 | #ifdef CONFIG_BLK_DEV_THROTTLING_LOW | 182 | struct blkcg_gq *bi_blkg; |
| 183 | void *bi_cg_private; | ||
| 184 | struct bio_issue bi_issue; | 183 | struct bio_issue bi_issue; |
| 185 | #endif | 184 | #endif |
| 186 | #endif | ||
| 187 | union { | 185 | union { |
| 188 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 186 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
| 189 | struct bio_integrity_payload *bi_integrity; /* data integrity */ | 187 | struct bio_integrity_payload *bi_integrity; /* data integrity */ |
| @@ -329,7 +327,7 @@ enum req_flag_bits { | |||
| 329 | 327 | ||
| 330 | /* for driver use */ | 328 | /* for driver use */ |
| 331 | __REQ_DRV, | 329 | __REQ_DRV, |
| 332 | 330 | __REQ_SWAP, /* swapping request. */ | |
| 333 | __REQ_NR_BITS, /* stops here */ | 331 | __REQ_NR_BITS, /* stops here */ |
| 334 | }; | 332 | }; |
| 335 | 333 | ||
| @@ -351,6 +349,7 @@ enum req_flag_bits { | |||
| 351 | #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) | 349 | #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) |
| 352 | 350 | ||
| 353 | #define REQ_DRV (1ULL << __REQ_DRV) | 351 | #define REQ_DRV (1ULL << __REQ_DRV) |
| 352 | #define REQ_SWAP (1ULL << __REQ_SWAP) | ||
| 354 | 353 | ||
| 355 | #define REQ_FAILFAST_MASK \ | 354 | #define REQ_FAILFAST_MASK \ |
| 356 | (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) | 355 | (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) |
| @@ -358,6 +357,14 @@ enum req_flag_bits { | |||
| 358 | #define REQ_NOMERGE_FLAGS \ | 357 | #define REQ_NOMERGE_FLAGS \ |
| 359 | (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) | 358 | (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) |
| 360 | 359 | ||
| 360 | enum stat_group { | ||
| 361 | STAT_READ, | ||
| 362 | STAT_WRITE, | ||
| 363 | STAT_DISCARD, | ||
| 364 | |||
| 365 | NR_STAT_GROUPS | ||
| 366 | }; | ||
| 367 | |||
| 361 | #define bio_op(bio) \ | 368 | #define bio_op(bio) \ |
| 362 | ((bio)->bi_opf & REQ_OP_MASK) | 369 | ((bio)->bi_opf & REQ_OP_MASK) |
| 363 | #define req_op(req) \ | 370 | #define req_op(req) \ |
| @@ -395,6 +402,18 @@ static inline bool op_is_sync(unsigned int op) | |||
| 395 | (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); | 402 | (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); |
| 396 | } | 403 | } |
| 397 | 404 | ||
| 405 | static inline bool op_is_discard(unsigned int op) | ||
| 406 | { | ||
| 407 | return (op & REQ_OP_MASK) == REQ_OP_DISCARD; | ||
| 408 | } | ||
| 409 | |||
| 410 | static inline int op_stat_group(unsigned int op) | ||
| 411 | { | ||
| 412 | if (op_is_discard(op)) | ||
| 413 | return STAT_DISCARD; | ||
| 414 | return op_is_write(op); | ||
| 415 | } | ||
| 416 | |||
| 398 | typedef unsigned int blk_qc_t; | 417 | typedef unsigned int blk_qc_t; |
| 399 | #define BLK_QC_T_NONE -1U | 418 | #define BLK_QC_T_NONE -1U |
| 400 | #define BLK_QC_T_SHIFT 16 | 419 | #define BLK_QC_T_SHIFT 16 |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 79226ca8f80f..d6869e0e2b64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
| @@ -27,8 +27,6 @@ | |||
| 27 | #include <linux/percpu-refcount.h> | 27 | #include <linux/percpu-refcount.h> |
| 28 | #include <linux/scatterlist.h> | 28 | #include <linux/scatterlist.h> |
| 29 | #include <linux/blkzoned.h> | 29 | #include <linux/blkzoned.h> |
| 30 | #include <linux/seqlock.h> | ||
| 31 | #include <linux/u64_stats_sync.h> | ||
| 32 | 30 | ||
| 33 | struct module; | 31 | struct module; |
| 34 | struct scsi_ioctl_command; | 32 | struct scsi_ioctl_command; |
| @@ -42,7 +40,7 @@ struct bsg_job; | |||
| 42 | struct blkcg_gq; | 40 | struct blkcg_gq; |
| 43 | struct blk_flush_queue; | 41 | struct blk_flush_queue; |
| 44 | struct pr_ops; | 42 | struct pr_ops; |
| 45 | struct rq_wb; | 43 | struct rq_qos; |
| 46 | struct blk_queue_stats; | 44 | struct blk_queue_stats; |
| 47 | struct blk_stat_callback; | 45 | struct blk_stat_callback; |
| 48 | 46 | ||
| @@ -442,10 +440,8 @@ struct request_queue { | |||
| 442 | int nr_rqs[2]; /* # allocated [a]sync rqs */ | 440 | int nr_rqs[2]; /* # allocated [a]sync rqs */ |
| 443 | int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ | 441 | int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ |
| 444 | 442 | ||
| 445 | atomic_t shared_hctx_restart; | ||
| 446 | |||
| 447 | struct blk_queue_stats *stats; | 443 | struct blk_queue_stats *stats; |
| 448 | struct rq_wb *rq_wb; | 444 | struct rq_qos *rq_qos; |
| 449 | 445 | ||
| 450 | /* | 446 | /* |
| 451 | * If blkcg is not used, @q->root_rl serves all requests. If blkcg | 447 | * If blkcg is not used, @q->root_rl serves all requests. If blkcg |
| @@ -592,6 +588,7 @@ struct request_queue { | |||
| 592 | 588 | ||
| 593 | struct queue_limits limits; | 589 | struct queue_limits limits; |
| 594 | 590 | ||
| 591 | #ifdef CONFIG_BLK_DEV_ZONED | ||
| 595 | /* | 592 | /* |
| 596 | * Zoned block device information for request dispatch control. | 593 | * Zoned block device information for request dispatch control. |
| 597 | * nr_zones is the total number of zones of the device. This is always | 594 | * nr_zones is the total number of zones of the device. This is always |
| @@ -612,6 +609,7 @@ struct request_queue { | |||
| 612 | unsigned int nr_zones; | 609 | unsigned int nr_zones; |
| 613 | unsigned long *seq_zones_bitmap; | 610 | unsigned long *seq_zones_bitmap; |
| 614 | unsigned long *seq_zones_wlock; | 611 | unsigned long *seq_zones_wlock; |
| 612 | #endif /* CONFIG_BLK_DEV_ZONED */ | ||
| 615 | 613 | ||
| 616 | /* | 614 | /* |
| 617 | * sg stuff | 615 | * sg stuff |
| @@ -800,11 +798,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) | |||
| 800 | return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; | 798 | return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; |
| 801 | } | 799 | } |
| 802 | 800 | ||
| 803 | static inline unsigned int blk_queue_nr_zones(struct request_queue *q) | 801 | #ifdef CONFIG_BLK_DEV_ZONED |
| 804 | { | ||
| 805 | return q->nr_zones; | ||
| 806 | } | ||
| 807 | |||
| 808 | static inline unsigned int blk_queue_zone_no(struct request_queue *q, | 802 | static inline unsigned int blk_queue_zone_no(struct request_queue *q, |
| 809 | sector_t sector) | 803 | sector_t sector) |
| 810 | { | 804 | { |
| @@ -820,6 +814,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, | |||
| 820 | return false; | 814 | return false; |
| 821 | return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); | 815 | return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); |
| 822 | } | 816 | } |
| 817 | #endif /* CONFIG_BLK_DEV_ZONED */ | ||
| 823 | 818 | ||
| 824 | static inline bool rq_is_sync(struct request *rq) | 819 | static inline bool rq_is_sync(struct request *rq) |
| 825 | { | 820 | { |
| @@ -1070,6 +1065,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) | |||
| 1070 | return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; | 1065 | return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; |
| 1071 | } | 1066 | } |
| 1072 | 1067 | ||
| 1068 | #ifdef CONFIG_BLK_DEV_ZONED | ||
| 1073 | static inline unsigned int blk_rq_zone_no(struct request *rq) | 1069 | static inline unsigned int blk_rq_zone_no(struct request *rq) |
| 1074 | { | 1070 | { |
| 1075 | return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); | 1071 | return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); |
| @@ -1079,6 +1075,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) | |||
| 1079 | { | 1075 | { |
| 1080 | return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); | 1076 | return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); |
| 1081 | } | 1077 | } |
| 1078 | #endif /* CONFIG_BLK_DEV_ZONED */ | ||
| 1082 | 1079 | ||
| 1083 | /* | 1080 | /* |
| 1084 | * Some commands like WRITE SAME have a payload or data transfer size which | 1081 | * Some commands like WRITE SAME have a payload or data transfer size which |
| @@ -1437,8 +1434,6 @@ enum blk_default_limits { | |||
| 1437 | BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, | 1434 | BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, |
| 1438 | }; | 1435 | }; |
| 1439 | 1436 | ||
| 1440 | #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) | ||
| 1441 | |||
| 1442 | static inline unsigned long queue_segment_boundary(struct request_queue *q) | 1437 | static inline unsigned long queue_segment_boundary(struct request_queue *q) |
| 1443 | { | 1438 | { |
| 1444 | return q->limits.seg_boundary_mask; | 1439 | return q->limits.seg_boundary_mask; |
| @@ -1639,15 +1634,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) | |||
| 1639 | return 0; | 1634 | return 0; |
| 1640 | } | 1635 | } |
| 1641 | 1636 | ||
| 1642 | static inline unsigned int bdev_nr_zones(struct block_device *bdev) | ||
| 1643 | { | ||
| 1644 | struct request_queue *q = bdev_get_queue(bdev); | ||
| 1645 | |||
| 1646 | if (q) | ||
| 1647 | return blk_queue_nr_zones(q); | ||
| 1648 | return 0; | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | static inline int queue_dma_alignment(struct request_queue *q) | 1637 | static inline int queue_dma_alignment(struct request_queue *q) |
| 1652 | { | 1638 | { |
| 1653 | return q ? q->dma_alignment : 511; | 1639 | return q ? q->dma_alignment : 511; |
| @@ -1877,6 +1863,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req, | |||
| 1877 | bip_next->bip_vec[0].bv_offset); | 1863 | bip_next->bip_vec[0].bv_offset); |
| 1878 | } | 1864 | } |
| 1879 | 1865 | ||
| 1866 | /** | ||
| 1867 | * bio_integrity_intervals - Return number of integrity intervals for a bio | ||
| 1868 | * @bi: blk_integrity profile for device | ||
| 1869 | * @sectors: Size of the bio in 512-byte sectors | ||
| 1870 | * | ||
| 1871 | * Description: The block layer calculates everything in 512 byte | ||
| 1872 | * sectors but integrity metadata is done in terms of the data integrity | ||
| 1873 | * interval size of the storage device. Convert the block layer sectors | ||
| 1874 | * to the appropriate number of integrity intervals. | ||
| 1875 | */ | ||
| 1876 | static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, | ||
| 1877 | unsigned int sectors) | ||
| 1878 | { | ||
| 1879 | return sectors >> (bi->interval_exp - 9); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, | ||
| 1883 | unsigned int sectors) | ||
| 1884 | { | ||
| 1885 | return bio_integrity_intervals(bi, sectors) * bi->tuple_size; | ||
| 1886 | } | ||
| 1887 | |||
| 1880 | #else /* CONFIG_BLK_DEV_INTEGRITY */ | 1888 | #else /* CONFIG_BLK_DEV_INTEGRITY */ |
| 1881 | 1889 | ||
| 1882 | struct bio; | 1890 | struct bio; |
| @@ -1950,12 +1958,24 @@ static inline bool integrity_req_gap_front_merge(struct request *req, | |||
| 1950 | return false; | 1958 | return false; |
| 1951 | } | 1959 | } |
| 1952 | 1960 | ||
| 1961 | static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, | ||
| 1962 | unsigned int sectors) | ||
| 1963 | { | ||
| 1964 | return 0; | ||
| 1965 | } | ||
| 1966 | |||
| 1967 | static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, | ||
| 1968 | unsigned int sectors) | ||
| 1969 | { | ||
| 1970 | return 0; | ||
| 1971 | } | ||
| 1972 | |||
| 1953 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 1973 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
| 1954 | 1974 | ||
| 1955 | struct block_device_operations { | 1975 | struct block_device_operations { |
| 1956 | int (*open) (struct block_device *, fmode_t); | 1976 | int (*open) (struct block_device *, fmode_t); |
| 1957 | void (*release) (struct gendisk *, fmode_t); | 1977 | void (*release) (struct gendisk *, fmode_t); |
| 1958 | int (*rw_page)(struct block_device *, sector_t, struct page *, bool); | 1978 | int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); |
| 1959 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1979 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
| 1960 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1980 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
| 1961 | unsigned int (*check_events) (struct gendisk *disk, | 1981 | unsigned int (*check_events) (struct gendisk *disk, |
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index e75dfd1f1dec..528271c60018 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | 13 | ||
| 14 | #include <linux/fs.h> /* not really needed, later.. */ | 14 | #include <linux/fs.h> /* not really needed, later.. */ |
| 15 | #include <linux/list.h> | 15 | #include <linux/list.h> |
| 16 | #include <scsi/scsi_common.h> | ||
| 16 | #include <uapi/linux/cdrom.h> | 17 | #include <uapi/linux/cdrom.h> |
| 17 | 18 | ||
| 18 | struct packet_command | 19 | struct packet_command |
| @@ -21,7 +22,7 @@ struct packet_command | |||
| 21 | unsigned char *buffer; | 22 | unsigned char *buffer; |
| 22 | unsigned int buflen; | 23 | unsigned int buflen; |
| 23 | int stat; | 24 | int stat; |
| 24 | struct request_sense *sense; | 25 | struct scsi_sense_hdr *sshdr; |
| 25 | unsigned char data_direction; | 26 | unsigned char data_direction; |
| 26 | int quiet; | 27 | int quiet; |
| 27 | int timeout; | 28 | int timeout; |
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c0e68f903011..ff20b677fb9f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
| @@ -438,6 +438,9 @@ struct cgroup { | |||
| 438 | /* used to store eBPF programs */ | 438 | /* used to store eBPF programs */ |
| 439 | struct cgroup_bpf bpf; | 439 | struct cgroup_bpf bpf; |
| 440 | 440 | ||
| 441 | /* If there is block congestion on this cgroup. */ | ||
| 442 | atomic_t congestion_count; | ||
| 443 | |||
| 441 | /* ids of the ancestors at each level including self */ | 444 | /* ids of the ancestors at each level including self */ |
| 442 | int ancestor_ids[]; | 445 | int ancestor_ids[]; |
| 443 | }; | 446 | }; |
diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6cb8a5789668..57864422a2c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/percpu-refcount.h> | 17 | #include <linux/percpu-refcount.h> |
| 18 | #include <linux/uuid.h> | 18 | #include <linux/uuid.h> |
| 19 | #include <linux/blk_types.h> | ||
| 19 | 20 | ||
| 20 | #ifdef CONFIG_BLOCK | 21 | #ifdef CONFIG_BLOCK |
| 21 | 22 | ||
| @@ -82,10 +83,10 @@ struct partition { | |||
| 82 | } __attribute__((packed)); | 83 | } __attribute__((packed)); |
| 83 | 84 | ||
| 84 | struct disk_stats { | 85 | struct disk_stats { |
| 85 | unsigned long sectors[2]; /* READs and WRITEs */ | 86 | unsigned long sectors[NR_STAT_GROUPS]; |
| 86 | unsigned long ios[2]; | 87 | unsigned long ios[NR_STAT_GROUPS]; |
| 87 | unsigned long merges[2]; | 88 | unsigned long merges[NR_STAT_GROUPS]; |
| 88 | unsigned long ticks[2]; | 89 | unsigned long ticks[NR_STAT_GROUPS]; |
| 89 | unsigned long io_ticks; | 90 | unsigned long io_ticks; |
| 90 | unsigned long time_in_queue; | 91 | unsigned long time_in_queue; |
| 91 | }; | 92 | }; |
| @@ -353,6 +354,11 @@ static inline void free_part_stats(struct hd_struct *part) | |||
| 353 | 354 | ||
| 354 | #endif /* CONFIG_SMP */ | 355 | #endif /* CONFIG_SMP */ |
| 355 | 356 | ||
| 357 | #define part_stat_read_accum(part, field) \ | ||
| 358 | (part_stat_read(part, field[STAT_READ]) + \ | ||
| 359 | part_stat_read(part, field[STAT_WRITE]) + \ | ||
| 360 | part_stat_read(part, field[STAT_DISCARD])) | ||
| 361 | |||
| 356 | #define part_stat_add(cpu, part, field, addnd) do { \ | 362 | #define part_stat_add(cpu, part, field, addnd) do { \ |
| 357 | __part_stat_add((cpu), (part), field, addnd); \ | 363 | __part_stat_add((cpu), (part), field, addnd); \ |
| 358 | if ((part)->partno) \ | 364 | if ((part)->partno) \ |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c6fb116e925..680d3395fc83 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, | |||
| 317 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 317 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
| 318 | gfp_t gfp_mask, struct mem_cgroup **memcgp, | 318 | gfp_t gfp_mask, struct mem_cgroup **memcgp, |
| 319 | bool compound); | 319 | bool compound); |
| 320 | int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, | ||
| 321 | gfp_t gfp_mask, struct mem_cgroup **memcgp, | ||
| 322 | bool compound); | ||
| 320 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 323 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
| 321 | bool lrucare, bool compound); | 324 | bool lrucare, bool compound); |
| 322 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, | 325 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, |
| @@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
| 789 | return 0; | 792 | return 0; |
| 790 | } | 793 | } |
| 791 | 794 | ||
| 795 | static inline int mem_cgroup_try_charge_delay(struct page *page, | ||
| 796 | struct mm_struct *mm, | ||
| 797 | gfp_t gfp_mask, | ||
| 798 | struct mem_cgroup **memcgp, | ||
| 799 | bool compound) | ||
| 800 | { | ||
| 801 | *memcgp = NULL; | ||
| 802 | return 0; | ||
| 803 | } | ||
| 804 | |||
| 792 | static inline void mem_cgroup_commit_charge(struct page *page, | 805 | static inline void mem_cgroup_commit_charge(struct page *page, |
| 793 | struct mem_cgroup *memcg, | 806 | struct mem_cgroup *memcg, |
| 794 | bool lrucare, bool compound) | 807 | bool lrucare, bool compound) |
diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2950ce957656..68e91ef5494c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h | |||
| @@ -242,7 +242,12 @@ struct nvme_id_ctrl { | |||
| 242 | __le32 sanicap; | 242 | __le32 sanicap; |
| 243 | __le32 hmminds; | 243 | __le32 hmminds; |
| 244 | __le16 hmmaxd; | 244 | __le16 hmmaxd; |
| 245 | __u8 rsvd338[174]; | 245 | __u8 rsvd338[4]; |
| 246 | __u8 anatt; | ||
| 247 | __u8 anacap; | ||
| 248 | __le32 anagrpmax; | ||
| 249 | __le32 nanagrpid; | ||
| 250 | __u8 rsvd352[160]; | ||
| 246 | __u8 sqes; | 251 | __u8 sqes; |
| 247 | __u8 cqes; | 252 | __u8 cqes; |
| 248 | __le16 maxcmd; | 253 | __le16 maxcmd; |
| @@ -254,11 +259,12 @@ struct nvme_id_ctrl { | |||
| 254 | __le16 awun; | 259 | __le16 awun; |
| 255 | __le16 awupf; | 260 | __le16 awupf; |
| 256 | __u8 nvscc; | 261 | __u8 nvscc; |
| 257 | __u8 rsvd531; | 262 | __u8 nwpc; |
| 258 | __le16 acwu; | 263 | __le16 acwu; |
| 259 | __u8 rsvd534[2]; | 264 | __u8 rsvd534[2]; |
| 260 | __le32 sgls; | 265 | __le32 sgls; |
| 261 | __u8 rsvd540[228]; | 266 | __le32 mnan; |
| 267 | __u8 rsvd544[224]; | ||
| 262 | char subnqn[256]; | 268 | char subnqn[256]; |
| 263 | __u8 rsvd1024[768]; | 269 | __u8 rsvd1024[768]; |
| 264 | __le32 ioccsz; | 270 | __le32 ioccsz; |
| @@ -312,7 +318,11 @@ struct nvme_id_ns { | |||
| 312 | __le16 nabspf; | 318 | __le16 nabspf; |
| 313 | __le16 noiob; | 319 | __le16 noiob; |
| 314 | __u8 nvmcap[16]; | 320 | __u8 nvmcap[16]; |
| 315 | __u8 rsvd64[40]; | 321 | __u8 rsvd64[28]; |
| 322 | __le32 anagrpid; | ||
| 323 | __u8 rsvd96[3]; | ||
| 324 | __u8 nsattr; | ||
| 325 | __u8 rsvd100[4]; | ||
| 316 | __u8 nguid[16]; | 326 | __u8 nguid[16]; |
| 317 | __u8 eui64[8]; | 327 | __u8 eui64[8]; |
| 318 | struct nvme_lbaf lbaf[16]; | 328 | struct nvme_lbaf lbaf[16]; |
| @@ -425,6 +435,32 @@ struct nvme_effects_log { | |||
| 425 | __u8 resv[2048]; | 435 | __u8 resv[2048]; |
| 426 | }; | 436 | }; |
| 427 | 437 | ||
| 438 | enum nvme_ana_state { | ||
| 439 | NVME_ANA_OPTIMIZED = 0x01, | ||
| 440 | NVME_ANA_NONOPTIMIZED = 0x02, | ||
| 441 | NVME_ANA_INACCESSIBLE = 0x03, | ||
| 442 | NVME_ANA_PERSISTENT_LOSS = 0x04, | ||
| 443 | NVME_ANA_CHANGE = 0x0f, | ||
| 444 | }; | ||
| 445 | |||
| 446 | struct nvme_ana_group_desc { | ||
| 447 | __le32 grpid; | ||
| 448 | __le32 nnsids; | ||
| 449 | __le64 chgcnt; | ||
| 450 | __u8 state; | ||
| 451 | __u8 rsvd17[15]; | ||
| 452 | __le32 nsids[]; | ||
| 453 | }; | ||
| 454 | |||
| 455 | /* flag for the log specific field of the ANA log */ | ||
| 456 | #define NVME_ANA_LOG_RGO (1 << 0) | ||
| 457 | |||
| 458 | struct nvme_ana_rsp_hdr { | ||
| 459 | __le64 chgcnt; | ||
| 460 | __le16 ngrps; | ||
| 461 | __le16 rsvd10[3]; | ||
| 462 | }; | ||
| 463 | |||
| 428 | enum { | 464 | enum { |
| 429 | NVME_SMART_CRIT_SPARE = 1 << 0, | 465 | NVME_SMART_CRIT_SPARE = 1 << 0, |
| 430 | NVME_SMART_CRIT_TEMPERATURE = 1 << 1, | 466 | NVME_SMART_CRIT_TEMPERATURE = 1 << 1, |
| @@ -444,11 +480,13 @@ enum { | |||
| 444 | enum { | 480 | enum { |
| 445 | NVME_AER_NOTICE_NS_CHANGED = 0x00, | 481 | NVME_AER_NOTICE_NS_CHANGED = 0x00, |
| 446 | NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, | 482 | NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, |
| 483 | NVME_AER_NOTICE_ANA = 0x03, | ||
| 447 | }; | 484 | }; |
| 448 | 485 | ||
| 449 | enum { | 486 | enum { |
| 450 | NVME_AEN_CFG_NS_ATTR = 1 << 8, | 487 | NVME_AEN_CFG_NS_ATTR = 1 << 8, |
| 451 | NVME_AEN_CFG_FW_ACT = 1 << 9, | 488 | NVME_AEN_CFG_FW_ACT = 1 << 9, |
| 489 | NVME_AEN_CFG_ANA_CHANGE = 1 << 11, | ||
| 452 | }; | 490 | }; |
| 453 | 491 | ||
| 454 | struct nvme_lba_range_type { | 492 | struct nvme_lba_range_type { |
| @@ -749,15 +787,22 @@ enum { | |||
| 749 | NVME_FEAT_HOST_MEM_BUF = 0x0d, | 787 | NVME_FEAT_HOST_MEM_BUF = 0x0d, |
| 750 | NVME_FEAT_TIMESTAMP = 0x0e, | 788 | NVME_FEAT_TIMESTAMP = 0x0e, |
| 751 | NVME_FEAT_KATO = 0x0f, | 789 | NVME_FEAT_KATO = 0x0f, |
| 790 | NVME_FEAT_HCTM = 0x10, | ||
| 791 | NVME_FEAT_NOPSC = 0x11, | ||
| 792 | NVME_FEAT_RRL = 0x12, | ||
| 793 | NVME_FEAT_PLM_CONFIG = 0x13, | ||
| 794 | NVME_FEAT_PLM_WINDOW = 0x14, | ||
| 752 | NVME_FEAT_SW_PROGRESS = 0x80, | 795 | NVME_FEAT_SW_PROGRESS = 0x80, |
| 753 | NVME_FEAT_HOST_ID = 0x81, | 796 | NVME_FEAT_HOST_ID = 0x81, |
| 754 | NVME_FEAT_RESV_MASK = 0x82, | 797 | NVME_FEAT_RESV_MASK = 0x82, |
| 755 | NVME_FEAT_RESV_PERSIST = 0x83, | 798 | NVME_FEAT_RESV_PERSIST = 0x83, |
| 799 | NVME_FEAT_WRITE_PROTECT = 0x84, | ||
| 756 | NVME_LOG_ERROR = 0x01, | 800 | NVME_LOG_ERROR = 0x01, |
| 757 | NVME_LOG_SMART = 0x02, | 801 | NVME_LOG_SMART = 0x02, |
| 758 | NVME_LOG_FW_SLOT = 0x03, | 802 | NVME_LOG_FW_SLOT = 0x03, |
| 759 | NVME_LOG_CHANGED_NS = 0x04, | 803 | NVME_LOG_CHANGED_NS = 0x04, |
| 760 | NVME_LOG_CMD_EFFECTS = 0x05, | 804 | NVME_LOG_CMD_EFFECTS = 0x05, |
| 805 | NVME_LOG_ANA = 0x0c, | ||
| 761 | NVME_LOG_DISC = 0x70, | 806 | NVME_LOG_DISC = 0x70, |
| 762 | NVME_LOG_RESERVATION = 0x80, | 807 | NVME_LOG_RESERVATION = 0x80, |
| 763 | NVME_FWACT_REPL = (0 << 3), | 808 | NVME_FWACT_REPL = (0 << 3), |
| @@ -765,6 +810,14 @@ enum { | |||
| 765 | NVME_FWACT_ACTV = (2 << 3), | 810 | NVME_FWACT_ACTV = (2 << 3), |
| 766 | }; | 811 | }; |
| 767 | 812 | ||
| 813 | /* NVMe Namespace Write Protect State */ | ||
| 814 | enum { | ||
| 815 | NVME_NS_NO_WRITE_PROTECT = 0, | ||
| 816 | NVME_NS_WRITE_PROTECT, | ||
| 817 | NVME_NS_WRITE_PROTECT_POWER_CYCLE, | ||
| 818 | NVME_NS_WRITE_PROTECT_PERMANENT, | ||
| 819 | }; | ||
| 820 | |||
| 768 | #define NVME_MAX_CHANGED_NAMESPACES 1024 | 821 | #define NVME_MAX_CHANGED_NAMESPACES 1024 |
| 769 | 822 | ||
| 770 | struct nvme_identify { | 823 | struct nvme_identify { |
| @@ -880,7 +933,7 @@ struct nvme_get_log_page_command { | |||
| 880 | __u64 rsvd2[2]; | 933 | __u64 rsvd2[2]; |
| 881 | union nvme_data_ptr dptr; | 934 | union nvme_data_ptr dptr; |
| 882 | __u8 lid; | 935 | __u8 lid; |
| 883 | __u8 rsvd10; | 936 | __u8 lsp; /* upper 4 bits reserved */ |
| 884 | __le16 numdl; | 937 | __le16 numdl; |
| 885 | __le16 numdu; | 938 | __le16 numdu; |
| 886 | __u16 rsvd11; | 939 | __u16 rsvd11; |
| @@ -1111,6 +1164,8 @@ enum { | |||
| 1111 | NVME_SC_SGL_INVALID_OFFSET = 0x16, | 1164 | NVME_SC_SGL_INVALID_OFFSET = 0x16, |
| 1112 | NVME_SC_SGL_INVALID_SUBTYPE = 0x17, | 1165 | NVME_SC_SGL_INVALID_SUBTYPE = 0x17, |
| 1113 | 1166 | ||
| 1167 | NVME_SC_NS_WRITE_PROTECTED = 0x20, | ||
| 1168 | |||
| 1114 | NVME_SC_LBA_RANGE = 0x80, | 1169 | NVME_SC_LBA_RANGE = 0x80, |
| 1115 | NVME_SC_CAP_EXCEEDED = 0x81, | 1170 | NVME_SC_CAP_EXCEEDED = 0x81, |
| 1116 | NVME_SC_NS_NOT_READY = 0x82, | 1171 | NVME_SC_NS_NOT_READY = 0x82, |
| @@ -1180,6 +1235,13 @@ enum { | |||
| 1180 | NVME_SC_ACCESS_DENIED = 0x286, | 1235 | NVME_SC_ACCESS_DENIED = 0x286, |
| 1181 | NVME_SC_UNWRITTEN_BLOCK = 0x287, | 1236 | NVME_SC_UNWRITTEN_BLOCK = 0x287, |
| 1182 | 1237 | ||
| 1238 | /* | ||
| 1239 | * Path-related Errors: | ||
| 1240 | */ | ||
| 1241 | NVME_SC_ANA_PERSISTENT_LOSS = 0x301, | ||
| 1242 | NVME_SC_ANA_INACCESSIBLE = 0x302, | ||
| 1243 | NVME_SC_ANA_TRANSITION = 0x303, | ||
| 1244 | |||
| 1183 | NVME_SC_DNR = 0x4000, | 1245 | NVME_SC_DNR = 0x4000, |
| 1184 | }; | 1246 | }; |
| 1185 | 1247 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index dac5086e3815..95a5018c338e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -734,6 +734,10 @@ struct task_struct { | |||
| 734 | /* disallow userland-initiated cgroup migration */ | 734 | /* disallow userland-initiated cgroup migration */ |
| 735 | unsigned no_cgroup_migration:1; | 735 | unsigned no_cgroup_migration:1; |
| 736 | #endif | 736 | #endif |
| 737 | #ifdef CONFIG_BLK_CGROUP | ||
| 738 | /* to be used once the psi infrastructure lands upstream. */ | ||
| 739 | unsigned use_memdelay:1; | ||
| 740 | #endif | ||
| 737 | 741 | ||
| 738 | unsigned long atomic_flags; /* Flags requiring atomic access. */ | 742 | unsigned long atomic_flags; /* Flags requiring atomic access. */ |
| 739 | 743 | ||
| @@ -1150,6 +1154,10 @@ struct task_struct { | |||
| 1150 | unsigned int memcg_nr_pages_over_high; | 1154 | unsigned int memcg_nr_pages_over_high; |
| 1151 | #endif | 1155 | #endif |
| 1152 | 1156 | ||
| 1157 | #ifdef CONFIG_BLK_CGROUP | ||
| 1158 | struct request_queue *throttle_queue; | ||
| 1159 | #endif | ||
| 1160 | |||
| 1153 | #ifdef CONFIG_UPROBES | 1161 | #ifdef CONFIG_UPROBES |
| 1154 | struct uprobe_task *utask; | 1162 | struct uprobe_task *utask; |
| 1155 | #endif | 1163 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d8638..1a8bd05a335e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
| @@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
| 629 | 629 | ||
| 630 | return memcg->swappiness; | 630 | return memcg->swappiness; |
| 631 | } | 631 | } |
| 632 | |||
| 633 | #else | 632 | #else |
| 634 | static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) | 633 | static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) |
| 635 | { | 634 | { |
| @@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) | |||
| 637 | } | 636 | } |
| 638 | #endif | 637 | #endif |
| 639 | 638 | ||
| 639 | #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) | ||
| 640 | extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, | ||
| 641 | gfp_t gfp_mask); | ||
| 642 | #else | ||
| 643 | static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, | ||
| 644 | int node, gfp_t gfp_mask) | ||
| 645 | { | ||
| 646 | } | ||
| 647 | #endif | ||
| 648 | |||
| 640 | #ifdef CONFIG_MEMCG_SWAP | 649 | #ifdef CONFIG_MEMCG_SWAP |
| 641 | extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); | 650 | extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); |
| 642 | extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); | 651 | extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); |
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c6aa8a3c42ed..b9626aa7e90c 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h | |||
| @@ -37,9 +37,33 @@ struct t10_pi_tuple { | |||
| 37 | #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) | 37 | #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) |
| 38 | #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) | 38 | #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) |
| 39 | 39 | ||
| 40 | static inline u32 t10_pi_ref_tag(struct request *rq) | ||
| 41 | { | ||
| 42 | #ifdef CONFIG_BLK_DEV_INTEGRITY | ||
| 43 | return blk_rq_pos(rq) >> | ||
| 44 | (rq->q->integrity.interval_exp - 9) & 0xffffffff; | ||
| 45 | #else | ||
| 46 | return -1U; | ||
| 47 | #endif | ||
| 48 | } | ||
| 49 | |||
| 40 | extern const struct blk_integrity_profile t10_pi_type1_crc; | 50 | extern const struct blk_integrity_profile t10_pi_type1_crc; |
| 41 | extern const struct blk_integrity_profile t10_pi_type1_ip; | 51 | extern const struct blk_integrity_profile t10_pi_type1_ip; |
| 42 | extern const struct blk_integrity_profile t10_pi_type3_crc; | 52 | extern const struct blk_integrity_profile t10_pi_type3_crc; |
| 43 | extern const struct blk_integrity_profile t10_pi_type3_ip; | 53 | extern const struct blk_integrity_profile t10_pi_type3_ip; |
| 44 | 54 | ||
| 55 | #ifdef CONFIG_BLK_DEV_INTEGRITY | ||
| 56 | extern void t10_pi_prepare(struct request *rq, u8 protection_type); | ||
| 57 | extern void t10_pi_complete(struct request *rq, u8 protection_type, | ||
| 58 | unsigned int intervals); | ||
| 59 | #else | ||
| 60 | static inline void t10_pi_complete(struct request *rq, u8 protection_type, | ||
| 61 | unsigned int intervals) | ||
| 62 | { | ||
| 63 | } | ||
| 64 | static inline void t10_pi_prepare(struct request *rq, u8 protection_type) | ||
| 65 | { | ||
| 66 | } | ||
| 67 | #endif | ||
| 68 | |||
| 45 | #endif | 69 | #endif |
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 4a8841963c2e..05589a3e37f4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/security.h> | 51 | #include <linux/security.h> |
| 52 | #include <linux/task_work.h> | 52 | #include <linux/task_work.h> |
| 53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
| 54 | #include <linux/blk-cgroup.h> | ||
| 54 | struct linux_binprm; | 55 | struct linux_binprm; |
| 55 | 56 | ||
| 56 | /* | 57 | /* |
| @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) | |||
| 192 | task_work_run(); | 193 | task_work_run(); |
| 193 | 194 | ||
| 194 | mem_cgroup_handle_over_high(); | 195 | mem_cgroup_handle_over_high(); |
| 196 | blkcg_maybe_throttle_current(); | ||
| 195 | } | 197 | } |
| 196 | 198 | ||
| 197 | #endif /* <linux/tracehook.h> */ | 199 | #endif /* <linux/tracehook.h> */ |
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index aaf1e971c6a3..c891ada3c5c2 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | 4 | ||
| 5 | #include <linux/dma-mapping.h> | 5 | #include <linux/dma-mapping.h> |
| 6 | #include <linux/blkdev.h> | 6 | #include <linux/blkdev.h> |
| 7 | #include <linux/t10-pi.h> | ||
| 7 | #include <linux/list.h> | 8 | #include <linux/list.h> |
| 8 | #include <linux/types.h> | 9 | #include <linux/types.h> |
| 9 | #include <linux/timer.h> | 10 | #include <linux/timer.h> |
| @@ -14,8 +15,6 @@ | |||
| 14 | struct Scsi_Host; | 15 | struct Scsi_Host; |
| 15 | struct scsi_driver; | 16 | struct scsi_driver; |
| 16 | 17 | ||
| 17 | #include <scsi/scsi_device.h> | ||
| 18 | |||
| 19 | /* | 18 | /* |
| 20 | * MAX_COMMAND_SIZE is: | 19 | * MAX_COMMAND_SIZE is: |
| 21 | * The longest fixed-length SCSI CDB as per the SCSI standard. | 20 | * The longest fixed-length SCSI CDB as per the SCSI standard. |
| @@ -120,11 +119,11 @@ struct scsi_cmnd { | |||
| 120 | struct request *request; /* The command we are | 119 | struct request *request; /* The command we are |
| 121 | working on */ | 120 | working on */ |
| 122 | 121 | ||
| 123 | #define SCSI_SENSE_BUFFERSIZE 96 | ||
| 124 | unsigned char *sense_buffer; | 122 | unsigned char *sense_buffer; |
| 125 | /* obtained by REQUEST SENSE when | 123 | /* obtained by REQUEST SENSE when |
| 126 | * CHECK CONDITION is received on original | 124 | * CHECK CONDITION is received on original |
| 127 | * command (auto-sense) */ | 125 | * command (auto-sense). Length must be |
| 126 | * SCSI_SENSE_BUFFERSIZE bytes. */ | ||
| 128 | 127 | ||
| 129 | /* Low-level done function - can be used by low-level driver to point | 128 | /* Low-level done function - can be used by low-level driver to point |
| 130 | * to completion function. Not used by mid/upper level code. */ | 129 | * to completion function. Not used by mid/upper level code. */ |
| @@ -313,12 +312,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) | |||
| 313 | return scmd->device->sector_size; | 312 | return scmd->device->sector_size; |
| 314 | } | 313 | } |
| 315 | 314 | ||
| 316 | static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) | ||
| 317 | { | ||
| 318 | return blk_rq_pos(scmd->request) >> | ||
| 319 | (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff; | ||
| 320 | } | ||
| 321 | |||
| 322 | static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) | 315 | static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) |
| 323 | { | 316 | { |
| 324 | return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; | 317 | return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; |
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 4c36af6edd79..202f4d6a4342 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h | |||
| @@ -17,6 +17,8 @@ struct scsi_sense_hdr; | |||
| 17 | 17 | ||
| 18 | typedef __u64 __bitwise blist_flags_t; | 18 | typedef __u64 __bitwise blist_flags_t; |
| 19 | 19 | ||
| 20 | #define SCSI_SENSE_BUFFERSIZE 96 | ||
| 21 | |||
| 20 | struct scsi_mode_data { | 22 | struct scsi_mode_data { |
| 21 | __u32 length; | 23 | __u32 length; |
| 22 | __u16 block_descriptor_length; | 24 | __u16 block_descriptor_length; |
| @@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state); | |||
| 426 | extern int scsi_is_sdev_device(const struct device *); | 428 | extern int scsi_is_sdev_device(const struct device *); |
| 427 | extern int scsi_is_target_device(const struct device *); | 429 | extern int scsi_is_target_device(const struct device *); |
| 428 | extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); | 430 | extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); |
| 429 | extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, | 431 | extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, |
| 430 | int data_direction, void *buffer, unsigned bufflen, | 432 | int data_direction, void *buffer, unsigned bufflen, |
| 431 | unsigned char *sense, struct scsi_sense_hdr *sshdr, | 433 | unsigned char *sense, struct scsi_sense_hdr *sshdr, |
| 432 | int timeout, int retries, u64 flags, | 434 | int timeout, int retries, u64 flags, |
| 433 | req_flags_t rq_flags, int *resid); | 435 | req_flags_t rq_flags, int *resid); |
| 436 | /* Make sure any sense buffer is the correct size. */ | ||
| 437 | #define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \ | ||
| 438 | sshdr, timeout, retries, flags, rq_flags, resid) \ | ||
| 439 | ({ \ | ||
| 440 | BUILD_BUG_ON((sense) != NULL && \ | ||
| 441 | sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \ | ||
| 442 | __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \ | ||
| 443 | sense, sshdr, timeout, retries, flags, rq_flags, \ | ||
| 444 | resid); \ | ||
| 445 | }) | ||
| 434 | static inline int scsi_execute_req(struct scsi_device *sdev, | 446 | static inline int scsi_execute_req(struct scsi_device *sdev, |
| 435 | const unsigned char *cmd, int data_direction, void *buffer, | 447 | const unsigned char *cmd, int data_direction, void *buffer, |
| 436 | unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, | 448 | unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, |
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 821f71a2e48f..8d19e02d752a 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h | |||
| @@ -195,7 +195,7 @@ struct cache_sb { | |||
| 195 | }; | 195 | }; |
| 196 | }; | 196 | }; |
| 197 | 197 | ||
| 198 | __u32 last_mount; /* time_t */ | 198 | __u32 last_mount; /* time overflow in y2106 */ |
| 199 | 199 | ||
| 200 | __u16 first_bucket; | 200 | __u16 first_bucket; |
| 201 | union { | 201 | union { |
| @@ -318,7 +318,7 @@ struct uuid_entry { | |||
| 318 | struct { | 318 | struct { |
| 319 | __u8 uuid[16]; | 319 | __u8 uuid[16]; |
| 320 | __u8 label[32]; | 320 | __u8 label[32]; |
| 321 | __u32 first_reg; | 321 | __u32 first_reg; /* time overflow in y2106 */ |
| 322 | __u32 last_reg; | 322 | __u32 last_reg; |
| 323 | __u32 invalidated; | 323 | __u32 invalidated; |
| 324 | 324 | ||
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e3c70fe6bf0f..ff5a5db8906a 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h | |||
| @@ -117,7 +117,7 @@ struct blk_zone_report { | |||
| 117 | __u32 nr_zones; | 117 | __u32 nr_zones; |
| 118 | __u8 reserved[4]; | 118 | __u8 reserved[4]; |
| 119 | struct blk_zone zones[0]; | 119 | struct blk_zone zones[0]; |
| 120 | } __packed; | 120 | }; |
| 121 | 121 | ||
| 122 | /** | 122 | /** |
| 123 | * struct blk_zone_range - BLKRESETZONE ioctl request | 123 | * struct blk_zone_range - BLKRESETZONE ioctl request |
diff --git a/kernel/fork.c b/kernel/fork.c index 9d8d0e016fc6..33112315b5c0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -866,6 +866,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 866 | tsk->fail_nth = 0; | 866 | tsk->fail_nth = 0; |
| 867 | #endif | 867 | #endif |
| 868 | 868 | ||
| 869 | #ifdef CONFIG_BLK_CGROUP | ||
| 870 | tsk->throttle_queue = NULL; | ||
| 871 | tsk->use_memdelay = 0; | ||
| 872 | #endif | ||
| 873 | |||
| 869 | return tsk; | 874 | return tsk; |
| 870 | 875 | ||
| 871 | free_stack: | 876 | free_stack: |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..b951aa1fac61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
| 494 | if (!buts->buf_size || !buts->buf_nr) | 494 | if (!buts->buf_size || !buts->buf_nr) |
| 495 | return -EINVAL; | 495 | return -EINVAL; |
| 496 | 496 | ||
| 497 | if (!blk_debugfs_root) | ||
| 498 | return -ENOENT; | ||
| 499 | |||
| 497 | strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); | 500 | strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); |
| 498 | buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; | 501 | buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; |
| 499 | 502 | ||
| @@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
| 518 | 521 | ||
| 519 | ret = -ENOENT; | 522 | ret = -ENOENT; |
| 520 | 523 | ||
| 521 | if (!blk_debugfs_root) | ||
| 522 | goto err; | ||
| 523 | |||
| 524 | dir = debugfs_lookup(buts->name, blk_debugfs_root); | 524 | dir = debugfs_lookup(buts->name, blk_debugfs_root); |
| 525 | if (!dir) | 525 | if (!dir) |
| 526 | bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); | 526 | bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25346bd99364..a9e1e093df51 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, | |||
| 552 | 552 | ||
| 553 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 553 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 554 | 554 | ||
| 555 | if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { | 555 | if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { |
| 556 | put_page(page); | 556 | put_page(page); |
| 557 | count_vm_event(THP_FAULT_FALLBACK); | 557 | count_vm_event(THP_FAULT_FALLBACK); |
| 558 | return VM_FAULT_FALLBACK; | 558 | return VM_FAULT_FALLBACK; |
| @@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, | |||
| 1142 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, | 1142 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, |
| 1143 | vmf->address, page_to_nid(page)); | 1143 | vmf->address, page_to_nid(page)); |
| 1144 | if (unlikely(!pages[i] || | 1144 | if (unlikely(!pages[i] || |
| 1145 | mem_cgroup_try_charge(pages[i], vma->vm_mm, | 1145 | mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, |
| 1146 | GFP_KERNEL, &memcg, false))) { | 1146 | GFP_KERNEL, &memcg, false))) { |
| 1147 | if (pages[i]) | 1147 | if (pages[i]) |
| 1148 | put_page(pages[i]); | 1148 | put_page(pages[i]); |
| @@ -1312,7 +1312,7 @@ alloc: | |||
| 1312 | goto out; | 1312 | goto out; |
| 1313 | } | 1313 | } |
| 1314 | 1314 | ||
| 1315 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, | 1315 | if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, |
| 1316 | huge_gfp, &memcg, true))) { | 1316 | huge_gfp, &memcg, true))) { |
| 1317 | put_page(new_page); | 1317 | put_page(new_page); |
| 1318 | split_huge_pmd(vma, vmf->pmd, vmf->address); | 1318 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2173f7e5164..b836e7f00309 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -5600,6 +5600,19 @@ out: | |||
| 5600 | return ret; | 5600 | return ret; |
| 5601 | } | 5601 | } |
| 5602 | 5602 | ||
| 5603 | int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, | ||
| 5604 | gfp_t gfp_mask, struct mem_cgroup **memcgp, | ||
| 5605 | bool compound) | ||
| 5606 | { | ||
| 5607 | struct mem_cgroup *memcg; | ||
| 5608 | int ret; | ||
| 5609 | |||
| 5610 | ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); | ||
| 5611 | memcg = *memcgp; | ||
| 5612 | mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); | ||
| 5613 | return ret; | ||
| 5614 | } | ||
| 5615 | |||
| 5603 | /** | 5616 | /** |
| 5604 | * mem_cgroup_commit_charge - commit a page charge | 5617 | * mem_cgroup_commit_charge - commit a page charge |
| 5605 | * @page: page to charge | 5618 | * @page: page to charge |
diff --git a/mm/memory.c b/mm/memory.c index 6d175057cfd0..348279ff6e51 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -2524,7 +2524,7 @@ static int wp_page_copy(struct vm_fault *vmf) | |||
| 2524 | cow_user_page(new_page, old_page, vmf->address, vma); | 2524 | cow_user_page(new_page, old_page, vmf->address, vma); |
| 2525 | } | 2525 | } |
| 2526 | 2526 | ||
| 2527 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2527 | if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) |
| 2528 | goto oom_free_new; | 2528 | goto oom_free_new; |
| 2529 | 2529 | ||
| 2530 | __SetPageUptodate(new_page); | 2530 | __SetPageUptodate(new_page); |
| @@ -3024,8 +3024,8 @@ int do_swap_page(struct vm_fault *vmf) | |||
| 3024 | goto out_page; | 3024 | goto out_page; |
| 3025 | } | 3025 | } |
| 3026 | 3026 | ||
| 3027 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, | 3027 | if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, |
| 3028 | &memcg, false)) { | 3028 | &memcg, false)) { |
| 3029 | ret = VM_FAULT_OOM; | 3029 | ret = VM_FAULT_OOM; |
| 3030 | goto out_page; | 3030 | goto out_page; |
| 3031 | } | 3031 | } |
| @@ -3186,7 +3186,8 @@ static int do_anonymous_page(struct vm_fault *vmf) | |||
| 3186 | if (!page) | 3186 | if (!page) |
| 3187 | goto oom; | 3187 | goto oom; |
| 3188 | 3188 | ||
| 3189 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) | 3189 | if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, |
| 3190 | false)) | ||
| 3190 | goto oom_free_page; | 3191 | goto oom_free_page; |
| 3191 | 3192 | ||
| 3192 | /* | 3193 | /* |
| @@ -3682,7 +3683,7 @@ static int do_cow_fault(struct vm_fault *vmf) | |||
| 3682 | if (!vmf->cow_page) | 3683 | if (!vmf->cow_page) |
| 3683 | return VM_FAULT_OOM; | 3684 | return VM_FAULT_OOM; |
| 3684 | 3685 | ||
| 3685 | if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, | 3686 | if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, |
| 3686 | &vmf->memcg, false)) { | 3687 | &vmf->memcg, false)) { |
| 3687 | put_page(vmf->cow_page); | 3688 | put_page(vmf->cow_page); |
| 3688 | return VM_FAULT_OOM; | 3689 | return VM_FAULT_OOM; |
diff --git a/mm/page_io.c b/mm/page_io.c index b41cf9644585..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
| 338 | ret = -ENOMEM; | 338 | ret = -ENOMEM; |
| 339 | goto out; | 339 | goto out; |
| 340 | } | 340 | } |
| 341 | bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); | 341 | bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); |
| 342 | bio_associate_blkcg_from_page(bio, page); | ||
| 342 | count_swpout_vm_event(page); | 343 | count_swpout_vm_event(page); |
| 343 | set_page_writeback(page); | 344 | set_page_writeback(page); |
| 344 | unlock_page(page); | 345 | unlock_page(page); |
diff --git a/mm/readahead.c b/mm/readahead.c index e273f0de3376..a59ea70527b9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
| 20 | #include <linux/file.h> | 20 | #include <linux/file.h> |
| 21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
| 22 | #include <linux/blk-cgroup.h> | ||
| 22 | 23 | ||
| 23 | #include "internal.h" | 24 | #include "internal.h" |
| 24 | 25 | ||
| @@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, | |||
| 385 | { | 386 | { |
| 386 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | 387 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
| 387 | unsigned long max_pages = ra->ra_pages; | 388 | unsigned long max_pages = ra->ra_pages; |
| 389 | unsigned long add_pages; | ||
| 388 | pgoff_t prev_offset; | 390 | pgoff_t prev_offset; |
| 389 | 391 | ||
| 390 | /* | 392 | /* |
| @@ -474,10 +476,17 @@ readit: | |||
| 474 | * Will this read hit the readahead marker made by itself? | 476 | * Will this read hit the readahead marker made by itself? |
| 475 | * If so, trigger the readahead marker hit now, and merge | 477 | * If so, trigger the readahead marker hit now, and merge |
| 476 | * the resulted next readahead window into the current one. | 478 | * the resulted next readahead window into the current one. |
| 479 | * Take care of maximum IO pages as above. | ||
| 477 | */ | 480 | */ |
| 478 | if (offset == ra->start && ra->size == ra->async_size) { | 481 | if (offset == ra->start && ra->size == ra->async_size) { |
| 479 | ra->async_size = get_next_ra_size(ra, max_pages); | 482 | add_pages = get_next_ra_size(ra, max_pages); |
| 480 | ra->size += ra->async_size; | 483 | if (ra->size + add_pages <= max_pages) { |
| 484 | ra->async_size = add_pages; | ||
| 485 | ra->size += add_pages; | ||
| 486 | } else { | ||
| 487 | ra->size = max_pages; | ||
| 488 | ra->async_size = max_pages >> 1; | ||
| 489 | } | ||
| 481 | } | 490 | } |
| 482 | 491 | ||
| 483 | return ra_submit(ra, mapping, filp); | 492 | return ra_submit(ra, mapping, filp); |
| @@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping, | |||
| 505 | if (!ra->ra_pages) | 514 | if (!ra->ra_pages) |
| 506 | return; | 515 | return; |
| 507 | 516 | ||
| 517 | if (blk_cgroup_congested()) | ||
| 518 | return; | ||
| 519 | |||
| 508 | /* be dumb */ | 520 | /* be dumb */ |
| 509 | if (filp && (filp->f_mode & FMODE_RANDOM)) { | 521 | if (filp && (filp->f_mode & FMODE_RANDOM)) { |
| 510 | force_page_cache_readahead(mapping, filp, offset, req_size); | 522 | force_page_cache_readahead(mapping, filp, offset, req_size); |
| @@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping, | |||
| 555 | if (inode_read_congested(mapping->host)) | 567 | if (inode_read_congested(mapping->host)) |
| 556 | return; | 568 | return; |
| 557 | 569 | ||
| 570 | if (blk_cgroup_congested()) | ||
| 571 | return; | ||
| 572 | |||
| 558 | /* do read-ahead */ | 573 | /* do read-ahead */ |
| 559 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 574 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
| 560 | } | 575 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index 96bcc51fb9ec..06ebe17bb924 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
| 1239 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 1239 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
| 1240 | * Charged back to the user (not to caller) when swap account is used. | 1240 | * Charged back to the user (not to caller) when swap account is used. |
| 1241 | */ | 1241 | */ |
| 1242 | error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, | 1242 | error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, |
| 1243 | false); | 1243 | &memcg, false); |
| 1244 | if (error) | 1244 | if (error) |
| 1245 | goto out; | 1245 | goto out; |
| 1246 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ | 1246 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
| @@ -1713,7 +1713,7 @@ repeat: | |||
| 1713 | goto failed; | 1713 | goto failed; |
| 1714 | } | 1714 | } |
| 1715 | 1715 | ||
| 1716 | error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, | 1716 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, |
| 1717 | false); | 1717 | false); |
| 1718 | if (!error) { | 1718 | if (!error) { |
| 1719 | error = shmem_add_to_page_cache(page, mapping, index, | 1719 | error = shmem_add_to_page_cache(page, mapping, index, |
| @@ -1819,7 +1819,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, | |||
| 1819 | if (sgp == SGP_WRITE) | 1819 | if (sgp == SGP_WRITE) |
| 1820 | __SetPageReferenced(page); | 1820 | __SetPageReferenced(page); |
| 1821 | 1821 | ||
| 1822 | error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, | 1822 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, |
| 1823 | PageTransHuge(page)); | 1823 | PageTransHuge(page)); |
| 1824 | if (error) | 1824 | if (error) |
| 1825 | goto unacct; | 1825 | goto unacct; |
| @@ -2292,7 +2292,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, | |||
| 2292 | __SetPageSwapBacked(page); | 2292 | __SetPageSwapBacked(page); |
| 2293 | __SetPageUptodate(page); | 2293 | __SetPageUptodate(page); |
| 2294 | 2294 | ||
| 2295 | ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); | 2295 | ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); |
| 2296 | if (ret) | 2296 | if (ret) |
| 2297 | goto out_release; | 2297 | goto out_release; |
| 2298 | 2298 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 18185ae4f223..8837b22c848d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -3745,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si) | |||
| 3745 | } | 3745 | } |
| 3746 | } | 3746 | } |
| 3747 | 3747 | ||
| 3748 | #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) | ||
| 3749 | void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, | ||
| 3750 | gfp_t gfp_mask) | ||
| 3751 | { | ||
| 3752 | struct swap_info_struct *si, *next; | ||
| 3753 | if (!(gfp_mask & __GFP_IO) || !memcg) | ||
| 3754 | return; | ||
| 3755 | |||
| 3756 | if (!blk_cgroup_congested()) | ||
| 3757 | return; | ||
| 3758 | |||
| 3759 | /* | ||
| 3760 | * We've already scheduled a throttle, avoid taking the global swap | ||
| 3761 | * lock. | ||
| 3762 | */ | ||
| 3763 | if (current->throttle_queue) | ||
| 3764 | return; | ||
| 3765 | |||
| 3766 | spin_lock(&swap_avail_lock); | ||
| 3767 | plist_for_each_entry_safe(si, next, &swap_avail_heads[node], | ||
| 3768 | avail_lists[node]) { | ||
| 3769 | if (si->bdev) { | ||
| 3770 | blkcg_schedule_throttle(bdev_get_queue(si->bdev), | ||
| 3771 | true); | ||
| 3772 | break; | ||
| 3773 | } | ||
| 3774 | } | ||
| 3775 | spin_unlock(&swap_avail_lock); | ||
| 3776 | } | ||
| 3777 | #endif | ||
| 3778 | |||
| 3748 | static int __init swapfile_init(void) | 3779 | static int __init swapfile_init(void) |
| 3749 | { | 3780 | { |
| 3750 | int nid; | 3781 | int nid; |
