aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-29 13:34:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-29 13:34:42 -0400
commit88793e5c774ec69351ef6b5200bb59f532e41bca (patch)
tree54c4be61777ea53fde892b71e795322c5227d16e
parent1bc5e157ed2b4f5b206155fc772d860158acd201 (diff)
parent61031952f4c89dba1065f7a5b9419badb112554c (diff)
Merge tag 'libnvdimm-for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm
Pull libnvdimm subsystem from Dan Williams: "The libnvdimm sub-system introduces, in addition to the libnvdimm-core, 4 drivers / enabling modules: NFIT: Instantiates an "nvdimm bus" with the core and registers memory devices (NVDIMMs) enumerated by the ACPI 6.0 NFIT (NVDIMM Firmware Interface table). After registering NVDIMMs the NFIT driver then registers "region" devices. A libnvdimm-region defines an access mode and the boundaries of persistent memory media. A region may span multiple NVDIMMs that are interleaved by the hardware memory controller. In turn, a libnvdimm-region can be carved into a "namespace" device and bound to the PMEM or BLK driver which will attach a Linux block device (disk) interface to the memory. PMEM: Initially merged in v4.1 this driver for contiguous spans of persistent memory address ranges is re-worked to drive PMEM-namespaces emitted by the libnvdimm-core. In this update the PMEM driver, on x86, gains the ability to assert that writes to persistent memory have been flushed all the way through the caches and buffers in the platform to persistent media. See memcpy_to_pmem() and wmb_pmem(). BLK: This new driver enables access to persistent memory media through "Block Data Windows" as defined by the NFIT. The primary difference of this driver to PMEM is that only a small window of persistent memory is mapped into system address space at any given point in time. Per-NVDIMM windows are reprogrammed at run time, per-I/O, to access different portions of the media. BLK-mode, by definition, does not support DAX. BTT: This is a library, optionally consumed by either PMEM or BLK, that converts a byte-accessible namespace into a disk with atomic sector update semantics (prevents sector tearing on crash or power loss). The sinister aspect of sector tearing is that most applications do not know they have a atomic sector dependency. At least today's disk's rarely ever tear sectors and if they do one almost certainly gets a CRC error on access. NVDIMMs will always tear and always silently. Until an application is audited to be robust in the presence of sector-tearing the usage of BTT is recommended. Thanks to: Ross Zwisler, Jeff Moyer, Vishal Verma, Christoph Hellwig, Ingo Molnar, Neil Brown, Boaz Harrosh, Robert Elliott, Matthew Wilcox, Andy Rudoff, Linda Knippers, Toshi Kani, Nicholas Moulin, Rafael Wysocki, and Bob Moore" * tag 'libnvdimm-for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm: (33 commits) arch, x86: pmem api for ensuring durability of persistent memory updates libnvdimm: Add sysfs numa_node to NVDIMM devices libnvdimm: Set numa_node to NVDIMM devices acpi: Add acpi_map_pxm_to_online_node() libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only pmem: flag pmem block devices as non-rotational libnvdimm: enable iostat pmem: make_request cleanups libnvdimm, pmem: fix up max_hw_sectors libnvdimm, blk: add support for blk integrity libnvdimm, btt: add support for blk integrity fs/block_dev.c: skip rw_page if bdev has integrity libnvdimm: Non-Volatile Devices tools/testing/nvdimm: libnvdimm unit test infrastructure libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory nd_btt: atomic sector updates libnvdimm: infrastructure for btt devices libnvdimm: write blk label set libnvdimm: write pmem label set libnvdimm: blk labels and namespace instantiation ...
-rw-r--r--Documentation/nvdimm/btt.txt283
-rw-r--r--Documentation/nvdimm/nvdimm.txt808
-rw-r--r--MAINTAINERS39
-rw-r--r--arch/arm64/kernel/efi.c1
-rw-r--r--arch/ia64/kernel/efi.c4
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/boot/compressed/eboot.c4
-rw-r--r--arch/x86/include/asm/cacheflush.h72
-rw-r--r--arch/x86/include/asm/io.h6
-rw-r--r--arch/x86/include/uapi/asm/e820.h1
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/pmem.c93
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/acpi/Kconfig26
-rw-r--r--drivers/acpi/Makefile1
-rw-r--r--drivers/acpi/nfit.c1587
-rw-r--r--drivers/acpi/nfit.h158
-rw-r--r--drivers/acpi/numa.c50
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/nvdimm/Kconfig68
-rw-r--r--drivers/nvdimm/Makefile20
-rw-r--r--drivers/nvdimm/blk.c384
-rw-r--r--drivers/nvdimm/btt.c1479
-rw-r--r--drivers/nvdimm/btt.h185
-rw-r--r--drivers/nvdimm/btt_devs.c425
-rw-r--r--drivers/nvdimm/bus.c730
-rw-r--r--drivers/nvdimm/core.c465
-rw-r--r--drivers/nvdimm/dimm.c102
-rw-r--r--drivers/nvdimm/dimm_devs.c551
-rw-r--r--drivers/nvdimm/label.c927
-rw-r--r--drivers/nvdimm/label.h141
-rw-r--r--drivers/nvdimm/namespace_devs.c1870
-rw-r--r--drivers/nvdimm/nd-core.h83
-rw-r--r--drivers/nvdimm/nd.h220
-rw-r--r--drivers/nvdimm/pmem.c (renamed from drivers/block/pmem.c)227
-rw-r--r--drivers/nvdimm/region.c114
-rw-r--r--drivers/nvdimm/region_devs.c787
-rw-r--r--fs/block_dev.c4
-rw-r--r--include/linux/acpi.h5
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/efi.h3
-rw-r--r--include/linux/libnvdimm.h151
-rw-r--r--include/linux/nd.h151
-rw-r--r--include/linux/pmem.h152
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/ndctl.h197
-rw-r--r--lib/Kconfig3
-rw-r--r--tools/testing/nvdimm/Kbuild40
-rw-r--r--tools/testing/nvdimm/Makefile7
-rw-r--r--tools/testing/nvdimm/config_check.c15
-rw-r--r--tools/testing/nvdimm/test/Kbuild8
-rw-r--r--tools/testing/nvdimm/test/iomap.c151
-rw-r--r--tools/testing/nvdimm/test/nfit.c1116
-rw-r--r--tools/testing/nvdimm/test/nfit_test.h29
57 files changed, 13842 insertions, 155 deletions
diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
new file mode 100644
index 000000000000..b91443f577dc
--- /dev/null
+++ b/Documentation/nvdimm/btt.txt
@@ -0,0 +1,283 @@
1BTT - Block Translation Table
2=============================
3
4
51. Introduction
6---------------
7
8Persistent memory based storage is able to perform IO at byte (or more
9accurately, cache line) granularity. However, we often want to expose such
10storage as traditional block devices. The block drivers for persistent memory
11will do exactly this. However, they do not provide any atomicity guarantees.
12Traditional SSDs typically provide protection against torn sectors in hardware,
13using stored energy in capacitors to complete in-flight block writes, or perhaps
14in firmware. We don't have this luxury with persistent memory - if a write is in
15progress, and we experience a power failure, the block will contain a mix of old
16and new data. Applications may not be prepared to handle such a scenario.
17
18The Block Translation Table (BTT) provides atomic sector update semantics for
19persistent memory devices, so that applications that rely on sector writes not
20being torn can continue to do so. The BTT manifests itself as a stacked block
21device, and reserves a portion of the underlying storage for its metadata. At
22the heart of it, is an indirection table that re-maps all the blocks on the
23volume. It can be thought of as an extremely simple file system that only
24provides atomic sector updates.
25
26
272. Static Layout
28----------------
29
30The underlying storage on which a BTT can be laid out is not limited in any way.
31The BTT, however, splits the available space into chunks of up to 512 GiB,
32called "Arenas".
33
34Each arena follows the same layout for its metadata, and all references in an
35arena are internal to it (with the exception of one field that points to the
36next arena). The following depicts the "On-disk" metadata layout:
37
38
39 Backing Store +-------> Arena
40+---------------+ | +------------------+
41| | | | Arena info block |
42| Arena 0 +---+ | 4K |
43| 512G | +------------------+
44| | | |
45+---------------+ | |
46| | | |
47| Arena 1 | | Data Blocks |
48| 512G | | |
49| | | |
50+---------------+ | |
51| . | | |
52| . | | |
53| . | | |
54| | | |
55| | | |
56+---------------+ +------------------+
57 | |
58 | BTT Map |
59 | |
60 | |
61 +------------------+
62 | |
63 | BTT Flog |
64 | |
65 +------------------+
66 | Info block copy |
67 | 4K |
68 +------------------+
69
70
713. Theory of Operation
72----------------------
73
74
75a. The BTT Map
76--------------
77
78The map is a simple lookup/indirection table that maps an LBA to an internal
79block. Each map entry is 32 bits. The two most significant bits are special
80flags, and the remaining form the internal block number.
81
82Bit Description
8331 - 30 : Error and Zero flags - Used in the following way:
84 Bit Description
85 31 30
86 -----------------------------------------------------------------------
87 00 Initial state. Reads return zeroes; Premap = Postmap
88 01 Zero state: Reads return zeroes
89 10 Error state: Reads fail; Writes clear 'E' bit
90 11 Normal Block – has valid postmap
91
92
9329 - 0 : Mappings to internal 'postmap' blocks
94
95
96Some of the terminology that will be subsequently used:
97
98External LBA : LBA as made visible to upper layers.
99ABA : Arena Block Address - Block offset/number within an arena
100Premap ABA : The block offset into an arena, which was decided upon by range
101 checking the External LBA
102Postmap ABA : The block number in the "Data Blocks" area obtained after
103 indirection from the map
104nfree : The number of free blocks that are maintained at any given time.
105 This is the number of concurrent writes that can happen to the
106 arena.
107
108
109For example, after adding a BTT, we surface a disk of 1024G. We get a read for
110the external LBA at 768G. This falls into the second arena, and of the 512G
111worth of blocks that this arena contributes, this block is at 256G. Thus, the
112premap ABA is 256G. We now refer to the map, and find out the mapping for block
113'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
114
115
116b. The BTT Flog
117---------------
118
119The BTT provides sector atomicity by making every write an "allocating write",
120i.e. Every write goes to a "free" block. A running list of free blocks is
121maintained in the form of the BTT flog. 'Flog' is a combination of the words
122"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
123
124lba : The premap ABA that is being written to
125old_map : The old postmap ABA - after 'this' write completes, this will be a
126 free block.
127new_map : The new postmap ABA. The map will up updated to reflect this
128 lba->postmap_aba mapping, but we log it here in case we have to
129 recover.
130seq : Sequence number to mark which of the 2 sections of this flog entry is
131 valid/newest. It cycles between 01->10->11->01 (binary) under normal
132 operation, with 00 indicating an uninitialized state.
133lba' : alternate lba entry
134old_map': alternate old postmap entry
135new_map': alternate new postmap entry
136seq' : alternate sequence number.
137
138Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
139padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
140done such that for any entry being written, it:
141a. overwrites the 'old' section in the entry based on sequence numbers
142b. writes the 'new' section such that the sequence number is written last.
143
144
145c. The concept of lanes
146-----------------------
147
148While 'nfree' describes the number of concurrent IOs an arena can process
149concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
150process.
151 nlanes = min(nfree, num_cpus)
152A lane number is obtained at the start of any IO, and is used for indexing into
153all the on-disk and in-memory data structures for the duration of the IO. If
154there are more CPUs than the max number of available lanes, than lanes are
155protected by spinlocks.
156
157
158d. In-memory data structure: Read Tracking Table (RTT)
159------------------------------------------------------
160
161Consider a case where we have two threads, one doing reads and the other,
162writes. We can hit a condition where the writer thread grabs a free block to do
163a new IO, but the (slow) reader thread is still reading from it. In other words,
164the reader consulted a map entry, and started reading the corresponding block. A
165writer started writing to the same external LBA, and finished the write updating
166the map for that external LBA to point to its new postmap ABA. At this point the
167internal, postmap block that the reader is (still) reading has been inserted
168into the list of free blocks. If another write comes in for the same LBA, it can
169grab this free block, and start writing to it, causing the reader to read
170incorrect data. To prevent this, we introduce the RTT.
171
172The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
173into rtt[lane_number], the postmap ABA it is reading, and clears it after the
174read is complete. Every writer thread, after grabbing a free block, checks the
175RTT for its presence. If the postmap free block is in the RTT, it waits till the
176reader clears the RTT entry, and only then starts writing to it.
177
178
179e. In-memory data structure: map locks
180--------------------------------------
181
182Consider a case where two writer threads are writing to the same LBA. There can
183be a race in the following sequence of steps:
184
185free[lane] = map[premap_aba]
186map[premap_aba] = postmap_aba
187
188Both threads can update their respective free[lane] with the same old, freed
189postmap_aba. This has made the layout inconsistent by losing a free entry, and
190at the same time, duplicating another free entry for two lanes.
191
192To solve this, we could have a single map lock (per arena) that has to be taken
193before performing the above sequence, but we feel that could be too contentious.
194Instead we use an array of (nfree) map_locks that is indexed by
195(premap_aba modulo nfree).
196
197
198f. Reconstruction from the Flog
199-------------------------------
200
201On startup, we analyze the BTT flog to create our list of free blocks. We walk
202through all the entries, and for each lane, of the set of two possible
203'sections', we always look at the most recent one only (based on the sequence
204number). The reconstruction rules/steps are simple:
205- Read map[log_entry.lba].
206- If log_entry.new matches the map entry, then log_entry.old is free.
207- If log_entry.new does not match the map entry, then log_entry.new is free.
208 (This case can only be caused by power-fails/unsafe shutdowns)
209
210
211g. Summarizing - Read and Write flows
212-------------------------------------
213
214Read:
215
2161. Convert external LBA to arena number + pre-map ABA
2172. Get a lane (and take lane_lock)
2183. Read map to get the entry for this pre-map ABA
2194. Enter post-map ABA into RTT[lane]
2205. If TRIM flag set in map, return zeroes, and end IO (go to step 8)
2216. If ERROR flag set in map, end IO with EIO (go to step 8)
2227. Read data from this block
2238. Remove post-map ABA entry from RTT[lane]
2249. Release lane (and lane_lock)
225
226Write:
227
2281. Convert external LBA to Arena number + pre-map ABA
2292. Get a lane (and take lane_lock)
2303. Use lane to index into in-memory free list and obtain a new block, next flog
231 index, next sequence number
2324. Scan the RTT to check if free block is present, and spin/wait if it is.
2335. Write data to this free block
2346. Read map to get the existing post-map ABA entry for this pre-map ABA
2357. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
2368. Write new post-map ABA into map.
2379. Write old post-map entry into the free list
23810. Calculate next sequence number and write into the free list entry
23911. Release lane (and lane_lock)
240
241
2424. Error Handling
243=================
244
245An arena would be in an error state if any of the metadata is corrupted
246irrecoverably, either due to a bug or a media error. The following conditions
247indicate an error:
248- Info block checksum does not match (and recovering from the copy also fails)
249- All internal available blocks are not uniquely and entirely addressed by the
250 sum of mapped blocks and free blocks (from the BTT flog).
251- Rebuilding free list from the flog reveals missing/duplicate/impossible
252 entries
253- A map entry is out of bounds
254
255If any of these error conditions are encountered, the arena is put into a read
256only state using a flag in the info block.
257
258
2595. In-kernel usage
260==================
261
262Any block driver that supports byte granularity IO to the storage may register
263with the BTT. It will have to provide the rw_bytes interface in its
264block_device_operations struct:
265
266 int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
267
268It may register with the BTT after it adds its own gendisk, using btt_init:
269
270 struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
271 u32 lbasize, u8 uuid[], int maxlane);
272
273note that maxlane is the maximum amount of concurrency the driver wishes to
274allow the BTT to use.
275
276The BTT 'disk' appears as a stacked block device that grabs the underlying block
277device in the O_EXCL mode.
278
279When the driver wishes to remove the backing disk, it should similarly call
280btt_fini using the same struct btt* handle that was provided to it by btt_init.
281
282 void btt_fini(struct btt *btt);
283
diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt
new file mode 100644
index 000000000000..197a0b6b0582
--- /dev/null
+++ b/Documentation/nvdimm/nvdimm.txt
@@ -0,0 +1,808 @@
1 LIBNVDIMM: Non-Volatile Devices
2 libnvdimm - kernel / libndctl - userspace helper library
3 linux-nvdimm@lists.01.org
4 v13
5
6
7 Glossary
8 Overview
9 Supporting Documents
10 Git Trees
11 LIBNVDIMM PMEM and BLK
12 Why BLK?
13 PMEM vs BLK
14 BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
15 Example NVDIMM Platform
16 LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
17 LIBNDCTL: Context
18 libndctl: instantiate a new library context example
19 LIBNVDIMM/LIBNDCTL: Bus
20 libnvdimm: control class device in /sys/class
21 libnvdimm: bus
22 libndctl: bus enumeration example
23 LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
24 libnvdimm: DIMM (NMEM)
25 libndctl: DIMM enumeration example
26 LIBNVDIMM/LIBNDCTL: Region
27 libnvdimm: region
28 libndctl: region enumeration example
29 Why Not Encode the Region Type into the Region Name?
30 How Do I Determine the Major Type of a Region?
31 LIBNVDIMM/LIBNDCTL: Namespace
32 libnvdimm: namespace
33 libndctl: namespace enumeration example
34 libndctl: namespace creation example
35 Why the Term "namespace"?
36 LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
37 libnvdimm: btt layout
38 libndctl: btt creation example
39 Summary LIBNDCTL Diagram
40
41
42Glossary
43--------
44
45PMEM: A system-physical-address range where writes are persistent. A
46block device composed of PMEM is capable of DAX. A PMEM address range
47may span an interleave of several DIMMs.
48
49BLK: A set of one or more programmable memory mapped apertures provided
50by a DIMM to access its media. This indirection precludes the
51performance benefit of interleaving, but enables DIMM-bounded failure
52modes.
53
54DPA: DIMM Physical Address, is a DIMM-relative offset. With one DIMM in
55the system there would be a 1:1 system-physical-address:DPA association.
56Once more DIMMs are added a memory controller interleave must be
57decoded to determine the DPA associated with a given
58system-physical-address. BLK capacity always has a 1:1 relationship
59with a single-DIMM's DPA range.
60
61DAX: File system extensions to bypass the page cache and block layer to
62mmap persistent memory, from a PMEM block device, directly into a
63process address space.
64
65BTT: Block Translation Table: Persistent memory is byte addressable.
66Existing software may have an expectation that the power-fail-atomicity
67of writes is at least one sector, 512 bytes. The BTT is an indirection
68table with atomic update semantics to front a PMEM/BLK block device
69driver and present arbitrary atomic sector sizes.
70
71LABEL: Metadata stored on a DIMM device that partitions and identifies
72(persistently names) storage between PMEM and BLK. It also partitions
73BLK storage to host BTTs with different parameters per BLK-partition.
74Note that traditional partition tables, GPT/MBR, are layered on top of a
75BLK or PMEM device.
76
77
78Overview
79--------
80
81The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
82PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
83and BLK mode access. These three modes of operation are described by
84the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6. While the LIBNVDIMM
85implementation is generic and supports pre-NFIT platforms, it was guided
86by the superset of capabilities need to support this ACPI 6 definition
87for NVDIMM resources. The bulk of the kernel implementation is in place
88to handle the case where DPA accessible via PMEM is aliased with DPA
89accessible via BLK. When that occurs a LABEL is needed to reserve DPA
90for exclusive access via one mode a time.
91
92Supporting Documents
93ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
94NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
95DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
96Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
97
98Git Trees
99LIBNVDIMM: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
100LIBNDCTL: https://github.com/pmem/ndctl.git
101PMEM: https://github.com/01org/prd
102
103
104LIBNVDIMM PMEM and BLK
105------------------
106
107Prior to the arrival of the NFIT, non-volatile memory was described to a
108system in various ad-hoc ways. Usually only the bare minimum was
109provided, namely, a single system-physical-address range where writes
110are expected to be durable after a system power loss. Now, the NFIT
111specification standardizes not only the description of PMEM, but also
112BLK and platform message-passing entry points for control and
113configuration.
114
115For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
116device driver:
117
118 1. PMEM (nd_pmem.ko): Drives a system-physical-address range. This
119 range is contiguous in system memory and may be interleaved (hardware
120 memory controller striped) across multiple DIMMs. When interleaved the
121 platform may optionally provide details of which DIMMs are participating
122 in the interleave.
123
124 Note that while LIBNVDIMM describes system-physical-address ranges that may
125 alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
126 alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
127 distinction. The different device-types are an implementation detail
128 that userspace can exploit to implement policies like "only interface
129 with address ranges from certain DIMMs". It is worth noting that when
130 aliasing is present and a DIMM lacks a label, then no block device can
131 be created by default as userspace needs to do at least one allocation
132 of DPA to the PMEM range. In contrast ND_NAMESPACE_IO ranges, once
133 registered, can be immediately attached to nd_pmem.
134
135 2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
136 defined apertures. A set of apertures will all access just one DIMM.
137 Multiple windows allow multiple concurrent accesses, much like
138 tagged-command-queuing, and would likely be used by different threads or
139 different CPUs.
140
141 The NFIT specification defines a standard format for a BLK-aperture, but
142 the spec also allows for vendor specific layouts, and non-NFIT BLK
143 implementations may other designs for BLK I/O. For this reason "nd_blk"
144 calls back into platform-specific code to perform the I/O. One such
145 implementation is defined in the "Driver Writer's Guide" and "DSM
146 Interface Example".
147
148
149Why BLK?
150--------
151
152While PMEM provides direct byte-addressable CPU-load/store access to
153NVDIMM storage, it does not provide the best system RAS (recovery,
154availability, and serviceability) model. An access to a corrupted
155system-physical-address address causes a cpu exception while an access
156to a corrupted address through an BLK-aperture causes that block window
157to raise an error status in a register. The latter is more aligned with
158the standard error model that host-bus-adapter attached disks present.
159Also, if an administrator ever wants to replace a memory it is easier to
160service a system at DIMM module boundaries. Compare this to PMEM where
161data could be interleaved in an opaque hardware specific manner across
162several DIMMs.
163
164PMEM vs BLK
165BLK-apertures solve this RAS problem, but their presence is also the
166major contributing factor to the complexity of the ND subsystem. They
167complicate the implementation because PMEM and BLK alias in DPA space.
168Any given DIMM's DPA-range may contribute to one or more
169system-physical-address sets of interleaved DIMMs, *and* may also be
170accessed in its entirety through its BLK-aperture. Accessing a DPA
171through a system-physical-address while simultaneously accessing the
172same DPA through a BLK-aperture has undefined results. For this reason,
173DIMMs with this dual interface configuration include a DSM function to
174store/retrieve a LABEL. The LABEL effectively partitions the DPA-space
175into exclusive system-physical-address and BLK-aperture accessible
176regions. For simplicity a DIMM is allowed a PMEM "region" per each
177interleave set in which it is a member. The remaining DPA space can be
178carved into an arbitrary number of BLK devices with discontiguous
179extents.
180
181BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
182--------------------------------------------------
183
184One of the few
185reasons to allow multiple BLK namespaces per REGION is so that each
186BLK-namespace can be configured with a BTT with unique atomic sector
187sizes. While a PMEM device can host a BTT the LABEL specification does
188not provide for a sector size to be specified for a PMEM namespace.
189This is due to the expectation that the primary usage model for PMEM is
190via DAX, and the BTT is incompatible with DAX. However, for the cases
191where an application or filesystem still needs atomic sector update
192guarantees it can register a BTT on a PMEM device or partition. See
193LIBNVDIMM/NDCTL: Block Translation Table "btt"
194
195
196Example NVDIMM Platform
197-----------------------
198
199For the remainder of this document the following diagram will be
200referenced for any example sysfs layouts.
201
202
203 (a) (b) DIMM BLK-REGION
204 +-------------------+--------+--------+--------+
205+------+ | pm0.0 | blk2.0 | pm1.0 | blk2.1 | 0 region2
206| imc0 +--+- - - region0- - - +--------+ +--------+
207+--+---+ | pm0.0 | blk3.0 | pm1.0 | blk3.1 | 1 region3
208 | +-------------------+--------v v--------+
209+--+---+ | |
210| cpu0 | region1
211+--+---+ | |
212 | +----------------------------^ ^--------+
213+--+---+ | blk4.0 | pm1.0 | blk4.0 | 2 region4
214| imc1 +--+----------------------------| +--------+
215+------+ | blk5.0 | pm1.0 | blk5.0 | 3 region5
216 +----------------------------+--------+--------+
217
218In this platform we have four DIMMs and two memory controllers in one
219socket. Each unique interface (BLK or PMEM) to DPA space is identified
220by a region device with a dynamically assigned id (REGION0 - REGION5).
221
222 1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
223 single PMEM namespace is created in the REGION0-SPA-range that spans
224 DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
225 interleaved system-physical-address range is reclaimed as BLK-aperture
226 accessed space starting at DPA-offset (a) into each DIMM. In that
227 reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
228 REGION3 where "blk2.0" and "blk3.0" are just human readable names that
229 could be set to any user-desired name in the LABEL.
230
231 2. In the last portion of DIMM0 and DIMM1 we have an interleaved
232 system-physical-address range, REGION1, that spans those two DIMMs as
233 well as DIMM2 and DIMM3. Some of REGION1 allocated to a PMEM namespace
234 named "pm1.0" the rest is reclaimed in 4 BLK-aperture namespaces (for
235 each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
236 "blk5.0".
237
238 3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
239 interleaved system-physical-address range (i.e. the DPA address below
240 offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
241 Note, that this example shows that BLK-aperture namespaces don't need to
242 be contiguous in DPA-space.
243
244 This bus is provided by the kernel under the device
245 /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
246 the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the
247 acpi_nfit.ko driver as well.
248
249
250LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
251----------------------------------------------------
252
253What follows is a description of the LIBNVDIMM sysfs layout and a
254corresponding object hierarchy diagram as viewed through the LIBNDCTL
255api. The example sysfs paths and diagrams are relative to the Example
256NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
257test.
258
259LIBNDCTL: Context
260Every api call in the LIBNDCTL library requires a context that holds the
261logging parameters and other library instance state. The library is
262based on the libabc template:
263https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git/
264
265LIBNDCTL: instantiate a new library context example
266
267 struct ndctl_ctx *ctx;
268
269 if (ndctl_new(&ctx) == 0)
270 return ctx;
271 else
272 return NULL;
273
274LIBNVDIMM/LIBNDCTL: Bus
275-------------------
276
277A bus has a 1:1 relationship with an NFIT. The current expectation for
278ACPI based systems is that there is only ever one platform-global NFIT.
279That said, it is trivial to register multiple NFITs, the specification
280does not preclude it. The infrastructure supports multiple busses and
281we we use this capability to test multiple NFIT configurations in the
282unit test.
283
284LIBNVDIMM: control class device in /sys/class
285
286This character device accepts DSM messages to be passed to DIMM
287identified by its NFIT handle.
288
289 /sys/class/nd/ndctl0
290 |-- dev
291 |-- device -> ../../../ndbus0
292 |-- subsystem -> ../../../../../../../class/nd
293
294
295
296LIBNVDIMM: bus
297
298 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
299 struct nvdimm_bus_descriptor *nfit_desc);
300
301 /sys/devices/platform/nfit_test.0/ndbus0
302 |-- commands
303 |-- nd
304 |-- nfit
305 |-- nmem0
306 |-- nmem1
307 |-- nmem2
308 |-- nmem3
309 |-- power
310 |-- provider
311 |-- region0
312 |-- region1
313 |-- region2
314 |-- region3
315 |-- region4
316 |-- region5
317 |-- uevent
318 `-- wait_probe
319
320LIBNDCTL: bus enumeration example
321Find the bus handle that describes the bus from Example NVDIMM Platform
322
323 static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
324 const char *provider)
325 {
326 struct ndctl_bus *bus;
327
328 ndctl_bus_foreach(ctx, bus)
329 if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
330 return bus;
331
332 return NULL;
333 }
334
335 bus = get_bus_by_provider(ctx, "nfit_test.0");
336
337
338LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
339---------------------------
340
341The DIMM device provides a character device for sending commands to
342hardware, and it is a container for LABELs. If the DIMM is defined by
343NFIT then an optional 'nfit' attribute sub-directory is available to add
344NFIT-specifics.
345
346Note that the kernel device name for "DIMMs" is "nmemX". The NFIT
347describes these devices via "Memory Device to System Physical Address
348Range Mapping Structure", and there is no requirement that they actually
349be physical DIMMs, so we use a more generic name.
350
351LIBNVDIMM: DIMM (NMEM)
352
353 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
354 const struct attribute_group **groups, unsigned long flags,
355 unsigned long *dsm_mask);
356
357 /sys/devices/platform/nfit_test.0/ndbus0
358 |-- nmem0
359 | |-- available_slots
360 | |-- commands
361 | |-- dev
362 | |-- devtype
363 | |-- driver -> ../../../../../bus/nd/drivers/nvdimm
364 | |-- modalias
365 | |-- nfit
366 | | |-- device
367 | | |-- format
368 | | |-- handle
369 | | |-- phys_id
370 | | |-- rev_id
371 | | |-- serial
372 | | `-- vendor
373 | |-- state
374 | |-- subsystem -> ../../../../../bus/nd
375 | `-- uevent
376 |-- nmem1
377 [..]
378
379
380LIBNDCTL: DIMM enumeration example
381
382Note, in this example we are assuming NFIT-defined DIMMs which are
383identified by an "nfit_handle" a 32-bit value where:
384Bit 3:0 DIMM number within the memory channel
385Bit 7:4 memory channel number
386Bit 11:8 memory controller ID
387Bit 15:12 socket ID (within scope of a Node controller if node controller is present)
388Bit 27:16 Node Controller ID
389Bit 31:28 Reserved
390
391 static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
392 unsigned int handle)
393 {
394 struct ndctl_dimm *dimm;
395
396 ndctl_dimm_foreach(bus, dimm)
397 if (ndctl_dimm_get_handle(dimm) == handle)
398 return dimm;
399
400 return NULL;
401 }
402
403 #define DIMM_HANDLE(n, s, i, c, d) \
404 (((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
405 | ((c & 0xf) << 4) | (d & 0xf))
406
407 dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
408
409LIBNVDIMM/LIBNDCTL: Region
410----------------------
411
412A generic REGION device is registered for each PMEM range orBLK-aperture
413set. Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
414sets on the "nfit_test.0" bus. The primary role of regions are to be a
415container of "mappings". A mapping is a tuple of <DIMM,
416DPA-start-offset, length>.
417
418LIBNVDIMM provides a built-in driver for these REGION devices. This driver
419is responsible for reconciling the aliased DPA mappings across all
420regions, parsing the LABEL, if present, and then emitting NAMESPACE
421devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
422nd_blk device driver to consume.
423
424In addition to the generic attributes of "mapping"s, "interleave_ways"
425and "size" the REGION device also exports some convenience attributes.
426"nstype" indicates the integer type of namespace-device this region
427emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
428'add' event, "modalias" duplicates the MODALIAS variable stored by udev
429at the 'add' event, and finally, the optional "spa_index" is provided in
430the case where the region is defined by a SPA.
431
432LIBNVDIMM: region
433
434 struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
435 struct nd_region_desc *ndr_desc);
436 struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
437 struct nd_region_desc *ndr_desc);
438
439 /sys/devices/platform/nfit_test.0/ndbus0
440 |-- region0
441 | |-- available_size
442 | |-- btt0
443 | |-- btt_seed
444 | |-- devtype
445 | |-- driver -> ../../../../../bus/nd/drivers/nd_region
446 | |-- init_namespaces
447 | |-- mapping0
448 | |-- mapping1
449 | |-- mappings
450 | |-- modalias
451 | |-- namespace0.0
452 | |-- namespace_seed
453 | |-- numa_node
454 | |-- nfit
455 | | `-- spa_index
456 | |-- nstype
457 | |-- set_cookie
458 | |-- size
459 | |-- subsystem -> ../../../../../bus/nd
460 | `-- uevent
461 |-- region1
462 [..]
463
464LIBNDCTL: region enumeration example
465
466Sample region retrieval routines based on NFIT-unique data like
467"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
468BLK.
469
470 static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
471 unsigned int spa_index)
472 {
473 struct ndctl_region *region;
474
475 ndctl_region_foreach(bus, region) {
476 if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
477 continue;
478 if (ndctl_region_get_spa_index(region) == spa_index)
479 return region;
480 }
481 return NULL;
482 }
483
484 static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
485 unsigned int handle)
486 {
487 struct ndctl_region *region;
488
489 ndctl_region_foreach(bus, region) {
490 struct ndctl_mapping *map;
491
492 if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
493 continue;
494 ndctl_mapping_foreach(region, map) {
495 struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
496
497 if (ndctl_dimm_get_handle(dimm) == handle)
498 return region;
499 }
500 }
501 return NULL;
502 }
503
504
505Why Not Encode the Region Type into the Region Name?
506----------------------------------------------------
507
508At first glance it seems since NFIT defines just PMEM and BLK interface
509types that we should simply name REGION devices with something derived
510from those type names. However, the ND subsystem explicitly keeps the
511REGION name generic and expects userspace to always consider the
512region-attributes for 4 reasons:
513
514 1. There are already more than two REGION and "namespace" types. For
515 PMEM there are two subtypes. As mentioned previously we have PMEM where
516 the constituent DIMM devices are known and anonymous PMEM. For BLK
517 regions the NFIT specification already anticipates vendor specific
518 implementations. The exact distinction of what a region contains is in
519 the region-attributes not the region-name or the region-devtype.
520
521 2. A region with zero child-namespaces is a possible configuration. For
522 example, the NFIT allows for a DCR to be published without a
523 corresponding BLK-aperture. This equates to a DIMM that can only accept
524 control/configuration messages, but no i/o through a descendant block
525 device. Again, this "type" is advertised in the attributes ('mappings'
526 == 0) and the name does not tell you much.
527
528 3. What if a third major interface type arises in the future? Outside
529 of vendor specific implementations, it's not difficult to envision a
530 third class of interface type beyond BLK and PMEM. With a generic name
531 for the REGION level of the device-hierarchy old userspace
532 implementations can still make sense of new kernel advertised
533 region-types. Userspace can always rely on the generic region
534 attributes like "mappings", "size", etc and the expected child devices
535 named "namespace". This generic format of the device-model hierarchy
536 allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
537 future-proof.
538
539 4. There are more robust mechanisms for determining the major type of a
540 region than a device name. See the next section, How Do I Determine the
541 Major Type of a Region?
542
543How Do I Determine the Major Type of a Region?
544----------------------------------------------
545
546Outside of the blanket recommendation of "use libndctl", or simply
547looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
548"nstype" integer attribute, here are some other options.
549
550 1. module alias lookup:
551
552 The whole point of region/namespace device type differentiation is to
553 decide which block-device driver will attach to a given LIBNVDIMM namespace.
554 One can simply use the modalias to lookup the resulting module. It's
555 important to note that this method is robust in the presence of a
556 vendor-specific driver down the road. If a vendor-specific
557 implementation wants to supplant the standard nd_blk driver it can with
558 minimal impact to the rest of LIBNVDIMM.
559
560 In fact, a vendor may also want to have a vendor-specific region-driver
561 (outside of nd_region). For example, if a vendor defined its own LABEL
562 format it would need its own region driver to parse that LABEL and emit
563 the resulting namespaces. The output from module resolution is more
564 accurate than a region-name or region-devtype.
565
566 2. udev:
567
568 The kernel "devtype" is registered in the udev database
569 # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
570 P: /devices/platform/nfit_test.0/ndbus0/region0
571 E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
572 E: DEVTYPE=nd_pmem
573 E: MODALIAS=nd:t2
574 E: SUBSYSTEM=nd
575
576 # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
577 P: /devices/platform/nfit_test.0/ndbus0/region4
578 E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
579 E: DEVTYPE=nd_blk
580 E: MODALIAS=nd:t3
581 E: SUBSYSTEM=nd
582
583 ...and is available as a region attribute, but keep in mind that the
584 "devtype" does not indicate sub-type variations and scripts should
585 really be understanding the other attributes.
586
587 3. type specific attributes:
588
589 As it currently stands a BLK-aperture region will never have a
590 "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region. A
591 BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
592 that does not allow I/O. A PMEM region with a "mappings" value of zero
593 is a simple system-physical-address range.
594
595
596LIBNVDIMM/LIBNDCTL: Namespace
597-------------------------
598
599A REGION, after resolving DPA aliasing and LABEL specified boundaries,
600surfaces one or more "namespace" devices. The arrival of a "namespace"
601device currently triggers either the nd_blk or nd_pmem driver to load
602and register a disk/block device.
603
604LIBNVDIMM: namespace
605Here is a sample layout from the three major types of NAMESPACE where
606namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
607attribute), namespace2.0 represents a BLK namespace (note it has a
608'sector_size' attribute) that, and namespace6.0 represents an anonymous
609PMEM namespace (note that has no 'uuid' attribute due to not support a
610LABEL).
611
612 /sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
613 |-- alt_name
614 |-- devtype
615 |-- dpa_extents
616 |-- force_raw
617 |-- modalias
618 |-- numa_node
619 |-- resource
620 |-- size
621 |-- subsystem -> ../../../../../../bus/nd
622 |-- type
623 |-- uevent
624 `-- uuid
625 /sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
626 |-- alt_name
627 |-- devtype
628 |-- dpa_extents
629 |-- force_raw
630 |-- modalias
631 |-- numa_node
632 |-- sector_size
633 |-- size
634 |-- subsystem -> ../../../../../../bus/nd
635 |-- type
636 |-- uevent
637 `-- uuid
638 /sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
639 |-- block
640 | `-- pmem0
641 |-- devtype
642 |-- driver -> ../../../../../../bus/nd/drivers/pmem
643 |-- force_raw
644 |-- modalias
645 |-- numa_node
646 |-- resource
647 |-- size
648 |-- subsystem -> ../../../../../../bus/nd
649 |-- type
650 `-- uevent
651
652LIBNDCTL: namespace enumeration example
653Namespaces are indexed relative to their parent region, example below.
654These indexes are mostly static from boot to boot, but subsystem makes
655no guarantees in this regard. For a static namespace identifier use its
656'uuid' attribute.
657
658static struct ndctl_namespace *get_namespace_by_id(struct ndctl_region *region,
659 unsigned int id)
660{
661 struct ndctl_namespace *ndns;
662
663 ndctl_namespace_foreach(region, ndns)
664 if (ndctl_namespace_get_id(ndns) == id)
665 return ndns;
666
667 return NULL;
668}
669
670LIBNDCTL: namespace creation example
671Idle namespaces are automatically created by the kernel if a given
672region has enough available capacity to create a new namespace.
673Namespace instantiation involves finding an idle namespace and
674configuring it. For the most part the setting of namespace attributes
675can occur in any order, the only constraint is that 'uuid' must be set
676before 'size'. This enables the kernel to track DPA allocations
677internally with a static identifier.
678
679static int configure_namespace(struct ndctl_region *region,
680 struct ndctl_namespace *ndns,
681 struct namespace_parameters *parameters)
682{
683 char devname[50];
684
685 snprintf(devname, sizeof(devname), "namespace%d.%d",
686 ndctl_region_get_id(region), paramaters->id);
687
688 ndctl_namespace_set_alt_name(ndns, devname);
689 /* 'uuid' must be set prior to setting size! */
690 ndctl_namespace_set_uuid(ndns, paramaters->uuid);
691 ndctl_namespace_set_size(ndns, paramaters->size);
692 /* unlike pmem namespaces, blk namespaces have a sector size */
693 if (parameters->lbasize)
694 ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
695 ndctl_namespace_enable(ndns);
696}
697
698
699Why the Term "namespace"?
700
701 1. Why not "volume" for instance? "volume" ran the risk of confusing ND
702 as a volume manager like device-mapper.
703
704 2. The term originated to describe the sub-devices that can be created
705 within a NVME controller (see the nvme specification:
706 http://www.nvmexpress.org/specifications/), and NFIT namespaces are
707 meant to parallel the capabilities and configurability of
708 NVME-namespaces.
709
710
711LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
712---------------------------------------------
713
714A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
715block device driver that fronts either the whole block device or a
716partition of a block device emitted by either a PMEM or BLK NAMESPACE.
717
718LIBNVDIMM: btt layout
719Every region will start out with at least one BTT device which is the
720seed device. To activate it set the "namespace", "uuid", and
721"sector_size" attributes and then bind the device to the nd_pmem or
722nd_blk driver depending on the region type.
723
724 /sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
725 |-- namespace
726 |-- delete
727 |-- devtype
728 |-- modalias
729 |-- numa_node
730 |-- sector_size
731 |-- subsystem -> ../../../../../bus/nd
732 |-- uevent
733 `-- uuid
734
735LIBNDCTL: btt creation example
736Similar to namespaces an idle BTT device is automatically created per
737region. Each time this "seed" btt device is configured and enabled a new
738seed is created. Creating a BTT configuration involves two steps of
739finding and idle BTT and assigning it to consume a PMEM or BLK namespace.
740
741 static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
742 {
743 struct ndctl_btt *btt;
744
745 ndctl_btt_foreach(region, btt)
746 if (!ndctl_btt_is_enabled(btt)
747 && !ndctl_btt_is_configured(btt))
748 return btt;
749
750 return NULL;
751 }
752
753 static int configure_btt(struct ndctl_region *region,
754 struct btt_parameters *parameters)
755 {
756 btt = get_idle_btt(region);
757
758 ndctl_btt_set_uuid(btt, parameters->uuid);
759 ndctl_btt_set_sector_size(btt, parameters->sector_size);
760 ndctl_btt_set_namespace(btt, parameters->ndns);
761 /* turn off raw mode device */
762 ndctl_namespace_disable(parameters->ndns);
763 /* turn on btt access */
764 ndctl_btt_enable(btt);
765 }
766
767Once instantiated a new inactive btt seed device will appear underneath
768the region.
769
770Once a "namespace" is removed from a BTT that instance of the BTT device
771will be deleted or otherwise reset to default values. This deletion is
772only at the device model level. In order to destroy a BTT the "info
773block" needs to be destroyed. Note, that to destroy a BTT the media
774needs to be written in raw mode. By default, the kernel will autodetect
775the presence of a BTT and disable raw mode. This autodetect behavior
776can be suppressed by enabling raw mode for the namespace via the
777ndctl_namespace_set_raw_mode() api.
778
779
780Summary LIBNDCTL Diagram
781------------------------
782
783For the given example above, here is the view of the objects as seen by the LIBNDCTL api:
784 +---+
785 |CTX| +---------+ +--------------+ +---------------+
786 +-+-+ +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
787 | | +---------+ +--------------+ +---------------+
788+-------+ | | +---------+ +--------------+ +---------------+
789| DIMM0 <-+ | +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
790+-------+ | | | +---------+ +--------------+ +---------------+
791| DIMM1 <-+ +-v--+ | +---------+ +--------------+ +---------------+
792+-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6 "blk2.0" |
793| DIMM2 <-+ +----+ | +---------+ | +--------------+ +----------------------+
794+-------+ | | +-> NAMESPACE2.1 +--> ND5 "blk2.1" | BTT2 |
795| DIMM3 <-+ | +--------------+ +----------------------+
796+-------+ | +---------+ +--------------+ +---------------+
797 +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4 "blk3.0" |
798 | +---------+ | +--------------+ +----------------------+
799 | +-> NAMESPACE3.1 +--> ND3 "blk3.1" | BTT1 |
800 | +--------------+ +----------------------+
801 | +---------+ +--------------+ +---------------+
802 +-> REGION4 +---> NAMESPACE4.0 +--> ND2 "blk4.0" |
803 | +---------+ +--------------+ +---------------+
804 | +---------+ +--------------+ +----------------------+
805 +-> REGION5 +---> NAMESPACE5.0 +--> ND1 "blk5.0" | BTT0 |
806 +---------+ +--------------+ +---------------+------+
807
808
diff --git a/MAINTAINERS b/MAINTAINERS
index 67e0b863d2af..0e6b09150aad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6102,6 +6102,39 @@ M: Sasha Levin <sasha.levin@oracle.com>
6102S: Maintained 6102S: Maintained
6103F: tools/lib/lockdep/ 6103F: tools/lib/lockdep/
6104 6104
6105LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM
6106M: Dan Williams <dan.j.williams@intel.com>
6107L: linux-nvdimm@lists.01.org
6108Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
6109S: Supported
6110F: drivers/nvdimm/*
6111F: include/linux/nd.h
6112F: include/linux/libnvdimm.h
6113F: include/uapi/linux/ndctl.h
6114
6115LIBNVDIMM BLK: MMIO-APERTURE DRIVER
6116M: Ross Zwisler <ross.zwisler@linux.intel.com>
6117L: linux-nvdimm@lists.01.org
6118Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
6119S: Supported
6120F: drivers/nvdimm/blk.c
6121F: drivers/nvdimm/region_devs.c
6122F: drivers/acpi/nfit*
6123
6124LIBNVDIMM BTT: BLOCK TRANSLATION TABLE
6125M: Vishal Verma <vishal.l.verma@intel.com>
6126L: linux-nvdimm@lists.01.org
6127Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
6128S: Supported
6129F: drivers/nvdimm/btt*
6130
6131LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER
6132M: Ross Zwisler <ross.zwisler@linux.intel.com>
6133L: linux-nvdimm@lists.01.org
6134Q: https://patchwork.kernel.org/project/linux-nvdimm/list/
6135S: Supported
6136F: drivers/nvdimm/pmem.c
6137
6105LINUX FOR IBM pSERIES (RS/6000) 6138LINUX FOR IBM pSERIES (RS/6000)
6106M: Paul Mackerras <paulus@au.ibm.com> 6139M: Paul Mackerras <paulus@au.ibm.com>
6107W: http://www.ibm.com/linux/ltc/projects/ppc 6140W: http://www.ibm.com/linux/ltc/projects/ppc
@@ -8363,12 +8396,6 @@ S: Maintained
8363F: Documentation/blockdev/ramdisk.txt 8396F: Documentation/blockdev/ramdisk.txt
8364F: drivers/block/brd.c 8397F: drivers/block/brd.c
8365 8398
8366PERSISTENT MEMORY DRIVER
8367M: Ross Zwisler <ross.zwisler@linux.intel.com>
8368L: linux-nvdimm@lists.01.org
8369S: Supported
8370F: drivers/block/pmem.c
8371
8372RANDOM NUMBER DRIVER 8399RANDOM NUMBER DRIVER
8373M: "Theodore Ts'o" <tytso@mit.edu> 8400M: "Theodore Ts'o" <tytso@mit.edu>
8374S: Maintained 8401S: Maintained
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ab21e0d58278..9d4aa18f2a82 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -158,6 +158,7 @@ static __init int is_reserve_region(efi_memory_desc_t *md)
158 case EFI_BOOT_SERVICES_CODE: 158 case EFI_BOOT_SERVICES_CODE:
159 case EFI_BOOT_SERVICES_DATA: 159 case EFI_BOOT_SERVICES_DATA:
160 case EFI_CONVENTIONAL_MEMORY: 160 case EFI_CONVENTIONAL_MEMORY:
161 case EFI_PERSISTENT_MEMORY:
161 return 0; 162 return 0;
162 default: 163 default:
163 break; 164 break;
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 47e962f7ed5a..caae3f4e4341 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1222,6 +1222,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
1222 flags |= IORESOURCE_DISABLED; 1222 flags |= IORESOURCE_DISABLED;
1223 break; 1223 break;
1224 1224
1225 case EFI_PERSISTENT_MEMORY:
1226 name = "Persistent Memory";
1227 break;
1228
1225 case EFI_RESERVED_TYPE: 1229 case EFI_RESERVED_TYPE:
1226 case EFI_RUNTIME_SERVICES_CODE: 1230 case EFI_RUNTIME_SERVICES_CODE:
1227 case EFI_RUNTIME_SERVICES_DATA: 1231 case EFI_RUNTIME_SERVICES_DATA:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4fcf0ade7e91..d05a42357ef0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
27 select ARCH_HAS_ELF_RANDOMIZE 27 select ARCH_HAS_ELF_RANDOMIZE
28 select ARCH_HAS_FAST_MULTIPLIER 28 select ARCH_HAS_FAST_MULTIPLIER
29 select ARCH_HAS_GCOV_PROFILE_ALL 29 select ARCH_HAS_GCOV_PROFILE_ALL
30 select ARCH_HAS_PMEM_API
30 select ARCH_HAS_SG_CHAIN 31 select ARCH_HAS_SG_CHAIN
31 select ARCH_HAVE_NMI_SAFE_CMPXCHG 32 select ARCH_HAVE_NMI_SAFE_CMPXCHG
32 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI 33 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -1419,6 +1420,9 @@ source "mm/Kconfig"
1419 1420
1420config X86_PMEM_LEGACY 1421config X86_PMEM_LEGACY
1421 bool "Support non-standard NVDIMMs and ADR protected memory" 1422 bool "Support non-standard NVDIMMs and ADR protected memory"
1423 depends on PHYS_ADDR_T_64BIT
1424 depends on BLK_DEV
1425 select LIBNVDIMM
1422 help 1426 help
1423 Treat memory marked using the non-standard e820 type of 12 as used 1427 Treat memory marked using the non-standard e820 type of 12 as used
1424 by the Intel Sandy Bridge-EP reference BIOS as protected memory. 1428 by the Intel Sandy Bridge-EP reference BIOS as protected memory.
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 48304b89b601..2c82bd150d43 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params,
1224 e820_type = E820_NVS; 1224 e820_type = E820_NVS;
1225 break; 1225 break;
1226 1226
1227 case EFI_PERSISTENT_MEMORY:
1228 e820_type = E820_PMEM;
1229 break;
1230
1227 default: 1231 default:
1228 continue; 1232 continue;
1229 } 1233 }
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b6f7457d12e4..9bf3ea14b9f0 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
4/* Caches aren't brain-dead on the intel. */ 4/* Caches aren't brain-dead on the intel. */
5#include <asm-generic/cacheflush.h> 5#include <asm-generic/cacheflush.h>
6#include <asm/special_insns.h> 6#include <asm/special_insns.h>
7#include <asm/uaccess.h>
7 8
8/* 9/*
9 * The set_memory_* API can be used to change various attributes of a virtual 10 * The set_memory_* API can be used to change various attributes of a virtual
@@ -108,4 +109,75 @@ static inline int rodata_test(void)
108} 109}
109#endif 110#endif
110 111
112#ifdef ARCH_HAS_NOCACHE_UACCESS
113
114/**
115 * arch_memcpy_to_pmem - copy data to persistent memory
116 * @dst: destination buffer for the copy
117 * @src: source buffer for the copy
118 * @n: length of the copy in bytes
119 *
120 * Copy data to persistent memory media via non-temporal stores so that
121 * a subsequent arch_wmb_pmem() can flush cpu and memory controller
122 * write buffers to guarantee durability.
123 */
124static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
125 size_t n)
126{
127 int unwritten;
128
129 /*
130 * We are copying between two kernel buffers, if
131 * __copy_from_user_inatomic_nocache() returns an error (page
132 * fault) we would have already reported a general protection fault
133 * before the WARN+BUG.
134 */
135 unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
136 (void __user *) src, n);
137 if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
138 __func__, dst, src, unwritten))
139 BUG();
140}
141
142/**
143 * arch_wmb_pmem - synchronize writes to persistent memory
144 *
145 * After a series of arch_memcpy_to_pmem() operations this drains data
146 * from cpu write buffers and any platform (memory controller) buffers
147 * to ensure that written data is durable on persistent memory media.
148 */
149static inline void arch_wmb_pmem(void)
150{
151 /*
152 * wmb() to 'sfence' all previous writes such that they are
153 * architecturally visible to 'pcommit'. Note, that we've
154 * already arranged for pmem writes to avoid the cache via
155 * arch_memcpy_to_pmem().
156 */
157 wmb();
158 pcommit_sfence();
159}
160
161static inline bool __arch_has_wmb_pmem(void)
162{
163#ifdef CONFIG_X86_64
164 /*
165 * We require that wmb() be an 'sfence', that is only guaranteed on
166 * 64-bit builds
167 */
168 return static_cpu_has(X86_FEATURE_PCOMMIT);
169#else
170 return false;
171#endif
172}
173#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
174extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
175extern void arch_wmb_pmem(void);
176
177static inline bool __arch_has_wmb_pmem(void)
178{
179 return false;
180}
181#endif
182
111#endif /* _ASM_X86_CACHEFLUSH_H */ 183#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 83ec9b1d77cc..cc9c61bc1abe 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -248,6 +248,12 @@ static inline void flush_write_buffers(void)
248#endif 248#endif
249} 249}
250 250
251static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
252 unsigned long size)
253{
254 return (void __force __pmem *) ioremap_cache(offset, size);
255}
256
251#endif /* __KERNEL__ */ 257#endif /* __KERNEL__ */
252 258
253extern void native_io_delay(void); 259extern void native_io_delay(void);
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 960a8a9dc4ab..0f457e6eab18 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -32,6 +32,7 @@
32#define E820_ACPI 3 32#define E820_ACPI 3
33#define E820_NVS 4 33#define E820_NVS 4
34#define E820_UNUSABLE 5 34#define E820_UNUSABLE 5
35#define E820_PMEM 7
35 36
36/* 37/*
37 * This is a non-standardized way to represent ADR or NVDIMM regions that 38 * This is a non-standardized way to represent ADR or NVDIMM regions that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c8dda42cb6a3..a102564d08eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type)
149 case E820_UNUSABLE: 149 case E820_UNUSABLE:
150 printk(KERN_CONT "unusable"); 150 printk(KERN_CONT "unusable");
151 break; 151 break;
152 case E820_PMEM:
152 case E820_PRAM: 153 case E820_PRAM:
153 printk(KERN_CONT "persistent (type %u)", type); 154 printk(KERN_CONT "persistent (type %u)", type);
154 break; 155 break;
@@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type)
918 case E820_ACPI: return "ACPI Tables"; 919 case E820_ACPI: return "ACPI Tables";
919 case E820_NVS: return "ACPI Non-volatile Storage"; 920 case E820_NVS: return "ACPI Non-volatile Storage";
920 case E820_UNUSABLE: return "Unusable memory"; 921 case E820_UNUSABLE: return "Unusable memory";
921 case E820_PRAM: return "Persistent RAM"; 922 case E820_PRAM: return "Persistent Memory (legacy)";
923 case E820_PMEM: return "Persistent Memory";
922 default: return "reserved"; 924 default: return "reserved";
923 } 925 }
924} 926}
925 927
928static bool do_mark_busy(u32 type, struct resource *res)
929{
930 /* this is the legacy bios/dos rom-shadow + mmio region */
931 if (res->start < (1ULL<<20))
932 return true;
933
934 /*
935 * Treat persistent memory like device memory, i.e. reserve it
936 * for exclusive use of a driver
937 */
938 switch (type) {
939 case E820_RESERVED:
940 case E820_PRAM:
941 case E820_PMEM:
942 return false;
943 default:
944 return true;
945 }
946}
947
926/* 948/*
927 * Mark e820 reserved areas as busy for the resource manager. 949 * Mark e820 reserved areas as busy for the resource manager.
928 */ 950 */
@@ -952,9 +974,7 @@ void __init e820_reserve_resources(void)
952 * pci device BAR resource and insert them later in 974 * pci device BAR resource and insert them later in
953 * pcibios_resource_survey() 975 * pcibios_resource_survey()
954 */ 976 */
955 if (((e820.map[i].type != E820_RESERVED) && 977 if (do_mark_busy(e820.map[i].type, res)) {
956 (e820.map[i].type != E820_PRAM)) ||
957 res->start < (1ULL<<20)) {
958 res->flags |= IORESOURCE_BUSY; 978 res->flags |= IORESOURCE_BUSY;
959 insert_resource(&iomem_resource, res); 979 insert_resource(&iomem_resource, res);
960 } 980 }
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3420c874ddc5..64f90f53bb85 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -1,53 +1,82 @@
1/* 1/*
2 * Copyright (c) 2015, Christoph Hellwig. 2 * Copyright (c) 2015, Christoph Hellwig.
3 * Copyright (c) 2015, Intel Corporation.
3 */ 4 */
4#include <linux/memblock.h>
5#include <linux/platform_device.h> 5#include <linux/platform_device.h>
6#include <linux/slab.h> 6#include <linux/libnvdimm.h>
7#include <linux/module.h>
7#include <asm/e820.h> 8#include <asm/e820.h>
8#include <asm/page_types.h>
9#include <asm/setup.h>
10 9
11static __init void register_pmem_device(struct resource *res) 10static void e820_pmem_release(struct device *dev)
12{ 11{
13 struct platform_device *pdev; 12 struct nvdimm_bus *nvdimm_bus = dev->platform_data;
14 int error;
15 13
16 pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); 14 if (nvdimm_bus)
17 if (!pdev) 15 nvdimm_bus_unregister(nvdimm_bus);
18 return; 16}
19 17
20 error = platform_device_add_resources(pdev, res, 1); 18static struct platform_device e820_pmem = {
21 if (error) 19 .name = "e820_pmem",
22 goto out_put_pdev; 20 .id = -1,
21 .dev = {
22 .release = e820_pmem_release,
23 },
24};
23 25
24 error = platform_device_add(pdev); 26static const struct attribute_group *e820_pmem_attribute_groups[] = {
25 if (error) 27 &nvdimm_bus_attribute_group,
26 goto out_put_pdev; 28 NULL,
27 return; 29};
28 30
29out_put_pdev: 31static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
30 dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); 32 &nd_region_attribute_group,
31 platform_device_put(pdev); 33 &nd_device_attribute_group,
32} 34 NULL,
35};
33 36
34static __init int register_pmem_devices(void) 37static __init int register_e820_pmem(void)
35{ 38{
36 int i; 39 static struct nvdimm_bus_descriptor nd_desc;
40 struct device *dev = &e820_pmem.dev;
41 struct nvdimm_bus *nvdimm_bus;
42 int rc, i;
43
44 rc = platform_device_register(&e820_pmem);
45 if (rc)
46 return rc;
47
48 nd_desc.attr_groups = e820_pmem_attribute_groups;
49 nd_desc.provider_name = "e820";
50 nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
51 if (!nvdimm_bus)
52 goto err;
53 dev->platform_data = nvdimm_bus;
37 54
38 for (i = 0; i < e820.nr_map; i++) { 55 for (i = 0; i < e820.nr_map; i++) {
39 struct e820entry *ei = &e820.map[i]; 56 struct e820entry *ei = &e820.map[i];
57 struct resource res = {
58 .flags = IORESOURCE_MEM,
59 .start = ei->addr,
60 .end = ei->addr + ei->size - 1,
61 };
62 struct nd_region_desc ndr_desc;
63
64 if (ei->type != E820_PRAM)
65 continue;
40 66
41 if (ei->type == E820_PRAM) { 67 memset(&ndr_desc, 0, sizeof(ndr_desc));
42 struct resource res = { 68 ndr_desc.res = &res;
43 .flags = IORESOURCE_MEM, 69 ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
44 .start = ei->addr, 70 ndr_desc.numa_node = NUMA_NO_NODE;
45 .end = ei->addr + ei->size - 1, 71 if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
46 }; 72 goto err;
47 register_pmem_device(&res);
48 }
49 } 73 }
50 74
51 return 0; 75 return 0;
76
77 err:
78 dev_err(dev, "failed to register legacy persistent memory ranges\n");
79 platform_device_unregister(&e820_pmem);
80 return -ENXIO;
52} 81}
53device_initcall(register_pmem_devices); 82device_initcall(register_e820_pmem);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index c1c382c58c60..cfba30f27392 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -174,6 +174,9 @@ static void __init do_add_efi_memmap(void)
174 case EFI_UNUSABLE_MEMORY: 174 case EFI_UNUSABLE_MEMORY:
175 e820_type = E820_UNUSABLE; 175 e820_type = E820_UNUSABLE;
176 break; 176 break;
177 case EFI_PERSISTENT_MEMORY:
178 e820_type = E820_PMEM;
179 break;
177 default: 180 default:
178 /* 181 /*
179 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE 182 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
diff --git a/drivers/Kconfig b/drivers/Kconfig
index c0cc96bab9e7..6e973b8e3a3b 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -182,4 +182,6 @@ source "drivers/thunderbolt/Kconfig"
182 182
183source "drivers/android/Kconfig" 183source "drivers/android/Kconfig"
184 184
185source "drivers/nvdimm/Kconfig"
186
185endmenu 187endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 9a02fb7c5106..b64b49f6e01b 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
64 64
65obj-$(CONFIG_PARPORT) += parport/ 65obj-$(CONFIG_PARPORT) += parport/
66obj-y += base/ block/ misc/ mfd/ nfc/ 66obj-y += base/ block/ misc/ mfd/ nfc/
67obj-$(CONFIG_LIBNVDIMM) += nvdimm/
67obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ 68obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
68obj-$(CONFIG_NUBUS) += nubus/ 69obj-$(CONFIG_NUBUS) += nubus/
69obj-y += macintosh/ 70obj-y += macintosh/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 35da507411a0..f15db002be8e 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -386,6 +386,32 @@ config ACPI_REDUCED_HARDWARE_ONLY
386 386
387 If you are unsure what to do, do not enable this option. 387 If you are unsure what to do, do not enable this option.
388 388
389config ACPI_NFIT
390 tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
391 depends on PHYS_ADDR_T_64BIT
392 depends on BLK_DEV
393 select LIBNVDIMM
394 help
395 Infrastructure to probe ACPI 6 compliant platforms for
396 NVDIMMs (NFIT) and register a libnvdimm device tree. In
397 addition to storage devices this also enables libnvdimm to pass
398 ACPI._DSM messages for platform/dimm configuration.
399
400 To compile this driver as a module, choose M here:
401 the module will be called nfit.
402
403config ACPI_NFIT_DEBUG
404 bool "NFIT DSM debug"
405 depends on ACPI_NFIT
406 depends on DYNAMIC_DEBUG
407 default n
408 help
409 Enabling this option causes the nfit driver to dump the
410 input and output buffers of _DSM operations on the ACPI0012
411 device and its children. This can be very verbose, so leave
412 it disabled unless you are debugging a hardware / firmware
413 issue.
414
389source "drivers/acpi/apei/Kconfig" 415source "drivers/acpi/apei/Kconfig"
390 416
391config ACPI_EXTLOG 417config ACPI_EXTLOG
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 73d840bef455..8321430d7f24 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o
68obj-$(CONFIG_ACPI_PROCESSOR) += processor.o 68obj-$(CONFIG_ACPI_PROCESSOR) += processor.o
69obj-y += container.o 69obj-y += container.o
70obj-$(CONFIG_ACPI_THERMAL) += thermal.o 70obj-$(CONFIG_ACPI_THERMAL) += thermal.o
71obj-$(CONFIG_ACPI_NFIT) += nfit.o
71obj-y += acpi_memhotplug.o 72obj-y += acpi_memhotplug.o
72obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o 73obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
73obj-$(CONFIG_ACPI_BATTERY) += battery.o 74obj-$(CONFIG_ACPI_BATTERY) += battery.o
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
new file mode 100644
index 000000000000..2161fa178c8d
--- /dev/null
+++ b/drivers/acpi/nfit.c
@@ -0,0 +1,1587 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/list_sort.h>
14#include <linux/libnvdimm.h>
15#include <linux/module.h>
16#include <linux/mutex.h>
17#include <linux/ndctl.h>
18#include <linux/list.h>
19#include <linux/acpi.h>
20#include <linux/sort.h>
21#include <linux/io.h>
22#include "nfit.h"
23
24/*
25 * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
26 * irrelevant.
27 */
28#include <asm-generic/io-64-nonatomic-hi-lo.h>
29
30static bool force_enable_dimms;
31module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
32MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
33
34static u8 nfit_uuid[NFIT_UUID_MAX][16];
35
36const u8 *to_nfit_uuid(enum nfit_uuids id)
37{
38 return nfit_uuid[id];
39}
40EXPORT_SYMBOL(to_nfit_uuid);
41
42static struct acpi_nfit_desc *to_acpi_nfit_desc(
43 struct nvdimm_bus_descriptor *nd_desc)
44{
45 return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
46}
47
48static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
49{
50 struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
51
52 /*
53 * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct
54 * acpi_device.
55 */
56 if (!nd_desc->provider_name
57 || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0)
58 return NULL;
59
60 return to_acpi_device(acpi_desc->dev);
61}
62
63static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
64 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
65 unsigned int buf_len)
66{
67 struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
68 const struct nd_cmd_desc *desc = NULL;
69 union acpi_object in_obj, in_buf, *out_obj;
70 struct device *dev = acpi_desc->dev;
71 const char *cmd_name, *dimm_name;
72 unsigned long dsm_mask;
73 acpi_handle handle;
74 const u8 *uuid;
75 u32 offset;
76 int rc, i;
77
78 if (nvdimm) {
79 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
80 struct acpi_device *adev = nfit_mem->adev;
81
82 if (!adev)
83 return -ENOTTY;
84 dimm_name = nvdimm_name(nvdimm);
85 cmd_name = nvdimm_cmd_name(cmd);
86 dsm_mask = nfit_mem->dsm_mask;
87 desc = nd_cmd_dimm_desc(cmd);
88 uuid = to_nfit_uuid(NFIT_DEV_DIMM);
89 handle = adev->handle;
90 } else {
91 struct acpi_device *adev = to_acpi_dev(acpi_desc);
92
93 cmd_name = nvdimm_bus_cmd_name(cmd);
94 dsm_mask = nd_desc->dsm_mask;
95 desc = nd_cmd_bus_desc(cmd);
96 uuid = to_nfit_uuid(NFIT_DEV_BUS);
97 handle = adev->handle;
98 dimm_name = "bus";
99 }
100
101 if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
102 return -ENOTTY;
103
104 if (!test_bit(cmd, &dsm_mask))
105 return -ENOTTY;
106
107 in_obj.type = ACPI_TYPE_PACKAGE;
108 in_obj.package.count = 1;
109 in_obj.package.elements = &in_buf;
110 in_buf.type = ACPI_TYPE_BUFFER;
111 in_buf.buffer.pointer = buf;
112 in_buf.buffer.length = 0;
113
114 /* libnvdimm has already validated the input envelope */
115 for (i = 0; i < desc->in_num; i++)
116 in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc,
117 i, buf);
118
119 if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
120 dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__,
121 dimm_name, cmd_name, in_buf.buffer.length);
122 print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
123 4, in_buf.buffer.pointer, min_t(u32, 128,
124 in_buf.buffer.length), true);
125 }
126
127 out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj);
128 if (!out_obj) {
129 dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
130 cmd_name);
131 return -EINVAL;
132 }
133
134 if (out_obj->package.type != ACPI_TYPE_BUFFER) {
135 dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n",
136 __func__, dimm_name, cmd_name, out_obj->type);
137 rc = -EINVAL;
138 goto out;
139 }
140
141 if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
142 dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__,
143 dimm_name, cmd_name, out_obj->buffer.length);
144 print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
145 4, out_obj->buffer.pointer, min_t(u32, 128,
146 out_obj->buffer.length), true);
147 }
148
149 for (i = 0, offset = 0; i < desc->out_num; i++) {
150 u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf,
151 (u32 *) out_obj->buffer.pointer);
152
153 if (offset + out_size > out_obj->buffer.length) {
154 dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n",
155 __func__, dimm_name, cmd_name, i);
156 break;
157 }
158
159 if (in_buf.buffer.length + offset + out_size > buf_len) {
160 dev_dbg(dev, "%s:%s output overrun cmd: %s field: %d\n",
161 __func__, dimm_name, cmd_name, i);
162 rc = -ENXIO;
163 goto out;
164 }
165 memcpy(buf + in_buf.buffer.length + offset,
166 out_obj->buffer.pointer + offset, out_size);
167 offset += out_size;
168 }
169 if (offset + in_buf.buffer.length < buf_len) {
170 if (i >= 1) {
171 /*
172 * status valid, return the number of bytes left
173 * unfilled in the output buffer
174 */
175 rc = buf_len - offset - in_buf.buffer.length;
176 } else {
177 dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n",
178 __func__, dimm_name, cmd_name, buf_len,
179 offset);
180 rc = -ENXIO;
181 }
182 } else
183 rc = 0;
184
185 out:
186 ACPI_FREE(out_obj);
187
188 return rc;
189}
190
191static const char *spa_type_name(u16 type)
192{
193 static const char *to_name[] = {
194 [NFIT_SPA_VOLATILE] = "volatile",
195 [NFIT_SPA_PM] = "pmem",
196 [NFIT_SPA_DCR] = "dimm-control-region",
197 [NFIT_SPA_BDW] = "block-data-window",
198 [NFIT_SPA_VDISK] = "volatile-disk",
199 [NFIT_SPA_VCD] = "volatile-cd",
200 [NFIT_SPA_PDISK] = "persistent-disk",
201 [NFIT_SPA_PCD] = "persistent-cd",
202
203 };
204
205 if (type > NFIT_SPA_PCD)
206 return "unknown";
207
208 return to_name[type];
209}
210
211static int nfit_spa_type(struct acpi_nfit_system_address *spa)
212{
213 int i;
214
215 for (i = 0; i < NFIT_UUID_MAX; i++)
216 if (memcmp(to_nfit_uuid(i), spa->range_guid, 16) == 0)
217 return i;
218 return -1;
219}
220
221static bool add_spa(struct acpi_nfit_desc *acpi_desc,
222 struct acpi_nfit_system_address *spa)
223{
224 struct device *dev = acpi_desc->dev;
225 struct nfit_spa *nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa),
226 GFP_KERNEL);
227
228 if (!nfit_spa)
229 return false;
230 INIT_LIST_HEAD(&nfit_spa->list);
231 nfit_spa->spa = spa;
232 list_add_tail(&nfit_spa->list, &acpi_desc->spas);
233 dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__,
234 spa->range_index,
235 spa_type_name(nfit_spa_type(spa)));
236 return true;
237}
238
239static bool add_memdev(struct acpi_nfit_desc *acpi_desc,
240 struct acpi_nfit_memory_map *memdev)
241{
242 struct device *dev = acpi_desc->dev;
243 struct nfit_memdev *nfit_memdev = devm_kzalloc(dev,
244 sizeof(*nfit_memdev), GFP_KERNEL);
245
246 if (!nfit_memdev)
247 return false;
248 INIT_LIST_HEAD(&nfit_memdev->list);
249 nfit_memdev->memdev = memdev;
250 list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
251 dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
252 __func__, memdev->device_handle, memdev->range_index,
253 memdev->region_index);
254 return true;
255}
256
257static bool add_dcr(struct acpi_nfit_desc *acpi_desc,
258 struct acpi_nfit_control_region *dcr)
259{
260 struct device *dev = acpi_desc->dev;
261 struct nfit_dcr *nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr),
262 GFP_KERNEL);
263
264 if (!nfit_dcr)
265 return false;
266 INIT_LIST_HEAD(&nfit_dcr->list);
267 nfit_dcr->dcr = dcr;
268 list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs);
269 dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__,
270 dcr->region_index, dcr->windows);
271 return true;
272}
273
274static bool add_bdw(struct acpi_nfit_desc *acpi_desc,
275 struct acpi_nfit_data_region *bdw)
276{
277 struct device *dev = acpi_desc->dev;
278 struct nfit_bdw *nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw),
279 GFP_KERNEL);
280
281 if (!nfit_bdw)
282 return false;
283 INIT_LIST_HEAD(&nfit_bdw->list);
284 nfit_bdw->bdw = bdw;
285 list_add_tail(&nfit_bdw->list, &acpi_desc->bdws);
286 dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__,
287 bdw->region_index, bdw->windows);
288 return true;
289}
290
291static bool add_idt(struct acpi_nfit_desc *acpi_desc,
292 struct acpi_nfit_interleave *idt)
293{
294 struct device *dev = acpi_desc->dev;
295 struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt),
296 GFP_KERNEL);
297
298 if (!nfit_idt)
299 return false;
300 INIT_LIST_HEAD(&nfit_idt->list);
301 nfit_idt->idt = idt;
302 list_add_tail(&nfit_idt->list, &acpi_desc->idts);
303 dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__,
304 idt->interleave_index, idt->line_count);
305 return true;
306}
307
308static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table,
309 const void *end)
310{
311 struct device *dev = acpi_desc->dev;
312 struct acpi_nfit_header *hdr;
313 void *err = ERR_PTR(-ENOMEM);
314
315 if (table >= end)
316 return NULL;
317
318 hdr = table;
319 switch (hdr->type) {
320 case ACPI_NFIT_TYPE_SYSTEM_ADDRESS:
321 if (!add_spa(acpi_desc, table))
322 return err;
323 break;
324 case ACPI_NFIT_TYPE_MEMORY_MAP:
325 if (!add_memdev(acpi_desc, table))
326 return err;
327 break;
328 case ACPI_NFIT_TYPE_CONTROL_REGION:
329 if (!add_dcr(acpi_desc, table))
330 return err;
331 break;
332 case ACPI_NFIT_TYPE_DATA_REGION:
333 if (!add_bdw(acpi_desc, table))
334 return err;
335 break;
336 case ACPI_NFIT_TYPE_INTERLEAVE:
337 if (!add_idt(acpi_desc, table))
338 return err;
339 break;
340 case ACPI_NFIT_TYPE_FLUSH_ADDRESS:
341 dev_dbg(dev, "%s: flush\n", __func__);
342 break;
343 case ACPI_NFIT_TYPE_SMBIOS:
344 dev_dbg(dev, "%s: smbios\n", __func__);
345 break;
346 default:
347 dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type);
348 break;
349 }
350
351 return table + hdr->length;
352}
353
354static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc,
355 struct nfit_mem *nfit_mem)
356{
357 u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
358 u16 dcr = nfit_mem->dcr->region_index;
359 struct nfit_spa *nfit_spa;
360
361 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
362 u16 range_index = nfit_spa->spa->range_index;
363 int type = nfit_spa_type(nfit_spa->spa);
364 struct nfit_memdev *nfit_memdev;
365
366 if (type != NFIT_SPA_BDW)
367 continue;
368
369 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
370 if (nfit_memdev->memdev->range_index != range_index)
371 continue;
372 if (nfit_memdev->memdev->device_handle != device_handle)
373 continue;
374 if (nfit_memdev->memdev->region_index != dcr)
375 continue;
376
377 nfit_mem->spa_bdw = nfit_spa->spa;
378 return;
379 }
380 }
381
382 dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n",
383 nfit_mem->spa_dcr->range_index);
384 nfit_mem->bdw = NULL;
385}
386
387static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
388 struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
389{
390 u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
391 struct nfit_memdev *nfit_memdev;
392 struct nfit_dcr *nfit_dcr;
393 struct nfit_bdw *nfit_bdw;
394 struct nfit_idt *nfit_idt;
395 u16 idt_idx, range_index;
396
397 list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
398 if (nfit_dcr->dcr->region_index != dcr)
399 continue;
400 nfit_mem->dcr = nfit_dcr->dcr;
401 break;
402 }
403
404 if (!nfit_mem->dcr) {
405 dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n",
406 spa->range_index, __to_nfit_memdev(nfit_mem)
407 ? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR");
408 return -ENODEV;
409 }
410
411 /*
412 * We've found enough to create an nvdimm, optionally
413 * find an associated BDW
414 */
415 list_add(&nfit_mem->list, &acpi_desc->dimms);
416
417 list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) {
418 if (nfit_bdw->bdw->region_index != dcr)
419 continue;
420 nfit_mem->bdw = nfit_bdw->bdw;
421 break;
422 }
423
424 if (!nfit_mem->bdw)
425 return 0;
426
427 nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
428
429 if (!nfit_mem->spa_bdw)
430 return 0;
431
432 range_index = nfit_mem->spa_bdw->range_index;
433 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
434 if (nfit_memdev->memdev->range_index != range_index ||
435 nfit_memdev->memdev->region_index != dcr)
436 continue;
437 nfit_mem->memdev_bdw = nfit_memdev->memdev;
438 idt_idx = nfit_memdev->memdev->interleave_index;
439 list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
440 if (nfit_idt->idt->interleave_index != idt_idx)
441 continue;
442 nfit_mem->idt_bdw = nfit_idt->idt;
443 break;
444 }
445 break;
446 }
447
448 return 0;
449}
450
451static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
452 struct acpi_nfit_system_address *spa)
453{
454 struct nfit_mem *nfit_mem, *found;
455 struct nfit_memdev *nfit_memdev;
456 int type = nfit_spa_type(spa);
457 u16 dcr;
458
459 switch (type) {
460 case NFIT_SPA_DCR:
461 case NFIT_SPA_PM:
462 break;
463 default:
464 return 0;
465 }
466
467 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
468 int rc;
469
470 if (nfit_memdev->memdev->range_index != spa->range_index)
471 continue;
472 found = NULL;
473 dcr = nfit_memdev->memdev->region_index;
474 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
475 if (__to_nfit_memdev(nfit_mem)->region_index == dcr) {
476 found = nfit_mem;
477 break;
478 }
479
480 if (found)
481 nfit_mem = found;
482 else {
483 nfit_mem = devm_kzalloc(acpi_desc->dev,
484 sizeof(*nfit_mem), GFP_KERNEL);
485 if (!nfit_mem)
486 return -ENOMEM;
487 INIT_LIST_HEAD(&nfit_mem->list);
488 }
489
490 if (type == NFIT_SPA_DCR) {
491 struct nfit_idt *nfit_idt;
492 u16 idt_idx;
493
494 /* multiple dimms may share a SPA when interleaved */
495 nfit_mem->spa_dcr = spa;
496 nfit_mem->memdev_dcr = nfit_memdev->memdev;
497 idt_idx = nfit_memdev->memdev->interleave_index;
498 list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
499 if (nfit_idt->idt->interleave_index != idt_idx)
500 continue;
501 nfit_mem->idt_dcr = nfit_idt->idt;
502 break;
503 }
504 } else {
505 /*
506 * A single dimm may belong to multiple SPA-PM
507 * ranges, record at least one in addition to
508 * any SPA-DCR range.
509 */
510 nfit_mem->memdev_pmem = nfit_memdev->memdev;
511 }
512
513 if (found)
514 continue;
515
516 rc = nfit_mem_add(acpi_desc, nfit_mem, spa);
517 if (rc)
518 return rc;
519 }
520
521 return 0;
522}
523
524static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
525{
526 struct nfit_mem *a = container_of(_a, typeof(*a), list);
527 struct nfit_mem *b = container_of(_b, typeof(*b), list);
528 u32 handleA, handleB;
529
530 handleA = __to_nfit_memdev(a)->device_handle;
531 handleB = __to_nfit_memdev(b)->device_handle;
532 if (handleA < handleB)
533 return -1;
534 else if (handleA > handleB)
535 return 1;
536 return 0;
537}
538
539static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
540{
541 struct nfit_spa *nfit_spa;
542
543 /*
544 * For each SPA-DCR or SPA-PMEM address range find its
545 * corresponding MEMDEV(s). From each MEMDEV find the
546 * corresponding DCR. Then, if we're operating on a SPA-DCR,
547 * try to find a SPA-BDW and a corresponding BDW that references
548 * the DCR. Throw it all into an nfit_mem object. Note, that
549 * BDWs are optional.
550 */
551 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
552 int rc;
553
554 rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa);
555 if (rc)
556 return rc;
557 }
558
559 list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp);
560
561 return 0;
562}
563
564static ssize_t revision_show(struct device *dev,
565 struct device_attribute *attr, char *buf)
566{
567 struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
568 struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
569 struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
570
571 return sprintf(buf, "%d\n", acpi_desc->nfit->header.revision);
572}
573static DEVICE_ATTR_RO(revision);
574
575static struct attribute *acpi_nfit_attributes[] = {
576 &dev_attr_revision.attr,
577 NULL,
578};
579
580static struct attribute_group acpi_nfit_attribute_group = {
581 .name = "nfit",
582 .attrs = acpi_nfit_attributes,
583};
584
585const struct attribute_group *acpi_nfit_attribute_groups[] = {
586 &nvdimm_bus_attribute_group,
587 &acpi_nfit_attribute_group,
588 NULL,
589};
590EXPORT_SYMBOL_GPL(acpi_nfit_attribute_groups);
591
592static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev)
593{
594 struct nvdimm *nvdimm = to_nvdimm(dev);
595 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
596
597 return __to_nfit_memdev(nfit_mem);
598}
599
600static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev)
601{
602 struct nvdimm *nvdimm = to_nvdimm(dev);
603 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
604
605 return nfit_mem->dcr;
606}
607
608static ssize_t handle_show(struct device *dev,
609 struct device_attribute *attr, char *buf)
610{
611 struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
612
613 return sprintf(buf, "%#x\n", memdev->device_handle);
614}
615static DEVICE_ATTR_RO(handle);
616
617static ssize_t phys_id_show(struct device *dev,
618 struct device_attribute *attr, char *buf)
619{
620 struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
621
622 return sprintf(buf, "%#x\n", memdev->physical_id);
623}
624static DEVICE_ATTR_RO(phys_id);
625
626static ssize_t vendor_show(struct device *dev,
627 struct device_attribute *attr, char *buf)
628{
629 struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
630
631 return sprintf(buf, "%#x\n", dcr->vendor_id);
632}
633static DEVICE_ATTR_RO(vendor);
634
635static ssize_t rev_id_show(struct device *dev,
636 struct device_attribute *attr, char *buf)
637{
638 struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
639
640 return sprintf(buf, "%#x\n", dcr->revision_id);
641}
642static DEVICE_ATTR_RO(rev_id);
643
644static ssize_t device_show(struct device *dev,
645 struct device_attribute *attr, char *buf)
646{
647 struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
648
649 return sprintf(buf, "%#x\n", dcr->device_id);
650}
651static DEVICE_ATTR_RO(device);
652
653static ssize_t format_show(struct device *dev,
654 struct device_attribute *attr, char *buf)
655{
656 struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
657
658 return sprintf(buf, "%#x\n", dcr->code);
659}
660static DEVICE_ATTR_RO(format);
661
662static ssize_t serial_show(struct device *dev,
663 struct device_attribute *attr, char *buf)
664{
665 struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
666
667 return sprintf(buf, "%#x\n", dcr->serial_number);
668}
669static DEVICE_ATTR_RO(serial);
670
671static ssize_t flags_show(struct device *dev,
672 struct device_attribute *attr, char *buf)
673{
674 u16 flags = to_nfit_memdev(dev)->flags;
675
676 return sprintf(buf, "%s%s%s%s%s\n",
677 flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
678 flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
679 flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
680 flags & ACPI_NFIT_MEM_ARMED ? "arm " : "",
681 flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart " : "");
682}
683static DEVICE_ATTR_RO(flags);
684
685static struct attribute *acpi_nfit_dimm_attributes[] = {
686 &dev_attr_handle.attr,
687 &dev_attr_phys_id.attr,
688 &dev_attr_vendor.attr,
689 &dev_attr_device.attr,
690 &dev_attr_format.attr,
691 &dev_attr_serial.attr,
692 &dev_attr_rev_id.attr,
693 &dev_attr_flags.attr,
694 NULL,
695};
696
697static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
698 struct attribute *a, int n)
699{
700 struct device *dev = container_of(kobj, struct device, kobj);
701
702 if (to_nfit_dcr(dev))
703 return a->mode;
704 else
705 return 0;
706}
707
708static struct attribute_group acpi_nfit_dimm_attribute_group = {
709 .name = "nfit",
710 .attrs = acpi_nfit_dimm_attributes,
711 .is_visible = acpi_nfit_dimm_attr_visible,
712};
713
714static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = {
715 &nvdimm_attribute_group,
716 &nd_device_attribute_group,
717 &acpi_nfit_dimm_attribute_group,
718 NULL,
719};
720
721static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc,
722 u32 device_handle)
723{
724 struct nfit_mem *nfit_mem;
725
726 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
727 if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle)
728 return nfit_mem->nvdimm;
729
730 return NULL;
731}
732
733static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
734 struct nfit_mem *nfit_mem, u32 device_handle)
735{
736 struct acpi_device *adev, *adev_dimm;
737 struct device *dev = acpi_desc->dev;
738 const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
739 unsigned long long sta;
740 int i, rc = -ENODEV;
741 acpi_status status;
742
743 nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
744 adev = to_acpi_dev(acpi_desc);
745 if (!adev)
746 return 0;
747
748 adev_dimm = acpi_find_child_device(adev, device_handle, false);
749 nfit_mem->adev = adev_dimm;
750 if (!adev_dimm) {
751 dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n",
752 device_handle);
753 return force_enable_dimms ? 0 : -ENODEV;
754 }
755
756 status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta);
757 if (status == AE_NOT_FOUND) {
758 dev_dbg(dev, "%s missing _STA, assuming enabled...\n",
759 dev_name(&adev_dimm->dev));
760 rc = 0;
761 } else if (ACPI_FAILURE(status))
762 dev_err(dev, "%s failed to retrieve_STA, disabling...\n",
763 dev_name(&adev_dimm->dev));
764 else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0)
765 dev_info(dev, "%s disabled by firmware\n",
766 dev_name(&adev_dimm->dev));
767 else
768 rc = 0;
769
770 for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++)
771 if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
772 set_bit(i, &nfit_mem->dsm_mask);
773
774 return force_enable_dimms ? 0 : rc;
775}
776
777static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
778{
779 struct nfit_mem *nfit_mem;
780 int dimm_count = 0;
781
782 list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
783 struct nvdimm *nvdimm;
784 unsigned long flags = 0;
785 u32 device_handle;
786 u16 mem_flags;
787 int rc;
788
789 device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
790 nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
791 if (nvdimm) {
792 /*
793 * If for some reason we find multiple DCRs the
794 * first one wins
795 */
796 dev_err(acpi_desc->dev, "duplicate DCR detected: %s\n",
797 nvdimm_name(nvdimm));
798 continue;
799 }
800
801 if (nfit_mem->bdw && nfit_mem->memdev_pmem)
802 flags |= NDD_ALIASING;
803
804 mem_flags = __to_nfit_memdev(nfit_mem)->flags;
805 if (mem_flags & ACPI_NFIT_MEM_ARMED)
806 flags |= NDD_UNARMED;
807
808 rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
809 if (rc)
810 continue;
811
812 nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
813 acpi_nfit_dimm_attribute_groups,
814 flags, &nfit_mem->dsm_mask);
815 if (!nvdimm)
816 return -ENOMEM;
817
818 nfit_mem->nvdimm = nvdimm;
819 dimm_count++;
820
821 if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
822 continue;
823
824 dev_info(acpi_desc->dev, "%s: failed: %s%s%s%s\n",
825 nvdimm_name(nvdimm),
826 mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
827 mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
828 mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
829 mem_flags & ACPI_NFIT_MEM_ARMED ? "arm " : "");
830
831 }
832
833 return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
834}
835
836static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
837{
838 struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
839 const u8 *uuid = to_nfit_uuid(NFIT_DEV_BUS);
840 struct acpi_device *adev;
841 int i;
842
843 adev = to_acpi_dev(acpi_desc);
844 if (!adev)
845 return;
846
847 for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++)
848 if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
849 set_bit(i, &nd_desc->dsm_mask);
850}
851
852static ssize_t range_index_show(struct device *dev,
853 struct device_attribute *attr, char *buf)
854{
855 struct nd_region *nd_region = to_nd_region(dev);
856 struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
857
858 return sprintf(buf, "%d\n", nfit_spa->spa->range_index);
859}
860static DEVICE_ATTR_RO(range_index);
861
862static struct attribute *acpi_nfit_region_attributes[] = {
863 &dev_attr_range_index.attr,
864 NULL,
865};
866
867static struct attribute_group acpi_nfit_region_attribute_group = {
868 .name = "nfit",
869 .attrs = acpi_nfit_region_attributes,
870};
871
872static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
873 &nd_region_attribute_group,
874 &nd_mapping_attribute_group,
875 &nd_device_attribute_group,
876 &nd_numa_attribute_group,
877 &acpi_nfit_region_attribute_group,
878 NULL,
879};
880
881/* enough info to uniquely specify an interleave set */
882struct nfit_set_info {
883 struct nfit_set_info_map {
884 u64 region_offset;
885 u32 serial_number;
886 u32 pad;
887 } mapping[0];
888};
889
890static size_t sizeof_nfit_set_info(int num_mappings)
891{
892 return sizeof(struct nfit_set_info)
893 + num_mappings * sizeof(struct nfit_set_info_map);
894}
895
896static int cmp_map(const void *m0, const void *m1)
897{
898 const struct nfit_set_info_map *map0 = m0;
899 const struct nfit_set_info_map *map1 = m1;
900
901 return memcmp(&map0->region_offset, &map1->region_offset,
902 sizeof(u64));
903}
904
905/* Retrieve the nth entry referencing this spa */
906static struct acpi_nfit_memory_map *memdev_from_spa(
907 struct acpi_nfit_desc *acpi_desc, u16 range_index, int n)
908{
909 struct nfit_memdev *nfit_memdev;
910
911 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list)
912 if (nfit_memdev->memdev->range_index == range_index)
913 if (n-- == 0)
914 return nfit_memdev->memdev;
915 return NULL;
916}
917
918static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
919 struct nd_region_desc *ndr_desc,
920 struct acpi_nfit_system_address *spa)
921{
922 int i, spa_type = nfit_spa_type(spa);
923 struct device *dev = acpi_desc->dev;
924 struct nd_interleave_set *nd_set;
925 u16 nr = ndr_desc->num_mappings;
926 struct nfit_set_info *info;
927
928 if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE)
929 /* pass */;
930 else
931 return 0;
932
933 nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
934 if (!nd_set)
935 return -ENOMEM;
936
937 info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL);
938 if (!info)
939 return -ENOMEM;
940 for (i = 0; i < nr; i++) {
941 struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
942 struct nfit_set_info_map *map = &info->mapping[i];
943 struct nvdimm *nvdimm = nd_mapping->nvdimm;
944 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
945 struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
946 spa->range_index, i);
947
948 if (!memdev || !nfit_mem->dcr) {
949 dev_err(dev, "%s: failed to find DCR\n", __func__);
950 return -ENODEV;
951 }
952
953 map->region_offset = memdev->region_offset;
954 map->serial_number = nfit_mem->dcr->serial_number;
955 }
956
957 sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
958 cmp_map, NULL);
959 nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
960 ndr_desc->nd_set = nd_set;
961 devm_kfree(dev, info);
962
963 return 0;
964}
965
966static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio)
967{
968 struct acpi_nfit_interleave *idt = mmio->idt;
969 u32 sub_line_offset, line_index, line_offset;
970 u64 line_no, table_skip_count, table_offset;
971
972 line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset);
973 table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index);
974 line_offset = idt->line_offset[line_index]
975 * mmio->line_size;
976 table_offset = table_skip_count * mmio->table_size;
977
978 return mmio->base_offset + line_offset + table_offset + sub_line_offset;
979}
980
981static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
982{
983 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
984 u64 offset = nfit_blk->stat_offset + mmio->size * bw;
985
986 if (mmio->num_lines)
987 offset = to_interleave_offset(offset, mmio);
988
989 return readq(mmio->base + offset);
990}
991
992static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
993 resource_size_t dpa, unsigned int len, unsigned int write)
994{
995 u64 cmd, offset;
996 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
997
998 enum {
999 BCW_OFFSET_MASK = (1ULL << 48)-1,
1000 BCW_LEN_SHIFT = 48,
1001 BCW_LEN_MASK = (1ULL << 8) - 1,
1002 BCW_CMD_SHIFT = 56,
1003 };
1004
1005 cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK;
1006 len = len >> L1_CACHE_SHIFT;
1007 cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT;
1008 cmd |= ((u64) write) << BCW_CMD_SHIFT;
1009
1010 offset = nfit_blk->cmd_offset + mmio->size * bw;
1011 if (mmio->num_lines)
1012 offset = to_interleave_offset(offset, mmio);
1013
1014 writeq(cmd, mmio->base + offset);
1015 /* FIXME: conditionally perform read-back if mandated by firmware */
1016}
1017
1018static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
1019 resource_size_t dpa, void *iobuf, size_t len, int rw,
1020 unsigned int lane)
1021{
1022 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
1023 unsigned int copied = 0;
1024 u64 base_offset;
1025 int rc;
1026
1027 base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES
1028 + lane * mmio->size;
1029 /* TODO: non-temporal access, flush hints, cache management etc... */
1030 write_blk_ctl(nfit_blk, lane, dpa, len, rw);
1031 while (len) {
1032 unsigned int c;
1033 u64 offset;
1034
1035 if (mmio->num_lines) {
1036 u32 line_offset;
1037
1038 offset = to_interleave_offset(base_offset + copied,
1039 mmio);
1040 div_u64_rem(offset, mmio->line_size, &line_offset);
1041 c = min_t(size_t, len, mmio->line_size - line_offset);
1042 } else {
1043 offset = base_offset + nfit_blk->bdw_offset;
1044 c = len;
1045 }
1046
1047 if (rw)
1048 memcpy(mmio->aperture + offset, iobuf + copied, c);
1049 else
1050 memcpy(iobuf + copied, mmio->aperture + offset, c);
1051
1052 copied += c;
1053 len -= c;
1054 }
1055 rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
1056 return rc;
1057}
1058
1059static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr,
1060 resource_size_t dpa, void *iobuf, u64 len, int rw)
1061{
1062 struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
1063 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
1064 struct nd_region *nd_region = nfit_blk->nd_region;
1065 unsigned int lane, copied = 0;
1066 int rc = 0;
1067
1068 lane = nd_region_acquire_lane(nd_region);
1069 while (len) {
1070 u64 c = min(len, mmio->size);
1071
1072 rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied,
1073 iobuf + copied, c, rw, lane);
1074 if (rc)
1075 break;
1076
1077 copied += c;
1078 len -= c;
1079 }
1080 nd_region_release_lane(nd_region, lane);
1081
1082 return rc;
1083}
1084
1085static void nfit_spa_mapping_release(struct kref *kref)
1086{
1087 struct nfit_spa_mapping *spa_map = to_spa_map(kref);
1088 struct acpi_nfit_system_address *spa = spa_map->spa;
1089 struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc;
1090
1091 WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
1092 dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
1093 iounmap(spa_map->iomem);
1094 release_mem_region(spa->address, spa->length);
1095 list_del(&spa_map->list);
1096 kfree(spa_map);
1097}
1098
1099static struct nfit_spa_mapping *find_spa_mapping(
1100 struct acpi_nfit_desc *acpi_desc,
1101 struct acpi_nfit_system_address *spa)
1102{
1103 struct nfit_spa_mapping *spa_map;
1104
1105 WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
1106 list_for_each_entry(spa_map, &acpi_desc->spa_maps, list)
1107 if (spa_map->spa == spa)
1108 return spa_map;
1109
1110 return NULL;
1111}
1112
1113static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc,
1114 struct acpi_nfit_system_address *spa)
1115{
1116 struct nfit_spa_mapping *spa_map;
1117
1118 mutex_lock(&acpi_desc->spa_map_mutex);
1119 spa_map = find_spa_mapping(acpi_desc, spa);
1120
1121 if (spa_map)
1122 kref_put(&spa_map->kref, nfit_spa_mapping_release);
1123 mutex_unlock(&acpi_desc->spa_map_mutex);
1124}
1125
1126static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
1127 struct acpi_nfit_system_address *spa)
1128{
1129 resource_size_t start = spa->address;
1130 resource_size_t n = spa->length;
1131 struct nfit_spa_mapping *spa_map;
1132 struct resource *res;
1133
1134 WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
1135
1136 spa_map = find_spa_mapping(acpi_desc, spa);
1137 if (spa_map) {
1138 kref_get(&spa_map->kref);
1139 return spa_map->iomem;
1140 }
1141
1142 spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
1143 if (!spa_map)
1144 return NULL;
1145
1146 INIT_LIST_HEAD(&spa_map->list);
1147 spa_map->spa = spa;
1148 kref_init(&spa_map->kref);
1149 spa_map->acpi_desc = acpi_desc;
1150
1151 res = request_mem_region(start, n, dev_name(acpi_desc->dev));
1152 if (!res)
1153 goto err_mem;
1154
1155 /* TODO: cacheability based on the spa type */
1156 spa_map->iomem = ioremap_nocache(start, n);
1157 if (!spa_map->iomem)
1158 goto err_map;
1159
1160 list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
1161 return spa_map->iomem;
1162
1163 err_map:
1164 release_mem_region(start, n);
1165 err_mem:
1166 kfree(spa_map);
1167 return NULL;
1168}
1169
1170/**
1171 * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges
1172 * @nvdimm_bus: NFIT-bus that provided the spa table entry
1173 * @nfit_spa: spa table to map
1174 *
1175 * In the case where block-data-window apertures and
1176 * dimm-control-regions are interleaved they will end up sharing a
1177 * single request_mem_region() + ioremap() for the address range. In
1178 * the style of devm nfit_spa_map() mappings are automatically dropped
1179 * when all region devices referencing the same mapping are disabled /
1180 * unbound.
1181 */
1182static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
1183 struct acpi_nfit_system_address *spa)
1184{
1185 void __iomem *iomem;
1186
1187 mutex_lock(&acpi_desc->spa_map_mutex);
1188 iomem = __nfit_spa_map(acpi_desc, spa);
1189 mutex_unlock(&acpi_desc->spa_map_mutex);
1190
1191 return iomem;
1192}
1193
1194static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
1195 struct acpi_nfit_interleave *idt, u16 interleave_ways)
1196{
1197 if (idt) {
1198 mmio->num_lines = idt->line_count;
1199 mmio->line_size = idt->line_size;
1200 if (interleave_ways == 0)
1201 return -ENXIO;
1202 mmio->table_size = mmio->num_lines * interleave_ways
1203 * mmio->line_size;
1204 }
1205
1206 return 0;
1207}
1208
1209static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
1210 struct device *dev)
1211{
1212 struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
1213 struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
1214 struct nd_blk_region *ndbr = to_nd_blk_region(dev);
1215 struct nfit_blk_mmio *mmio;
1216 struct nfit_blk *nfit_blk;
1217 struct nfit_mem *nfit_mem;
1218 struct nvdimm *nvdimm;
1219 int rc;
1220
1221 nvdimm = nd_blk_region_to_dimm(ndbr);
1222 nfit_mem = nvdimm_provider_data(nvdimm);
1223 if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) {
1224 dev_dbg(dev, "%s: missing%s%s%s\n", __func__,
1225 nfit_mem ? "" : " nfit_mem",
1226 nfit_mem->dcr ? "" : " dcr",
1227 nfit_mem->bdw ? "" : " bdw");
1228 return -ENXIO;
1229 }
1230
1231 nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL);
1232 if (!nfit_blk)
1233 return -ENOMEM;
1234 nd_blk_region_set_provider_data(ndbr, nfit_blk);
1235 nfit_blk->nd_region = to_nd_region(dev);
1236
1237 /* map block aperture memory */
1238 nfit_blk->bdw_offset = nfit_mem->bdw->offset;
1239 mmio = &nfit_blk->mmio[BDW];
1240 mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw);
1241 if (!mmio->base) {
1242 dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
1243 nvdimm_name(nvdimm));
1244 return -ENOMEM;
1245 }
1246 mmio->size = nfit_mem->bdw->size;
1247 mmio->base_offset = nfit_mem->memdev_bdw->region_offset;
1248 mmio->idt = nfit_mem->idt_bdw;
1249 mmio->spa = nfit_mem->spa_bdw;
1250 rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw,
1251 nfit_mem->memdev_bdw->interleave_ways);
1252 if (rc) {
1253 dev_dbg(dev, "%s: %s failed to init bdw interleave\n",
1254 __func__, nvdimm_name(nvdimm));
1255 return rc;
1256 }
1257
1258 /* map block control memory */
1259 nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
1260 nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
1261 mmio = &nfit_blk->mmio[DCR];
1262 mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr);
1263 if (!mmio->base) {
1264 dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
1265 nvdimm_name(nvdimm));
1266 return -ENOMEM;
1267 }
1268 mmio->size = nfit_mem->dcr->window_size;
1269 mmio->base_offset = nfit_mem->memdev_dcr->region_offset;
1270 mmio->idt = nfit_mem->idt_dcr;
1271 mmio->spa = nfit_mem->spa_dcr;
1272 rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr,
1273 nfit_mem->memdev_dcr->interleave_ways);
1274 if (rc) {
1275 dev_dbg(dev, "%s: %s failed to init dcr interleave\n",
1276 __func__, nvdimm_name(nvdimm));
1277 return rc;
1278 }
1279
1280 if (mmio->line_size == 0)
1281 return 0;
1282
1283 if ((u32) nfit_blk->cmd_offset % mmio->line_size
1284 + 8 > mmio->line_size) {
1285 dev_dbg(dev, "cmd_offset crosses interleave boundary\n");
1286 return -ENXIO;
1287 } else if ((u32) nfit_blk->stat_offset % mmio->line_size
1288 + 8 > mmio->line_size) {
1289 dev_dbg(dev, "stat_offset crosses interleave boundary\n");
1290 return -ENXIO;
1291 }
1292
1293 return 0;
1294}
1295
1296static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
1297 struct device *dev)
1298{
1299 struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
1300 struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
1301 struct nd_blk_region *ndbr = to_nd_blk_region(dev);
1302 struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
1303 int i;
1304
1305 if (!nfit_blk)
1306 return; /* never enabled */
1307
1308 /* auto-free BLK spa mappings */
1309 for (i = 0; i < 2; i++) {
1310 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
1311
1312 if (mmio->base)
1313 nfit_spa_unmap(acpi_desc, mmio->spa);
1314 }
1315 nd_blk_region_set_provider_data(ndbr, NULL);
1316 /* devm will free nfit_blk */
1317}
1318
1319static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
1320 struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
1321 struct acpi_nfit_memory_map *memdev,
1322 struct acpi_nfit_system_address *spa)
1323{
1324 struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
1325 memdev->device_handle);
1326 struct nd_blk_region_desc *ndbr_desc;
1327 struct nfit_mem *nfit_mem;
1328 int blk_valid = 0;
1329
1330 if (!nvdimm) {
1331 dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n",
1332 spa->range_index, memdev->device_handle);
1333 return -ENODEV;
1334 }
1335
1336 nd_mapping->nvdimm = nvdimm;
1337 switch (nfit_spa_type(spa)) {
1338 case NFIT_SPA_PM:
1339 case NFIT_SPA_VOLATILE:
1340 nd_mapping->start = memdev->address;
1341 nd_mapping->size = memdev->region_size;
1342 break;
1343 case NFIT_SPA_DCR:
1344 nfit_mem = nvdimm_provider_data(nvdimm);
1345 if (!nfit_mem || !nfit_mem->bdw) {
1346 dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n",
1347 spa->range_index, nvdimm_name(nvdimm));
1348 } else {
1349 nd_mapping->size = nfit_mem->bdw->capacity;
1350 nd_mapping->start = nfit_mem->bdw->start_address;
1351 ndr_desc->num_lanes = nfit_mem->bdw->windows;
1352 blk_valid = 1;
1353 }
1354
1355 ndr_desc->nd_mapping = nd_mapping;
1356 ndr_desc->num_mappings = blk_valid;
1357 ndbr_desc = to_blk_region_desc(ndr_desc);
1358 ndbr_desc->enable = acpi_nfit_blk_region_enable;
1359 ndbr_desc->disable = acpi_nfit_blk_region_disable;
1360 ndbr_desc->do_io = acpi_desc->blk_do_io;
1361 if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc))
1362 return -ENOMEM;
1363 break;
1364 }
1365
1366 return 0;
1367}
1368
1369static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
1370 struct nfit_spa *nfit_spa)
1371{
1372 static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS];
1373 struct acpi_nfit_system_address *spa = nfit_spa->spa;
1374 struct nd_blk_region_desc ndbr_desc;
1375 struct nd_region_desc *ndr_desc;
1376 struct nfit_memdev *nfit_memdev;
1377 struct nvdimm_bus *nvdimm_bus;
1378 struct resource res;
1379 int count = 0, rc;
1380
1381 if (spa->range_index == 0) {
1382 dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
1383 __func__);
1384 return 0;
1385 }
1386
1387 memset(&res, 0, sizeof(res));
1388 memset(&nd_mappings, 0, sizeof(nd_mappings));
1389 memset(&ndbr_desc, 0, sizeof(ndbr_desc));
1390 res.start = spa->address;
1391 res.end = res.start + spa->length - 1;
1392 ndr_desc = &ndbr_desc.ndr_desc;
1393 ndr_desc->res = &res;
1394 ndr_desc->provider_data = nfit_spa;
1395 ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
1396 if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
1397 ndr_desc->numa_node = acpi_map_pxm_to_online_node(
1398 spa->proximity_domain);
1399 else
1400 ndr_desc->numa_node = NUMA_NO_NODE;
1401
1402 list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
1403 struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
1404 struct nd_mapping *nd_mapping;
1405
1406 if (memdev->range_index != spa->range_index)
1407 continue;
1408 if (count >= ND_MAX_MAPPINGS) {
1409 dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n",
1410 spa->range_index, ND_MAX_MAPPINGS);
1411 return -ENXIO;
1412 }
1413 nd_mapping = &nd_mappings[count++];
1414 rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc,
1415 memdev, spa);
1416 if (rc)
1417 return rc;
1418 }
1419
1420 ndr_desc->nd_mapping = nd_mappings;
1421 ndr_desc->num_mappings = count;
1422 rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
1423 if (rc)
1424 return rc;
1425
1426 nvdimm_bus = acpi_desc->nvdimm_bus;
1427 if (nfit_spa_type(spa) == NFIT_SPA_PM) {
1428 if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
1429 return -ENOMEM;
1430 } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
1431 if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc))
1432 return -ENOMEM;
1433 }
1434 return 0;
1435}
1436
1437static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
1438{
1439 struct nfit_spa *nfit_spa;
1440
1441 list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
1442 int rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
1443
1444 if (rc)
1445 return rc;
1446 }
1447 return 0;
1448}
1449
1450int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
1451{
1452 struct device *dev = acpi_desc->dev;
1453 const void *end;
1454 u8 *data;
1455 int rc;
1456
1457 INIT_LIST_HEAD(&acpi_desc->spa_maps);
1458 INIT_LIST_HEAD(&acpi_desc->spas);
1459 INIT_LIST_HEAD(&acpi_desc->dcrs);
1460 INIT_LIST_HEAD(&acpi_desc->bdws);
1461 INIT_LIST_HEAD(&acpi_desc->idts);
1462 INIT_LIST_HEAD(&acpi_desc->memdevs);
1463 INIT_LIST_HEAD(&acpi_desc->dimms);
1464 mutex_init(&acpi_desc->spa_map_mutex);
1465
1466 data = (u8 *) acpi_desc->nfit;
1467 end = data + sz;
1468 data += sizeof(struct acpi_table_nfit);
1469 while (!IS_ERR_OR_NULL(data))
1470 data = add_table(acpi_desc, data, end);
1471
1472 if (IS_ERR(data)) {
1473 dev_dbg(dev, "%s: nfit table parsing error: %ld\n", __func__,
1474 PTR_ERR(data));
1475 return PTR_ERR(data);
1476 }
1477
1478 if (nfit_mem_init(acpi_desc) != 0)
1479 return -ENOMEM;
1480
1481 acpi_nfit_init_dsms(acpi_desc);
1482
1483 rc = acpi_nfit_register_dimms(acpi_desc);
1484 if (rc)
1485 return rc;
1486
1487 return acpi_nfit_register_regions(acpi_desc);
1488}
1489EXPORT_SYMBOL_GPL(acpi_nfit_init);
1490
1491static int acpi_nfit_add(struct acpi_device *adev)
1492{
1493 struct nvdimm_bus_descriptor *nd_desc;
1494 struct acpi_nfit_desc *acpi_desc;
1495 struct device *dev = &adev->dev;
1496 struct acpi_table_header *tbl;
1497 acpi_status status = AE_OK;
1498 acpi_size sz;
1499 int rc;
1500
1501 status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz);
1502 if (ACPI_FAILURE(status)) {
1503 dev_err(dev, "failed to find NFIT\n");
1504 return -ENXIO;
1505 }
1506
1507 acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
1508 if (!acpi_desc)
1509 return -ENOMEM;
1510
1511 dev_set_drvdata(dev, acpi_desc);
1512 acpi_desc->dev = dev;
1513 acpi_desc->nfit = (struct acpi_table_nfit *) tbl;
1514 acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
1515 nd_desc = &acpi_desc->nd_desc;
1516 nd_desc->provider_name = "ACPI.NFIT";
1517 nd_desc->ndctl = acpi_nfit_ctl;
1518 nd_desc->attr_groups = acpi_nfit_attribute_groups;
1519
1520 acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc);
1521 if (!acpi_desc->nvdimm_bus)
1522 return -ENXIO;
1523
1524 rc = acpi_nfit_init(acpi_desc, sz);
1525 if (rc) {
1526 nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
1527 return rc;
1528 }
1529 return 0;
1530}
1531
1532static int acpi_nfit_remove(struct acpi_device *adev)
1533{
1534 struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
1535
1536 nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
1537 return 0;
1538}
1539
1540static const struct acpi_device_id acpi_nfit_ids[] = {
1541 { "ACPI0012", 0 },
1542 { "", 0 },
1543};
1544MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids);
1545
1546static struct acpi_driver acpi_nfit_driver = {
1547 .name = KBUILD_MODNAME,
1548 .ids = acpi_nfit_ids,
1549 .ops = {
1550 .add = acpi_nfit_add,
1551 .remove = acpi_nfit_remove,
1552 },
1553};
1554
1555static __init int nfit_init(void)
1556{
1557 BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40);
1558 BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 56);
1559 BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48);
1560 BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20);
1561 BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9);
1562 BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80);
1563 BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40);
1564
1565 acpi_str_to_uuid(UUID_VOLATILE_MEMORY, nfit_uuid[NFIT_SPA_VOLATILE]);
1566 acpi_str_to_uuid(UUID_PERSISTENT_MEMORY, nfit_uuid[NFIT_SPA_PM]);
1567 acpi_str_to_uuid(UUID_CONTROL_REGION, nfit_uuid[NFIT_SPA_DCR]);
1568 acpi_str_to_uuid(UUID_DATA_REGION, nfit_uuid[NFIT_SPA_BDW]);
1569 acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_VDISK]);
1570 acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_CD, nfit_uuid[NFIT_SPA_VCD]);
1571 acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_PDISK]);
1572 acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]);
1573 acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]);
1574 acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
1575
1576 return acpi_bus_register_driver(&acpi_nfit_driver);
1577}
1578
1579static __exit void nfit_exit(void)
1580{
1581 acpi_bus_unregister_driver(&acpi_nfit_driver);
1582}
1583
1584module_init(nfit_init);
1585module_exit(nfit_exit);
1586MODULE_LICENSE("GPL v2");
1587MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
new file mode 100644
index 000000000000..81f2e8c5a79c
--- /dev/null
+++ b/drivers/acpi/nfit.h
@@ -0,0 +1,158 @@
1/*
2 * NVDIMM Firmware Interface Table - NFIT
3 *
4 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15#ifndef __NFIT_H__
16#define __NFIT_H__
17#include <linux/libnvdimm.h>
18#include <linux/types.h>
19#include <linux/uuid.h>
20#include <linux/acpi.h>
21#include <acpi/acuuid.h>
22
23#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
24#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
25#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
26 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
27 | ACPI_NFIT_MEM_ARMED)
28
29enum nfit_uuids {
30 NFIT_SPA_VOLATILE,
31 NFIT_SPA_PM,
32 NFIT_SPA_DCR,
33 NFIT_SPA_BDW,
34 NFIT_SPA_VDISK,
35 NFIT_SPA_VCD,
36 NFIT_SPA_PDISK,
37 NFIT_SPA_PCD,
38 NFIT_DEV_BUS,
39 NFIT_DEV_DIMM,
40 NFIT_UUID_MAX,
41};
42
43struct nfit_spa {
44 struct acpi_nfit_system_address *spa;
45 struct list_head list;
46};
47
48struct nfit_dcr {
49 struct acpi_nfit_control_region *dcr;
50 struct list_head list;
51};
52
53struct nfit_bdw {
54 struct acpi_nfit_data_region *bdw;
55 struct list_head list;
56};
57
58struct nfit_idt {
59 struct acpi_nfit_interleave *idt;
60 struct list_head list;
61};
62
63struct nfit_memdev {
64 struct acpi_nfit_memory_map *memdev;
65 struct list_head list;
66};
67
68/* assembled tables for a given dimm/memory-device */
69struct nfit_mem {
70 struct nvdimm *nvdimm;
71 struct acpi_nfit_memory_map *memdev_dcr;
72 struct acpi_nfit_memory_map *memdev_pmem;
73 struct acpi_nfit_memory_map *memdev_bdw;
74 struct acpi_nfit_control_region *dcr;
75 struct acpi_nfit_data_region *bdw;
76 struct acpi_nfit_system_address *spa_dcr;
77 struct acpi_nfit_system_address *spa_bdw;
78 struct acpi_nfit_interleave *idt_dcr;
79 struct acpi_nfit_interleave *idt_bdw;
80 struct list_head list;
81 struct acpi_device *adev;
82 unsigned long dsm_mask;
83};
84
85struct acpi_nfit_desc {
86 struct nvdimm_bus_descriptor nd_desc;
87 struct acpi_table_nfit *nfit;
88 struct mutex spa_map_mutex;
89 struct list_head spa_maps;
90 struct list_head memdevs;
91 struct list_head dimms;
92 struct list_head spas;
93 struct list_head dcrs;
94 struct list_head bdws;
95 struct list_head idts;
96 struct nvdimm_bus *nvdimm_bus;
97 struct device *dev;
98 unsigned long dimm_dsm_force_en;
99 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
100 void *iobuf, u64 len, int rw);
101};
102
103enum nd_blk_mmio_selector {
104 BDW,
105 DCR,
106};
107
108struct nfit_blk {
109 struct nfit_blk_mmio {
110 union {
111 void __iomem *base;
112 void *aperture;
113 };
114 u64 size;
115 u64 base_offset;
116 u32 line_size;
117 u32 num_lines;
118 u32 table_size;
119 struct acpi_nfit_interleave *idt;
120 struct acpi_nfit_system_address *spa;
121 } mmio[2];
122 struct nd_region *nd_region;
123 u64 bdw_offset; /* post interleave offset */
124 u64 stat_offset;
125 u64 cmd_offset;
126};
127
128struct nfit_spa_mapping {
129 struct acpi_nfit_desc *acpi_desc;
130 struct acpi_nfit_system_address *spa;
131 struct list_head list;
132 struct kref kref;
133 void __iomem *iomem;
134};
135
136static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
137{
138 return container_of(kref, struct nfit_spa_mapping, kref);
139}
140
141static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
142 struct nfit_mem *nfit_mem)
143{
144 if (nfit_mem->memdev_dcr)
145 return nfit_mem->memdev_dcr;
146 return nfit_mem->memdev_pmem;
147}
148
149static inline struct acpi_nfit_desc *to_acpi_desc(
150 struct nvdimm_bus_descriptor *nd_desc)
151{
152 return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
153}
154
155const u8 *to_nfit_uuid(enum nfit_uuids id);
156int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz);
157extern const struct attribute_group *acpi_nfit_attribute_groups[];
158#endif /* __NFIT_H__ */
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 1333cbdc3ea2..acaa3b4ea504 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,6 +29,8 @@
29#include <linux/errno.h> 29#include <linux/errno.h>
30#include <linux/acpi.h> 30#include <linux/acpi.h>
31#include <linux/numa.h> 31#include <linux/numa.h>
32#include <linux/nodemask.h>
33#include <linux/topology.h>
32 34
33#define PREFIX "ACPI: " 35#define PREFIX "ACPI: "
34 36
@@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node)
70 72
71int acpi_map_pxm_to_node(int pxm) 73int acpi_map_pxm_to_node(int pxm)
72{ 74{
73 int node = pxm_to_node_map[pxm]; 75 int node;
76
77 if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
78 return NUMA_NO_NODE;
79
80 node = pxm_to_node_map[pxm];
74 81
75 if (node == NUMA_NO_NODE) { 82 if (node == NUMA_NO_NODE) {
76 if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) 83 if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
@@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm)
83 return node; 90 return node;
84} 91}
85 92
93/**
94 * acpi_map_pxm_to_online_node - Map proximity ID to online node
95 * @pxm: ACPI proximity ID
96 *
97 * This is similar to acpi_map_pxm_to_node(), but always returns an online
98 * node. When the mapped node from a given proximity ID is offline, it
99 * looks up the node distance table and returns the nearest online node.
100 *
101 * ACPI device drivers, which are called after the NUMA initialization has
102 * completed in the kernel, can call this interface to obtain their device
103 * NUMA topology from ACPI tables. Such drivers do not have to deal with
104 * offline nodes. A node may be offline when a device proximity ID is
105 * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
106 * "numa=off" on x86.
107 */
108int acpi_map_pxm_to_online_node(int pxm)
109{
110 int node, n, dist, min_dist;
111
112 node = acpi_map_pxm_to_node(pxm);
113
114 if (node == NUMA_NO_NODE)
115 node = 0;
116
117 if (!node_online(node)) {
118 min_dist = INT_MAX;
119 for_each_online_node(n) {
120 dist = node_distance(node, n);
121 if (dist < min_dist) {
122 min_dist = dist;
123 node = n;
124 }
125 }
126 }
127
128 return node;
129}
130EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
131
86static void __init 132static void __init
87acpi_table_print_srat_entry(struct acpi_subtable_header *header) 133acpi_table_print_srat_entry(struct acpi_subtable_header *header)
88{ 134{
@@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle)
328 int pxm; 374 int pxm;
329 375
330 pxm = acpi_get_pxm(handle); 376 pxm = acpi_get_pxm(handle);
331 if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
332 return NUMA_NO_NODE;
333 377
334 return acpi_map_pxm_to_node(pxm); 378 return acpi_map_pxm_to_node(pxm);
335} 379}
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 3ccef9eba6f9..1b8094d4d7af 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,18 +404,6 @@ config BLK_DEV_RAM_DAX
404 and will prevent RAM block device backing store memory from being 404 and will prevent RAM block device backing store memory from being
405 allocated from highmem (only a problem for highmem systems). 405 allocated from highmem (only a problem for highmem systems).
406 406
407config BLK_DEV_PMEM
408 tristate "Persistent memory block device support"
409 depends on HAS_IOMEM
410 help
411 Saying Y here will allow you to use a contiguous range of reserved
412 memory as one or more persistent block devices.
413
414 To compile this driver as a module, choose M here: the module will be
415 called 'pmem'.
416
417 If unsure, say N.
418
419config CDROM_PKTCDVD 407config CDROM_PKTCDVD
420 tristate "Packet writing on CD/DVD media" 408 tristate "Packet writing on CD/DVD media"
421 depends on !UML 409 depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 9cc6c18a1c7e..02b688d1438d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,7 +14,6 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o
14obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o 14obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
15obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o 15obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
16obj-$(CONFIG_BLK_DEV_RAM) += brd.o 16obj-$(CONFIG_BLK_DEV_RAM) += brd.o
17obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
18obj-$(CONFIG_BLK_DEV_LOOP) += loop.o 17obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
19obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o 18obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
20obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o 19obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
new file mode 100644
index 000000000000..72226acb5c0f
--- /dev/null
+++ b/drivers/nvdimm/Kconfig
@@ -0,0 +1,68 @@
1menuconfig LIBNVDIMM
2 tristate "NVDIMM (Non-Volatile Memory Device) Support"
3 depends on PHYS_ADDR_T_64BIT
4 depends on BLK_DEV
5 help
6 Generic support for non-volatile memory devices including
7 ACPI-6-NFIT defined resources. On platforms that define an
8 NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
9 bus is registered to advertise PMEM (persistent memory)
10 namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
11 namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a
12 memory resource that may span multiple DIMMs and support DAX
13 (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control
14 region which exposes an mmio register set for windowed access
15 mode to non-volatile memory.
16
17if LIBNVDIMM
18
19config BLK_DEV_PMEM
20 tristate "PMEM: Persistent memory block device support"
21 default LIBNVDIMM
22 depends on HAS_IOMEM
23 select ND_BTT if BTT
24 help
25 Memory ranges for PMEM are described by either an NFIT
26 (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
27 non-standard OEM-specific E820 memory type (type-12, see
28 CONFIG_X86_PMEM_LEGACY), or it is manually specified by the
29 'memmap=nn[KMG]!ss[KMG]' kernel command line (see
30 Documentation/kernel-parameters.txt). This driver converts
31 these persistent memory ranges into block devices that are
32 capable of DAX (direct-access) file system mappings. See
33 Documentation/nvdimm/nvdimm.txt for more details.
34
35 Say Y if you want to use an NVDIMM
36
37config ND_BLK
38 tristate "BLK: Block data window (aperture) device support"
39 default LIBNVDIMM
40 select ND_BTT if BTT
41 help
42 Support NVDIMMs, or other devices, that implement a BLK-mode
43 access capability. BLK-mode access uses memory-mapped-i/o
44 apertures to access persistent media.
45
46 Say Y if your platform firmware emits an ACPI.NFIT table
47 (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
48 capabilities.
49
50config ND_BTT
51 tristate
52
53config BTT
54 bool "BTT: Block Translation Table (atomic sector updates)"
55 default y if LIBNVDIMM
56 help
57 The Block Translation Table (BTT) provides atomic sector
58 update semantics for persistent memory devices, so that
59 applications that rely on sector writes not being torn (a
60 guarantee that typical disks provide) can continue to do so.
61 The BTT manifests itself as an alternate personality for an
62 NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX,
63 ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys,
64 etc...).
65
66 Select Y if unsure
67
68endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
new file mode 100644
index 000000000000..594bb97c867a
--- /dev/null
+++ b/drivers/nvdimm/Makefile
@@ -0,0 +1,20 @@
1obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
2obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
3obj-$(CONFIG_ND_BTT) += nd_btt.o
4obj-$(CONFIG_ND_BLK) += nd_blk.o
5
6nd_pmem-y := pmem.o
7
8nd_btt-y := btt.o
9
10nd_blk-y := blk.o
11
12libnvdimm-y := core.o
13libnvdimm-y += bus.o
14libnvdimm-y += dimm_devs.o
15libnvdimm-y += dimm.o
16libnvdimm-y += region_devs.o
17libnvdimm-y += region.o
18libnvdimm-y += namespace_devs.o
19libnvdimm-y += label.o
20libnvdimm-$(CONFIG_BTT) += btt_devs.o
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
new file mode 100644
index 000000000000..4f97b248c236
--- /dev/null
+++ b/drivers/nvdimm/blk.c
@@ -0,0 +1,384 @@
1/*
2 * NVDIMM Block Window Driver
3 * Copyright (c) 2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/blkdev.h>
16#include <linux/fs.h>
17#include <linux/genhd.h>
18#include <linux/module.h>
19#include <linux/moduleparam.h>
20#include <linux/nd.h>
21#include <linux/sizes.h>
22#include "nd.h"
23
24struct nd_blk_device {
25 struct request_queue *queue;
26 struct gendisk *disk;
27 struct nd_namespace_blk *nsblk;
28 struct nd_blk_region *ndbr;
29 size_t disk_size;
30 u32 sector_size;
31 u32 internal_lbasize;
32};
33
34static int nd_blk_major;
35
36static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
37{
38 return blk_dev->nsblk->lbasize - blk_dev->sector_size;
39}
40
41static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
42 resource_size_t ns_offset, unsigned int len)
43{
44 int i;
45
46 for (i = 0; i < nsblk->num_resources; i++) {
47 if (ns_offset < resource_size(nsblk->res[i])) {
48 if (ns_offset + len > resource_size(nsblk->res[i])) {
49 dev_WARN_ONCE(&nsblk->common.dev, 1,
50 "illegal request\n");
51 return SIZE_MAX;
52 }
53 return nsblk->res[i]->start + ns_offset;
54 }
55 ns_offset -= resource_size(nsblk->res[i]);
56 }
57
58 dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n");
59 return SIZE_MAX;
60}
61
62#ifdef CONFIG_BLK_DEV_INTEGRITY
63static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
64 struct bio_integrity_payload *bip, u64 lba,
65 int rw)
66{
67 unsigned int len = nd_blk_meta_size(blk_dev);
68 resource_size_t dev_offset, ns_offset;
69 struct nd_namespace_blk *nsblk;
70 struct nd_blk_region *ndbr;
71 int err = 0;
72
73 nsblk = blk_dev->nsblk;
74 ndbr = blk_dev->ndbr;
75 ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size;
76 dev_offset = to_dev_offset(nsblk, ns_offset, len);
77 if (dev_offset == SIZE_MAX)
78 return -EIO;
79
80 while (len) {
81 unsigned int cur_len;
82 struct bio_vec bv;
83 void *iobuf;
84
85 bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
86 /*
87 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
88 * .bv_offset already adjusted for iter->bi_bvec_done, and we
89 * can use those directly
90 */
91
92 cur_len = min(len, bv.bv_len);
93 iobuf = kmap_atomic(bv.bv_page);
94 err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset,
95 cur_len, rw);
96 kunmap_atomic(iobuf);
97 if (err)
98 return err;
99
100 len -= cur_len;
101 dev_offset += cur_len;
102 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
103 }
104
105 return err;
106}
107
108#else /* CONFIG_BLK_DEV_INTEGRITY */
109static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
110 struct bio_integrity_payload *bip, u64 lba,
111 int rw)
112{
113 return 0;
114}
115#endif
116
117static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
118 struct bio_integrity_payload *bip, struct page *page,
119 unsigned int len, unsigned int off, int rw,
120 sector_t sector)
121{
122 struct nd_blk_region *ndbr = blk_dev->ndbr;
123 resource_size_t dev_offset, ns_offset;
124 int err = 0;
125 void *iobuf;
126 u64 lba;
127
128 while (len) {
129 unsigned int cur_len;
130
131 /*
132 * If we don't have an integrity payload, we don't have to
133 * split the bvec into sectors, as this would cause unnecessary
134 * Block Window setup/move steps. the do_io routine is capable
135 * of handling len <= PAGE_SIZE.
136 */
137 cur_len = bip ? min(len, blk_dev->sector_size) : len;
138
139 lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size);
140 ns_offset = lba * blk_dev->internal_lbasize;
141 dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len);
142 if (dev_offset == SIZE_MAX)
143 return -EIO;
144
145 iobuf = kmap_atomic(page);
146 err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw);
147 kunmap_atomic(iobuf);
148 if (err)
149 return err;
150
151 if (bip) {
152 err = nd_blk_rw_integrity(blk_dev, bip, lba, rw);
153 if (err)
154 return err;
155 }
156 len -= cur_len;
157 off += cur_len;
158 sector += blk_dev->sector_size >> SECTOR_SHIFT;
159 }
160
161 return err;
162}
163
164static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
165{
166 struct block_device *bdev = bio->bi_bdev;
167 struct gendisk *disk = bdev->bd_disk;
168 struct bio_integrity_payload *bip;
169 struct nd_blk_device *blk_dev;
170 struct bvec_iter iter;
171 unsigned long start;
172 struct bio_vec bvec;
173 int err = 0, rw;
174 bool do_acct;
175
176 /*
177 * bio_integrity_enabled also checks if the bio already has an
178 * integrity payload attached. If it does, we *don't* do a
179 * bio_integrity_prep here - the payload has been generated by
180 * another kernel subsystem, and we just pass it through.
181 */
182 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
183 err = -EIO;
184 goto out;
185 }
186
187 bip = bio_integrity(bio);
188 blk_dev = disk->private_data;
189 rw = bio_data_dir(bio);
190 do_acct = nd_iostat_start(bio, &start);
191 bio_for_each_segment(bvec, bio, iter) {
192 unsigned int len = bvec.bv_len;
193
194 BUG_ON(len > PAGE_SIZE);
195 err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
196 bvec.bv_offset, rw, iter.bi_sector);
197 if (err) {
198 dev_info(&blk_dev->nsblk->common.dev,
199 "io error in %s sector %lld, len %d,\n",
200 (rw == READ) ? "READ" : "WRITE",
201 (unsigned long long) iter.bi_sector, len);
202 break;
203 }
204 }
205 if (do_acct)
206 nd_iostat_end(bio, start);
207
208 out:
209 bio_endio(bio, err);
210}
211
212static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
213 resource_size_t offset, void *iobuf, size_t n, int rw)
214{
215 struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
216 struct nd_namespace_blk *nsblk = blk_dev->nsblk;
217 struct nd_blk_region *ndbr = blk_dev->ndbr;
218 resource_size_t dev_offset;
219
220 dev_offset = to_dev_offset(nsblk, offset, n);
221
222 if (unlikely(offset + n > blk_dev->disk_size)) {
223 dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
224 return -EFAULT;
225 }
226
227 if (dev_offset == SIZE_MAX)
228 return -EIO;
229
230 return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
231}
232
233static const struct block_device_operations nd_blk_fops = {
234 .owner = THIS_MODULE,
235 .revalidate_disk = nvdimm_revalidate_disk,
236};
237
238static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
239 struct nd_blk_device *blk_dev)
240{
241 resource_size_t available_disk_size;
242 struct gendisk *disk;
243 u64 internal_nlba;
244
245 internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
246 available_disk_size = internal_nlba * blk_dev->sector_size;
247
248 blk_dev->queue = blk_alloc_queue(GFP_KERNEL);
249 if (!blk_dev->queue)
250 return -ENOMEM;
251
252 blk_queue_make_request(blk_dev->queue, nd_blk_make_request);
253 blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX);
254 blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY);
255 blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size);
256 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue);
257
258 disk = blk_dev->disk = alloc_disk(0);
259 if (!disk) {
260 blk_cleanup_queue(blk_dev->queue);
261 return -ENOMEM;
262 }
263
264 disk->driverfs_dev = &ndns->dev;
265 disk->major = nd_blk_major;
266 disk->first_minor = 0;
267 disk->fops = &nd_blk_fops;
268 disk->private_data = blk_dev;
269 disk->queue = blk_dev->queue;
270 disk->flags = GENHD_FL_EXT_DEVT;
271 nvdimm_namespace_disk_name(ndns, disk->disk_name);
272 set_capacity(disk, 0);
273 add_disk(disk);
274
275 if (nd_blk_meta_size(blk_dev)) {
276 int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
277
278 if (rc) {
279 del_gendisk(disk);
280 put_disk(disk);
281 blk_cleanup_queue(blk_dev->queue);
282 return rc;
283 }
284 }
285
286 set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
287 revalidate_disk(disk);
288 return 0;
289}
290
291static int nd_blk_probe(struct device *dev)
292{
293 struct nd_namespace_common *ndns;
294 struct nd_namespace_blk *nsblk;
295 struct nd_blk_device *blk_dev;
296 int rc;
297
298 ndns = nvdimm_namespace_common_probe(dev);
299 if (IS_ERR(ndns))
300 return PTR_ERR(ndns);
301
302 blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL);
303 if (!blk_dev)
304 return -ENOMEM;
305
306 nsblk = to_nd_namespace_blk(&ndns->dev);
307 blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
308 blk_dev->ndbr = to_nd_blk_region(dev->parent);
309 blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
310 blk_dev->internal_lbasize = roundup(nsblk->lbasize,
311 INT_LBASIZE_ALIGNMENT);
312 blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512);
313 dev_set_drvdata(dev, blk_dev);
314
315 ndns->rw_bytes = nd_blk_rw_bytes;
316 if (is_nd_btt(dev))
317 rc = nvdimm_namespace_attach_btt(ndns);
318 else if (nd_btt_probe(ndns, blk_dev) == 0) {
319 /* we'll come back as btt-blk */
320 rc = -ENXIO;
321 } else
322 rc = nd_blk_attach_disk(ndns, blk_dev);
323 if (rc)
324 kfree(blk_dev);
325 return rc;
326}
327
328static void nd_blk_detach_disk(struct nd_blk_device *blk_dev)
329{
330 del_gendisk(blk_dev->disk);
331 put_disk(blk_dev->disk);
332 blk_cleanup_queue(blk_dev->queue);
333}
334
335static int nd_blk_remove(struct device *dev)
336{
337 struct nd_blk_device *blk_dev = dev_get_drvdata(dev);
338
339 if (is_nd_btt(dev))
340 nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
341 else
342 nd_blk_detach_disk(blk_dev);
343 kfree(blk_dev);
344
345 return 0;
346}
347
348static struct nd_device_driver nd_blk_driver = {
349 .probe = nd_blk_probe,
350 .remove = nd_blk_remove,
351 .drv = {
352 .name = "nd_blk",
353 },
354 .type = ND_DRIVER_NAMESPACE_BLK,
355};
356
357static int __init nd_blk_init(void)
358{
359 int rc;
360
361 rc = register_blkdev(0, "nd_blk");
362 if (rc < 0)
363 return rc;
364
365 nd_blk_major = rc;
366 rc = nd_driver_register(&nd_blk_driver);
367
368 if (rc < 0)
369 unregister_blkdev(nd_blk_major, "nd_blk");
370
371 return rc;
372}
373
374static void __exit nd_blk_exit(void)
375{
376 driver_unregister(&nd_blk_driver.drv);
377 unregister_blkdev(nd_blk_major, "nd_blk");
378}
379
380MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
381MODULE_LICENSE("GPL v2");
382MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK);
383module_init(nd_blk_init);
384module_exit(nd_blk_exit);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
new file mode 100644
index 000000000000..411c7b2bb37a
--- /dev/null
+++ b/drivers/nvdimm/btt.c
@@ -0,0 +1,1479 @@
1/*
2 * Block Translation Table
3 * Copyright (c) 2014-2015, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#include <linux/highmem.h>
15#include <linux/debugfs.h>
16#include <linux/blkdev.h>
17#include <linux/module.h>
18#include <linux/device.h>
19#include <linux/mutex.h>
20#include <linux/hdreg.h>
21#include <linux/genhd.h>
22#include <linux/sizes.h>
23#include <linux/ndctl.h>
24#include <linux/fs.h>
25#include <linux/nd.h>
26#include "btt.h"
27#include "nd.h"
28
29enum log_ent_request {
30 LOG_NEW_ENT = 0,
31 LOG_OLD_ENT
32};
33
34static int btt_major;
35
36static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
37 void *buf, size_t n)
38{
39 struct nd_btt *nd_btt = arena->nd_btt;
40 struct nd_namespace_common *ndns = nd_btt->ndns;
41
42 /* arena offsets are 4K from the base of the device */
43 offset += SZ_4K;
44 return nvdimm_read_bytes(ndns, offset, buf, n);
45}
46
47static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
48 void *buf, size_t n)
49{
50 struct nd_btt *nd_btt = arena->nd_btt;
51 struct nd_namespace_common *ndns = nd_btt->ndns;
52
53 /* arena offsets are 4K from the base of the device */
54 offset += SZ_4K;
55 return nvdimm_write_bytes(ndns, offset, buf, n);
56}
57
58static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
59{
60 int ret;
61
62 ret = arena_write_bytes(arena, arena->info2off, super,
63 sizeof(struct btt_sb));
64 if (ret)
65 return ret;
66
67 return arena_write_bytes(arena, arena->infooff, super,
68 sizeof(struct btt_sb));
69}
70
71static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
72{
73 WARN_ON(!super);
74 return arena_read_bytes(arena, arena->infooff, super,
75 sizeof(struct btt_sb));
76}
77
78/*
79 * 'raw' version of btt_map write
80 * Assumptions:
81 * mapping is in little-endian
82 * mapping contains 'E' and 'Z' flags as desired
83 */
84static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping)
85{
86 u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
87
88 WARN_ON(lba >= arena->external_nlba);
89 return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE);
90}
91
92static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
93 u32 z_flag, u32 e_flag)
94{
95 u32 ze;
96 __le32 mapping_le;
97
98 /*
99 * This 'mapping' is supposed to be just the LBA mapping, without
100 * any flags set, so strip the flag bits.
101 */
102 mapping &= MAP_LBA_MASK;
103
104 ze = (z_flag << 1) + e_flag;
105 switch (ze) {
106 case 0:
107 /*
108 * We want to set neither of the Z or E flags, and
109 * in the actual layout, this means setting the bit
110 * positions of both to '1' to indicate a 'normal'
111 * map entry
112 */
113 mapping |= MAP_ENT_NORMAL;
114 break;
115 case 1:
116 mapping |= (1 << MAP_ERR_SHIFT);
117 break;
118 case 2:
119 mapping |= (1 << MAP_TRIM_SHIFT);
120 break;
121 default:
122 /*
123 * The case where Z and E are both sent in as '1' could be
124 * construed as a valid 'normal' case, but we decide not to,
125 * to avoid confusion
126 */
127 WARN_ONCE(1, "Invalid use of Z and E flags\n");
128 return -EIO;
129 }
130
131 mapping_le = cpu_to_le32(mapping);
132 return __btt_map_write(arena, lba, mapping_le);
133}
134
135static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
136 int *trim, int *error)
137{
138 int ret;
139 __le32 in;
140 u32 raw_mapping, postmap, ze, z_flag, e_flag;
141 u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
142
143 WARN_ON(lba >= arena->external_nlba);
144
145 ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE);
146 if (ret)
147 return ret;
148
149 raw_mapping = le32_to_cpu(in);
150
151 z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
152 e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
153 ze = (z_flag << 1) + e_flag;
154 postmap = raw_mapping & MAP_LBA_MASK;
155
156 /* Reuse the {z,e}_flag variables for *trim and *error */
157 z_flag = 0;
158 e_flag = 0;
159
160 switch (ze) {
161 case 0:
162 /* Initial state. Return postmap = premap */
163 *mapping = lba;
164 break;
165 case 1:
166 *mapping = postmap;
167 e_flag = 1;
168 break;
169 case 2:
170 *mapping = postmap;
171 z_flag = 1;
172 break;
173 case 3:
174 *mapping = postmap;
175 break;
176 default:
177 return -EIO;
178 }
179
180 if (trim)
181 *trim = z_flag;
182 if (error)
183 *error = e_flag;
184
185 return ret;
186}
187
188static int btt_log_read_pair(struct arena_info *arena, u32 lane,
189 struct log_entry *ent)
190{
191 WARN_ON(!ent);
192 return arena_read_bytes(arena,
193 arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
194 2 * LOG_ENT_SIZE);
195}
196
197static struct dentry *debugfs_root;
198
199static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
200 int idx)
201{
202 char dirname[32];
203 struct dentry *d;
204
205 /* If for some reason, parent bttN was not created, exit */
206 if (!parent)
207 return;
208
209 snprintf(dirname, 32, "arena%d", idx);
210 d = debugfs_create_dir(dirname, parent);
211 if (IS_ERR_OR_NULL(d))
212 return;
213 a->debugfs_dir = d;
214
215 debugfs_create_x64("size", S_IRUGO, d, &a->size);
216 debugfs_create_x64("external_lba_start", S_IRUGO, d,
217 &a->external_lba_start);
218 debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
219 debugfs_create_u32("internal_lbasize", S_IRUGO, d,
220 &a->internal_lbasize);
221 debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
222 debugfs_create_u32("external_lbasize", S_IRUGO, d,
223 &a->external_lbasize);
224 debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
225 debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
226 debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
227 debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
228 debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
229 debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
230 debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
231 debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
232 debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
233 debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
234}
235
236static void btt_debugfs_init(struct btt *btt)
237{
238 int i = 0;
239 struct arena_info *arena;
240
241 btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
242 debugfs_root);
243 if (IS_ERR_OR_NULL(btt->debugfs_dir))
244 return;
245
246 list_for_each_entry(arena, &btt->arena_list, list) {
247 arena_debugfs_init(arena, btt->debugfs_dir, i);
248 i++;
249 }
250}
251
252/*
253 * This function accepts two log entries, and uses the
254 * sequence number to find the 'older' entry.
255 * It also updates the sequence number in this old entry to
256 * make it the 'new' one if the mark_flag is set.
257 * Finally, it returns which of the entries was the older one.
258 *
259 * TODO The logic feels a bit kludge-y. make it better..
260 */
261static int btt_log_get_old(struct log_entry *ent)
262{
263 int old;
264
265 /*
266 * the first ever time this is seen, the entry goes into [0]
267 * the next time, the following logic works out to put this
268 * (next) entry into [1]
269 */
270 if (ent[0].seq == 0) {
271 ent[0].seq = cpu_to_le32(1);
272 return 0;
273 }
274
275 if (ent[0].seq == ent[1].seq)
276 return -EINVAL;
277 if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
278 return -EINVAL;
279
280 if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
281 if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
282 old = 0;
283 else
284 old = 1;
285 } else {
286 if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
287 old = 1;
288 else
289 old = 0;
290 }
291
292 return old;
293}
294
295static struct device *to_dev(struct arena_info *arena)
296{
297 return &arena->nd_btt->dev;
298}
299
300/*
301 * This function copies the desired (old/new) log entry into ent if
302 * it is not NULL. It returns the sub-slot number (0 or 1)
303 * where the desired log entry was found. Negative return values
304 * indicate errors.
305 */
306static int btt_log_read(struct arena_info *arena, u32 lane,
307 struct log_entry *ent, int old_flag)
308{
309 int ret;
310 int old_ent, ret_ent;
311 struct log_entry log[2];
312
313 ret = btt_log_read_pair(arena, lane, log);
314 if (ret)
315 return -EIO;
316
317 old_ent = btt_log_get_old(log);
318 if (old_ent < 0 || old_ent > 1) {
319 dev_info(to_dev(arena),
320 "log corruption (%d): lane %d seq [%d, %d]\n",
321 old_ent, lane, log[0].seq, log[1].seq);
322 /* TODO set error state? */
323 return -EIO;
324 }
325
326 ret_ent = (old_flag ? old_ent : (1 - old_ent));
327
328 if (ent != NULL)
329 memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
330
331 return ret_ent;
332}
333
334/*
335 * This function commits a log entry to media
336 * It does _not_ prepare the freelist entry for the next write
337 * btt_flog_write is the wrapper for updating the freelist elements
338 */
339static int __btt_log_write(struct arena_info *arena, u32 lane,
340 u32 sub, struct log_entry *ent)
341{
342 int ret;
343 /*
344 * Ignore the padding in log_entry for calculating log_half.
345 * The entry is 'committed' when we write the sequence number,
346 * and we want to ensure that that is the last thing written.
347 * We don't bother writing the padding as that would be extra
348 * media wear and write amplification
349 */
350 unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
351 u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
352 void *src = ent;
353
354 /* split the 16B write into atomic, durable halves */
355 ret = arena_write_bytes(arena, ns_off, src, log_half);
356 if (ret)
357 return ret;
358
359 ns_off += log_half;
360 src += log_half;
361 return arena_write_bytes(arena, ns_off, src, log_half);
362}
363
364static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
365 struct log_entry *ent)
366{
367 int ret;
368
369 ret = __btt_log_write(arena, lane, sub, ent);
370 if (ret)
371 return ret;
372
373 /* prepare the next free entry */
374 arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
375 if (++(arena->freelist[lane].seq) == 4)
376 arena->freelist[lane].seq = 1;
377 arena->freelist[lane].block = le32_to_cpu(ent->old_map);
378
379 return ret;
380}
381
382/*
383 * This function initializes the BTT map to the initial state, which is
384 * all-zeroes, and indicates an identity mapping
385 */
386static int btt_map_init(struct arena_info *arena)
387{
388 int ret = -EINVAL;
389 void *zerobuf;
390 size_t offset = 0;
391 size_t chunk_size = SZ_2M;
392 size_t mapsize = arena->logoff - arena->mapoff;
393
394 zerobuf = kzalloc(chunk_size, GFP_KERNEL);
395 if (!zerobuf)
396 return -ENOMEM;
397
398 while (mapsize) {
399 size_t size = min(mapsize, chunk_size);
400
401 ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
402 size);
403 if (ret)
404 goto free;
405
406 offset += size;
407 mapsize -= size;
408 cond_resched();
409 }
410
411 free:
412 kfree(zerobuf);
413 return ret;
414}
415
416/*
417 * This function initializes the BTT log with 'fake' entries pointing
418 * to the initial reserved set of blocks as being free
419 */
420static int btt_log_init(struct arena_info *arena)
421{
422 int ret;
423 u32 i;
424 struct log_entry log, zerolog;
425
426 memset(&zerolog, 0, sizeof(zerolog));
427
428 for (i = 0; i < arena->nfree; i++) {
429 log.lba = cpu_to_le32(i);
430 log.old_map = cpu_to_le32(arena->external_nlba + i);
431 log.new_map = cpu_to_le32(arena->external_nlba + i);
432 log.seq = cpu_to_le32(LOG_SEQ_INIT);
433 ret = __btt_log_write(arena, i, 0, &log);
434 if (ret)
435 return ret;
436 ret = __btt_log_write(arena, i, 1, &zerolog);
437 if (ret)
438 return ret;
439 }
440
441 return 0;
442}
443
444static int btt_freelist_init(struct arena_info *arena)
445{
446 int old, new, ret;
447 u32 i, map_entry;
448 struct log_entry log_new, log_old;
449
450 arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
451 GFP_KERNEL);
452 if (!arena->freelist)
453 return -ENOMEM;
454
455 for (i = 0; i < arena->nfree; i++) {
456 old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT);
457 if (old < 0)
458 return old;
459
460 new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
461 if (new < 0)
462 return new;
463
464 /* sub points to the next one to be overwritten */
465 arena->freelist[i].sub = 1 - new;
466 arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
467 arena->freelist[i].block = le32_to_cpu(log_new.old_map);
468
469 /* This implies a newly created or untouched flog entry */
470 if (log_new.old_map == log_new.new_map)
471 continue;
472
473 /* Check if map recovery is needed */
474 ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
475 NULL, NULL);
476 if (ret)
477 return ret;
478 if ((le32_to_cpu(log_new.new_map) != map_entry) &&
479 (le32_to_cpu(log_new.old_map) == map_entry)) {
480 /*
481 * Last transaction wrote the flog, but wasn't able
482 * to complete the map write. So fix up the map.
483 */
484 ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
485 le32_to_cpu(log_new.new_map), 0, 0);
486 if (ret)
487 return ret;
488 }
489
490 }
491
492 return 0;
493}
494
495static int btt_rtt_init(struct arena_info *arena)
496{
497 arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
498 if (arena->rtt == NULL)
499 return -ENOMEM;
500
501 return 0;
502}
503
504static int btt_maplocks_init(struct arena_info *arena)
505{
506 u32 i;
507
508 arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
509 GFP_KERNEL);
510 if (!arena->map_locks)
511 return -ENOMEM;
512
513 for (i = 0; i < arena->nfree; i++)
514 spin_lock_init(&arena->map_locks[i].lock);
515
516 return 0;
517}
518
519static struct arena_info *alloc_arena(struct btt *btt, size_t size,
520 size_t start, size_t arena_off)
521{
522 struct arena_info *arena;
523 u64 logsize, mapsize, datasize;
524 u64 available = size;
525
526 arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
527 if (!arena)
528 return NULL;
529 arena->nd_btt = btt->nd_btt;
530
531 if (!size)
532 return arena;
533
534 arena->size = size;
535 arena->external_lba_start = start;
536 arena->external_lbasize = btt->lbasize;
537 arena->internal_lbasize = roundup(arena->external_lbasize,
538 INT_LBASIZE_ALIGNMENT);
539 arena->nfree = BTT_DEFAULT_NFREE;
540 arena->version_major = 1;
541 arena->version_minor = 1;
542
543 if (available % BTT_PG_SIZE)
544 available -= (available % BTT_PG_SIZE);
545
546 /* Two pages are reserved for the super block and its copy */
547 available -= 2 * BTT_PG_SIZE;
548
549 /* The log takes a fixed amount of space based on nfree */
550 logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
551 BTT_PG_SIZE);
552 available -= logsize;
553
554 /* Calculate optimal split between map and data area */
555 arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
556 arena->internal_lbasize + MAP_ENT_SIZE);
557 arena->external_nlba = arena->internal_nlba - arena->nfree;
558
559 mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
560 datasize = available - mapsize;
561
562 /* 'Absolute' values, relative to start of storage space */
563 arena->infooff = arena_off;
564 arena->dataoff = arena->infooff + BTT_PG_SIZE;
565 arena->mapoff = arena->dataoff + datasize;
566 arena->logoff = arena->mapoff + mapsize;
567 arena->info2off = arena->logoff + logsize;
568 return arena;
569}
570
571static void free_arenas(struct btt *btt)
572{
573 struct arena_info *arena, *next;
574
575 list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
576 list_del(&arena->list);
577 kfree(arena->rtt);
578 kfree(arena->map_locks);
579 kfree(arena->freelist);
580 debugfs_remove_recursive(arena->debugfs_dir);
581 kfree(arena);
582 }
583}
584
585/*
586 * This function checks if the metadata layout is valid and error free
587 */
588static int arena_is_valid(struct arena_info *arena, struct btt_sb *super,
589 u8 *uuid, u32 lbasize)
590{
591 u64 checksum;
592
593 if (memcmp(super->uuid, uuid, 16))
594 return 0;
595
596 checksum = le64_to_cpu(super->checksum);
597 super->checksum = 0;
598 if (checksum != nd_btt_sb_checksum(super))
599 return 0;
600 super->checksum = cpu_to_le64(checksum);
601
602 if (lbasize != le32_to_cpu(super->external_lbasize))
603 return 0;
604
605 /* TODO: figure out action for this */
606 if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
607 dev_info(to_dev(arena), "Found arena with an error flag\n");
608
609 return 1;
610}
611
612/*
613 * This function reads an existing valid btt superblock and
614 * populates the corresponding arena_info struct
615 */
616static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
617 u64 arena_off)
618{
619 arena->internal_nlba = le32_to_cpu(super->internal_nlba);
620 arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
621 arena->external_nlba = le32_to_cpu(super->external_nlba);
622 arena->external_lbasize = le32_to_cpu(super->external_lbasize);
623 arena->nfree = le32_to_cpu(super->nfree);
624 arena->version_major = le16_to_cpu(super->version_major);
625 arena->version_minor = le16_to_cpu(super->version_minor);
626
627 arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
628 le64_to_cpu(super->nextoff));
629 arena->infooff = arena_off;
630 arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
631 arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
632 arena->logoff = arena_off + le64_to_cpu(super->logoff);
633 arena->info2off = arena_off + le64_to_cpu(super->info2off);
634
635 arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) :
636 (arena->info2off - arena->infooff + BTT_PG_SIZE);
637
638 arena->flags = le32_to_cpu(super->flags);
639}
640
641static int discover_arenas(struct btt *btt)
642{
643 int ret = 0;
644 struct arena_info *arena;
645 struct btt_sb *super;
646 size_t remaining = btt->rawsize;
647 u64 cur_nlba = 0;
648 size_t cur_off = 0;
649 int num_arenas = 0;
650
651 super = kzalloc(sizeof(*super), GFP_KERNEL);
652 if (!super)
653 return -ENOMEM;
654
655 while (remaining) {
656 /* Alloc memory for arena */
657 arena = alloc_arena(btt, 0, 0, 0);
658 if (!arena) {
659 ret = -ENOMEM;
660 goto out_super;
661 }
662
663 arena->infooff = cur_off;
664 ret = btt_info_read(arena, super);
665 if (ret)
666 goto out;
667
668 if (!arena_is_valid(arena, super, btt->nd_btt->uuid,
669 btt->lbasize)) {
670 if (remaining == btt->rawsize) {
671 btt->init_state = INIT_NOTFOUND;
672 dev_info(to_dev(arena), "No existing arenas\n");
673 goto out;
674 } else {
675 dev_info(to_dev(arena),
676 "Found corrupted metadata!\n");
677 ret = -ENODEV;
678 goto out;
679 }
680 }
681
682 arena->external_lba_start = cur_nlba;
683 parse_arena_meta(arena, super, cur_off);
684
685 ret = btt_freelist_init(arena);
686 if (ret)
687 goto out;
688
689 ret = btt_rtt_init(arena);
690 if (ret)
691 goto out;
692
693 ret = btt_maplocks_init(arena);
694 if (ret)
695 goto out;
696
697 list_add_tail(&arena->list, &btt->arena_list);
698
699 remaining -= arena->size;
700 cur_off += arena->size;
701 cur_nlba += arena->external_nlba;
702 num_arenas++;
703
704 if (arena->nextoff == 0)
705 break;
706 }
707 btt->num_arenas = num_arenas;
708 btt->nlba = cur_nlba;
709 btt->init_state = INIT_READY;
710
711 kfree(super);
712 return ret;
713
714 out:
715 kfree(arena);
716 free_arenas(btt);
717 out_super:
718 kfree(super);
719 return ret;
720}
721
722static int create_arenas(struct btt *btt)
723{
724 size_t remaining = btt->rawsize;
725 size_t cur_off = 0;
726
727 while (remaining) {
728 struct arena_info *arena;
729 size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
730
731 remaining -= arena_size;
732 if (arena_size < ARENA_MIN_SIZE)
733 break;
734
735 arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
736 if (!arena) {
737 free_arenas(btt);
738 return -ENOMEM;
739 }
740 btt->nlba += arena->external_nlba;
741 if (remaining >= ARENA_MIN_SIZE)
742 arena->nextoff = arena->size;
743 else
744 arena->nextoff = 0;
745 cur_off += arena_size;
746 list_add_tail(&arena->list, &btt->arena_list);
747 }
748
749 return 0;
750}
751
752/*
753 * This function completes arena initialization by writing
754 * all the metadata.
755 * It is only called for an uninitialized arena when a write
756 * to that arena occurs for the first time.
757 */
758static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
759{
760 int ret;
761 struct btt_sb *super;
762
763 ret = btt_map_init(arena);
764 if (ret)
765 return ret;
766
767 ret = btt_log_init(arena);
768 if (ret)
769 return ret;
770
771 super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
772 if (!super)
773 return -ENOMEM;
774
775 strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
776 memcpy(super->uuid, uuid, 16);
777 super->flags = cpu_to_le32(arena->flags);
778 super->version_major = cpu_to_le16(arena->version_major);
779 super->version_minor = cpu_to_le16(arena->version_minor);
780 super->external_lbasize = cpu_to_le32(arena->external_lbasize);
781 super->external_nlba = cpu_to_le32(arena->external_nlba);
782 super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
783 super->internal_nlba = cpu_to_le32(arena->internal_nlba);
784 super->nfree = cpu_to_le32(arena->nfree);
785 super->infosize = cpu_to_le32(sizeof(struct btt_sb));
786 super->nextoff = cpu_to_le64(arena->nextoff);
787 /*
788 * Subtract arena->infooff (arena start) so numbers are relative
789 * to 'this' arena
790 */
791 super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
792 super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
793 super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
794 super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
795
796 super->flags = 0;
797 super->checksum = cpu_to_le64(nd_btt_sb_checksum(super));
798
799 ret = btt_info_write(arena, super);
800
801 kfree(super);
802 return ret;
803}
804
805/*
806 * This function completes the initialization for the BTT namespace
807 * such that it is ready to accept IOs
808 */
809static int btt_meta_init(struct btt *btt)
810{
811 int ret = 0;
812 struct arena_info *arena;
813
814 mutex_lock(&btt->init_lock);
815 list_for_each_entry(arena, &btt->arena_list, list) {
816 ret = btt_arena_write_layout(arena, btt->nd_btt->uuid);
817 if (ret)
818 goto unlock;
819
820 ret = btt_freelist_init(arena);
821 if (ret)
822 goto unlock;
823
824 ret = btt_rtt_init(arena);
825 if (ret)
826 goto unlock;
827
828 ret = btt_maplocks_init(arena);
829 if (ret)
830 goto unlock;
831 }
832
833 btt->init_state = INIT_READY;
834
835 unlock:
836 mutex_unlock(&btt->init_lock);
837 return ret;
838}
839
840static u32 btt_meta_size(struct btt *btt)
841{
842 return btt->lbasize - btt->sector_size;
843}
844
845/*
846 * This function calculates the arena in which the given LBA lies
847 * by doing a linear walk. This is acceptable since we expect only
848 * a few arenas. If we have backing devices that get much larger,
849 * we can construct a balanced binary tree of arenas at init time
850 * so that this range search becomes faster.
851 */
852static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
853 struct arena_info **arena)
854{
855 struct arena_info *arena_list;
856 __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
857
858 list_for_each_entry(arena_list, &btt->arena_list, list) {
859 if (lba < arena_list->external_nlba) {
860 *arena = arena_list;
861 *premap = lba;
862 return 0;
863 }
864 lba -= arena_list->external_nlba;
865 }
866
867 return -EIO;
868}
869
870/*
871 * The following (lock_map, unlock_map) are mostly just to improve
872 * readability, since they index into an array of locks
873 */
874static void lock_map(struct arena_info *arena, u32 premap)
875 __acquires(&arena->map_locks[idx].lock)
876{
877 u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
878
879 spin_lock(&arena->map_locks[idx].lock);
880}
881
882static void unlock_map(struct arena_info *arena, u32 premap)
883 __releases(&arena->map_locks[idx].lock)
884{
885 u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
886
887 spin_unlock(&arena->map_locks[idx].lock);
888}
889
890static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
891{
892 return arena->dataoff + ((u64)lba * arena->internal_lbasize);
893}
894
895static int btt_data_read(struct arena_info *arena, struct page *page,
896 unsigned int off, u32 lba, u32 len)
897{
898 int ret;
899 u64 nsoff = to_namespace_offset(arena, lba);
900 void *mem = kmap_atomic(page);
901
902 ret = arena_read_bytes(arena, nsoff, mem + off, len);
903 kunmap_atomic(mem);
904
905 return ret;
906}
907
908static int btt_data_write(struct arena_info *arena, u32 lba,
909 struct page *page, unsigned int off, u32 len)
910{
911 int ret;
912 u64 nsoff = to_namespace_offset(arena, lba);
913 void *mem = kmap_atomic(page);
914
915 ret = arena_write_bytes(arena, nsoff, mem + off, len);
916 kunmap_atomic(mem);
917
918 return ret;
919}
920
921static void zero_fill_data(struct page *page, unsigned int off, u32 len)
922{
923 void *mem = kmap_atomic(page);
924
925 memset(mem + off, 0, len);
926 kunmap_atomic(mem);
927}
928
929#ifdef CONFIG_BLK_DEV_INTEGRITY
930static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
931 struct arena_info *arena, u32 postmap, int rw)
932{
933 unsigned int len = btt_meta_size(btt);
934 u64 meta_nsoff;
935 int ret = 0;
936
937 if (bip == NULL)
938 return 0;
939
940 meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size;
941
942 while (len) {
943 unsigned int cur_len;
944 struct bio_vec bv;
945 void *mem;
946
947 bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
948 /*
949 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
950 * .bv_offset already adjusted for iter->bi_bvec_done, and we
951 * can use those directly
952 */
953
954 cur_len = min(len, bv.bv_len);
955 mem = kmap_atomic(bv.bv_page);
956 if (rw)
957 ret = arena_write_bytes(arena, meta_nsoff,
958 mem + bv.bv_offset, cur_len);
959 else
960 ret = arena_read_bytes(arena, meta_nsoff,
961 mem + bv.bv_offset, cur_len);
962
963 kunmap_atomic(mem);
964 if (ret)
965 return ret;
966
967 len -= cur_len;
968 meta_nsoff += cur_len;
969 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len);
970 }
971
972 return ret;
973}
974
975#else /* CONFIG_BLK_DEV_INTEGRITY */
976static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
977 struct arena_info *arena, u32 postmap, int rw)
978{
979 return 0;
980}
981#endif
982
983static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
984 struct page *page, unsigned int off, sector_t sector,
985 unsigned int len)
986{
987 int ret = 0;
988 int t_flag, e_flag;
989 struct arena_info *arena = NULL;
990 u32 lane = 0, premap, postmap;
991
992 while (len) {
993 u32 cur_len;
994
995 lane = nd_region_acquire_lane(btt->nd_region);
996
997 ret = lba_to_arena(btt, sector, &premap, &arena);
998 if (ret)
999 goto out_lane;
1000
1001 cur_len = min(btt->sector_size, len);
1002
1003 ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag);
1004 if (ret)
1005 goto out_lane;
1006
1007 /*
1008 * We loop to make sure that the post map LBA didn't change
1009 * from under us between writing the RTT and doing the actual
1010 * read.
1011 */
1012 while (1) {
1013 u32 new_map;
1014
1015 if (t_flag) {
1016 zero_fill_data(page, off, cur_len);
1017 goto out_lane;
1018 }
1019
1020 if (e_flag) {
1021 ret = -EIO;
1022 goto out_lane;
1023 }
1024
1025 arena->rtt[lane] = RTT_VALID | postmap;
1026 /*
1027 * Barrier to make sure this write is not reordered
1028 * to do the verification map_read before the RTT store
1029 */
1030 barrier();
1031
1032 ret = btt_map_read(arena, premap, &new_map, &t_flag,
1033 &e_flag);
1034 if (ret)
1035 goto out_rtt;
1036
1037 if (postmap == new_map)
1038 break;
1039
1040 postmap = new_map;
1041 }
1042
1043 ret = btt_data_read(arena, page, off, postmap, cur_len);
1044 if (ret)
1045 goto out_rtt;
1046
1047 if (bip) {
1048 ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
1049 if (ret)
1050 goto out_rtt;
1051 }
1052
1053 arena->rtt[lane] = RTT_INVALID;
1054 nd_region_release_lane(btt->nd_region, lane);
1055
1056 len -= cur_len;
1057 off += cur_len;
1058 sector += btt->sector_size >> SECTOR_SHIFT;
1059 }
1060
1061 return 0;
1062
1063 out_rtt:
1064 arena->rtt[lane] = RTT_INVALID;
1065 out_lane:
1066 nd_region_release_lane(btt->nd_region, lane);
1067 return ret;
1068}
1069
1070static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
1071 sector_t sector, struct page *page, unsigned int off,
1072 unsigned int len)
1073{
1074 int ret = 0;
1075 struct arena_info *arena = NULL;
1076 u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
1077 struct log_entry log;
1078 int sub;
1079
1080 while (len) {
1081 u32 cur_len;
1082
1083 lane = nd_region_acquire_lane(btt->nd_region);
1084
1085 ret = lba_to_arena(btt, sector, &premap, &arena);
1086 if (ret)
1087 goto out_lane;
1088 cur_len = min(btt->sector_size, len);
1089
1090 if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
1091 ret = -EIO;
1092 goto out_lane;
1093 }
1094
1095 new_postmap = arena->freelist[lane].block;
1096
1097 /* Wait if the new block is being read from */
1098 for (i = 0; i < arena->nfree; i++)
1099 while (arena->rtt[i] == (RTT_VALID | new_postmap))
1100 cpu_relax();
1101
1102
1103 if (new_postmap >= arena->internal_nlba) {
1104 ret = -EIO;
1105 goto out_lane;
1106 }
1107
1108 ret = btt_data_write(arena, new_postmap, page, off, cur_len);
1109 if (ret)
1110 goto out_lane;
1111
1112 if (bip) {
1113 ret = btt_rw_integrity(btt, bip, arena, new_postmap,
1114 WRITE);
1115 if (ret)
1116 goto out_lane;
1117 }
1118
1119 lock_map(arena, premap);
1120 ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL);
1121 if (ret)
1122 goto out_map;
1123 if (old_postmap >= arena->internal_nlba) {
1124 ret = -EIO;
1125 goto out_map;
1126 }
1127
1128 log.lba = cpu_to_le32(premap);
1129 log.old_map = cpu_to_le32(old_postmap);
1130 log.new_map = cpu_to_le32(new_postmap);
1131 log.seq = cpu_to_le32(arena->freelist[lane].seq);
1132 sub = arena->freelist[lane].sub;
1133 ret = btt_flog_write(arena, lane, sub, &log);
1134 if (ret)
1135 goto out_map;
1136
1137 ret = btt_map_write(arena, premap, new_postmap, 0, 0);
1138 if (ret)
1139 goto out_map;
1140
1141 unlock_map(arena, premap);
1142 nd_region_release_lane(btt->nd_region, lane);
1143
1144 len -= cur_len;
1145 off += cur_len;
1146 sector += btt->sector_size >> SECTOR_SHIFT;
1147 }
1148
1149 return 0;
1150
1151 out_map:
1152 unlock_map(arena, premap);
1153 out_lane:
1154 nd_region_release_lane(btt->nd_region, lane);
1155 return ret;
1156}
1157
1158static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
1159 struct page *page, unsigned int len, unsigned int off,
1160 int rw, sector_t sector)
1161{
1162 int ret;
1163
1164 if (rw == READ) {
1165 ret = btt_read_pg(btt, bip, page, off, sector, len);
1166 flush_dcache_page(page);
1167 } else {
1168 flush_dcache_page(page);
1169 ret = btt_write_pg(btt, bip, sector, page, off, len);
1170 }
1171
1172 return ret;
1173}
1174
1175static void btt_make_request(struct request_queue *q, struct bio *bio)
1176{
1177 struct bio_integrity_payload *bip = bio_integrity(bio);
1178 struct btt *btt = q->queuedata;
1179 struct bvec_iter iter;
1180 unsigned long start;
1181 struct bio_vec bvec;
1182 int err = 0, rw;
1183 bool do_acct;
1184
1185 /*
1186 * bio_integrity_enabled also checks if the bio already has an
1187 * integrity payload attached. If it does, we *don't* do a
1188 * bio_integrity_prep here - the payload has been generated by
1189 * another kernel subsystem, and we just pass it through.
1190 */
1191 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1192 err = -EIO;
1193 goto out;
1194 }
1195
1196 do_acct = nd_iostat_start(bio, &start);
1197 rw = bio_data_dir(bio);
1198 bio_for_each_segment(bvec, bio, iter) {
1199 unsigned int len = bvec.bv_len;
1200
1201 BUG_ON(len > PAGE_SIZE);
1202 /* Make sure len is in multiples of sector size. */
1203 /* XXX is this right? */
1204 BUG_ON(len < btt->sector_size);
1205 BUG_ON(len % btt->sector_size);
1206
1207 err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
1208 rw, iter.bi_sector);
1209 if (err) {
1210 dev_info(&btt->nd_btt->dev,
1211 "io error in %s sector %lld, len %d,\n",
1212 (rw == READ) ? "READ" : "WRITE",
1213 (unsigned long long) iter.bi_sector, len);
1214 break;
1215 }
1216 }
1217 if (do_acct)
1218 nd_iostat_end(bio, start);
1219
1220out:
1221 bio_endio(bio, err);
1222}
1223
1224static int btt_rw_page(struct block_device *bdev, sector_t sector,
1225 struct page *page, int rw)
1226{
1227 struct btt *btt = bdev->bd_disk->private_data;
1228
1229 btt_do_bvec(btt, NULL, page, PAGE_CACHE_SIZE, 0, rw, sector);
1230 page_endio(page, rw & WRITE, 0);
1231 return 0;
1232}
1233
1234
1235static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
1236{
1237 /* some standard values */
1238 geo->heads = 1 << 6;
1239 geo->sectors = 1 << 5;
1240 geo->cylinders = get_capacity(bd->bd_disk) >> 11;
1241 return 0;
1242}
1243
1244static const struct block_device_operations btt_fops = {
1245 .owner = THIS_MODULE,
1246 .rw_page = btt_rw_page,
1247 .getgeo = btt_getgeo,
1248 .revalidate_disk = nvdimm_revalidate_disk,
1249};
1250
1251static int btt_blk_init(struct btt *btt)
1252{
1253 struct nd_btt *nd_btt = btt->nd_btt;
1254 struct nd_namespace_common *ndns = nd_btt->ndns;
1255
1256 /* create a new disk and request queue for btt */
1257 btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
1258 if (!btt->btt_queue)
1259 return -ENOMEM;
1260
1261 btt->btt_disk = alloc_disk(0);
1262 if (!btt->btt_disk) {
1263 blk_cleanup_queue(btt->btt_queue);
1264 return -ENOMEM;
1265 }
1266
1267 nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
1268 btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
1269 btt->btt_disk->major = btt_major;
1270 btt->btt_disk->first_minor = 0;
1271 btt->btt_disk->fops = &btt_fops;
1272 btt->btt_disk->private_data = btt;
1273 btt->btt_disk->queue = btt->btt_queue;
1274 btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
1275
1276 blk_queue_make_request(btt->btt_queue, btt_make_request);
1277 blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
1278 blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
1279 blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY);
1280 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue);
1281 btt->btt_queue->queuedata = btt;
1282
1283 set_capacity(btt->btt_disk, 0);
1284 add_disk(btt->btt_disk);
1285 if (btt_meta_size(btt)) {
1286 int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
1287
1288 if (rc) {
1289 del_gendisk(btt->btt_disk);
1290 put_disk(btt->btt_disk);
1291 blk_cleanup_queue(btt->btt_queue);
1292 return rc;
1293 }
1294 }
1295 set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
1296 revalidate_disk(btt->btt_disk);
1297
1298 return 0;
1299}
1300
1301static void btt_blk_cleanup(struct btt *btt)
1302{
1303 blk_integrity_unregister(btt->btt_disk);
1304 del_gendisk(btt->btt_disk);
1305 put_disk(btt->btt_disk);
1306 blk_cleanup_queue(btt->btt_queue);
1307}
1308
1309/**
1310 * btt_init - initialize a block translation table for the given device
1311 * @nd_btt: device with BTT geometry and backing device info
1312 * @rawsize: raw size in bytes of the backing device
1313 * @lbasize: lba size of the backing device
1314 * @uuid: A uuid for the backing device - this is stored on media
1315 * @maxlane: maximum number of parallel requests the device can handle
1316 *
1317 * Initialize a Block Translation Table on a backing device to provide
1318 * single sector power fail atomicity.
1319 *
1320 * Context:
1321 * Might sleep.
1322 *
1323 * Returns:
1324 * Pointer to a new struct btt on success, NULL on failure.
1325 */
1326static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
1327 u32 lbasize, u8 *uuid, struct nd_region *nd_region)
1328{
1329 int ret;
1330 struct btt *btt;
1331 struct device *dev = &nd_btt->dev;
1332
1333 btt = kzalloc(sizeof(struct btt), GFP_KERNEL);
1334 if (!btt)
1335 return NULL;
1336
1337 btt->nd_btt = nd_btt;
1338 btt->rawsize = rawsize;
1339 btt->lbasize = lbasize;
1340 btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
1341 INIT_LIST_HEAD(&btt->arena_list);
1342 mutex_init(&btt->init_lock);
1343 btt->nd_region = nd_region;
1344
1345 ret = discover_arenas(btt);
1346 if (ret) {
1347 dev_err(dev, "init: error in arena_discover: %d\n", ret);
1348 goto out_free;
1349 }
1350
1351 if (btt->init_state != INIT_READY && nd_region->ro) {
1352 dev_info(dev, "%s is read-only, unable to init btt metadata\n",
1353 dev_name(&nd_region->dev));
1354 goto out_free;
1355 } else if (btt->init_state != INIT_READY) {
1356 btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
1357 ((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
1358 dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
1359 btt->num_arenas, rawsize);
1360
1361 ret = create_arenas(btt);
1362 if (ret) {
1363 dev_info(dev, "init: create_arenas: %d\n", ret);
1364 goto out_free;
1365 }
1366
1367 ret = btt_meta_init(btt);
1368 if (ret) {
1369 dev_err(dev, "init: error in meta_init: %d\n", ret);
1370 goto out_free;
1371 }
1372 }
1373
1374 ret = btt_blk_init(btt);
1375 if (ret) {
1376 dev_err(dev, "init: error in blk_init: %d\n", ret);
1377 goto out_free;
1378 }
1379
1380 btt_debugfs_init(btt);
1381
1382 return btt;
1383
1384 out_free:
1385 kfree(btt);
1386 return NULL;
1387}
1388
1389/**
1390 * btt_fini - de-initialize a BTT
1391 * @btt: the BTT handle that was generated by btt_init
1392 *
1393 * De-initialize a Block Translation Table on device removal
1394 *
1395 * Context:
1396 * Might sleep.
1397 */
1398static void btt_fini(struct btt *btt)
1399{
1400 if (btt) {
1401 btt_blk_cleanup(btt);
1402 free_arenas(btt);
1403 debugfs_remove_recursive(btt->debugfs_dir);
1404 kfree(btt);
1405 }
1406}
1407
1408int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
1409{
1410 struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
1411 struct nd_region *nd_region;
1412 struct btt *btt;
1413 size_t rawsize;
1414
1415 if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize)
1416 return -ENODEV;
1417
1418 rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K;
1419 if (rawsize < ARENA_MIN_SIZE) {
1420 return -ENXIO;
1421 }
1422 nd_region = to_nd_region(nd_btt->dev.parent);
1423 btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
1424 nd_region);
1425 if (!btt)
1426 return -ENOMEM;
1427 nd_btt->btt = btt;
1428
1429 return 0;
1430}
1431EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
1432
1433int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
1434{
1435 struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
1436 struct btt *btt = nd_btt->btt;
1437
1438 btt_fini(btt);
1439 nd_btt->btt = NULL;
1440
1441 return 0;
1442}
1443EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
1444
1445static int __init nd_btt_init(void)
1446{
1447 int rc;
1448
1449 BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
1450
1451 btt_major = register_blkdev(0, "btt");
1452 if (btt_major < 0)
1453 return btt_major;
1454
1455 debugfs_root = debugfs_create_dir("btt", NULL);
1456 if (IS_ERR_OR_NULL(debugfs_root)) {
1457 rc = -ENXIO;
1458 goto err_debugfs;
1459 }
1460
1461 return 0;
1462
1463 err_debugfs:
1464 unregister_blkdev(btt_major, "btt");
1465
1466 return rc;
1467}
1468
1469static void __exit nd_btt_exit(void)
1470{
1471 debugfs_remove_recursive(debugfs_root);
1472 unregister_blkdev(btt_major, "btt");
1473}
1474
1475MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
1476MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
1477MODULE_LICENSE("GPL v2");
1478module_init(nd_btt_init);
1479module_exit(nd_btt_exit);
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
new file mode 100644
index 000000000000..75b0d80a6bd9
--- /dev/null
+++ b/drivers/nvdimm/btt.h
@@ -0,0 +1,185 @@
1/*
2 * Block Translation Table library
3 * Copyright (c) 2014-2015, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#ifndef _LINUX_BTT_H
16#define _LINUX_BTT_H
17
18#include <linux/types.h>
19
20#define BTT_SIG_LEN 16
21#define BTT_SIG "BTT_ARENA_INFO\0"
22#define MAP_ENT_SIZE 4
23#define MAP_TRIM_SHIFT 31
24#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
25#define MAP_ERR_SHIFT 30
26#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
27#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
28#define MAP_ENT_NORMAL 0xC0000000
29#define LOG_ENT_SIZE sizeof(struct log_entry)
30#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */
31#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */
32#define RTT_VALID (1UL << 31)
33#define RTT_INVALID 0
34#define BTT_PG_SIZE 4096
35#define BTT_DEFAULT_NFREE ND_MAX_LANES
36#define LOG_SEQ_INIT 1
37
38#define IB_FLAG_ERROR 0x00000001
39#define IB_FLAG_ERROR_MASK 0x00000001
40
41enum btt_init_state {
42 INIT_UNCHECKED = 0,
43 INIT_NOTFOUND,
44 INIT_READY
45};
46
47struct log_entry {
48 __le32 lba;
49 __le32 old_map;
50 __le32 new_map;
51 __le32 seq;
52 __le64 padding[2];
53};
54
55struct btt_sb {
56 u8 signature[BTT_SIG_LEN];
57 u8 uuid[16];
58 u8 parent_uuid[16];
59 __le32 flags;
60 __le16 version_major;
61 __le16 version_minor;
62 __le32 external_lbasize;
63 __le32 external_nlba;
64 __le32 internal_lbasize;
65 __le32 internal_nlba;
66 __le32 nfree;
67 __le32 infosize;
68 __le64 nextoff;
69 __le64 dataoff;
70 __le64 mapoff;
71 __le64 logoff;
72 __le64 info2off;
73 u8 padding[3968];
74 __le64 checksum;
75};
76
77struct free_entry {
78 u32 block;
79 u8 sub;
80 u8 seq;
81};
82
83struct aligned_lock {
84 union {
85 spinlock_t lock;
86 u8 cacheline_padding[L1_CACHE_BYTES];
87 };
88};
89
90/**
91 * struct arena_info - handle for an arena
92 * @size: Size in bytes this arena occupies on the raw device.
93 * This includes arena metadata.
94 * @external_lba_start: The first external LBA in this arena.
95 * @internal_nlba: Number of internal blocks available in the arena
96 * including nfree reserved blocks
97 * @internal_lbasize: Internal and external lba sizes may be different as
98 * we can round up 'odd' external lbasizes such as 520B
99 * to be aligned.
100 * @external_nlba: Number of blocks contributed by the arena to the number
101 * reported to upper layers. (internal_nlba - nfree)
102 * @external_lbasize: LBA size as exposed to upper layers.
103 * @nfree: A reserve number of 'free' blocks that is used to
104 * handle incoming writes.
105 * @version_major: Metadata layout version major.
106 * @version_minor: Metadata layout version minor.
107 * @nextoff: Offset in bytes to the start of the next arena.
108 * @infooff: Offset in bytes to the info block of this arena.
109 * @dataoff: Offset in bytes to the data area of this arena.
110 * @mapoff: Offset in bytes to the map area of this arena.
111 * @logoff: Offset in bytes to the log area of this arena.
112 * @info2off: Offset in bytes to the backup info block of this arena.
113 * @freelist: Pointer to in-memory list of free blocks
114 * @rtt: Pointer to in-memory "Read Tracking Table"
115 * @map_locks: Spinlocks protecting concurrent map writes
116 * @nd_btt: Pointer to parent nd_btt structure.
117 * @list: List head for list of arenas
118 * @debugfs_dir: Debugfs dentry
119 * @flags: Arena flags - may signify error states.
120 *
121 * arena_info is a per-arena handle. Once an arena is narrowed down for an
122 * IO, this struct is passed around for the duration of the IO.
123 */
124struct arena_info {
125 u64 size; /* Total bytes for this arena */
126 u64 external_lba_start;
127 u32 internal_nlba;
128 u32 internal_lbasize;
129 u32 external_nlba;
130 u32 external_lbasize;
131 u32 nfree;
132 u16 version_major;
133 u16 version_minor;
134 /* Byte offsets to the different on-media structures */
135 u64 nextoff;
136 u64 infooff;
137 u64 dataoff;
138 u64 mapoff;
139 u64 logoff;
140 u64 info2off;
141 /* Pointers to other in-memory structures for this arena */
142 struct free_entry *freelist;
143 u32 *rtt;
144 struct aligned_lock *map_locks;
145 struct nd_btt *nd_btt;
146 struct list_head list;
147 struct dentry *debugfs_dir;
148 /* Arena flags */
149 u32 flags;
150};
151
152/**
153 * struct btt - handle for a BTT instance
154 * @btt_disk: Pointer to the gendisk for BTT device
155 * @btt_queue: Pointer to the request queue for the BTT device
156 * @arena_list: Head of the list of arenas
157 * @debugfs_dir: Debugfs dentry
158 * @nd_btt: Parent nd_btt struct
159 * @nlba: Number of logical blocks exposed to the upper layers
160 * after removing the amount of space needed by metadata
161 * @rawsize: Total size in bytes of the available backing device
162 * @lbasize: LBA size as requested and presented to upper layers.
163 * This is sector_size + size of any metadata.
164 * @sector_size: The Linux sector size - 512 or 4096
165 * @lanes: Per-lane spinlocks
166 * @init_lock: Mutex used for the BTT initialization
167 * @init_state: Flag describing the initialization state for the BTT
168 * @num_arenas: Number of arenas in the BTT instance
169 */
170struct btt {
171 struct gendisk *btt_disk;
172 struct request_queue *btt_queue;
173 struct list_head arena_list;
174 struct dentry *debugfs_dir;
175 struct nd_btt *nd_btt;
176 u64 nlba;
177 unsigned long long rawsize;
178 u32 lbasize;
179 u32 sector_size;
180 struct nd_region *nd_region;
181 struct mutex init_lock;
182 int init_state;
183 int num_arenas;
184};
185#endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
new file mode 100644
index 000000000000..6ac8c0fea3ec
--- /dev/null
+++ b/drivers/nvdimm/btt_devs.c
@@ -0,0 +1,425 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/blkdev.h>
14#include <linux/device.h>
15#include <linux/genhd.h>
16#include <linux/sizes.h>
17#include <linux/slab.h>
18#include <linux/fs.h>
19#include <linux/mm.h>
20#include "nd-core.h"
21#include "btt.h"
22#include "nd.h"
23
24static void __nd_btt_detach_ndns(struct nd_btt *nd_btt)
25{
26 struct nd_namespace_common *ndns = nd_btt->ndns;
27
28 dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
29 || ndns->claim != &nd_btt->dev,
30 "%s: invalid claim\n", __func__);
31 ndns->claim = NULL;
32 nd_btt->ndns = NULL;
33 put_device(&ndns->dev);
34}
35
36static void nd_btt_detach_ndns(struct nd_btt *nd_btt)
37{
38 struct nd_namespace_common *ndns = nd_btt->ndns;
39
40 if (!ndns)
41 return;
42 get_device(&ndns->dev);
43 device_lock(&ndns->dev);
44 __nd_btt_detach_ndns(nd_btt);
45 device_unlock(&ndns->dev);
46 put_device(&ndns->dev);
47}
48
49static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt,
50 struct nd_namespace_common *ndns)
51{
52 if (ndns->claim)
53 return false;
54 dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
55 || nd_btt->ndns,
56 "%s: invalid claim\n", __func__);
57 ndns->claim = &nd_btt->dev;
58 nd_btt->ndns = ndns;
59 get_device(&ndns->dev);
60 return true;
61}
62
63static bool nd_btt_attach_ndns(struct nd_btt *nd_btt,
64 struct nd_namespace_common *ndns)
65{
66 bool claimed;
67
68 device_lock(&ndns->dev);
69 claimed = __nd_btt_attach_ndns(nd_btt, ndns);
70 device_unlock(&ndns->dev);
71 return claimed;
72}
73
74static void nd_btt_release(struct device *dev)
75{
76 struct nd_region *nd_region = to_nd_region(dev->parent);
77 struct nd_btt *nd_btt = to_nd_btt(dev);
78
79 dev_dbg(dev, "%s\n", __func__);
80 nd_btt_detach_ndns(nd_btt);
81 ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
82 kfree(nd_btt->uuid);
83 kfree(nd_btt);
84}
85
86static struct device_type nd_btt_device_type = {
87 .name = "nd_btt",
88 .release = nd_btt_release,
89};
90
91bool is_nd_btt(struct device *dev)
92{
93 return dev->type == &nd_btt_device_type;
94}
95EXPORT_SYMBOL(is_nd_btt);
96
97struct nd_btt *to_nd_btt(struct device *dev)
98{
99 struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
100
101 WARN_ON(!is_nd_btt(dev));
102 return nd_btt;
103}
104EXPORT_SYMBOL(to_nd_btt);
105
106static const unsigned long btt_lbasize_supported[] = { 512, 520, 528,
107 4096, 4104, 4160, 4224, 0 };
108
109static ssize_t sector_size_show(struct device *dev,
110 struct device_attribute *attr, char *buf)
111{
112 struct nd_btt *nd_btt = to_nd_btt(dev);
113
114 return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf);
115}
116
117static ssize_t sector_size_store(struct device *dev,
118 struct device_attribute *attr, const char *buf, size_t len)
119{
120 struct nd_btt *nd_btt = to_nd_btt(dev);
121 ssize_t rc;
122
123 device_lock(dev);
124 nvdimm_bus_lock(dev);
125 rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize,
126 btt_lbasize_supported);
127 dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
128 rc, buf, buf[len - 1] == '\n' ? "" : "\n");
129 nvdimm_bus_unlock(dev);
130 device_unlock(dev);
131
132 return rc ? rc : len;
133}
134static DEVICE_ATTR_RW(sector_size);
135
136static ssize_t uuid_show(struct device *dev,
137 struct device_attribute *attr, char *buf)
138{
139 struct nd_btt *nd_btt = to_nd_btt(dev);
140
141 if (nd_btt->uuid)
142 return sprintf(buf, "%pUb\n", nd_btt->uuid);
143 return sprintf(buf, "\n");
144}
145
146static ssize_t uuid_store(struct device *dev,
147 struct device_attribute *attr, const char *buf, size_t len)
148{
149 struct nd_btt *nd_btt = to_nd_btt(dev);
150 ssize_t rc;
151
152 device_lock(dev);
153 rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
154 dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
155 rc, buf, buf[len - 1] == '\n' ? "" : "\n");
156 device_unlock(dev);
157
158 return rc ? rc : len;
159}
160static DEVICE_ATTR_RW(uuid);
161
162static ssize_t namespace_show(struct device *dev,
163 struct device_attribute *attr, char *buf)
164{
165 struct nd_btt *nd_btt = to_nd_btt(dev);
166 ssize_t rc;
167
168 nvdimm_bus_lock(dev);
169 rc = sprintf(buf, "%s\n", nd_btt->ndns
170 ? dev_name(&nd_btt->ndns->dev) : "");
171 nvdimm_bus_unlock(dev);
172 return rc;
173}
174
175static int namespace_match(struct device *dev, void *data)
176{
177 char *name = data;
178
179 return strcmp(name, dev_name(dev)) == 0;
180}
181
182static bool is_nd_btt_idle(struct device *dev)
183{
184 struct nd_region *nd_region = to_nd_region(dev->parent);
185 struct nd_btt *nd_btt = to_nd_btt(dev);
186
187 if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver)
188 return false;
189 return true;
190}
191
192static ssize_t __namespace_store(struct device *dev,
193 struct device_attribute *attr, const char *buf, size_t len)
194{
195 struct nd_btt *nd_btt = to_nd_btt(dev);
196 struct nd_namespace_common *ndns;
197 struct device *found;
198 char *name;
199
200 if (dev->driver) {
201 dev_dbg(dev, "%s: -EBUSY\n", __func__);
202 return -EBUSY;
203 }
204
205 name = kstrndup(buf, len, GFP_KERNEL);
206 if (!name)
207 return -ENOMEM;
208 strim(name);
209
210 if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
211 /* pass */;
212 else {
213 len = -EINVAL;
214 goto out;
215 }
216
217 ndns = nd_btt->ndns;
218 if (strcmp(name, "") == 0) {
219 /* detach the namespace and destroy / reset the btt device */
220 nd_btt_detach_ndns(nd_btt);
221 if (is_nd_btt_idle(dev))
222 nd_device_unregister(dev, ND_ASYNC);
223 else {
224 nd_btt->lbasize = 0;
225 kfree(nd_btt->uuid);
226 nd_btt->uuid = NULL;
227 }
228 goto out;
229 } else if (ndns) {
230 dev_dbg(dev, "namespace already set to: %s\n",
231 dev_name(&ndns->dev));
232 len = -EBUSY;
233 goto out;
234 }
235
236 found = device_find_child(dev->parent, name, namespace_match);
237 if (!found) {
238 dev_dbg(dev, "'%s' not found under %s\n", name,
239 dev_name(dev->parent));
240 len = -ENODEV;
241 goto out;
242 }
243
244 ndns = to_ndns(found);
245 if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
246 dev_dbg(dev, "%s too small to host btt\n", name);
247 len = -ENXIO;
248 goto out_attach;
249 }
250
251 WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev));
252 if (!nd_btt_attach_ndns(nd_btt, ndns)) {
253 dev_dbg(dev, "%s already claimed\n",
254 dev_name(&ndns->dev));
255 len = -EBUSY;
256 }
257
258 out_attach:
259 put_device(&ndns->dev); /* from device_find_child */
260 out:
261 kfree(name);
262 return len;
263}
264
265static ssize_t namespace_store(struct device *dev,
266 struct device_attribute *attr, const char *buf, size_t len)
267{
268 ssize_t rc;
269
270 nvdimm_bus_lock(dev);
271 device_lock(dev);
272 rc = __namespace_store(dev, attr, buf, len);
273 dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
274 rc, buf, buf[len - 1] == '\n' ? "" : "\n");
275 device_unlock(dev);
276 nvdimm_bus_unlock(dev);
277
278 return rc;
279}
280static DEVICE_ATTR_RW(namespace);
281
282static struct attribute *nd_btt_attributes[] = {
283 &dev_attr_sector_size.attr,
284 &dev_attr_namespace.attr,
285 &dev_attr_uuid.attr,
286 NULL,
287};
288
289static struct attribute_group nd_btt_attribute_group = {
290 .attrs = nd_btt_attributes,
291};
292
293static const struct attribute_group *nd_btt_attribute_groups[] = {
294 &nd_btt_attribute_group,
295 &nd_device_attribute_group,
296 &nd_numa_attribute_group,
297 NULL,
298};
299
300static struct device *__nd_btt_create(struct nd_region *nd_region,
301 unsigned long lbasize, u8 *uuid,
302 struct nd_namespace_common *ndns)
303{
304 struct nd_btt *nd_btt;
305 struct device *dev;
306
307 nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
308 if (!nd_btt)
309 return NULL;
310
311 nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
312 if (nd_btt->id < 0) {
313 kfree(nd_btt);
314 return NULL;
315 }
316
317 nd_btt->lbasize = lbasize;
318 if (uuid)
319 uuid = kmemdup(uuid, 16, GFP_KERNEL);
320 nd_btt->uuid = uuid;
321 dev = &nd_btt->dev;
322 dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
323 dev->parent = &nd_region->dev;
324 dev->type = &nd_btt_device_type;
325 dev->groups = nd_btt_attribute_groups;
326 device_initialize(&nd_btt->dev);
327 if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) {
328 dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
329 __func__, dev_name(ndns->claim));
330 put_device(dev);
331 return NULL;
332 }
333 return dev;
334}
335
336struct device *nd_btt_create(struct nd_region *nd_region)
337{
338 struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
339
340 if (dev)
341 __nd_device_register(dev);
342 return dev;
343}
344
345/*
346 * nd_btt_sb_checksum: compute checksum for btt info block
347 *
348 * Returns a fletcher64 checksum of everything in the given info block
349 * except the last field (since that's where the checksum lives).
350 */
351u64 nd_btt_sb_checksum(struct btt_sb *btt_sb)
352{
353 u64 sum;
354 __le64 sum_save;
355
356 sum_save = btt_sb->checksum;
357 btt_sb->checksum = 0;
358 sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1);
359 btt_sb->checksum = sum_save;
360 return sum;
361}
362EXPORT_SYMBOL(nd_btt_sb_checksum);
363
364static int __nd_btt_probe(struct nd_btt *nd_btt,
365 struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
366{
367 u64 checksum;
368
369 if (!btt_sb || !ndns || !nd_btt)
370 return -ENODEV;
371
372 if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb)))
373 return -ENXIO;
374
375 if (nvdimm_namespace_capacity(ndns) < SZ_16M)
376 return -ENXIO;
377
378 if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
379 return -ENODEV;
380
381 checksum = le64_to_cpu(btt_sb->checksum);
382 btt_sb->checksum = 0;
383 if (checksum != nd_btt_sb_checksum(btt_sb))
384 return -ENODEV;
385 btt_sb->checksum = cpu_to_le64(checksum);
386
387 nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
388 nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
389 if (!nd_btt->uuid)
390 return -ENOMEM;
391
392 __nd_device_register(&nd_btt->dev);
393
394 return 0;
395}
396
397int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
398{
399 int rc;
400 struct device *dev;
401 struct btt_sb *btt_sb;
402 struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
403
404 if (ndns->force_raw)
405 return -ENODEV;
406
407 nvdimm_bus_lock(&ndns->dev);
408 dev = __nd_btt_create(nd_region, 0, NULL, ndns);
409 nvdimm_bus_unlock(&ndns->dev);
410 if (!dev)
411 return -ENOMEM;
412 dev_set_drvdata(dev, drvdata);
413 btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL);
414 rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb);
415 kfree(btt_sb);
416 dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__,
417 rc == 0 ? dev_name(dev) : "<none>");
418 if (rc < 0) {
419 __nd_btt_detach_ndns(to_nd_btt(dev));
420 put_device(dev);
421 }
422
423 return rc;
424}
425EXPORT_SYMBOL(nd_btt_probe);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
new file mode 100644
index 000000000000..8eb22c0ca7ce
--- /dev/null
+++ b/drivers/nvdimm/bus.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14#include <linux/vmalloc.h>
15#include <linux/uaccess.h>
16#include <linux/module.h>
17#include <linux/blkdev.h>
18#include <linux/fcntl.h>
19#include <linux/async.h>
20#include <linux/genhd.h>
21#include <linux/ndctl.h>
22#include <linux/sched.h>
23#include <linux/slab.h>
24#include <linux/fs.h>
25#include <linux/io.h>
26#include <linux/mm.h>
27#include <linux/nd.h>
28#include "nd-core.h"
29#include "nd.h"
30
31int nvdimm_major;
32static int nvdimm_bus_major;
33static struct class *nd_class;
34
35static int to_nd_device_type(struct device *dev)
36{
37 if (is_nvdimm(dev))
38 return ND_DEVICE_DIMM;
39 else if (is_nd_pmem(dev))
40 return ND_DEVICE_REGION_PMEM;
41 else if (is_nd_blk(dev))
42 return ND_DEVICE_REGION_BLK;
43 else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
44 return nd_region_to_nstype(to_nd_region(dev->parent));
45
46 return 0;
47}
48
49static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
50{
51 /*
52 * Ensure that region devices always have their numa node set as
53 * early as possible.
54 */
55 if (is_nd_pmem(dev) || is_nd_blk(dev))
56 set_dev_node(dev, to_nd_region(dev)->numa_node);
57 return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
58 to_nd_device_type(dev));
59}
60
61static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
62{
63 struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
64
65 return test_bit(to_nd_device_type(dev), &nd_drv->type);
66}
67
68static struct module *to_bus_provider(struct device *dev)
69{
70 /* pin bus providers while regions are enabled */
71 if (is_nd_pmem(dev) || is_nd_blk(dev)) {
72 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
73
74 return nvdimm_bus->module;
75 }
76 return NULL;
77}
78
79static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus)
80{
81 nvdimm_bus_lock(&nvdimm_bus->dev);
82 nvdimm_bus->probe_active++;
83 nvdimm_bus_unlock(&nvdimm_bus->dev);
84}
85
86static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus)
87{
88 nvdimm_bus_lock(&nvdimm_bus->dev);
89 if (--nvdimm_bus->probe_active == 0)
90 wake_up(&nvdimm_bus->probe_wait);
91 nvdimm_bus_unlock(&nvdimm_bus->dev);
92}
93
94static int nvdimm_bus_probe(struct device *dev)
95{
96 struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
97 struct module *provider = to_bus_provider(dev);
98 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
99 int rc;
100
101 if (!try_module_get(provider))
102 return -ENXIO;
103
104 nvdimm_bus_probe_start(nvdimm_bus);
105 rc = nd_drv->probe(dev);
106 if (rc == 0)
107 nd_region_probe_success(nvdimm_bus, dev);
108 else
109 nd_region_disable(nvdimm_bus, dev);
110 nvdimm_bus_probe_end(nvdimm_bus);
111
112 dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
113 dev_name(dev), rc);
114
115 if (rc != 0)
116 module_put(provider);
117 return rc;
118}
119
120static int nvdimm_bus_remove(struct device *dev)
121{
122 struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
123 struct module *provider = to_bus_provider(dev);
124 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
125 int rc;
126
127 rc = nd_drv->remove(dev);
128 nd_region_disable(nvdimm_bus, dev);
129
130 dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
131 dev_name(dev), rc);
132 module_put(provider);
133 return rc;
134}
135
136static struct bus_type nvdimm_bus_type = {
137 .name = "nd",
138 .uevent = nvdimm_bus_uevent,
139 .match = nvdimm_bus_match,
140 .probe = nvdimm_bus_probe,
141 .remove = nvdimm_bus_remove,
142};
143
144static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
145
146void nd_synchronize(void)
147{
148 async_synchronize_full_domain(&nd_async_domain);
149}
150EXPORT_SYMBOL_GPL(nd_synchronize);
151
152static void nd_async_device_register(void *d, async_cookie_t cookie)
153{
154 struct device *dev = d;
155
156 if (device_add(dev) != 0) {
157 dev_err(dev, "%s: failed\n", __func__);
158 put_device(dev);
159 }
160 put_device(dev);
161}
162
163static void nd_async_device_unregister(void *d, async_cookie_t cookie)
164{
165 struct device *dev = d;
166
167 /* flush bus operations before delete */
168 nvdimm_bus_lock(dev);
169 nvdimm_bus_unlock(dev);
170
171 device_unregister(dev);
172 put_device(dev);
173}
174
175void __nd_device_register(struct device *dev)
176{
177 dev->bus = &nvdimm_bus_type;
178 get_device(dev);
179 async_schedule_domain(nd_async_device_register, dev,
180 &nd_async_domain);
181}
182
183void nd_device_register(struct device *dev)
184{
185 device_initialize(dev);
186 __nd_device_register(dev);
187}
188EXPORT_SYMBOL(nd_device_register);
189
190void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
191{
192 switch (mode) {
193 case ND_ASYNC:
194 get_device(dev);
195 async_schedule_domain(nd_async_device_unregister, dev,
196 &nd_async_domain);
197 break;
198 case ND_SYNC:
199 nd_synchronize();
200 device_unregister(dev);
201 break;
202 }
203}
204EXPORT_SYMBOL(nd_device_unregister);
205
206/**
207 * __nd_driver_register() - register a region or a namespace driver
208 * @nd_drv: driver to register
209 * @owner: automatically set by nd_driver_register() macro
210 * @mod_name: automatically set by nd_driver_register() macro
211 */
212int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
213 const char *mod_name)
214{
215 struct device_driver *drv = &nd_drv->drv;
216
217 if (!nd_drv->type) {
218 pr_debug("driver type bitmask not set (%pf)\n",
219 __builtin_return_address(0));
220 return -EINVAL;
221 }
222
223 if (!nd_drv->probe || !nd_drv->remove) {
224 pr_debug("->probe() and ->remove() must be specified\n");
225 return -EINVAL;
226 }
227
228 drv->bus = &nvdimm_bus_type;
229 drv->owner = owner;
230 drv->mod_name = mod_name;
231
232 return driver_register(drv);
233}
234EXPORT_SYMBOL(__nd_driver_register);
235
236int nvdimm_revalidate_disk(struct gendisk *disk)
237{
238 struct device *dev = disk->driverfs_dev;
239 struct nd_region *nd_region = to_nd_region(dev->parent);
240 const char *pol = nd_region->ro ? "only" : "write";
241
242 if (nd_region->ro == get_disk_ro(disk))
243 return 0;
244
245 dev_info(dev, "%s read-%s, marking %s read-%s\n",
246 dev_name(&nd_region->dev), pol, disk->disk_name, pol);
247 set_disk_ro(disk, nd_region->ro);
248
249 return 0;
250
251}
252EXPORT_SYMBOL(nvdimm_revalidate_disk);
253
254static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
255 char *buf)
256{
257 return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n",
258 to_nd_device_type(dev));
259}
260static DEVICE_ATTR_RO(modalias);
261
262static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
263 char *buf)
264{
265 return sprintf(buf, "%s\n", dev->type->name);
266}
267static DEVICE_ATTR_RO(devtype);
268
269static struct attribute *nd_device_attributes[] = {
270 &dev_attr_modalias.attr,
271 &dev_attr_devtype.attr,
272 NULL,
273};
274
275/**
276 * nd_device_attribute_group - generic attributes for all devices on an nd bus
277 */
278struct attribute_group nd_device_attribute_group = {
279 .attrs = nd_device_attributes,
280};
281EXPORT_SYMBOL_GPL(nd_device_attribute_group);
282
283static ssize_t numa_node_show(struct device *dev,
284 struct device_attribute *attr, char *buf)
285{
286 return sprintf(buf, "%d\n", dev_to_node(dev));
287}
288static DEVICE_ATTR_RO(numa_node);
289
290static struct attribute *nd_numa_attributes[] = {
291 &dev_attr_numa_node.attr,
292 NULL,
293};
294
295static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
296 int n)
297{
298 if (!IS_ENABLED(CONFIG_NUMA))
299 return 0;
300
301 return a->mode;
302}
303
304/**
305 * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
306 */
307struct attribute_group nd_numa_attribute_group = {
308 .attrs = nd_numa_attributes,
309 .is_visible = nd_numa_attr_visible,
310};
311EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
312
313int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
314{
315 dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
316 struct device *dev;
317
318 dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
319 "ndctl%d", nvdimm_bus->id);
320
321 if (IS_ERR(dev)) {
322 dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
323 nvdimm_bus->id, PTR_ERR(dev));
324 return PTR_ERR(dev);
325 }
326 return 0;
327}
328
329void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
330{
331 device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
332}
333
334static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
335 [ND_CMD_IMPLEMENTED] = { },
336 [ND_CMD_SMART] = {
337 .out_num = 2,
338 .out_sizes = { 4, 8, },
339 },
340 [ND_CMD_SMART_THRESHOLD] = {
341 .out_num = 2,
342 .out_sizes = { 4, 8, },
343 },
344 [ND_CMD_DIMM_FLAGS] = {
345 .out_num = 2,
346 .out_sizes = { 4, 4 },
347 },
348 [ND_CMD_GET_CONFIG_SIZE] = {
349 .out_num = 3,
350 .out_sizes = { 4, 4, 4, },
351 },
352 [ND_CMD_GET_CONFIG_DATA] = {
353 .in_num = 2,
354 .in_sizes = { 4, 4, },
355 .out_num = 2,
356 .out_sizes = { 4, UINT_MAX, },
357 },
358 [ND_CMD_SET_CONFIG_DATA] = {
359 .in_num = 3,
360 .in_sizes = { 4, 4, UINT_MAX, },
361 .out_num = 1,
362 .out_sizes = { 4, },
363 },
364 [ND_CMD_VENDOR] = {
365 .in_num = 3,
366 .in_sizes = { 4, 4, UINT_MAX, },
367 .out_num = 3,
368 .out_sizes = { 4, 4, UINT_MAX, },
369 },
370};
371
372const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
373{
374 if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs))
375 return &__nd_cmd_dimm_descs[cmd];
376 return NULL;
377}
378EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc);
379
380static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
381 [ND_CMD_IMPLEMENTED] = { },
382 [ND_CMD_ARS_CAP] = {
383 .in_num = 2,
384 .in_sizes = { 8, 8, },
385 .out_num = 2,
386 .out_sizes = { 4, 4, },
387 },
388 [ND_CMD_ARS_START] = {
389 .in_num = 4,
390 .in_sizes = { 8, 8, 2, 6, },
391 .out_num = 1,
392 .out_sizes = { 4, },
393 },
394 [ND_CMD_ARS_STATUS] = {
395 .out_num = 2,
396 .out_sizes = { 4, UINT_MAX, },
397 },
398};
399
400const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
401{
402 if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs))
403 return &__nd_cmd_bus_descs[cmd];
404 return NULL;
405}
406EXPORT_SYMBOL_GPL(nd_cmd_bus_desc);
407
408u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
409 const struct nd_cmd_desc *desc, int idx, void *buf)
410{
411 if (idx >= desc->in_num)
412 return UINT_MAX;
413
414 if (desc->in_sizes[idx] < UINT_MAX)
415 return desc->in_sizes[idx];
416
417 if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) {
418 struct nd_cmd_set_config_hdr *hdr = buf;
419
420 return hdr->in_length;
421 } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) {
422 struct nd_cmd_vendor_hdr *hdr = buf;
423
424 return hdr->in_length;
425 }
426
427 return UINT_MAX;
428}
429EXPORT_SYMBOL_GPL(nd_cmd_in_size);
430
431u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
432 const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
433 const u32 *out_field)
434{
435 if (idx >= desc->out_num)
436 return UINT_MAX;
437
438 if (desc->out_sizes[idx] < UINT_MAX)
439 return desc->out_sizes[idx];
440
441 if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1)
442 return in_field[1];
443 else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
444 return out_field[1];
445 else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1)
446 return ND_CMD_ARS_STATUS_MAX;
447
448 return UINT_MAX;
449}
450EXPORT_SYMBOL_GPL(nd_cmd_out_size);
451
452void wait_nvdimm_bus_probe_idle(struct device *dev)
453{
454 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
455
456 do {
457 if (nvdimm_bus->probe_active == 0)
458 break;
459 nvdimm_bus_unlock(&nvdimm_bus->dev);
460 wait_event(nvdimm_bus->probe_wait,
461 nvdimm_bus->probe_active == 0);
462 nvdimm_bus_lock(&nvdimm_bus->dev);
463 } while (true);
464}
465
466/* set_config requires an idle interleave set */
467static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd)
468{
469 struct nvdimm_bus *nvdimm_bus;
470
471 if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
472 return 0;
473
474 nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
475 wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
476
477 if (atomic_read(&nvdimm->busy))
478 return -EBUSY;
479 return 0;
480}
481
482static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
483 int read_only, unsigned int ioctl_cmd, unsigned long arg)
484{
485 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
486 size_t buf_len = 0, in_len = 0, out_len = 0;
487 static char out_env[ND_CMD_MAX_ENVELOPE];
488 static char in_env[ND_CMD_MAX_ENVELOPE];
489 const struct nd_cmd_desc *desc = NULL;
490 unsigned int cmd = _IOC_NR(ioctl_cmd);
491 void __user *p = (void __user *) arg;
492 struct device *dev = &nvdimm_bus->dev;
493 const char *cmd_name, *dimm_name;
494 unsigned long dsm_mask;
495 void *buf;
496 int rc, i;
497
498 if (nvdimm) {
499 desc = nd_cmd_dimm_desc(cmd);
500 cmd_name = nvdimm_cmd_name(cmd);
501 dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
502 dimm_name = dev_name(&nvdimm->dev);
503 } else {
504 desc = nd_cmd_bus_desc(cmd);
505 cmd_name = nvdimm_bus_cmd_name(cmd);
506 dsm_mask = nd_desc->dsm_mask;
507 dimm_name = "bus";
508 }
509
510 if (!desc || (desc->out_num + desc->in_num == 0) ||
511 !test_bit(cmd, &dsm_mask))
512 return -ENOTTY;
513
514 /* fail write commands (when read-only) */
515 if (read_only)
516 switch (ioctl_cmd) {
517 case ND_IOCTL_VENDOR:
518 case ND_IOCTL_SET_CONFIG_DATA:
519 case ND_IOCTL_ARS_START:
520 dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
521 nvdimm ? nvdimm_cmd_name(cmd)
522 : nvdimm_bus_cmd_name(cmd));
523 return -EPERM;
524 default:
525 break;
526 }
527
528 /* process an input envelope */
529 for (i = 0; i < desc->in_num; i++) {
530 u32 in_size, copy;
531
532 in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env);
533 if (in_size == UINT_MAX) {
534 dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n",
535 __func__, dimm_name, cmd_name, i);
536 return -ENXIO;
537 }
538 if (!access_ok(VERIFY_READ, p + in_len, in_size))
539 return -EFAULT;
540 if (in_len < sizeof(in_env))
541 copy = min_t(u32, sizeof(in_env) - in_len, in_size);
542 else
543 copy = 0;
544 if (copy && copy_from_user(&in_env[in_len], p + in_len, copy))
545 return -EFAULT;
546 in_len += in_size;
547 }
548
549 /* process an output envelope */
550 for (i = 0; i < desc->out_num; i++) {
551 u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
552 (u32 *) in_env, (u32 *) out_env);
553 u32 copy;
554
555 if (out_size == UINT_MAX) {
556 dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n",
557 __func__, dimm_name, cmd_name, i);
558 return -EFAULT;
559 }
560 if (!access_ok(VERIFY_WRITE, p + in_len + out_len, out_size))
561 return -EFAULT;
562 if (out_len < sizeof(out_env))
563 copy = min_t(u32, sizeof(out_env) - out_len, out_size);
564 else
565 copy = 0;
566 if (copy && copy_from_user(&out_env[out_len],
567 p + in_len + out_len, copy))
568 return -EFAULT;
569 out_len += out_size;
570 }
571
572 buf_len = out_len + in_len;
573 if (!access_ok(VERIFY_WRITE, p, sizeof(buf_len)))
574 return -EFAULT;
575
576 if (buf_len > ND_IOCTL_MAX_BUFLEN) {
577 dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
578 dimm_name, cmd_name, buf_len,
579 ND_IOCTL_MAX_BUFLEN);
580 return -EINVAL;
581 }
582
583 buf = vmalloc(buf_len);
584 if (!buf)
585 return -ENOMEM;
586
587 if (copy_from_user(buf, p, buf_len)) {
588 rc = -EFAULT;
589 goto out;
590 }
591
592 nvdimm_bus_lock(&nvdimm_bus->dev);
593 rc = nd_cmd_clear_to_send(nvdimm, cmd);
594 if (rc)
595 goto out_unlock;
596
597 rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len);
598 if (rc < 0)
599 goto out_unlock;
600 if (copy_to_user(p, buf, buf_len))
601 rc = -EFAULT;
602 out_unlock:
603 nvdimm_bus_unlock(&nvdimm_bus->dev);
604 out:
605 vfree(buf);
606 return rc;
607}
608
609static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
610{
611 long id = (long) file->private_data;
612 int rc = -ENXIO, read_only;
613 struct nvdimm_bus *nvdimm_bus;
614
615 read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
616 mutex_lock(&nvdimm_bus_list_mutex);
617 list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
618 if (nvdimm_bus->id == id) {
619 rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg);
620 break;
621 }
622 }
623 mutex_unlock(&nvdimm_bus_list_mutex);
624
625 return rc;
626}
627
628static int match_dimm(struct device *dev, void *data)
629{
630 long id = (long) data;
631
632 if (is_nvdimm(dev)) {
633 struct nvdimm *nvdimm = to_nvdimm(dev);
634
635 return nvdimm->id == id;
636 }
637
638 return 0;
639}
640
641static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
642{
643 int rc = -ENXIO, read_only;
644 struct nvdimm_bus *nvdimm_bus;
645
646 read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
647 mutex_lock(&nvdimm_bus_list_mutex);
648 list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
649 struct device *dev = device_find_child(&nvdimm_bus->dev,
650 file->private_data, match_dimm);
651 struct nvdimm *nvdimm;
652
653 if (!dev)
654 continue;
655
656 nvdimm = to_nvdimm(dev);
657 rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg);
658 put_device(dev);
659 break;
660 }
661 mutex_unlock(&nvdimm_bus_list_mutex);
662
663 return rc;
664}
665
666static int nd_open(struct inode *inode, struct file *file)
667{
668 long minor = iminor(inode);
669
670 file->private_data = (void *) minor;
671 return 0;
672}
673
674static const struct file_operations nvdimm_bus_fops = {
675 .owner = THIS_MODULE,
676 .open = nd_open,
677 .unlocked_ioctl = nd_ioctl,
678 .compat_ioctl = nd_ioctl,
679 .llseek = noop_llseek,
680};
681
682static const struct file_operations nvdimm_fops = {
683 .owner = THIS_MODULE,
684 .open = nd_open,
685 .unlocked_ioctl = nvdimm_ioctl,
686 .compat_ioctl = nvdimm_ioctl,
687 .llseek = noop_llseek,
688};
689
690int __init nvdimm_bus_init(void)
691{
692 int rc;
693
694 rc = bus_register(&nvdimm_bus_type);
695 if (rc)
696 return rc;
697
698 rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
699 if (rc < 0)
700 goto err_bus_chrdev;
701 nvdimm_bus_major = rc;
702
703 rc = register_chrdev(0, "dimmctl", &nvdimm_fops);
704 if (rc < 0)
705 goto err_dimm_chrdev;
706 nvdimm_major = rc;
707
708 nd_class = class_create(THIS_MODULE, "nd");
709 if (IS_ERR(nd_class))
710 goto err_class;
711
712 return 0;
713
714 err_class:
715 unregister_chrdev(nvdimm_major, "dimmctl");
716 err_dimm_chrdev:
717 unregister_chrdev(nvdimm_bus_major, "ndctl");
718 err_bus_chrdev:
719 bus_unregister(&nvdimm_bus_type);
720
721 return rc;
722}
723
724void nvdimm_bus_exit(void)
725{
726 class_destroy(nd_class);
727 unregister_chrdev(nvdimm_bus_major, "ndctl");
728 unregister_chrdev(nvdimm_major, "dimmctl");
729 bus_unregister(&nvdimm_bus_type);
730}
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
new file mode 100644
index 000000000000..cb62ec6a12d0
--- /dev/null
+++ b/drivers/nvdimm/core.c
@@ -0,0 +1,465 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/libnvdimm.h>
14#include <linux/export.h>
15#include <linux/module.h>
16#include <linux/blkdev.h>
17#include <linux/device.h>
18#include <linux/ctype.h>
19#include <linux/ndctl.h>
20#include <linux/mutex.h>
21#include <linux/slab.h>
22#include "nd-core.h"
23#include "nd.h"
24
25LIST_HEAD(nvdimm_bus_list);
26DEFINE_MUTEX(nvdimm_bus_list_mutex);
27static DEFINE_IDA(nd_ida);
28
29void nvdimm_bus_lock(struct device *dev)
30{
31 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
32
33 if (!nvdimm_bus)
34 return;
35 mutex_lock(&nvdimm_bus->reconfig_mutex);
36}
37EXPORT_SYMBOL(nvdimm_bus_lock);
38
39void nvdimm_bus_unlock(struct device *dev)
40{
41 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
42
43 if (!nvdimm_bus)
44 return;
45 mutex_unlock(&nvdimm_bus->reconfig_mutex);
46}
47EXPORT_SYMBOL(nvdimm_bus_unlock);
48
49bool is_nvdimm_bus_locked(struct device *dev)
50{
51 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
52
53 if (!nvdimm_bus)
54 return false;
55 return mutex_is_locked(&nvdimm_bus->reconfig_mutex);
56}
57EXPORT_SYMBOL(is_nvdimm_bus_locked);
58
59u64 nd_fletcher64(void *addr, size_t len, bool le)
60{
61 u32 *buf = addr;
62 u32 lo32 = 0;
63 u64 hi32 = 0;
64 int i;
65
66 for (i = 0; i < len / sizeof(u32); i++) {
67 lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i];
68 hi32 += lo32;
69 }
70
71 return hi32 << 32 | lo32;
72}
73EXPORT_SYMBOL_GPL(nd_fletcher64);
74
75static void nvdimm_bus_release(struct device *dev)
76{
77 struct nvdimm_bus *nvdimm_bus;
78
79 nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
80 ida_simple_remove(&nd_ida, nvdimm_bus->id);
81 kfree(nvdimm_bus);
82}
83
84struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
85{
86 struct nvdimm_bus *nvdimm_bus;
87
88 nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
89 WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release);
90 return nvdimm_bus;
91}
92EXPORT_SYMBOL_GPL(to_nvdimm_bus);
93
94struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
95{
96 /* struct nvdimm_bus definition is private to libnvdimm */
97 return nvdimm_bus->nd_desc;
98}
99EXPORT_SYMBOL_GPL(to_nd_desc);
100
101struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
102{
103 struct device *dev;
104
105 for (dev = nd_dev; dev; dev = dev->parent)
106 if (dev->release == nvdimm_bus_release)
107 break;
108 dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
109 if (dev)
110 return to_nvdimm_bus(dev);
111 return NULL;
112}
113
114static bool is_uuid_sep(char sep)
115{
116 if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
117 return true;
118 return false;
119}
120
121static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf,
122 size_t len)
123{
124 const char *str = buf;
125 u8 uuid[16];
126 int i;
127
128 for (i = 0; i < 16; i++) {
129 if (!isxdigit(str[0]) || !isxdigit(str[1])) {
130 dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n",
131 __func__, i, str - buf, str[0],
132 str + 1 - buf, str[1]);
133 return -EINVAL;
134 }
135
136 uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
137 str += 2;
138 if (is_uuid_sep(*str))
139 str++;
140 }
141
142 memcpy(uuid_out, uuid, sizeof(uuid));
143 return 0;
144}
145
146/**
147 * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes
148 * @dev: container device for the uuid property
149 * @uuid_out: uuid buffer to replace
150 * @buf: raw sysfs buffer to parse
151 *
152 * Enforce that uuids can only be changed while the device is disabled
153 * (driver detached)
154 * LOCKING: expects device_lock() is held on entry
155 */
156int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
157 size_t len)
158{
159 u8 uuid[16];
160 int rc;
161
162 if (dev->driver)
163 return -EBUSY;
164
165 rc = nd_uuid_parse(dev, uuid, buf, len);
166 if (rc)
167 return rc;
168
169 kfree(*uuid_out);
170 *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL);
171 if (!(*uuid_out))
172 return -ENOMEM;
173
174 return 0;
175}
176
177ssize_t nd_sector_size_show(unsigned long current_lbasize,
178 const unsigned long *supported, char *buf)
179{
180 ssize_t len = 0;
181 int i;
182
183 for (i = 0; supported[i]; i++)
184 if (current_lbasize == supported[i])
185 len += sprintf(buf + len, "[%ld] ", supported[i]);
186 else
187 len += sprintf(buf + len, "%ld ", supported[i]);
188 len += sprintf(buf + len, "\n");
189 return len;
190}
191
192ssize_t nd_sector_size_store(struct device *dev, const char *buf,
193 unsigned long *current_lbasize, const unsigned long *supported)
194{
195 unsigned long lbasize;
196 int rc, i;
197
198 if (dev->driver)
199 return -EBUSY;
200
201 rc = kstrtoul(buf, 0, &lbasize);
202 if (rc)
203 return rc;
204
205 for (i = 0; supported[i]; i++)
206 if (lbasize == supported[i])
207 break;
208
209 if (supported[i]) {
210 *current_lbasize = lbasize;
211 return 0;
212 } else {
213 return -EINVAL;
214 }
215}
216
217void __nd_iostat_start(struct bio *bio, unsigned long *start)
218{
219 struct gendisk *disk = bio->bi_bdev->bd_disk;
220 const int rw = bio_data_dir(bio);
221 int cpu = part_stat_lock();
222
223 *start = jiffies;
224 part_round_stats(cpu, &disk->part0);
225 part_stat_inc(cpu, &disk->part0, ios[rw]);
226 part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
227 part_inc_in_flight(&disk->part0, rw);
228 part_stat_unlock();
229}
230EXPORT_SYMBOL(__nd_iostat_start);
231
232void nd_iostat_end(struct bio *bio, unsigned long start)
233{
234 struct gendisk *disk = bio->bi_bdev->bd_disk;
235 unsigned long duration = jiffies - start;
236 const int rw = bio_data_dir(bio);
237 int cpu = part_stat_lock();
238
239 part_stat_add(cpu, &disk->part0, ticks[rw], duration);
240 part_round_stats(cpu, &disk->part0);
241 part_dec_in_flight(&disk->part0, rw);
242 part_stat_unlock();
243}
244EXPORT_SYMBOL(nd_iostat_end);
245
246static ssize_t commands_show(struct device *dev,
247 struct device_attribute *attr, char *buf)
248{
249 int cmd, len = 0;
250 struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
251 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
252
253 for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
254 len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
255 len += sprintf(buf + len, "\n");
256 return len;
257}
258static DEVICE_ATTR_RO(commands);
259
260static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
261{
262 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
263 struct device *parent = nvdimm_bus->dev.parent;
264
265 if (nd_desc->provider_name)
266 return nd_desc->provider_name;
267 else if (parent)
268 return dev_name(parent);
269 else
270 return "unknown";
271}
272
273static ssize_t provider_show(struct device *dev,
274 struct device_attribute *attr, char *buf)
275{
276 struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
277
278 return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus));
279}
280static DEVICE_ATTR_RO(provider);
281
282static int flush_namespaces(struct device *dev, void *data)
283{
284 device_lock(dev);
285 device_unlock(dev);
286 return 0;
287}
288
289static int flush_regions_dimms(struct device *dev, void *data)
290{
291 device_lock(dev);
292 device_unlock(dev);
293 device_for_each_child(dev, NULL, flush_namespaces);
294 return 0;
295}
296
297static ssize_t wait_probe_show(struct device *dev,
298 struct device_attribute *attr, char *buf)
299{
300 nd_synchronize();
301 device_for_each_child(dev, NULL, flush_regions_dimms);
302 return sprintf(buf, "1\n");
303}
304static DEVICE_ATTR_RO(wait_probe);
305
306static struct attribute *nvdimm_bus_attributes[] = {
307 &dev_attr_commands.attr,
308 &dev_attr_wait_probe.attr,
309 &dev_attr_provider.attr,
310 NULL,
311};
312
313struct attribute_group nvdimm_bus_attribute_group = {
314 .attrs = nvdimm_bus_attributes,
315};
316EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
317
318struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
319 struct nvdimm_bus_descriptor *nd_desc, struct module *module)
320{
321 struct nvdimm_bus *nvdimm_bus;
322 int rc;
323
324 nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
325 if (!nvdimm_bus)
326 return NULL;
327 INIT_LIST_HEAD(&nvdimm_bus->list);
328 init_waitqueue_head(&nvdimm_bus->probe_wait);
329 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
330 mutex_init(&nvdimm_bus->reconfig_mutex);
331 if (nvdimm_bus->id < 0) {
332 kfree(nvdimm_bus);
333 return NULL;
334 }
335 nvdimm_bus->nd_desc = nd_desc;
336 nvdimm_bus->module = module;
337 nvdimm_bus->dev.parent = parent;
338 nvdimm_bus->dev.release = nvdimm_bus_release;
339 nvdimm_bus->dev.groups = nd_desc->attr_groups;
340 dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
341 rc = device_register(&nvdimm_bus->dev);
342 if (rc) {
343 dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
344 goto err;
345 }
346
347 rc = nvdimm_bus_create_ndctl(nvdimm_bus);
348 if (rc)
349 goto err;
350
351 mutex_lock(&nvdimm_bus_list_mutex);
352 list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
353 mutex_unlock(&nvdimm_bus_list_mutex);
354
355 return nvdimm_bus;
356 err:
357 put_device(&nvdimm_bus->dev);
358 return NULL;
359}
360EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
361
362static int child_unregister(struct device *dev, void *data)
363{
364 /*
365 * the singular ndctl class device per bus needs to be
366 * "device_destroy"ed, so skip it here
367 *
368 * i.e. remove classless children
369 */
370 if (dev->class)
371 /* pass */;
372 else
373 nd_device_unregister(dev, ND_SYNC);
374 return 0;
375}
376
377void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
378{
379 if (!nvdimm_bus)
380 return;
381
382 mutex_lock(&nvdimm_bus_list_mutex);
383 list_del_init(&nvdimm_bus->list);
384 mutex_unlock(&nvdimm_bus_list_mutex);
385
386 nd_synchronize();
387 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
388 nvdimm_bus_destroy_ndctl(nvdimm_bus);
389
390 device_unregister(&nvdimm_bus->dev);
391}
392EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
393
394#ifdef CONFIG_BLK_DEV_INTEGRITY
395static int nd_pi_nop_generate_verify(struct blk_integrity_iter *iter)
396{
397 return 0;
398}
399
400int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
401{
402 struct blk_integrity integrity = {
403 .name = "ND-PI-NOP",
404 .generate_fn = nd_pi_nop_generate_verify,
405 .verify_fn = nd_pi_nop_generate_verify,
406 .tuple_size = meta_size,
407 .tag_size = meta_size,
408 };
409 int ret;
410
411 if (meta_size == 0)
412 return 0;
413
414 ret = blk_integrity_register(disk, &integrity);
415 if (ret)
416 return ret;
417
418 blk_queue_max_integrity_segments(disk->queue, 1);
419
420 return 0;
421}
422EXPORT_SYMBOL(nd_integrity_init);
423
424#else /* CONFIG_BLK_DEV_INTEGRITY */
425int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
426{
427 return 0;
428}
429EXPORT_SYMBOL(nd_integrity_init);
430
431#endif
432
433static __init int libnvdimm_init(void)
434{
435 int rc;
436
437 rc = nvdimm_bus_init();
438 if (rc)
439 return rc;
440 rc = nvdimm_init();
441 if (rc)
442 goto err_dimm;
443 rc = nd_region_init();
444 if (rc)
445 goto err_region;
446 return 0;
447 err_region:
448 nvdimm_exit();
449 err_dimm:
450 nvdimm_bus_exit();
451 return rc;
452}
453
454static __exit void libnvdimm_exit(void)
455{
456 WARN_ON(!list_empty(&nvdimm_bus_list));
457 nd_region_exit();
458 nvdimm_exit();
459 nvdimm_bus_exit();
460}
461
462MODULE_LICENSE("GPL v2");
463MODULE_AUTHOR("Intel Corporation");
464subsys_initcall(libnvdimm_init);
465module_exit(libnvdimm_exit);
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
new file mode 100644
index 000000000000..71d12bb67339
--- /dev/null
+++ b/drivers/nvdimm/dimm.c
@@ -0,0 +1,102 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/vmalloc.h>
14#include <linux/module.h>
15#include <linux/device.h>
16#include <linux/sizes.h>
17#include <linux/ndctl.h>
18#include <linux/slab.h>
19#include <linux/mm.h>
20#include <linux/nd.h>
21#include "label.h"
22#include "nd.h"
23
24static int nvdimm_probe(struct device *dev)
25{
26 struct nvdimm_drvdata *ndd;
27 int rc;
28
29 ndd = kzalloc(sizeof(*ndd), GFP_KERNEL);
30 if (!ndd)
31 return -ENOMEM;
32
33 dev_set_drvdata(dev, ndd);
34 ndd->dpa.name = dev_name(dev);
35 ndd->ns_current = -1;
36 ndd->ns_next = -1;
37 ndd->dpa.start = 0;
38 ndd->dpa.end = -1;
39 ndd->dev = dev;
40 get_device(dev);
41 kref_init(&ndd->kref);
42
43 rc = nvdimm_init_nsarea(ndd);
44 if (rc)
45 goto err;
46
47 rc = nvdimm_init_config_data(ndd);
48 if (rc)
49 goto err;
50
51 dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size);
52
53 nvdimm_bus_lock(dev);
54 ndd->ns_current = nd_label_validate(ndd);
55 ndd->ns_next = nd_label_next_nsindex(ndd->ns_current);
56 nd_label_copy(ndd, to_next_namespace_index(ndd),
57 to_current_namespace_index(ndd));
58 rc = nd_label_reserve_dpa(ndd);
59 nvdimm_bus_unlock(dev);
60
61 if (rc)
62 goto err;
63
64 return 0;
65
66 err:
67 put_ndd(ndd);
68 return rc;
69}
70
71static int nvdimm_remove(struct device *dev)
72{
73 struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
74
75 nvdimm_bus_lock(dev);
76 dev_set_drvdata(dev, NULL);
77 nvdimm_bus_unlock(dev);
78 put_ndd(ndd);
79
80 return 0;
81}
82
83static struct nd_device_driver nvdimm_driver = {
84 .probe = nvdimm_probe,
85 .remove = nvdimm_remove,
86 .drv = {
87 .name = "nvdimm",
88 },
89 .type = ND_DRIVER_DIMM,
90};
91
92int __init nvdimm_init(void)
93{
94 return nd_driver_register(&nvdimm_driver);
95}
96
97void nvdimm_exit(void)
98{
99 driver_unregister(&nvdimm_driver.drv);
100}
101
102MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
new file mode 100644
index 000000000000..c05eb807d674
--- /dev/null
+++ b/drivers/nvdimm/dimm_devs.c
@@ -0,0 +1,551 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14#include <linux/vmalloc.h>
15#include <linux/device.h>
16#include <linux/ndctl.h>
17#include <linux/slab.h>
18#include <linux/io.h>
19#include <linux/fs.h>
20#include <linux/mm.h>
21#include "nd-core.h"
22#include "label.h"
23#include "nd.h"
24
25static DEFINE_IDA(dimm_ida);
26
27/*
28 * Retrieve bus and dimm handle and return if this bus supports
29 * get_config_data commands
30 */
31static int __validate_dimm(struct nvdimm_drvdata *ndd)
32{
33 struct nvdimm *nvdimm;
34
35 if (!ndd)
36 return -EINVAL;
37
38 nvdimm = to_nvdimm(ndd->dev);
39
40 if (!nvdimm->dsm_mask)
41 return -ENXIO;
42 if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
43 return -ENXIO;
44
45 return 0;
46}
47
48static int validate_dimm(struct nvdimm_drvdata *ndd)
49{
50 int rc = __validate_dimm(ndd);
51
52 if (rc && ndd)
53 dev_dbg(ndd->dev, "%pf: %s error: %d\n",
54 __builtin_return_address(0), __func__, rc);
55 return rc;
56}
57
58/**
59 * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area
60 * @nvdimm: dimm to initialize
61 */
62int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
63{
64 struct nd_cmd_get_config_size *cmd = &ndd->nsarea;
65 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
66 struct nvdimm_bus_descriptor *nd_desc;
67 int rc = validate_dimm(ndd);
68
69 if (rc)
70 return rc;
71
72 if (cmd->config_size)
73 return 0; /* already valid */
74
75 memset(cmd, 0, sizeof(*cmd));
76 nd_desc = nvdimm_bus->nd_desc;
77 return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
78 ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd));
79}
80
81int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
82{
83 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
84 struct nd_cmd_get_config_data_hdr *cmd;
85 struct nvdimm_bus_descriptor *nd_desc;
86 int rc = validate_dimm(ndd);
87 u32 max_cmd_size, config_size;
88 size_t offset;
89
90 if (rc)
91 return rc;
92
93 if (ndd->data)
94 return 0;
95
96 if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0
97 || ndd->nsarea.config_size < ND_LABEL_MIN_SIZE) {
98 dev_dbg(ndd->dev, "failed to init config data area: (%d:%d)\n",
99 ndd->nsarea.max_xfer, ndd->nsarea.config_size);
100 return -ENXIO;
101 }
102
103 ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL);
104 if (!ndd->data)
105 ndd->data = vmalloc(ndd->nsarea.config_size);
106
107 if (!ndd->data)
108 return -ENOMEM;
109
110 max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer);
111 cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL);
112 if (!cmd)
113 return -ENOMEM;
114
115 nd_desc = nvdimm_bus->nd_desc;
116 for (config_size = ndd->nsarea.config_size, offset = 0;
117 config_size; config_size -= cmd->in_length,
118 offset += cmd->in_length) {
119 cmd->in_length = min(config_size, max_cmd_size);
120 cmd->in_offset = offset;
121 rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
122 ND_CMD_GET_CONFIG_DATA, cmd,
123 cmd->in_length + sizeof(*cmd));
124 if (rc || cmd->status) {
125 rc = -ENXIO;
126 break;
127 }
128 memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length);
129 }
130 dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc);
131 kfree(cmd);
132
133 return rc;
134}
135
136int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
137 void *buf, size_t len)
138{
139 int rc = validate_dimm(ndd);
140 size_t max_cmd_size, buf_offset;
141 struct nd_cmd_set_config_hdr *cmd;
142 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
143 struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
144
145 if (rc)
146 return rc;
147
148 if (!ndd->data)
149 return -ENXIO;
150
151 if (offset + len > ndd->nsarea.config_size)
152 return -ENXIO;
153
154 max_cmd_size = min_t(u32, PAGE_SIZE, len);
155 max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer);
156 cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL);
157 if (!cmd)
158 return -ENOMEM;
159
160 for (buf_offset = 0; len; len -= cmd->in_length,
161 buf_offset += cmd->in_length) {
162 size_t cmd_size;
163 u32 *status;
164
165 cmd->in_offset = offset + buf_offset;
166 cmd->in_length = min(max_cmd_size, len);
167 memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length);
168
169 /* status is output in the last 4-bytes of the command buffer */
170 cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32);
171 status = ((void *) cmd) + cmd_size - sizeof(u32);
172
173 rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
174 ND_CMD_SET_CONFIG_DATA, cmd, cmd_size);
175 if (rc || *status) {
176 rc = rc ? rc : -ENXIO;
177 break;
178 }
179 }
180 kfree(cmd);
181
182 return rc;
183}
184
185static void nvdimm_release(struct device *dev)
186{
187 struct nvdimm *nvdimm = to_nvdimm(dev);
188
189 ida_simple_remove(&dimm_ida, nvdimm->id);
190 kfree(nvdimm);
191}
192
193static struct device_type nvdimm_device_type = {
194 .name = "nvdimm",
195 .release = nvdimm_release,
196};
197
198bool is_nvdimm(struct device *dev)
199{
200 return dev->type == &nvdimm_device_type;
201}
202
203struct nvdimm *to_nvdimm(struct device *dev)
204{
205 struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev);
206
207 WARN_ON(!is_nvdimm(dev));
208 return nvdimm;
209}
210EXPORT_SYMBOL_GPL(to_nvdimm);
211
212struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
213{
214 struct nd_region *nd_region = &ndbr->nd_region;
215 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
216
217 return nd_mapping->nvdimm;
218}
219EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
220
221struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
222{
223 struct nvdimm *nvdimm = nd_mapping->nvdimm;
224
225 WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
226
227 return dev_get_drvdata(&nvdimm->dev);
228}
229EXPORT_SYMBOL(to_ndd);
230
231void nvdimm_drvdata_release(struct kref *kref)
232{
233 struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref);
234 struct device *dev = ndd->dev;
235 struct resource *res, *_r;
236
237 dev_dbg(dev, "%s\n", __func__);
238
239 nvdimm_bus_lock(dev);
240 for_each_dpa_resource_safe(ndd, res, _r)
241 nvdimm_free_dpa(ndd, res);
242 nvdimm_bus_unlock(dev);
243
244 if (ndd->data && is_vmalloc_addr(ndd->data))
245 vfree(ndd->data);
246 else
247 kfree(ndd->data);
248 kfree(ndd);
249 put_device(dev);
250}
251
252void get_ndd(struct nvdimm_drvdata *ndd)
253{
254 kref_get(&ndd->kref);
255}
256
257void put_ndd(struct nvdimm_drvdata *ndd)
258{
259 if (ndd)
260 kref_put(&ndd->kref, nvdimm_drvdata_release);
261}
262
263const char *nvdimm_name(struct nvdimm *nvdimm)
264{
265 return dev_name(&nvdimm->dev);
266}
267EXPORT_SYMBOL_GPL(nvdimm_name);
268
269void *nvdimm_provider_data(struct nvdimm *nvdimm)
270{
271 if (nvdimm)
272 return nvdimm->provider_data;
273 return NULL;
274}
275EXPORT_SYMBOL_GPL(nvdimm_provider_data);
276
277static ssize_t commands_show(struct device *dev,
278 struct device_attribute *attr, char *buf)
279{
280 struct nvdimm *nvdimm = to_nvdimm(dev);
281 int cmd, len = 0;
282
283 if (!nvdimm->dsm_mask)
284 return sprintf(buf, "\n");
285
286 for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
287 len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
288 len += sprintf(buf + len, "\n");
289 return len;
290}
291static DEVICE_ATTR_RO(commands);
292
293static ssize_t state_show(struct device *dev, struct device_attribute *attr,
294 char *buf)
295{
296 struct nvdimm *nvdimm = to_nvdimm(dev);
297
298 /*
299 * The state may be in the process of changing, userspace should
300 * quiesce probing if it wants a static answer
301 */
302 nvdimm_bus_lock(dev);
303 nvdimm_bus_unlock(dev);
304 return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy)
305 ? "active" : "idle");
306}
307static DEVICE_ATTR_RO(state);
308
309static ssize_t available_slots_show(struct device *dev,
310 struct device_attribute *attr, char *buf)
311{
312 struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
313 ssize_t rc;
314 u32 nfree;
315
316 if (!ndd)
317 return -ENXIO;
318
319 nvdimm_bus_lock(dev);
320 nfree = nd_label_nfree(ndd);
321 if (nfree - 1 > nfree) {
322 dev_WARN_ONCE(dev, 1, "we ate our last label?\n");
323 nfree = 0;
324 } else
325 nfree--;
326 rc = sprintf(buf, "%d\n", nfree);
327 nvdimm_bus_unlock(dev);
328 return rc;
329}
330static DEVICE_ATTR_RO(available_slots);
331
332static struct attribute *nvdimm_attributes[] = {
333 &dev_attr_state.attr,
334 &dev_attr_commands.attr,
335 &dev_attr_available_slots.attr,
336 NULL,
337};
338
339struct attribute_group nvdimm_attribute_group = {
340 .attrs = nvdimm_attributes,
341};
342EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
343
344struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
345 const struct attribute_group **groups, unsigned long flags,
346 unsigned long *dsm_mask)
347{
348 struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
349 struct device *dev;
350
351 if (!nvdimm)
352 return NULL;
353
354 nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL);
355 if (nvdimm->id < 0) {
356 kfree(nvdimm);
357 return NULL;
358 }
359 nvdimm->provider_data = provider_data;
360 nvdimm->flags = flags;
361 nvdimm->dsm_mask = dsm_mask;
362 atomic_set(&nvdimm->busy, 0);
363 dev = &nvdimm->dev;
364 dev_set_name(dev, "nmem%d", nvdimm->id);
365 dev->parent = &nvdimm_bus->dev;
366 dev->type = &nvdimm_device_type;
367 dev->devt = MKDEV(nvdimm_major, nvdimm->id);
368 dev->groups = groups;
369 nd_device_register(dev);
370
371 return nvdimm;
372}
373EXPORT_SYMBOL_GPL(nvdimm_create);
374
375/**
376 * nd_blk_available_dpa - account the unused dpa of BLK region
377 * @nd_mapping: container of dpa-resource-root + labels
378 *
379 * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges.
380 */
381resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping)
382{
383 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
384 resource_size_t map_end, busy = 0, available;
385 struct resource *res;
386
387 if (!ndd)
388 return 0;
389
390 map_end = nd_mapping->start + nd_mapping->size - 1;
391 for_each_dpa_resource(ndd, res)
392 if (res->start >= nd_mapping->start && res->start < map_end) {
393 resource_size_t end = min(map_end, res->end);
394
395 busy += end - res->start + 1;
396 } else if (res->end >= nd_mapping->start
397 && res->end <= map_end) {
398 busy += res->end - nd_mapping->start;
399 } else if (nd_mapping->start > res->start
400 && nd_mapping->start < res->end) {
401 /* total eclipse of the BLK region mapping */
402 busy += nd_mapping->size;
403 }
404
405 available = map_end - nd_mapping->start + 1;
406 if (busy < available)
407 return available - busy;
408 return 0;
409}
410
411/**
412 * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
413 * @nd_mapping: container of dpa-resource-root + labels
414 * @nd_region: constrain available space check to this reference region
415 * @overlap: calculate available space assuming this level of overlap
416 *
417 * Validate that a PMEM label, if present, aligns with the start of an
418 * interleave set and truncate the available size at the lowest BLK
419 * overlap point.
420 *
421 * The expectation is that this routine is called multiple times as it
422 * probes for the largest BLK encroachment for any single member DIMM of
423 * the interleave set. Once that value is determined the PMEM-limit for
424 * the set can be established.
425 */
426resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
427 struct nd_mapping *nd_mapping, resource_size_t *overlap)
428{
429 resource_size_t map_start, map_end, busy = 0, available, blk_start;
430 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
431 struct resource *res;
432 const char *reason;
433
434 if (!ndd)
435 return 0;
436
437 map_start = nd_mapping->start;
438 map_end = map_start + nd_mapping->size - 1;
439 blk_start = max(map_start, map_end + 1 - *overlap);
440 for_each_dpa_resource(ndd, res)
441 if (res->start >= map_start && res->start < map_end) {
442 if (strncmp(res->name, "blk", 3) == 0)
443 blk_start = min(blk_start, res->start);
444 else if (res->start != map_start) {
445 reason = "misaligned to iset";
446 goto err;
447 } else {
448 if (busy) {
449 reason = "duplicate overlapping PMEM reservations?";
450 goto err;
451 }
452 busy += resource_size(res);
453 continue;
454 }
455 } else if (res->end >= map_start && res->end <= map_end) {
456 if (strncmp(res->name, "blk", 3) == 0) {
457 /*
458 * If a BLK allocation overlaps the start of
459 * PMEM the entire interleave set may now only
460 * be used for BLK.
461 */
462 blk_start = map_start;
463 } else {
464 reason = "misaligned to iset";
465 goto err;
466 }
467 } else if (map_start > res->start && map_start < res->end) {
468 /* total eclipse of the mapping */
469 busy += nd_mapping->size;
470 blk_start = map_start;
471 }
472
473 *overlap = map_end + 1 - blk_start;
474 available = blk_start - map_start;
475 if (busy < available)
476 return available - busy;
477 return 0;
478
479 err:
480 /*
481 * Something is wrong, PMEM must align with the start of the
482 * interleave set, and there can only be one allocation per set.
483 */
484 nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
485 return 0;
486}
487
488void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res)
489{
490 WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
491 kfree(res->name);
492 __release_region(&ndd->dpa, res->start, resource_size(res));
493}
494
495struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
496 struct nd_label_id *label_id, resource_size_t start,
497 resource_size_t n)
498{
499 char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL);
500 struct resource *res;
501
502 if (!name)
503 return NULL;
504
505 WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
506 res = __request_region(&ndd->dpa, start, n, name, 0);
507 if (!res)
508 kfree(name);
509 return res;
510}
511
512/**
513 * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
514 * @nvdimm: container of dpa-resource-root + labels
515 * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
516 */
517resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
518 struct nd_label_id *label_id)
519{
520 resource_size_t allocated = 0;
521 struct resource *res;
522
523 for_each_dpa_resource(ndd, res)
524 if (strcmp(res->name, label_id->id) == 0)
525 allocated += resource_size(res);
526
527 return allocated;
528}
529
530static int count_dimms(struct device *dev, void *c)
531{
532 int *count = c;
533
534 if (is_nvdimm(dev))
535 (*count)++;
536 return 0;
537}
538
539int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
540{
541 int count = 0;
542 /* Flush any possible dimm registration failures */
543 nd_synchronize();
544
545 device_for_each_child(&nvdimm_bus->dev, &count, count_dimms);
546 dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count);
547 if (count != dimm_count)
548 return -ENXIO;
549 return 0;
550}
551EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
new file mode 100644
index 000000000000..96526dcfdd37
--- /dev/null
+++ b/drivers/nvdimm/label.c
@@ -0,0 +1,927 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/device.h>
14#include <linux/ndctl.h>
15#include <linux/slab.h>
16#include <linux/io.h>
17#include <linux/nd.h>
18#include "nd-core.h"
19#include "label.h"
20#include "nd.h"
21
22static u32 best_seq(u32 a, u32 b)
23{
24 a &= NSINDEX_SEQ_MASK;
25 b &= NSINDEX_SEQ_MASK;
26
27 if (a == 0 || a == b)
28 return b;
29 else if (b == 0)
30 return a;
31 else if (nd_inc_seq(a) == b)
32 return b;
33 else
34 return a;
35}
36
37size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
38{
39 u32 index_span;
40
41 if (ndd->nsindex_size)
42 return ndd->nsindex_size;
43
44 /*
45 * The minimum index space is 512 bytes, with that amount of
46 * index we can describe ~1400 labels which is less than a byte
47 * of overhead per label. Round up to a byte of overhead per
48 * label and determine the size of the index region. Yes, this
49 * starts to waste space at larger config_sizes, but it's
50 * unlikely we'll ever see anything but 128K.
51 */
52 index_span = ndd->nsarea.config_size / 129;
53 index_span /= NSINDEX_ALIGN * 2;
54 ndd->nsindex_size = index_span * NSINDEX_ALIGN;
55
56 return ndd->nsindex_size;
57}
58
59int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
60{
61 return ndd->nsarea.config_size / 129;
62}
63
64int nd_label_validate(struct nvdimm_drvdata *ndd)
65{
66 /*
67 * On media label format consists of two index blocks followed
68 * by an array of labels. None of these structures are ever
69 * updated in place. A sequence number tracks the current
70 * active index and the next one to write, while labels are
71 * written to free slots.
72 *
73 * +------------+
74 * | |
75 * | nsindex0 |
76 * | |
77 * +------------+
78 * | |
79 * | nsindex1 |
80 * | |
81 * +------------+
82 * | label0 |
83 * +------------+
84 * | label1 |
85 * +------------+
86 * | |
87 * ....nslot...
88 * | |
89 * +------------+
90 * | labelN |
91 * +------------+
92 */
93 struct nd_namespace_index *nsindex[] = {
94 to_namespace_index(ndd, 0),
95 to_namespace_index(ndd, 1),
96 };
97 const int num_index = ARRAY_SIZE(nsindex);
98 struct device *dev = ndd->dev;
99 bool valid[2] = { 0 };
100 int i, num_valid = 0;
101 u32 seq;
102
103 for (i = 0; i < num_index; i++) {
104 u32 nslot;
105 u8 sig[NSINDEX_SIG_LEN];
106 u64 sum_save, sum, size;
107
108 memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN);
109 if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) {
110 dev_dbg(dev, "%s: nsindex%d signature invalid\n",
111 __func__, i);
112 continue;
113 }
114 sum_save = __le64_to_cpu(nsindex[i]->checksum);
115 nsindex[i]->checksum = __cpu_to_le64(0);
116 sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1);
117 nsindex[i]->checksum = __cpu_to_le64(sum_save);
118 if (sum != sum_save) {
119 dev_dbg(dev, "%s: nsindex%d checksum invalid\n",
120 __func__, i);
121 continue;
122 }
123
124 seq = __le32_to_cpu(nsindex[i]->seq);
125 if ((seq & NSINDEX_SEQ_MASK) == 0) {
126 dev_dbg(dev, "%s: nsindex%d sequence: %#x invalid\n",
127 __func__, i, seq);
128 continue;
129 }
130
131 /* sanity check the index against expected values */
132 if (__le64_to_cpu(nsindex[i]->myoff)
133 != i * sizeof_namespace_index(ndd)) {
134 dev_dbg(dev, "%s: nsindex%d myoff: %#llx invalid\n",
135 __func__, i, (unsigned long long)
136 __le64_to_cpu(nsindex[i]->myoff));
137 continue;
138 }
139 if (__le64_to_cpu(nsindex[i]->otheroff)
140 != (!i) * sizeof_namespace_index(ndd)) {
141 dev_dbg(dev, "%s: nsindex%d otheroff: %#llx invalid\n",
142 __func__, i, (unsigned long long)
143 __le64_to_cpu(nsindex[i]->otheroff));
144 continue;
145 }
146
147 size = __le64_to_cpu(nsindex[i]->mysize);
148 if (size > sizeof_namespace_index(ndd)
149 || size < sizeof(struct nd_namespace_index)) {
150 dev_dbg(dev, "%s: nsindex%d mysize: %#llx invalid\n",
151 __func__, i, size);
152 continue;
153 }
154
155 nslot = __le32_to_cpu(nsindex[i]->nslot);
156 if (nslot * sizeof(struct nd_namespace_label)
157 + 2 * sizeof_namespace_index(ndd)
158 > ndd->nsarea.config_size) {
159 dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n",
160 __func__, i, nslot,
161 ndd->nsarea.config_size);
162 continue;
163 }
164 valid[i] = true;
165 num_valid++;
166 }
167
168 switch (num_valid) {
169 case 0:
170 break;
171 case 1:
172 for (i = 0; i < num_index; i++)
173 if (valid[i])
174 return i;
175 /* can't have num_valid > 0 but valid[] = { false, false } */
176 WARN_ON(1);
177 break;
178 default:
179 /* pick the best index... */
180 seq = best_seq(__le32_to_cpu(nsindex[0]->seq),
181 __le32_to_cpu(nsindex[1]->seq));
182 if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK))
183 return 1;
184 else
185 return 0;
186 break;
187 }
188
189 return -1;
190}
191
192void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
193 struct nd_namespace_index *src)
194{
195 if (dst && src)
196 /* pass */;
197 else
198 return;
199
200 memcpy(dst, src, sizeof_namespace_index(ndd));
201}
202
203static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd)
204{
205 void *base = to_namespace_index(ndd, 0);
206
207 return base + 2 * sizeof_namespace_index(ndd);
208}
209
210static int to_slot(struct nvdimm_drvdata *ndd,
211 struct nd_namespace_label *nd_label)
212{
213 return nd_label - nd_label_base(ndd);
214}
215
216#define for_each_clear_bit_le(bit, addr, size) \
217 for ((bit) = find_next_zero_bit_le((addr), (size), 0); \
218 (bit) < (size); \
219 (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1))
220
221/**
222 * preamble_index - common variable initialization for nd_label_* routines
223 * @ndd: dimm container for the relevant label set
224 * @idx: namespace_index index
225 * @nsindex_out: on return set to the currently active namespace index
226 * @free: on return set to the free label bitmap in the index
227 * @nslot: on return set to the number of slots in the label space
228 */
229static bool preamble_index(struct nvdimm_drvdata *ndd, int idx,
230 struct nd_namespace_index **nsindex_out,
231 unsigned long **free, u32 *nslot)
232{
233 struct nd_namespace_index *nsindex;
234
235 nsindex = to_namespace_index(ndd, idx);
236 if (nsindex == NULL)
237 return false;
238
239 *free = (unsigned long *) nsindex->free;
240 *nslot = __le32_to_cpu(nsindex->nslot);
241 *nsindex_out = nsindex;
242
243 return true;
244}
245
246char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
247{
248 if (!label_id || !uuid)
249 return NULL;
250 snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
251 flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
252 return label_id->id;
253}
254
255static bool preamble_current(struct nvdimm_drvdata *ndd,
256 struct nd_namespace_index **nsindex,
257 unsigned long **free, u32 *nslot)
258{
259 return preamble_index(ndd, ndd->ns_current, nsindex,
260 free, nslot);
261}
262
263static bool preamble_next(struct nvdimm_drvdata *ndd,
264 struct nd_namespace_index **nsindex,
265 unsigned long **free, u32 *nslot)
266{
267 return preamble_index(ndd, ndd->ns_next, nsindex,
268 free, nslot);
269}
270
271static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot)
272{
273 /* check that we are written where we expect to be written */
274 if (slot != __le32_to_cpu(nd_label->slot))
275 return false;
276
277 /* check that DPA allocations are page aligned */
278 if ((__le64_to_cpu(nd_label->dpa)
279 | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
280 return false;
281
282 return true;
283}
284
285int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
286{
287 struct nd_namespace_index *nsindex;
288 unsigned long *free;
289 u32 nslot, slot;
290
291 if (!preamble_current(ndd, &nsindex, &free, &nslot))
292 return 0; /* no label, nothing to reserve */
293
294 for_each_clear_bit_le(slot, free, nslot) {
295 struct nd_namespace_label *nd_label;
296 struct nd_region *nd_region = NULL;
297 u8 label_uuid[NSLABEL_UUID_LEN];
298 struct nd_label_id label_id;
299 struct resource *res;
300 u32 flags;
301
302 nd_label = nd_label_base(ndd) + slot;
303
304 if (!slot_valid(nd_label, slot))
305 continue;
306
307 memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
308 flags = __le32_to_cpu(nd_label->flags);
309 nd_label_gen_id(&label_id, label_uuid, flags);
310 res = nvdimm_allocate_dpa(ndd, &label_id,
311 __le64_to_cpu(nd_label->dpa),
312 __le64_to_cpu(nd_label->rawsize));
313 nd_dbg_dpa(nd_region, ndd, res, "reserve\n");
314 if (!res)
315 return -EBUSY;
316 }
317
318 return 0;
319}
320
321int nd_label_active_count(struct nvdimm_drvdata *ndd)
322{
323 struct nd_namespace_index *nsindex;
324 unsigned long *free;
325 u32 nslot, slot;
326 int count = 0;
327
328 if (!preamble_current(ndd, &nsindex, &free, &nslot))
329 return 0;
330
331 for_each_clear_bit_le(slot, free, nslot) {
332 struct nd_namespace_label *nd_label;
333
334 nd_label = nd_label_base(ndd) + slot;
335
336 if (!slot_valid(nd_label, slot)) {
337 u32 label_slot = __le32_to_cpu(nd_label->slot);
338 u64 size = __le64_to_cpu(nd_label->rawsize);
339 u64 dpa = __le64_to_cpu(nd_label->dpa);
340
341 dev_dbg(ndd->dev,
342 "%s: slot%d invalid slot: %d dpa: %llx size: %llx\n",
343 __func__, slot, label_slot, dpa, size);
344 continue;
345 }
346 count++;
347 }
348 return count;
349}
350
351struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
352{
353 struct nd_namespace_index *nsindex;
354 unsigned long *free;
355 u32 nslot, slot;
356
357 if (!preamble_current(ndd, &nsindex, &free, &nslot))
358 return NULL;
359
360 for_each_clear_bit_le(slot, free, nslot) {
361 struct nd_namespace_label *nd_label;
362
363 nd_label = nd_label_base(ndd) + slot;
364 if (!slot_valid(nd_label, slot))
365 continue;
366
367 if (n-- == 0)
368 return nd_label_base(ndd) + slot;
369 }
370
371 return NULL;
372}
373
374u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd)
375{
376 struct nd_namespace_index *nsindex;
377 unsigned long *free;
378 u32 nslot, slot;
379
380 if (!preamble_next(ndd, &nsindex, &free, &nslot))
381 return UINT_MAX;
382
383 WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
384
385 slot = find_next_bit_le(free, nslot, 0);
386 if (slot == nslot)
387 return UINT_MAX;
388
389 clear_bit_le(slot, free);
390
391 return slot;
392}
393
394bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot)
395{
396 struct nd_namespace_index *nsindex;
397 unsigned long *free;
398 u32 nslot;
399
400 if (!preamble_next(ndd, &nsindex, &free, &nslot))
401 return false;
402
403 WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
404
405 if (slot < nslot)
406 return !test_and_set_bit_le(slot, free);
407 return false;
408}
409
410u32 nd_label_nfree(struct nvdimm_drvdata *ndd)
411{
412 struct nd_namespace_index *nsindex;
413 unsigned long *free;
414 u32 nslot;
415
416 WARN_ON(!is_nvdimm_bus_locked(ndd->dev));
417
418 if (!preamble_next(ndd, &nsindex, &free, &nslot))
419 return nvdimm_num_label_slots(ndd);
420
421 return bitmap_weight(free, nslot);
422}
423
424static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq,
425 unsigned long flags)
426{
427 struct nd_namespace_index *nsindex;
428 unsigned long offset;
429 u64 checksum;
430 u32 nslot;
431 int rc;
432
433 nsindex = to_namespace_index(ndd, index);
434 if (flags & ND_NSINDEX_INIT)
435 nslot = nvdimm_num_label_slots(ndd);
436 else
437 nslot = __le32_to_cpu(nsindex->nslot);
438
439 memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN);
440 nsindex->flags = __cpu_to_le32(0);
441 nsindex->seq = __cpu_to_le32(seq);
442 offset = (unsigned long) nsindex
443 - (unsigned long) to_namespace_index(ndd, 0);
444 nsindex->myoff = __cpu_to_le64(offset);
445 nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd));
446 offset = (unsigned long) to_namespace_index(ndd,
447 nd_label_next_nsindex(index))
448 - (unsigned long) to_namespace_index(ndd, 0);
449 nsindex->otheroff = __cpu_to_le64(offset);
450 offset = (unsigned long) nd_label_base(ndd)
451 - (unsigned long) to_namespace_index(ndd, 0);
452 nsindex->labeloff = __cpu_to_le64(offset);
453 nsindex->nslot = __cpu_to_le32(nslot);
454 nsindex->major = __cpu_to_le16(1);
455 nsindex->minor = __cpu_to_le16(1);
456 nsindex->checksum = __cpu_to_le64(0);
457 if (flags & ND_NSINDEX_INIT) {
458 unsigned long *free = (unsigned long *) nsindex->free;
459 u32 nfree = ALIGN(nslot, BITS_PER_LONG);
460 int last_bits, i;
461
462 memset(nsindex->free, 0xff, nfree / 8);
463 for (i = 0, last_bits = nfree - nslot; i < last_bits; i++)
464 clear_bit_le(nslot + i, free);
465 }
466 checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1);
467 nsindex->checksum = __cpu_to_le64(checksum);
468 rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff),
469 nsindex, sizeof_namespace_index(ndd));
470 if (rc < 0)
471 return rc;
472
473 if (flags & ND_NSINDEX_INIT)
474 return 0;
475
476 /* copy the index we just wrote to the new 'next' */
477 WARN_ON(index != ndd->ns_next);
478 nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex);
479 ndd->ns_current = nd_label_next_nsindex(ndd->ns_current);
480 ndd->ns_next = nd_label_next_nsindex(ndd->ns_next);
481 WARN_ON(ndd->ns_current == ndd->ns_next);
482
483 return 0;
484}
485
486static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
487 struct nd_namespace_label *nd_label)
488{
489 return (unsigned long) nd_label
490 - (unsigned long) to_namespace_index(ndd, 0);
491}
492
493static int __pmem_label_update(struct nd_region *nd_region,
494 struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
495 int pos)
496{
497 u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize;
498 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
499 struct nd_namespace_label *victim_label;
500 struct nd_namespace_label *nd_label;
501 struct nd_namespace_index *nsindex;
502 unsigned long *free;
503 u32 nslot, slot;
504 size_t offset;
505 int rc;
506
507 if (!preamble_next(ndd, &nsindex, &free, &nslot))
508 return -ENXIO;
509
510 /* allocate and write the label to the staging (next) index */
511 slot = nd_label_alloc_slot(ndd);
512 if (slot == UINT_MAX)
513 return -ENXIO;
514 dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
515
516 nd_label = nd_label_base(ndd) + slot;
517 memset(nd_label, 0, sizeof(struct nd_namespace_label));
518 memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN);
519 if (nspm->alt_name)
520 memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN);
521 nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_UPDATING);
522 nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings);
523 nd_label->position = __cpu_to_le16(pos);
524 nd_label->isetcookie = __cpu_to_le64(cookie);
525 rawsize = div_u64(resource_size(&nspm->nsio.res),
526 nd_region->ndr_mappings);
527 nd_label->rawsize = __cpu_to_le64(rawsize);
528 nd_label->dpa = __cpu_to_le64(nd_mapping->start);
529 nd_label->slot = __cpu_to_le32(slot);
530
531 /* update label */
532 offset = nd_label_offset(ndd, nd_label);
533 rc = nvdimm_set_config_data(ndd, offset, nd_label,
534 sizeof(struct nd_namespace_label));
535 if (rc < 0)
536 return rc;
537
538 /* Garbage collect the previous label */
539 victim_label = nd_mapping->labels[0];
540 if (victim_label) {
541 slot = to_slot(ndd, victim_label);
542 nd_label_free_slot(ndd, slot);
543 dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
544 }
545
546 /* update index */
547 rc = nd_label_write_index(ndd, ndd->ns_next,
548 nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
549 if (rc < 0)
550 return rc;
551
552 nd_mapping->labels[0] = nd_label;
553
554 return 0;
555}
556
557static void del_label(struct nd_mapping *nd_mapping, int l)
558{
559 struct nd_namespace_label *next_label, *nd_label;
560 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
561 unsigned int slot;
562 int j;
563
564 nd_label = nd_mapping->labels[l];
565 slot = to_slot(ndd, nd_label);
566 dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot);
567
568 for (j = l; (next_label = nd_mapping->labels[j + 1]); j++)
569 nd_mapping->labels[j] = next_label;
570 nd_mapping->labels[j] = NULL;
571}
572
573static bool is_old_resource(struct resource *res, struct resource **list, int n)
574{
575 int i;
576
577 if (res->flags & DPA_RESOURCE_ADJUSTED)
578 return false;
579 for (i = 0; i < n; i++)
580 if (res == list[i])
581 return true;
582 return false;
583}
584
585static struct resource *to_resource(struct nvdimm_drvdata *ndd,
586 struct nd_namespace_label *nd_label)
587{
588 struct resource *res;
589
590 for_each_dpa_resource(ndd, res) {
591 if (res->start != __le64_to_cpu(nd_label->dpa))
592 continue;
593 if (resource_size(res) != __le64_to_cpu(nd_label->rawsize))
594 continue;
595 return res;
596 }
597
598 return NULL;
599}
600
601/*
602 * 1/ Account all the labels that can be freed after this update
603 * 2/ Allocate and write the label to the staging (next) index
604 * 3/ Record the resources in the namespace device
605 */
606static int __blk_label_update(struct nd_region *nd_region,
607 struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk,
608 int num_labels)
609{
610 int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
611 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
612 struct nd_namespace_label *nd_label;
613 struct nd_namespace_index *nsindex;
614 unsigned long *free, *victim_map = NULL;
615 struct resource *res, **old_res_list;
616 struct nd_label_id label_id;
617 u8 uuid[NSLABEL_UUID_LEN];
618 u32 nslot, slot;
619
620 if (!preamble_next(ndd, &nsindex, &free, &nslot))
621 return -ENXIO;
622
623 old_res_list = nsblk->res;
624 nfree = nd_label_nfree(ndd);
625 old_num_resources = nsblk->num_resources;
626 nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
627
628 /*
629 * We need to loop over the old resources a few times, which seems a
630 * bit inefficient, but we need to know that we have the label
631 * space before we start mutating the tracking structures.
632 * Otherwise the recovery method of last resort for userspace is
633 * disable and re-enable the parent region.
634 */
635 alloc = 0;
636 for_each_dpa_resource(ndd, res) {
637 if (strcmp(res->name, label_id.id) != 0)
638 continue;
639 if (!is_old_resource(res, old_res_list, old_num_resources))
640 alloc++;
641 }
642
643 victims = 0;
644 if (old_num_resources) {
645 /* convert old local-label-map to dimm-slot victim-map */
646 victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long),
647 GFP_KERNEL);
648 if (!victim_map)
649 return -ENOMEM;
650
651 /* mark unused labels for garbage collection */
652 for_each_clear_bit_le(slot, free, nslot) {
653 nd_label = nd_label_base(ndd) + slot;
654 memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
655 if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
656 continue;
657 res = to_resource(ndd, nd_label);
658 if (res && is_old_resource(res, old_res_list,
659 old_num_resources))
660 continue;
661 slot = to_slot(ndd, nd_label);
662 set_bit(slot, victim_map);
663 victims++;
664 }
665 }
666
667 /* don't allow updates that consume the last label */
668 if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
669 dev_info(&nsblk->common.dev, "insufficient label space\n");
670 kfree(victim_map);
671 return -ENOSPC;
672 }
673 /* from here on we need to abort on error */
674
675
676 /* assign all resources to the namespace before writing the labels */
677 nsblk->res = NULL;
678 nsblk->num_resources = 0;
679 for_each_dpa_resource(ndd, res) {
680 if (strcmp(res->name, label_id.id) != 0)
681 continue;
682 if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) {
683 rc = -ENOMEM;
684 goto abort;
685 }
686 }
687
688 for (i = 0; i < nsblk->num_resources; i++) {
689 size_t offset;
690
691 res = nsblk->res[i];
692 if (is_old_resource(res, old_res_list, old_num_resources))
693 continue; /* carry-over */
694 slot = nd_label_alloc_slot(ndd);
695 if (slot == UINT_MAX)
696 goto abort;
697 dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot);
698
699 nd_label = nd_label_base(ndd) + slot;
700 memset(nd_label, 0, sizeof(struct nd_namespace_label));
701 memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN);
702 if (nsblk->alt_name)
703 memcpy(nd_label->name, nsblk->alt_name,
704 NSLABEL_NAME_LEN);
705 nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL);
706 nd_label->nlabel = __cpu_to_le16(0); /* N/A */
707 nd_label->position = __cpu_to_le16(0); /* N/A */
708 nd_label->isetcookie = __cpu_to_le64(0); /* N/A */
709 nd_label->dpa = __cpu_to_le64(res->start);
710 nd_label->rawsize = __cpu_to_le64(resource_size(res));
711 nd_label->lbasize = __cpu_to_le64(nsblk->lbasize);
712 nd_label->slot = __cpu_to_le32(slot);
713
714 /* update label */
715 offset = nd_label_offset(ndd, nd_label);
716 rc = nvdimm_set_config_data(ndd, offset, nd_label,
717 sizeof(struct nd_namespace_label));
718 if (rc < 0)
719 goto abort;
720 }
721
722 /* free up now unused slots in the new index */
723 for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) {
724 dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
725 nd_label_free_slot(ndd, slot);
726 }
727
728 /* update index */
729 rc = nd_label_write_index(ndd, ndd->ns_next,
730 nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
731 if (rc)
732 goto abort;
733
734 /*
735 * Now that the on-dimm labels are up to date, fix up the tracking
736 * entries in nd_mapping->labels
737 */
738 nlabel = 0;
739 for_each_label(l, nd_label, nd_mapping->labels) {
740 nlabel++;
741 memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
742 if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
743 continue;
744 nlabel--;
745 del_label(nd_mapping, l);
746 l--; /* retry with the new label at this index */
747 }
748 if (nlabel + nsblk->num_resources > num_labels) {
749 /*
750 * Bug, we can't end up with more resources than
751 * available labels
752 */
753 WARN_ON_ONCE(1);
754 rc = -ENXIO;
755 goto out;
756 }
757
758 for_each_clear_bit_le(slot, free, nslot) {
759 nd_label = nd_label_base(ndd) + slot;
760 memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN);
761 if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0)
762 continue;
763 res = to_resource(ndd, nd_label);
764 res->flags &= ~DPA_RESOURCE_ADJUSTED;
765 dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n",
766 l, slot);
767 nd_mapping->labels[l++] = nd_label;
768 }
769 nd_mapping->labels[l] = NULL;
770
771 out:
772 kfree(old_res_list);
773 kfree(victim_map);
774 return rc;
775
776 abort:
777 /*
778 * 1/ repair the allocated label bitmap in the index
779 * 2/ restore the resource list
780 */
781 nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd));
782 kfree(nsblk->res);
783 nsblk->res = old_res_list;
784 nsblk->num_resources = old_num_resources;
785 old_res_list = NULL;
786 goto out;
787}
788
789static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
790{
791 int i, l, old_num_labels = 0;
792 struct nd_namespace_index *nsindex;
793 struct nd_namespace_label *nd_label;
794 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
795 size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *);
796
797 for_each_label(l, nd_label, nd_mapping->labels)
798 old_num_labels++;
799
800 /*
801 * We need to preserve all the old labels for the mapping so
802 * they can be garbage collected after writing the new labels.
803 */
804 if (num_labels > old_num_labels) {
805 struct nd_namespace_label **labels;
806
807 labels = krealloc(nd_mapping->labels, size, GFP_KERNEL);
808 if (!labels)
809 return -ENOMEM;
810 nd_mapping->labels = labels;
811 }
812 if (!nd_mapping->labels)
813 return -ENOMEM;
814
815 for (i = old_num_labels; i <= num_labels; i++)
816 nd_mapping->labels[i] = NULL;
817
818 if (ndd->ns_current == -1 || ndd->ns_next == -1)
819 /* pass */;
820 else
821 return max(num_labels, old_num_labels);
822
823 nsindex = to_namespace_index(ndd, 0);
824 memset(nsindex, 0, ndd->nsarea.config_size);
825 for (i = 0; i < 2; i++) {
826 int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
827
828 if (rc)
829 return rc;
830 }
831 ndd->ns_next = 1;
832 ndd->ns_current = 0;
833
834 return max(num_labels, old_num_labels);
835}
836
837static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid)
838{
839 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
840 struct nd_namespace_label *nd_label;
841 struct nd_namespace_index *nsindex;
842 u8 label_uuid[NSLABEL_UUID_LEN];
843 int l, num_freed = 0;
844 unsigned long *free;
845 u32 nslot, slot;
846
847 if (!uuid)
848 return 0;
849
850 /* no index || no labels == nothing to delete */
851 if (!preamble_next(ndd, &nsindex, &free, &nslot)
852 || !nd_mapping->labels)
853 return 0;
854
855 for_each_label(l, nd_label, nd_mapping->labels) {
856 memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN);
857 if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0)
858 continue;
859 slot = to_slot(ndd, nd_label);
860 nd_label_free_slot(ndd, slot);
861 dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
862 del_label(nd_mapping, l);
863 num_freed++;
864 l--; /* retry with new label at this index */
865 }
866
867 if (num_freed > l) {
868 /*
869 * num_freed will only ever be > l when we delete the last
870 * label
871 */
872 kfree(nd_mapping->labels);
873 nd_mapping->labels = NULL;
874 dev_dbg(ndd->dev, "%s: no more labels\n", __func__);
875 }
876
877 return nd_label_write_index(ndd, ndd->ns_next,
878 nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
879}
880
881int nd_pmem_namespace_label_update(struct nd_region *nd_region,
882 struct nd_namespace_pmem *nspm, resource_size_t size)
883{
884 int i;
885
886 for (i = 0; i < nd_region->ndr_mappings; i++) {
887 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
888 int rc;
889
890 if (size == 0) {
891 rc = del_labels(nd_mapping, nspm->uuid);
892 if (rc)
893 return rc;
894 continue;
895 }
896
897 rc = init_labels(nd_mapping, 1);
898 if (rc < 0)
899 return rc;
900
901 rc = __pmem_label_update(nd_region, nd_mapping, nspm, i);
902 if (rc)
903 return rc;
904 }
905
906 return 0;
907}
908
909int nd_blk_namespace_label_update(struct nd_region *nd_region,
910 struct nd_namespace_blk *nsblk, resource_size_t size)
911{
912 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
913 struct resource *res;
914 int count = 0;
915
916 if (size == 0)
917 return del_labels(nd_mapping, nsblk->uuid);
918
919 for_each_dpa_resource(to_ndd(nd_mapping), res)
920 count++;
921
922 count = init_labels(nd_mapping, count);
923 if (count < 0)
924 return count;
925
926 return __blk_label_update(nd_region, nd_mapping, nsblk, count);
927}
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
new file mode 100644
index 000000000000..a59ef6eef2a3
--- /dev/null
+++ b/drivers/nvdimm/label.h
@@ -0,0 +1,141 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __LABEL_H__
14#define __LABEL_H__
15
16#include <linux/ndctl.h>
17#include <linux/sizes.h>
18#include <linux/io.h>
19
20enum {
21 NSINDEX_SIG_LEN = 16,
22 NSINDEX_ALIGN = 256,
23 NSINDEX_SEQ_MASK = 0x3,
24 NSLABEL_UUID_LEN = 16,
25 NSLABEL_NAME_LEN = 64,
26 NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */
27 NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */
28 NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */
29 NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */
30 BTT_ALIGN = 4096, /* all btt structures */
31 BTTINFO_SIG_LEN = 16,
32 BTTINFO_UUID_LEN = 16,
33 BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */
34 BTTINFO_MAJOR_VERSION = 1,
35 ND_LABEL_MIN_SIZE = 512 * 129, /* see sizeof_namespace_index() */
36 ND_LABEL_ID_SIZE = 50,
37 ND_NSINDEX_INIT = 0x1,
38};
39
40static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0";
41
42/**
43 * struct nd_namespace_index - label set superblock
44 * @sig: NAMESPACE_INDEX\0
45 * @flags: placeholder
46 * @seq: sequence number for this index
47 * @myoff: offset of this index in label area
48 * @mysize: size of this index struct
49 * @otheroff: offset of other index
50 * @labeloff: offset of first label slot
51 * @nslot: total number of label slots
52 * @major: label area major version
53 * @minor: label area minor version
54 * @checksum: fletcher64 of all fields
55 * @free[0]: bitmap, nlabel bits
56 *
57 * The size of free[] is rounded up so the total struct size is a
58 * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond
59 * nlabel bits must be zero.
60 */
61struct nd_namespace_index {
62 u8 sig[NSINDEX_SIG_LEN];
63 __le32 flags;
64 __le32 seq;
65 __le64 myoff;
66 __le64 mysize;
67 __le64 otheroff;
68 __le64 labeloff;
69 __le32 nslot;
70 __le16 major;
71 __le16 minor;
72 __le64 checksum;
73 u8 free[0];
74};
75
76/**
77 * struct nd_namespace_label - namespace superblock
78 * @uuid: UUID per RFC 4122
79 * @name: optional name (NULL-terminated)
80 * @flags: see NSLABEL_FLAG_*
81 * @nlabel: num labels to describe this ns
82 * @position: labels position in set
83 * @isetcookie: interleave set cookie
84 * @lbasize: LBA size in bytes or 0 for pmem
85 * @dpa: DPA of NVM range on this DIMM
86 * @rawsize: size of namespace
87 * @slot: slot of this label in label area
88 * @unused: must be zero
89 */
90struct nd_namespace_label {
91 u8 uuid[NSLABEL_UUID_LEN];
92 u8 name[NSLABEL_NAME_LEN];
93 __le32 flags;
94 __le16 nlabel;
95 __le16 position;
96 __le64 isetcookie;
97 __le64 lbasize;
98 __le64 dpa;
99 __le64 rawsize;
100 __le32 slot;
101 __le32 unused;
102};
103
104/**
105 * struct nd_label_id - identifier string for dpa allocation
106 * @id: "{blk|pmem}-<namespace uuid>"
107 */
108struct nd_label_id {
109 char id[ND_LABEL_ID_SIZE];
110};
111
112/*
113 * If the 'best' index is invalid, so is the 'next' index. Otherwise,
114 * the next index is MOD(index+1, 2)
115 */
116static inline int nd_label_next_nsindex(int index)
117{
118 if (index < 0)
119 return -1;
120
121 return (index + 1) % 2;
122}
123
124struct nvdimm_drvdata;
125int nd_label_validate(struct nvdimm_drvdata *ndd);
126void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
127 struct nd_namespace_index *src);
128size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd);
129int nd_label_active_count(struct nvdimm_drvdata *ndd);
130struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
131u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd);
132bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
133u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
134struct nd_region;
135struct nd_namespace_pmem;
136struct nd_namespace_blk;
137int nd_pmem_namespace_label_update(struct nd_region *nd_region,
138 struct nd_namespace_pmem *nspm, resource_size_t size);
139int nd_blk_namespace_label_update(struct nd_region *nd_region,
140 struct nd_namespace_blk *nsblk, resource_size_t size);
141#endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
new file mode 100644
index 000000000000..fef0dd80d4ad
--- /dev/null
+++ b/drivers/nvdimm/namespace_devs.c
@@ -0,0 +1,1870 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/module.h>
14#include <linux/device.h>
15#include <linux/slab.h>
16#include <linux/nd.h>
17#include "nd-core.h"
18#include "nd.h"
19
20static void namespace_io_release(struct device *dev)
21{
22 struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
23
24 kfree(nsio);
25}
26
27static void namespace_pmem_release(struct device *dev)
28{
29 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
30
31 kfree(nspm->alt_name);
32 kfree(nspm->uuid);
33 kfree(nspm);
34}
35
36static void namespace_blk_release(struct device *dev)
37{
38 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
39 struct nd_region *nd_region = to_nd_region(dev->parent);
40
41 if (nsblk->id >= 0)
42 ida_simple_remove(&nd_region->ns_ida, nsblk->id);
43 kfree(nsblk->alt_name);
44 kfree(nsblk->uuid);
45 kfree(nsblk->res);
46 kfree(nsblk);
47}
48
49static struct device_type namespace_io_device_type = {
50 .name = "nd_namespace_io",
51 .release = namespace_io_release,
52};
53
54static struct device_type namespace_pmem_device_type = {
55 .name = "nd_namespace_pmem",
56 .release = namespace_pmem_release,
57};
58
59static struct device_type namespace_blk_device_type = {
60 .name = "nd_namespace_blk",
61 .release = namespace_blk_release,
62};
63
64static bool is_namespace_pmem(struct device *dev)
65{
66 return dev ? dev->type == &namespace_pmem_device_type : false;
67}
68
69static bool is_namespace_blk(struct device *dev)
70{
71 return dev ? dev->type == &namespace_blk_device_type : false;
72}
73
74static bool is_namespace_io(struct device *dev)
75{
76 return dev ? dev->type == &namespace_io_device_type : false;
77}
78
79const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
80 char *name)
81{
82 struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
83 const char *suffix = "";
84
85 if (ndns->claim && is_nd_btt(ndns->claim))
86 suffix = "s";
87
88 if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev))
89 sprintf(name, "pmem%d%s", nd_region->id, suffix);
90 else if (is_namespace_blk(&ndns->dev)) {
91 struct nd_namespace_blk *nsblk;
92
93 nsblk = to_nd_namespace_blk(&ndns->dev);
94 sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix);
95 } else {
96 return NULL;
97 }
98
99 return name;
100}
101EXPORT_SYMBOL(nvdimm_namespace_disk_name);
102
103static ssize_t nstype_show(struct device *dev,
104 struct device_attribute *attr, char *buf)
105{
106 struct nd_region *nd_region = to_nd_region(dev->parent);
107
108 return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
109}
110static DEVICE_ATTR_RO(nstype);
111
112static ssize_t __alt_name_store(struct device *dev, const char *buf,
113 const size_t len)
114{
115 char *input, *pos, *alt_name, **ns_altname;
116 ssize_t rc;
117
118 if (is_namespace_pmem(dev)) {
119 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
120
121 ns_altname = &nspm->alt_name;
122 } else if (is_namespace_blk(dev)) {
123 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
124
125 ns_altname = &nsblk->alt_name;
126 } else
127 return -ENXIO;
128
129 if (dev->driver || to_ndns(dev)->claim)
130 return -EBUSY;
131
132 input = kmemdup(buf, len + 1, GFP_KERNEL);
133 if (!input)
134 return -ENOMEM;
135
136 input[len] = '\0';
137 pos = strim(input);
138 if (strlen(pos) + 1 > NSLABEL_NAME_LEN) {
139 rc = -EINVAL;
140 goto out;
141 }
142
143 alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL);
144 if (!alt_name) {
145 rc = -ENOMEM;
146 goto out;
147 }
148 kfree(*ns_altname);
149 *ns_altname = alt_name;
150 sprintf(*ns_altname, "%s", pos);
151 rc = len;
152
153out:
154 kfree(input);
155 return rc;
156}
157
158static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
159{
160 struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
161 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
162 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
163 struct nd_label_id label_id;
164 resource_size_t size = 0;
165 struct resource *res;
166
167 if (!nsblk->uuid)
168 return 0;
169 nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
170 for_each_dpa_resource(ndd, res)
171 if (strcmp(res->name, label_id.id) == 0)
172 size += resource_size(res);
173 return size;
174}
175
176static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
177{
178 struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
179 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
180 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
181 struct nd_label_id label_id;
182 struct resource *res;
183 int count, i;
184
185 if (!nsblk->uuid || !nsblk->lbasize || !ndd)
186 return false;
187
188 count = 0;
189 nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
190 for_each_dpa_resource(ndd, res) {
191 if (strcmp(res->name, label_id.id) != 0)
192 continue;
193 /*
194 * Resources with unacknoweldged adjustments indicate a
195 * failure to update labels
196 */
197 if (res->flags & DPA_RESOURCE_ADJUSTED)
198 return false;
199 count++;
200 }
201
202 /* These values match after a successful label update */
203 if (count != nsblk->num_resources)
204 return false;
205
206 for (i = 0; i < nsblk->num_resources; i++) {
207 struct resource *found = NULL;
208
209 for_each_dpa_resource(ndd, res)
210 if (res == nsblk->res[i]) {
211 found = res;
212 break;
213 }
214 /* stale resource */
215 if (!found)
216 return false;
217 }
218
219 return true;
220}
221
222resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
223{
224 resource_size_t size;
225
226 nvdimm_bus_lock(&nsblk->common.dev);
227 size = __nd_namespace_blk_validate(nsblk);
228 nvdimm_bus_unlock(&nsblk->common.dev);
229
230 return size;
231}
232EXPORT_SYMBOL(nd_namespace_blk_validate);
233
234
235static int nd_namespace_label_update(struct nd_region *nd_region,
236 struct device *dev)
237{
238 dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim,
239 "namespace must be idle during label update\n");
240 if (dev->driver || to_ndns(dev)->claim)
241 return 0;
242
243 /*
244 * Only allow label writes that will result in a valid namespace
245 * or deletion of an existing namespace.
246 */
247 if (is_namespace_pmem(dev)) {
248 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
249 resource_size_t size = resource_size(&nspm->nsio.res);
250
251 if (size == 0 && nspm->uuid)
252 /* delete allocation */;
253 else if (!nspm->uuid)
254 return 0;
255
256 return nd_pmem_namespace_label_update(nd_region, nspm, size);
257 } else if (is_namespace_blk(dev)) {
258 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
259 resource_size_t size = nd_namespace_blk_size(nsblk);
260
261 if (size == 0 && nsblk->uuid)
262 /* delete allocation */;
263 else if (!nsblk->uuid || !nsblk->lbasize)
264 return 0;
265
266 return nd_blk_namespace_label_update(nd_region, nsblk, size);
267 } else
268 return -ENXIO;
269}
270
271static ssize_t alt_name_store(struct device *dev,
272 struct device_attribute *attr, const char *buf, size_t len)
273{
274 struct nd_region *nd_region = to_nd_region(dev->parent);
275 ssize_t rc;
276
277 device_lock(dev);
278 nvdimm_bus_lock(dev);
279 wait_nvdimm_bus_probe_idle(dev);
280 rc = __alt_name_store(dev, buf, len);
281 if (rc >= 0)
282 rc = nd_namespace_label_update(nd_region, dev);
283 dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc);
284 nvdimm_bus_unlock(dev);
285 device_unlock(dev);
286
287 return rc < 0 ? rc : len;
288}
289
290static ssize_t alt_name_show(struct device *dev,
291 struct device_attribute *attr, char *buf)
292{
293 char *ns_altname;
294
295 if (is_namespace_pmem(dev)) {
296 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
297
298 ns_altname = nspm->alt_name;
299 } else if (is_namespace_blk(dev)) {
300 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
301
302 ns_altname = nsblk->alt_name;
303 } else
304 return -ENXIO;
305
306 return sprintf(buf, "%s\n", ns_altname ? ns_altname : "");
307}
308static DEVICE_ATTR_RW(alt_name);
309
310static int scan_free(struct nd_region *nd_region,
311 struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
312 resource_size_t n)
313{
314 bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
315 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
316 int rc = 0;
317
318 while (n) {
319 struct resource *res, *last;
320 resource_size_t new_start;
321
322 last = NULL;
323 for_each_dpa_resource(ndd, res)
324 if (strcmp(res->name, label_id->id) == 0)
325 last = res;
326 res = last;
327 if (!res)
328 return 0;
329
330 if (n >= resource_size(res)) {
331 n -= resource_size(res);
332 nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc);
333 nvdimm_free_dpa(ndd, res);
334 /* retry with last resource deleted */
335 continue;
336 }
337
338 /*
339 * Keep BLK allocations relegated to high DPA as much as
340 * possible
341 */
342 if (is_blk)
343 new_start = res->start + n;
344 else
345 new_start = res->start;
346
347 rc = adjust_resource(res, new_start, resource_size(res) - n);
348 if (rc == 0)
349 res->flags |= DPA_RESOURCE_ADJUSTED;
350 nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
351 break;
352 }
353
354 return rc;
355}
356
357/**
358 * shrink_dpa_allocation - for each dimm in region free n bytes for label_id
359 * @nd_region: the set of dimms to reclaim @n bytes from
360 * @label_id: unique identifier for the namespace consuming this dpa range
361 * @n: number of bytes per-dimm to release
362 *
363 * Assumes resources are ordered. Starting from the end try to
364 * adjust_resource() the allocation to @n, but if @n is larger than the
365 * allocation delete it and find the 'new' last allocation in the label
366 * set.
367 */
368static int shrink_dpa_allocation(struct nd_region *nd_region,
369 struct nd_label_id *label_id, resource_size_t n)
370{
371 int i;
372
373 for (i = 0; i < nd_region->ndr_mappings; i++) {
374 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
375 int rc;
376
377 rc = scan_free(nd_region, nd_mapping, label_id, n);
378 if (rc)
379 return rc;
380 }
381
382 return 0;
383}
384
385static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
386 struct nd_region *nd_region, struct nd_mapping *nd_mapping,
387 resource_size_t n)
388{
389 bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
390 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
391 resource_size_t first_dpa;
392 struct resource *res;
393 int rc = 0;
394
395 /* allocate blk from highest dpa first */
396 if (is_blk)
397 first_dpa = nd_mapping->start + nd_mapping->size - n;
398 else
399 first_dpa = nd_mapping->start;
400
401 /* first resource allocation for this label-id or dimm */
402 res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
403 if (!res)
404 rc = -EBUSY;
405
406 nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc);
407 return rc ? n : 0;
408}
409
410static bool space_valid(bool is_pmem, bool is_reserve,
411 struct nd_label_id *label_id, struct resource *res)
412{
413 /*
414 * For BLK-space any space is valid, for PMEM-space, it must be
415 * contiguous with an existing allocation unless we are
416 * reserving pmem.
417 */
418 if (is_reserve || !is_pmem)
419 return true;
420 if (!res || strcmp(res->name, label_id->id) == 0)
421 return true;
422 return false;
423}
424
425enum alloc_loc {
426 ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER,
427};
428
429static resource_size_t scan_allocate(struct nd_region *nd_region,
430 struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
431 resource_size_t n)
432{
433 resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
434 bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
435 bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
436 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
437 const resource_size_t to_allocate = n;
438 struct resource *res;
439 int first;
440
441 retry:
442 first = 0;
443 for_each_dpa_resource(ndd, res) {
444 resource_size_t allocate, available = 0, free_start, free_end;
445 struct resource *next = res->sibling, *new_res = NULL;
446 enum alloc_loc loc = ALLOC_ERR;
447 const char *action;
448 int rc = 0;
449
450 /* ignore resources outside this nd_mapping */
451 if (res->start > mapping_end)
452 continue;
453 if (res->end < nd_mapping->start)
454 continue;
455
456 /* space at the beginning of the mapping */
457 if (!first++ && res->start > nd_mapping->start) {
458 free_start = nd_mapping->start;
459 available = res->start - free_start;
460 if (space_valid(is_pmem, is_reserve, label_id, NULL))
461 loc = ALLOC_BEFORE;
462 }
463
464 /* space between allocations */
465 if (!loc && next) {
466 free_start = res->start + resource_size(res);
467 free_end = min(mapping_end, next->start - 1);
468 if (space_valid(is_pmem, is_reserve, label_id, res)
469 && free_start < free_end) {
470 available = free_end + 1 - free_start;
471 loc = ALLOC_MID;
472 }
473 }
474
475 /* space at the end of the mapping */
476 if (!loc && !next) {
477 free_start = res->start + resource_size(res);
478 free_end = mapping_end;
479 if (space_valid(is_pmem, is_reserve, label_id, res)
480 && free_start < free_end) {
481 available = free_end + 1 - free_start;
482 loc = ALLOC_AFTER;
483 }
484 }
485
486 if (!loc || !available)
487 continue;
488 allocate = min(available, n);
489 switch (loc) {
490 case ALLOC_BEFORE:
491 if (strcmp(res->name, label_id->id) == 0) {
492 /* adjust current resource up */
493 if (is_pmem && !is_reserve)
494 return n;
495 rc = adjust_resource(res, res->start - allocate,
496 resource_size(res) + allocate);
497 action = "cur grow up";
498 } else
499 action = "allocate";
500 break;
501 case ALLOC_MID:
502 if (strcmp(next->name, label_id->id) == 0) {
503 /* adjust next resource up */
504 if (is_pmem && !is_reserve)
505 return n;
506 rc = adjust_resource(next, next->start
507 - allocate, resource_size(next)
508 + allocate);
509 new_res = next;
510 action = "next grow up";
511 } else if (strcmp(res->name, label_id->id) == 0) {
512 action = "grow down";
513 } else
514 action = "allocate";
515 break;
516 case ALLOC_AFTER:
517 if (strcmp(res->name, label_id->id) == 0)
518 action = "grow down";
519 else
520 action = "allocate";
521 break;
522 default:
523 return n;
524 }
525
526 if (strcmp(action, "allocate") == 0) {
527 /* BLK allocate bottom up */
528 if (!is_pmem)
529 free_start += available - allocate;
530 else if (!is_reserve && free_start != nd_mapping->start)
531 return n;
532
533 new_res = nvdimm_allocate_dpa(ndd, label_id,
534 free_start, allocate);
535 if (!new_res)
536 rc = -EBUSY;
537 } else if (strcmp(action, "grow down") == 0) {
538 /* adjust current resource down */
539 rc = adjust_resource(res, res->start, resource_size(res)
540 + allocate);
541 if (rc == 0)
542 res->flags |= DPA_RESOURCE_ADJUSTED;
543 }
544
545 if (!new_res)
546 new_res = res;
547
548 nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n",
549 action, loc, rc);
550
551 if (rc)
552 return n;
553
554 n -= allocate;
555 if (n) {
556 /*
557 * Retry scan with newly inserted resources.
558 * For example, if we did an ALLOC_BEFORE
559 * insertion there may also have been space
560 * available for an ALLOC_AFTER insertion, so we
561 * need to check this same resource again
562 */
563 goto retry;
564 } else
565 return 0;
566 }
567
568 /*
569 * If we allocated nothing in the BLK case it may be because we are in
570 * an initial "pmem-reserve pass". Only do an initial BLK allocation
571 * when none of the DPA space is reserved.
572 */
573 if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
574 return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
575 return n;
576}
577
578static int merge_dpa(struct nd_region *nd_region,
579 struct nd_mapping *nd_mapping, struct nd_label_id *label_id)
580{
581 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
582 struct resource *res;
583
584 if (strncmp("pmem", label_id->id, 4) == 0)
585 return 0;
586 retry:
587 for_each_dpa_resource(ndd, res) {
588 int rc;
589 struct resource *next = res->sibling;
590 resource_size_t end = res->start + resource_size(res);
591
592 if (!next || strcmp(res->name, label_id->id) != 0
593 || strcmp(next->name, label_id->id) != 0
594 || end != next->start)
595 continue;
596 end += resource_size(next);
597 nvdimm_free_dpa(ndd, next);
598 rc = adjust_resource(res, res->start, end - res->start);
599 nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc);
600 if (rc)
601 return rc;
602 res->flags |= DPA_RESOURCE_ADJUSTED;
603 goto retry;
604 }
605
606 return 0;
607}
608
609static int __reserve_free_pmem(struct device *dev, void *data)
610{
611 struct nvdimm *nvdimm = data;
612 struct nd_region *nd_region;
613 struct nd_label_id label_id;
614 int i;
615
616 if (!is_nd_pmem(dev))
617 return 0;
618
619 nd_region = to_nd_region(dev);
620 if (nd_region->ndr_mappings == 0)
621 return 0;
622
623 memset(&label_id, 0, sizeof(label_id));
624 strcat(label_id.id, "pmem-reserve");
625 for (i = 0; i < nd_region->ndr_mappings; i++) {
626 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
627 resource_size_t n, rem = 0;
628
629 if (nd_mapping->nvdimm != nvdimm)
630 continue;
631
632 n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
633 if (n == 0)
634 return 0;
635 rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
636 dev_WARN_ONCE(&nd_region->dev, rem,
637 "pmem reserve underrun: %#llx of %#llx bytes\n",
638 (unsigned long long) n - rem,
639 (unsigned long long) n);
640 return rem ? -ENXIO : 0;
641 }
642
643 return 0;
644}
645
646static void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
647 struct nd_mapping *nd_mapping)
648{
649 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
650 struct resource *res, *_res;
651
652 for_each_dpa_resource_safe(ndd, res, _res)
653 if (strcmp(res->name, "pmem-reserve") == 0)
654 nvdimm_free_dpa(ndd, res);
655}
656
657static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
658 struct nd_mapping *nd_mapping)
659{
660 struct nvdimm *nvdimm = nd_mapping->nvdimm;
661 int rc;
662
663 rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
664 __reserve_free_pmem);
665 if (rc)
666 release_free_pmem(nvdimm_bus, nd_mapping);
667 return rc;
668}
669
670/**
671 * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
672 * @nd_region: the set of dimms to allocate @n more bytes from
673 * @label_id: unique identifier for the namespace consuming this dpa range
674 * @n: number of bytes per-dimm to add to the existing allocation
675 *
676 * Assumes resources are ordered. For BLK regions, first consume
677 * BLK-only available DPA free space, then consume PMEM-aliased DPA
678 * space starting at the highest DPA. For PMEM regions start
679 * allocations from the start of an interleave set and end at the first
680 * BLK allocation or the end of the interleave set, whichever comes
681 * first.
682 */
683static int grow_dpa_allocation(struct nd_region *nd_region,
684 struct nd_label_id *label_id, resource_size_t n)
685{
686 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
687 bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
688 int i;
689
690 for (i = 0; i < nd_region->ndr_mappings; i++) {
691 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
692 resource_size_t rem = n;
693 int rc, j;
694
695 /*
696 * In the BLK case try once with all unallocated PMEM
697 * reserved, and once without
698 */
699 for (j = is_pmem; j < 2; j++) {
700 bool blk_only = j == 0;
701
702 if (blk_only) {
703 rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
704 if (rc)
705 return rc;
706 }
707 rem = scan_allocate(nd_region, nd_mapping,
708 label_id, rem);
709 if (blk_only)
710 release_free_pmem(nvdimm_bus, nd_mapping);
711
712 /* try again and allow encroachments into PMEM */
713 if (rem == 0)
714 break;
715 }
716
717 dev_WARN_ONCE(&nd_region->dev, rem,
718 "allocation underrun: %#llx of %#llx bytes\n",
719 (unsigned long long) n - rem,
720 (unsigned long long) n);
721 if (rem)
722 return -ENXIO;
723
724 rc = merge_dpa(nd_region, nd_mapping, label_id);
725 if (rc)
726 return rc;
727 }
728
729 return 0;
730}
731
732static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
733 struct nd_namespace_pmem *nspm, resource_size_t size)
734{
735 struct resource *res = &nspm->nsio.res;
736
737 res->start = nd_region->ndr_start;
738 res->end = nd_region->ndr_start + size - 1;
739}
740
741static ssize_t __size_store(struct device *dev, unsigned long long val)
742{
743 resource_size_t allocated = 0, available = 0;
744 struct nd_region *nd_region = to_nd_region(dev->parent);
745 struct nd_mapping *nd_mapping;
746 struct nvdimm_drvdata *ndd;
747 struct nd_label_id label_id;
748 u32 flags = 0, remainder;
749 u8 *uuid = NULL;
750 int rc, i;
751
752 if (dev->driver || to_ndns(dev)->claim)
753 return -EBUSY;
754
755 if (is_namespace_pmem(dev)) {
756 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
757
758 uuid = nspm->uuid;
759 } else if (is_namespace_blk(dev)) {
760 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
761
762 uuid = nsblk->uuid;
763 flags = NSLABEL_FLAG_LOCAL;
764 }
765
766 /*
767 * We need a uuid for the allocation-label and dimm(s) on which
768 * to store the label.
769 */
770 if (!uuid || nd_region->ndr_mappings == 0)
771 return -ENXIO;
772
773 div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
774 if (remainder) {
775 dev_dbg(dev, "%llu is not %dK aligned\n", val,
776 (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
777 return -EINVAL;
778 }
779
780 nd_label_gen_id(&label_id, uuid, flags);
781 for (i = 0; i < nd_region->ndr_mappings; i++) {
782 nd_mapping = &nd_region->mapping[i];
783 ndd = to_ndd(nd_mapping);
784
785 /*
786 * All dimms in an interleave set, or the base dimm for a blk
787 * region, need to be enabled for the size to be changed.
788 */
789 if (!ndd)
790 return -ENXIO;
791
792 allocated += nvdimm_allocated_dpa(ndd, &label_id);
793 }
794 available = nd_region_available_dpa(nd_region);
795
796 if (val > available + allocated)
797 return -ENOSPC;
798
799 if (val == allocated)
800 return 0;
801
802 val = div_u64(val, nd_region->ndr_mappings);
803 allocated = div_u64(allocated, nd_region->ndr_mappings);
804 if (val < allocated)
805 rc = shrink_dpa_allocation(nd_region, &label_id,
806 allocated - val);
807 else
808 rc = grow_dpa_allocation(nd_region, &label_id, val - allocated);
809
810 if (rc)
811 return rc;
812
813 if (is_namespace_pmem(dev)) {
814 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
815
816 nd_namespace_pmem_set_size(nd_region, nspm,
817 val * nd_region->ndr_mappings);
818 } else if (is_namespace_blk(dev)) {
819 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
820
821 /*
822 * Try to delete the namespace if we deleted all of its
823 * allocation, this is not the seed device for the
824 * region, and it is not actively claimed by a btt
825 * instance.
826 */
827 if (val == 0 && nd_region->ns_seed != dev
828 && !nsblk->common.claim)
829 nd_device_unregister(dev, ND_ASYNC);
830 }
831
832 return rc;
833}
834
835static ssize_t size_store(struct device *dev,
836 struct device_attribute *attr, const char *buf, size_t len)
837{
838 struct nd_region *nd_region = to_nd_region(dev->parent);
839 unsigned long long val;
840 u8 **uuid = NULL;
841 int rc;
842
843 rc = kstrtoull(buf, 0, &val);
844 if (rc)
845 return rc;
846
847 device_lock(dev);
848 nvdimm_bus_lock(dev);
849 wait_nvdimm_bus_probe_idle(dev);
850 rc = __size_store(dev, val);
851 if (rc >= 0)
852 rc = nd_namespace_label_update(nd_region, dev);
853
854 if (is_namespace_pmem(dev)) {
855 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
856
857 uuid = &nspm->uuid;
858 } else if (is_namespace_blk(dev)) {
859 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
860
861 uuid = &nsblk->uuid;
862 }
863
864 if (rc == 0 && val == 0 && uuid) {
865 /* setting size zero == 'delete namespace' */
866 kfree(*uuid);
867 *uuid = NULL;
868 }
869
870 dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0
871 ? "fail" : "success", rc);
872
873 nvdimm_bus_unlock(dev);
874 device_unlock(dev);
875
876 return rc < 0 ? rc : len;
877}
878
879resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
880{
881 struct device *dev = &ndns->dev;
882
883 if (is_namespace_pmem(dev)) {
884 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
885
886 return resource_size(&nspm->nsio.res);
887 } else if (is_namespace_blk(dev)) {
888 return nd_namespace_blk_size(to_nd_namespace_blk(dev));
889 } else if (is_namespace_io(dev)) {
890 struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
891
892 return resource_size(&nsio->res);
893 } else
894 WARN_ONCE(1, "unknown namespace type\n");
895 return 0;
896}
897
898resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
899{
900 resource_size_t size;
901
902 nvdimm_bus_lock(&ndns->dev);
903 size = __nvdimm_namespace_capacity(ndns);
904 nvdimm_bus_unlock(&ndns->dev);
905
906 return size;
907}
908EXPORT_SYMBOL(nvdimm_namespace_capacity);
909
910static ssize_t size_show(struct device *dev,
911 struct device_attribute *attr, char *buf)
912{
913 return sprintf(buf, "%llu\n", (unsigned long long)
914 nvdimm_namespace_capacity(to_ndns(dev)));
915}
916static DEVICE_ATTR(size, S_IRUGO, size_show, size_store);
917
918static ssize_t uuid_show(struct device *dev,
919 struct device_attribute *attr, char *buf)
920{
921 u8 *uuid;
922
923 if (is_namespace_pmem(dev)) {
924 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
925
926 uuid = nspm->uuid;
927 } else if (is_namespace_blk(dev)) {
928 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
929
930 uuid = nsblk->uuid;
931 } else
932 return -ENXIO;
933
934 if (uuid)
935 return sprintf(buf, "%pUb\n", uuid);
936 return sprintf(buf, "\n");
937}
938
939/**
940 * namespace_update_uuid - check for a unique uuid and whether we're "renaming"
941 * @nd_region: parent region so we can updates all dimms in the set
942 * @dev: namespace type for generating label_id
943 * @new_uuid: incoming uuid
944 * @old_uuid: reference to the uuid storage location in the namespace object
945 */
946static int namespace_update_uuid(struct nd_region *nd_region,
947 struct device *dev, u8 *new_uuid, u8 **old_uuid)
948{
949 u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0;
950 struct nd_label_id old_label_id;
951 struct nd_label_id new_label_id;
952 int i;
953
954 if (!nd_is_uuid_unique(dev, new_uuid))
955 return -EINVAL;
956
957 if (*old_uuid == NULL)
958 goto out;
959
960 /*
961 * If we've already written a label with this uuid, then it's
962 * too late to rename because we can't reliably update the uuid
963 * without losing the old namespace. Userspace must delete this
964 * namespace to abandon the old uuid.
965 */
966 for (i = 0; i < nd_region->ndr_mappings; i++) {
967 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
968
969 /*
970 * This check by itself is sufficient because old_uuid
971 * would be NULL above if this uuid did not exist in the
972 * currently written set.
973 *
974 * FIXME: can we delete uuid with zero dpa allocated?
975 */
976 if (nd_mapping->labels)
977 return -EBUSY;
978 }
979
980 nd_label_gen_id(&old_label_id, *old_uuid, flags);
981 nd_label_gen_id(&new_label_id, new_uuid, flags);
982 for (i = 0; i < nd_region->ndr_mappings; i++) {
983 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
984 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
985 struct resource *res;
986
987 for_each_dpa_resource(ndd, res)
988 if (strcmp(res->name, old_label_id.id) == 0)
989 sprintf((void *) res->name, "%s",
990 new_label_id.id);
991 }
992 kfree(*old_uuid);
993 out:
994 *old_uuid = new_uuid;
995 return 0;
996}
997
998static ssize_t uuid_store(struct device *dev,
999 struct device_attribute *attr, const char *buf, size_t len)
1000{
1001 struct nd_region *nd_region = to_nd_region(dev->parent);
1002 u8 *uuid = NULL;
1003 ssize_t rc = 0;
1004 u8 **ns_uuid;
1005
1006 if (is_namespace_pmem(dev)) {
1007 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
1008
1009 ns_uuid = &nspm->uuid;
1010 } else if (is_namespace_blk(dev)) {
1011 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
1012
1013 ns_uuid = &nsblk->uuid;
1014 } else
1015 return -ENXIO;
1016
1017 device_lock(dev);
1018 nvdimm_bus_lock(dev);
1019 wait_nvdimm_bus_probe_idle(dev);
1020 if (to_ndns(dev)->claim)
1021 rc = -EBUSY;
1022 if (rc >= 0)
1023 rc = nd_uuid_store(dev, &uuid, buf, len);
1024 if (rc >= 0)
1025 rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
1026 if (rc >= 0)
1027 rc = nd_namespace_label_update(nd_region, dev);
1028 else
1029 kfree(uuid);
1030 dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
1031 rc, buf, buf[len - 1] == '\n' ? "" : "\n");
1032 nvdimm_bus_unlock(dev);
1033 device_unlock(dev);
1034
1035 return rc < 0 ? rc : len;
1036}
1037static DEVICE_ATTR_RW(uuid);
1038
1039static ssize_t resource_show(struct device *dev,
1040 struct device_attribute *attr, char *buf)
1041{
1042 struct resource *res;
1043
1044 if (is_namespace_pmem(dev)) {
1045 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
1046
1047 res = &nspm->nsio.res;
1048 } else if (is_namespace_io(dev)) {
1049 struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
1050
1051 res = &nsio->res;
1052 } else
1053 return -ENXIO;
1054
1055 /* no address to convey if the namespace has no allocation */
1056 if (resource_size(res) == 0)
1057 return -ENXIO;
1058 return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
1059}
1060static DEVICE_ATTR_RO(resource);
1061
1062static const unsigned long ns_lbasize_supported[] = { 512, 520, 528,
1063 4096, 4104, 4160, 4224, 0 };
1064
1065static ssize_t sector_size_show(struct device *dev,
1066 struct device_attribute *attr, char *buf)
1067{
1068 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
1069
1070 if (!is_namespace_blk(dev))
1071 return -ENXIO;
1072
1073 return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf);
1074}
1075
1076static ssize_t sector_size_store(struct device *dev,
1077 struct device_attribute *attr, const char *buf, size_t len)
1078{
1079 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
1080 struct nd_region *nd_region = to_nd_region(dev->parent);
1081 ssize_t rc = 0;
1082
1083 if (!is_namespace_blk(dev))
1084 return -ENXIO;
1085
1086 device_lock(dev);
1087 nvdimm_bus_lock(dev);
1088 if (to_ndns(dev)->claim)
1089 rc = -EBUSY;
1090 if (rc >= 0)
1091 rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
1092 ns_lbasize_supported);
1093 if (rc >= 0)
1094 rc = nd_namespace_label_update(nd_region, dev);
1095 dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
1096 rc, rc < 0 ? "tried" : "wrote", buf,
1097 buf[len - 1] == '\n' ? "" : "\n");
1098 nvdimm_bus_unlock(dev);
1099 device_unlock(dev);
1100
1101 return rc ? rc : len;
1102}
1103static DEVICE_ATTR_RW(sector_size);
1104
1105static ssize_t dpa_extents_show(struct device *dev,
1106 struct device_attribute *attr, char *buf)
1107{
1108 struct nd_region *nd_region = to_nd_region(dev->parent);
1109 struct nd_label_id label_id;
1110 int count = 0, i;
1111 u8 *uuid = NULL;
1112 u32 flags = 0;
1113
1114 nvdimm_bus_lock(dev);
1115 if (is_namespace_pmem(dev)) {
1116 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
1117
1118 uuid = nspm->uuid;
1119 flags = 0;
1120 } else if (is_namespace_blk(dev)) {
1121 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
1122
1123 uuid = nsblk->uuid;
1124 flags = NSLABEL_FLAG_LOCAL;
1125 }
1126
1127 if (!uuid)
1128 goto out;
1129
1130 nd_label_gen_id(&label_id, uuid, flags);
1131 for (i = 0; i < nd_region->ndr_mappings; i++) {
1132 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1133 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
1134 struct resource *res;
1135
1136 for_each_dpa_resource(ndd, res)
1137 if (strcmp(res->name, label_id.id) == 0)
1138 count++;
1139 }
1140 out:
1141 nvdimm_bus_unlock(dev);
1142
1143 return sprintf(buf, "%d\n", count);
1144}
1145static DEVICE_ATTR_RO(dpa_extents);
1146
1147static ssize_t holder_show(struct device *dev,
1148 struct device_attribute *attr, char *buf)
1149{
1150 struct nd_namespace_common *ndns = to_ndns(dev);
1151 ssize_t rc;
1152
1153 device_lock(dev);
1154 rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
1155 device_unlock(dev);
1156
1157 return rc;
1158}
1159static DEVICE_ATTR_RO(holder);
1160
1161static ssize_t force_raw_store(struct device *dev,
1162 struct device_attribute *attr, const char *buf, size_t len)
1163{
1164 bool force_raw;
1165 int rc = strtobool(buf, &force_raw);
1166
1167 if (rc)
1168 return rc;
1169
1170 to_ndns(dev)->force_raw = force_raw;
1171 return len;
1172}
1173
1174static ssize_t force_raw_show(struct device *dev,
1175 struct device_attribute *attr, char *buf)
1176{
1177 return sprintf(buf, "%d\n", to_ndns(dev)->force_raw);
1178}
1179static DEVICE_ATTR_RW(force_raw);
1180
1181static struct attribute *nd_namespace_attributes[] = {
1182 &dev_attr_nstype.attr,
1183 &dev_attr_size.attr,
1184 &dev_attr_uuid.attr,
1185 &dev_attr_holder.attr,
1186 &dev_attr_resource.attr,
1187 &dev_attr_alt_name.attr,
1188 &dev_attr_force_raw.attr,
1189 &dev_attr_sector_size.attr,
1190 &dev_attr_dpa_extents.attr,
1191 NULL,
1192};
1193
1194static umode_t namespace_visible(struct kobject *kobj,
1195 struct attribute *a, int n)
1196{
1197 struct device *dev = container_of(kobj, struct device, kobj);
1198
1199 if (a == &dev_attr_resource.attr) {
1200 if (is_namespace_blk(dev))
1201 return 0;
1202 return a->mode;
1203 }
1204
1205 if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
1206 if (a == &dev_attr_size.attr)
1207 return S_IWUSR | S_IRUGO;
1208
1209 if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr)
1210 return 0;
1211
1212 return a->mode;
1213 }
1214
1215 if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
1216 || a == &dev_attr_holder.attr
1217 || a == &dev_attr_force_raw.attr)
1218 return a->mode;
1219
1220 return 0;
1221}
1222
1223static struct attribute_group nd_namespace_attribute_group = {
1224 .attrs = nd_namespace_attributes,
1225 .is_visible = namespace_visible,
1226};
1227
1228static const struct attribute_group *nd_namespace_attribute_groups[] = {
1229 &nd_device_attribute_group,
1230 &nd_namespace_attribute_group,
1231 &nd_numa_attribute_group,
1232 NULL,
1233};
1234
1235struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
1236{
1237 struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
1238 struct nd_namespace_common *ndns;
1239 resource_size_t size;
1240
1241 if (nd_btt) {
1242 ndns = nd_btt->ndns;
1243 if (!ndns)
1244 return ERR_PTR(-ENODEV);
1245
1246 /*
1247 * Flush any in-progess probes / removals in the driver
1248 * for the raw personality of this namespace.
1249 */
1250 device_lock(&ndns->dev);
1251 device_unlock(&ndns->dev);
1252 if (ndns->dev.driver) {
1253 dev_dbg(&ndns->dev, "is active, can't bind %s\n",
1254 dev_name(&nd_btt->dev));
1255 return ERR_PTR(-EBUSY);
1256 }
1257 if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev,
1258 "host (%s) vs claim (%s) mismatch\n",
1259 dev_name(&nd_btt->dev),
1260 dev_name(ndns->claim)))
1261 return ERR_PTR(-ENXIO);
1262 } else {
1263 ndns = to_ndns(dev);
1264 if (ndns->claim) {
1265 dev_dbg(dev, "claimed by %s, failing probe\n",
1266 dev_name(ndns->claim));
1267
1268 return ERR_PTR(-ENXIO);
1269 }
1270 }
1271
1272 size = nvdimm_namespace_capacity(ndns);
1273 if (size < ND_MIN_NAMESPACE_SIZE) {
1274 dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
1275 &size, ND_MIN_NAMESPACE_SIZE);
1276 return ERR_PTR(-ENODEV);
1277 }
1278
1279 if (is_namespace_pmem(&ndns->dev)) {
1280 struct nd_namespace_pmem *nspm;
1281
1282 nspm = to_nd_namespace_pmem(&ndns->dev);
1283 if (!nspm->uuid) {
1284 dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
1285 return ERR_PTR(-ENODEV);
1286 }
1287 } else if (is_namespace_blk(&ndns->dev)) {
1288 struct nd_namespace_blk *nsblk;
1289
1290 nsblk = to_nd_namespace_blk(&ndns->dev);
1291 if (!nd_namespace_blk_validate(nsblk))
1292 return ERR_PTR(-ENODEV);
1293 }
1294
1295 return ndns;
1296}
1297EXPORT_SYMBOL(nvdimm_namespace_common_probe);
1298
1299static struct device **create_namespace_io(struct nd_region *nd_region)
1300{
1301 struct nd_namespace_io *nsio;
1302 struct device *dev, **devs;
1303 struct resource *res;
1304
1305 nsio = kzalloc(sizeof(*nsio), GFP_KERNEL);
1306 if (!nsio)
1307 return NULL;
1308
1309 devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
1310 if (!devs) {
1311 kfree(nsio);
1312 return NULL;
1313 }
1314
1315 dev = &nsio->common.dev;
1316 dev->type = &namespace_io_device_type;
1317 dev->parent = &nd_region->dev;
1318 res = &nsio->res;
1319 res->name = dev_name(&nd_region->dev);
1320 res->flags = IORESOURCE_MEM;
1321 res->start = nd_region->ndr_start;
1322 res->end = res->start + nd_region->ndr_size - 1;
1323
1324 devs[0] = dev;
1325 return devs;
1326}
1327
1328static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
1329 u64 cookie, u16 pos)
1330{
1331 struct nd_namespace_label *found = NULL;
1332 int i;
1333
1334 for (i = 0; i < nd_region->ndr_mappings; i++) {
1335 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1336 struct nd_namespace_label *nd_label;
1337 bool found_uuid = false;
1338 int l;
1339
1340 for_each_label(l, nd_label, nd_mapping->labels) {
1341 u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
1342 u16 position = __le16_to_cpu(nd_label->position);
1343 u16 nlabel = __le16_to_cpu(nd_label->nlabel);
1344
1345 if (isetcookie != cookie)
1346 continue;
1347
1348 if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0)
1349 continue;
1350
1351 if (found_uuid) {
1352 dev_dbg(to_ndd(nd_mapping)->dev,
1353 "%s duplicate entry for uuid\n",
1354 __func__);
1355 return false;
1356 }
1357 found_uuid = true;
1358 if (nlabel != nd_region->ndr_mappings)
1359 continue;
1360 if (position != pos)
1361 continue;
1362 found = nd_label;
1363 break;
1364 }
1365 if (found)
1366 break;
1367 }
1368 return found != NULL;
1369}
1370
1371static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
1372{
1373 struct nd_namespace_label *select = NULL;
1374 int i;
1375
1376 if (!pmem_id)
1377 return -ENODEV;
1378
1379 for (i = 0; i < nd_region->ndr_mappings; i++) {
1380 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1381 struct nd_namespace_label *nd_label;
1382 u64 hw_start, hw_end, pmem_start, pmem_end;
1383 int l;
1384
1385 for_each_label(l, nd_label, nd_mapping->labels)
1386 if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0)
1387 break;
1388
1389 if (!nd_label) {
1390 WARN_ON(1);
1391 return -EINVAL;
1392 }
1393
1394 select = nd_label;
1395 /*
1396 * Check that this label is compliant with the dpa
1397 * range published in NFIT
1398 */
1399 hw_start = nd_mapping->start;
1400 hw_end = hw_start + nd_mapping->size;
1401 pmem_start = __le64_to_cpu(select->dpa);
1402 pmem_end = pmem_start + __le64_to_cpu(select->rawsize);
1403 if (pmem_start == hw_start && pmem_end <= hw_end)
1404 /* pass */;
1405 else
1406 return -EINVAL;
1407
1408 nd_mapping->labels[0] = select;
1409 nd_mapping->labels[1] = NULL;
1410 }
1411 return 0;
1412}
1413
1414/**
1415 * find_pmem_label_set - validate interleave set labelling, retrieve label0
1416 * @nd_region: region with mappings to validate
1417 */
1418static int find_pmem_label_set(struct nd_region *nd_region,
1419 struct nd_namespace_pmem *nspm)
1420{
1421 u64 cookie = nd_region_interleave_set_cookie(nd_region);
1422 struct nd_namespace_label *nd_label;
1423 u8 select_id[NSLABEL_UUID_LEN];
1424 resource_size_t size = 0;
1425 u8 *pmem_id = NULL;
1426 int rc = -ENODEV, l;
1427 u16 i;
1428
1429 if (cookie == 0)
1430 return -ENXIO;
1431
1432 /*
1433 * Find a complete set of labels by uuid. By definition we can start
1434 * with any mapping as the reference label
1435 */
1436 for_each_label(l, nd_label, nd_region->mapping[0].labels) {
1437 u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
1438
1439 if (isetcookie != cookie)
1440 continue;
1441
1442 for (i = 0; nd_region->ndr_mappings; i++)
1443 if (!has_uuid_at_pos(nd_region, nd_label->uuid,
1444 cookie, i))
1445 break;
1446 if (i < nd_region->ndr_mappings) {
1447 /*
1448 * Give up if we don't find an instance of a
1449 * uuid at each position (from 0 to
1450 * nd_region->ndr_mappings - 1), or if we find a
1451 * dimm with two instances of the same uuid.
1452 */
1453 rc = -EINVAL;
1454 goto err;
1455 } else if (pmem_id) {
1456 /*
1457 * If there is more than one valid uuid set, we
1458 * need userspace to clean this up.
1459 */
1460 rc = -EBUSY;
1461 goto err;
1462 }
1463 memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN);
1464 pmem_id = select_id;
1465 }
1466
1467 /*
1468 * Fix up each mapping's 'labels' to have the validated pmem label for
1469 * that position at labels[0], and NULL at labels[1]. In the process,
1470 * check that the namespace aligns with interleave-set. We know
1471 * that it does not overlap with any blk namespaces by virtue of
1472 * the dimm being enabled (i.e. nd_label_reserve_dpa()
1473 * succeeded).
1474 */
1475 rc = select_pmem_id(nd_region, pmem_id);
1476 if (rc)
1477 goto err;
1478
1479 /* Calculate total size and populate namespace properties from label0 */
1480 for (i = 0; i < nd_region->ndr_mappings; i++) {
1481 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1482 struct nd_namespace_label *label0 = nd_mapping->labels[0];
1483
1484 size += __le64_to_cpu(label0->rawsize);
1485 if (__le16_to_cpu(label0->position) != 0)
1486 continue;
1487 WARN_ON(nspm->alt_name || nspm->uuid);
1488 nspm->alt_name = kmemdup((void __force *) label0->name,
1489 NSLABEL_NAME_LEN, GFP_KERNEL);
1490 nspm->uuid = kmemdup((void __force *) label0->uuid,
1491 NSLABEL_UUID_LEN, GFP_KERNEL);
1492 }
1493
1494 if (!nspm->alt_name || !nspm->uuid) {
1495 rc = -ENOMEM;
1496 goto err;
1497 }
1498
1499 nd_namespace_pmem_set_size(nd_region, nspm, size);
1500
1501 return 0;
1502 err:
1503 switch (rc) {
1504 case -EINVAL:
1505 dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__);
1506 break;
1507 case -ENODEV:
1508 dev_dbg(&nd_region->dev, "%s: label not found\n", __func__);
1509 break;
1510 default:
1511 dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n",
1512 __func__, rc);
1513 break;
1514 }
1515 return rc;
1516}
1517
1518static struct device **create_namespace_pmem(struct nd_region *nd_region)
1519{
1520 struct nd_namespace_pmem *nspm;
1521 struct device *dev, **devs;
1522 struct resource *res;
1523 int rc;
1524
1525 nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
1526 if (!nspm)
1527 return NULL;
1528
1529 dev = &nspm->nsio.common.dev;
1530 dev->type = &namespace_pmem_device_type;
1531 dev->parent = &nd_region->dev;
1532 res = &nspm->nsio.res;
1533 res->name = dev_name(&nd_region->dev);
1534 res->flags = IORESOURCE_MEM;
1535 rc = find_pmem_label_set(nd_region, nspm);
1536 if (rc == -ENODEV) {
1537 int i;
1538
1539 /* Pass, try to permit namespace creation... */
1540 for (i = 0; i < nd_region->ndr_mappings; i++) {
1541 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1542
1543 kfree(nd_mapping->labels);
1544 nd_mapping->labels = NULL;
1545 }
1546
1547 /* Publish a zero-sized namespace for userspace to configure. */
1548 nd_namespace_pmem_set_size(nd_region, nspm, 0);
1549
1550 rc = 0;
1551 } else if (rc)
1552 goto err;
1553
1554 devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
1555 if (!devs)
1556 goto err;
1557
1558 devs[0] = dev;
1559 return devs;
1560
1561 err:
1562 namespace_pmem_release(&nspm->nsio.common.dev);
1563 return NULL;
1564}
1565
1566struct resource *nsblk_add_resource(struct nd_region *nd_region,
1567 struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
1568 resource_size_t start)
1569{
1570 struct nd_label_id label_id;
1571 struct resource *res;
1572
1573 nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
1574 res = krealloc(nsblk->res,
1575 sizeof(void *) * (nsblk->num_resources + 1),
1576 GFP_KERNEL);
1577 if (!res)
1578 return NULL;
1579 nsblk->res = (struct resource **) res;
1580 for_each_dpa_resource(ndd, res)
1581 if (strcmp(res->name, label_id.id) == 0
1582 && res->start == start) {
1583 nsblk->res[nsblk->num_resources++] = res;
1584 return res;
1585 }
1586 return NULL;
1587}
1588
1589static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
1590{
1591 struct nd_namespace_blk *nsblk;
1592 struct device *dev;
1593
1594 if (!is_nd_blk(&nd_region->dev))
1595 return NULL;
1596
1597 nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
1598 if (!nsblk)
1599 return NULL;
1600
1601 dev = &nsblk->common.dev;
1602 dev->type = &namespace_blk_device_type;
1603 nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
1604 if (nsblk->id < 0) {
1605 kfree(nsblk);
1606 return NULL;
1607 }
1608 dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id);
1609 dev->parent = &nd_region->dev;
1610 dev->groups = nd_namespace_attribute_groups;
1611
1612 return &nsblk->common.dev;
1613}
1614
1615void nd_region_create_blk_seed(struct nd_region *nd_region)
1616{
1617 WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
1618 nd_region->ns_seed = nd_namespace_blk_create(nd_region);
1619 /*
1620 * Seed creation failures are not fatal, provisioning is simply
1621 * disabled until memory becomes available
1622 */
1623 if (!nd_region->ns_seed)
1624 dev_err(&nd_region->dev, "failed to create blk namespace\n");
1625 else
1626 nd_device_register(nd_region->ns_seed);
1627}
1628
1629void nd_region_create_btt_seed(struct nd_region *nd_region)
1630{
1631 WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
1632 nd_region->btt_seed = nd_btt_create(nd_region);
1633 /*
1634 * Seed creation failures are not fatal, provisioning is simply
1635 * disabled until memory becomes available
1636 */
1637 if (!nd_region->btt_seed)
1638 dev_err(&nd_region->dev, "failed to create btt namespace\n");
1639}
1640
1641static struct device **create_namespace_blk(struct nd_region *nd_region)
1642{
1643 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
1644 struct nd_namespace_label *nd_label;
1645 struct device *dev, **devs = NULL;
1646 struct nd_namespace_blk *nsblk;
1647 struct nvdimm_drvdata *ndd;
1648 int i, l, count = 0;
1649 struct resource *res;
1650
1651 if (nd_region->ndr_mappings == 0)
1652 return NULL;
1653
1654 ndd = to_ndd(nd_mapping);
1655 for_each_label(l, nd_label, nd_mapping->labels) {
1656 u32 flags = __le32_to_cpu(nd_label->flags);
1657 char *name[NSLABEL_NAME_LEN];
1658 struct device **__devs;
1659
1660 if (flags & NSLABEL_FLAG_LOCAL)
1661 /* pass */;
1662 else
1663 continue;
1664
1665 for (i = 0; i < count; i++) {
1666 nsblk = to_nd_namespace_blk(devs[i]);
1667 if (memcmp(nsblk->uuid, nd_label->uuid,
1668 NSLABEL_UUID_LEN) == 0) {
1669 res = nsblk_add_resource(nd_region, ndd, nsblk,
1670 __le64_to_cpu(nd_label->dpa));
1671 if (!res)
1672 goto err;
1673 nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
1674 dev_name(&nsblk->common.dev));
1675 break;
1676 }
1677 }
1678 if (i < count)
1679 continue;
1680 __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL);
1681 if (!__devs)
1682 goto err;
1683 memcpy(__devs, devs, sizeof(dev) * count);
1684 kfree(devs);
1685 devs = __devs;
1686
1687 nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
1688 if (!nsblk)
1689 goto err;
1690 dev = &nsblk->common.dev;
1691 dev->type = &namespace_blk_device_type;
1692 dev->parent = &nd_region->dev;
1693 dev_set_name(dev, "namespace%d.%d", nd_region->id, count);
1694 devs[count++] = dev;
1695 nsblk->id = -1;
1696 nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
1697 nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN,
1698 GFP_KERNEL);
1699 if (!nsblk->uuid)
1700 goto err;
1701 memcpy(name, nd_label->name, NSLABEL_NAME_LEN);
1702 if (name[0])
1703 nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN,
1704 GFP_KERNEL);
1705 res = nsblk_add_resource(nd_region, ndd, nsblk,
1706 __le64_to_cpu(nd_label->dpa));
1707 if (!res)
1708 goto err;
1709 nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
1710 dev_name(&nsblk->common.dev));
1711 }
1712
1713 dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n",
1714 __func__, count, count == 1 ? "" : "s");
1715
1716 if (count == 0) {
1717 /* Publish a zero-sized namespace for userspace to configure. */
1718 for (i = 0; i < nd_region->ndr_mappings; i++) {
1719 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1720
1721 kfree(nd_mapping->labels);
1722 nd_mapping->labels = NULL;
1723 }
1724
1725 devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
1726 if (!devs)
1727 goto err;
1728 nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
1729 if (!nsblk)
1730 goto err;
1731 dev = &nsblk->common.dev;
1732 dev->type = &namespace_blk_device_type;
1733 dev->parent = &nd_region->dev;
1734 devs[count++] = dev;
1735 }
1736
1737 return devs;
1738
1739err:
1740 for (i = 0; i < count; i++) {
1741 nsblk = to_nd_namespace_blk(devs[i]);
1742 namespace_blk_release(&nsblk->common.dev);
1743 }
1744 kfree(devs);
1745 return NULL;
1746}
1747
1748static int init_active_labels(struct nd_region *nd_region)
1749{
1750 int i;
1751
1752 for (i = 0; i < nd_region->ndr_mappings; i++) {
1753 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
1754 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
1755 struct nvdimm *nvdimm = nd_mapping->nvdimm;
1756 int count, j;
1757
1758 /*
1759 * If the dimm is disabled then prevent the region from
1760 * being activated if it aliases DPA.
1761 */
1762 if (!ndd) {
1763 if ((nvdimm->flags & NDD_ALIASING) == 0)
1764 return 0;
1765 dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
1766 dev_name(&nd_mapping->nvdimm->dev));
1767 return -ENXIO;
1768 }
1769 nd_mapping->ndd = ndd;
1770 atomic_inc(&nvdimm->busy);
1771 get_ndd(ndd);
1772
1773 count = nd_label_active_count(ndd);
1774 dev_dbg(ndd->dev, "%s: %d\n", __func__, count);
1775 if (!count)
1776 continue;
1777 nd_mapping->labels = kcalloc(count + 1, sizeof(void *),
1778 GFP_KERNEL);
1779 if (!nd_mapping->labels)
1780 return -ENOMEM;
1781 for (j = 0; j < count; j++) {
1782 struct nd_namespace_label *label;
1783
1784 label = nd_label_active(ndd, j);
1785 nd_mapping->labels[j] = label;
1786 }
1787 }
1788
1789 return 0;
1790}
1791
1792int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
1793{
1794 struct device **devs = NULL;
1795 int i, rc = 0, type;
1796
1797 *err = 0;
1798 nvdimm_bus_lock(&nd_region->dev);
1799 rc = init_active_labels(nd_region);
1800 if (rc) {
1801 nvdimm_bus_unlock(&nd_region->dev);
1802 return rc;
1803 }
1804
1805 type = nd_region_to_nstype(nd_region);
1806 switch (type) {
1807 case ND_DEVICE_NAMESPACE_IO:
1808 devs = create_namespace_io(nd_region);
1809 break;
1810 case ND_DEVICE_NAMESPACE_PMEM:
1811 devs = create_namespace_pmem(nd_region);
1812 break;
1813 case ND_DEVICE_NAMESPACE_BLK:
1814 devs = create_namespace_blk(nd_region);
1815 break;
1816 default:
1817 break;
1818 }
1819 nvdimm_bus_unlock(&nd_region->dev);
1820
1821 if (!devs)
1822 return -ENODEV;
1823
1824 for (i = 0; devs[i]; i++) {
1825 struct device *dev = devs[i];
1826 int id;
1827
1828 if (type == ND_DEVICE_NAMESPACE_BLK) {
1829 struct nd_namespace_blk *nsblk;
1830
1831 nsblk = to_nd_namespace_blk(dev);
1832 id = ida_simple_get(&nd_region->ns_ida, 0, 0,
1833 GFP_KERNEL);
1834 nsblk->id = id;
1835 } else
1836 id = i;
1837
1838 if (id < 0)
1839 break;
1840 dev_set_name(dev, "namespace%d.%d", nd_region->id, id);
1841 dev->groups = nd_namespace_attribute_groups;
1842 nd_device_register(dev);
1843 }
1844 if (i)
1845 nd_region->ns_seed = devs[0];
1846
1847 if (devs[i]) {
1848 int j;
1849
1850 for (j = i; devs[j]; j++) {
1851 struct device *dev = devs[j];
1852
1853 device_initialize(dev);
1854 put_device(dev);
1855 }
1856 *err = j - i;
1857 /*
1858 * All of the namespaces we tried to register failed, so
1859 * fail region activation.
1860 */
1861 if (*err == 0)
1862 rc = -ENODEV;
1863 }
1864 kfree(devs);
1865
1866 if (rc == -ENODEV)
1867 return rc;
1868
1869 return i;
1870}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
new file mode 100644
index 000000000000..e1970c71ad1c
--- /dev/null
+++ b/drivers/nvdimm/nd-core.h
@@ -0,0 +1,83 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __ND_CORE_H__
14#define __ND_CORE_H__
15#include <linux/libnvdimm.h>
16#include <linux/device.h>
17#include <linux/libnvdimm.h>
18#include <linux/sizes.h>
19#include <linux/mutex.h>
20#include <linux/nd.h>
21
22extern struct list_head nvdimm_bus_list;
23extern struct mutex nvdimm_bus_list_mutex;
24extern int nvdimm_major;
25
26struct nvdimm_bus {
27 struct nvdimm_bus_descriptor *nd_desc;
28 wait_queue_head_t probe_wait;
29 struct module *module;
30 struct list_head list;
31 struct device dev;
32 int id, probe_active;
33 struct mutex reconfig_mutex;
34};
35
36struct nvdimm {
37 unsigned long flags;
38 void *provider_data;
39 unsigned long *dsm_mask;
40 struct device dev;
41 atomic_t busy;
42 int id;
43};
44
45bool is_nvdimm(struct device *dev);
46bool is_nd_pmem(struct device *dev);
47bool is_nd_blk(struct device *dev);
48struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
49int __init nvdimm_bus_init(void);
50void nvdimm_bus_exit(void);
51void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
52struct nd_region;
53void nd_region_create_blk_seed(struct nd_region *nd_region);
54void nd_region_create_btt_seed(struct nd_region *nd_region);
55void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
56int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
57void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
58void nd_synchronize(void);
59int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
60int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
61int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
62void __nd_device_register(struct device *dev);
63int nd_match_dimm(struct device *dev, void *data);
64struct nd_label_id;
65char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags);
66bool nd_is_uuid_unique(struct device *dev, u8 *uuid);
67struct nd_region;
68struct nvdimm_drvdata;
69struct nd_mapping;
70resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
71 struct nd_mapping *nd_mapping, resource_size_t *overlap);
72resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping);
73resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
74resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
75 struct nd_label_id *label_id);
76struct nd_mapping;
77struct resource *nsblk_add_resource(struct nd_region *nd_region,
78 struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
79 resource_size_t start);
80int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
81void get_ndd(struct nvdimm_drvdata *ndd);
82resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
83#endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
new file mode 100644
index 000000000000..c41f53e74277
--- /dev/null
+++ b/drivers/nvdimm/nd.h
@@ -0,0 +1,220 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __ND_H__
14#define __ND_H__
15#include <linux/libnvdimm.h>
16#include <linux/blkdev.h>
17#include <linux/device.h>
18#include <linux/mutex.h>
19#include <linux/ndctl.h>
20#include <linux/types.h>
21#include "label.h"
22
23enum {
24 /*
25 * Limits the maximum number of block apertures a dimm can
26 * support and is an input to the geometry/on-disk-format of a
27 * BTT instance
28 */
29 ND_MAX_LANES = 256,
30 SECTOR_SHIFT = 9,
31 INT_LBASIZE_ALIGNMENT = 64,
32};
33
34struct nvdimm_drvdata {
35 struct device *dev;
36 int nsindex_size;
37 struct nd_cmd_get_config_size nsarea;
38 void *data;
39 int ns_current, ns_next;
40 struct resource dpa;
41 struct kref kref;
42};
43
44struct nd_region_namespaces {
45 int count;
46 int active;
47};
48
49static inline struct nd_namespace_index *to_namespace_index(
50 struct nvdimm_drvdata *ndd, int i)
51{
52 if (i < 0)
53 return NULL;
54
55 return ndd->data + sizeof_namespace_index(ndd) * i;
56}
57
58static inline struct nd_namespace_index *to_current_namespace_index(
59 struct nvdimm_drvdata *ndd)
60{
61 return to_namespace_index(ndd, ndd->ns_current);
62}
63
64static inline struct nd_namespace_index *to_next_namespace_index(
65 struct nvdimm_drvdata *ndd)
66{
67 return to_namespace_index(ndd, ndd->ns_next);
68}
69
70#define nd_dbg_dpa(r, d, res, fmt, arg...) \
71 dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \
72 (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \
73 (unsigned long long) (res ? resource_size(res) : 0), \
74 (unsigned long long) (res ? res->start : 0), ##arg)
75
76#define for_each_label(l, label, labels) \
77 for (l = 0; (label = labels ? labels[l] : NULL); l++)
78
79#define for_each_dpa_resource(ndd, res) \
80 for (res = (ndd)->dpa.child; res; res = res->sibling)
81
82#define for_each_dpa_resource_safe(ndd, res, next) \
83 for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
84 res; res = next, next = next ? next->sibling : NULL)
85
86struct nd_percpu_lane {
87 int count;
88 spinlock_t lock;
89};
90
91struct nd_region {
92 struct device dev;
93 struct ida ns_ida;
94 struct ida btt_ida;
95 struct device *ns_seed;
96 struct device *btt_seed;
97 u16 ndr_mappings;
98 u64 ndr_size;
99 u64 ndr_start;
100 int id, num_lanes, ro, numa_node;
101 void *provider_data;
102 struct nd_interleave_set *nd_set;
103 struct nd_percpu_lane __percpu *lane;
104 struct nd_mapping mapping[0];
105};
106
107struct nd_blk_region {
108 int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
109 void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
110 int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
111 void *iobuf, u64 len, int rw);
112 void *blk_provider_data;
113 struct nd_region nd_region;
114};
115
116/*
117 * Lookup next in the repeating sequence of 01, 10, and 11.
118 */
119static inline unsigned nd_inc_seq(unsigned seq)
120{
121 static const unsigned next[] = { 0, 2, 3, 1 };
122
123 return next[seq & 3];
124}
125
126struct btt;
127struct nd_btt {
128 struct device dev;
129 struct nd_namespace_common *ndns;
130 struct btt *btt;
131 unsigned long lbasize;
132 u8 *uuid;
133 int id;
134};
135
136enum nd_async_mode {
137 ND_SYNC,
138 ND_ASYNC,
139};
140
141int nd_integrity_init(struct gendisk *disk, unsigned long meta_size);
142void wait_nvdimm_bus_probe_idle(struct device *dev);
143void nd_device_register(struct device *dev);
144void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
145int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
146 size_t len);
147ssize_t nd_sector_size_show(unsigned long current_lbasize,
148 const unsigned long *supported, char *buf);
149ssize_t nd_sector_size_store(struct device *dev, const char *buf,
150 unsigned long *current_lbasize, const unsigned long *supported);
151int __init nvdimm_init(void);
152int __init nd_region_init(void);
153void nvdimm_exit(void);
154void nd_region_exit(void);
155struct nvdimm;
156struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping);
157int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
158int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
159int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
160 void *buf, size_t len);
161struct nd_btt *to_nd_btt(struct device *dev);
162struct btt_sb;
163u64 nd_btt_sb_checksum(struct btt_sb *btt_sb);
164#if IS_ENABLED(CONFIG_BTT)
165int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata);
166bool is_nd_btt(struct device *dev);
167struct device *nd_btt_create(struct nd_region *nd_region);
168#else
169static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
170{
171 return -ENODEV;
172}
173
174static inline bool is_nd_btt(struct device *dev)
175{
176 return false;
177}
178
179static inline struct device *nd_btt_create(struct nd_region *nd_region)
180{
181 return NULL;
182}
183
184#endif
185struct nd_region *to_nd_region(struct device *dev);
186int nd_region_to_nstype(struct nd_region *nd_region);
187int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
188u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
189void nvdimm_bus_lock(struct device *dev);
190void nvdimm_bus_unlock(struct device *dev);
191bool is_nvdimm_bus_locked(struct device *dev);
192int nvdimm_revalidate_disk(struct gendisk *disk);
193void nvdimm_drvdata_release(struct kref *kref);
194void put_ndd(struct nvdimm_drvdata *ndd);
195int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
196void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
197struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
198 struct nd_label_id *label_id, resource_size_t start,
199 resource_size_t n);
200resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
201struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
202int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
203int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
204const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
205 char *name);
206int nd_blk_region_init(struct nd_region *nd_region);
207void __nd_iostat_start(struct bio *bio, unsigned long *start);
208static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
209{
210 struct gendisk *disk = bio->bi_bdev->bd_disk;
211
212 if (!blk_queue_io_stat(disk->queue))
213 return false;
214
215 __nd_iostat_start(bio, start);
216 return true;
217}
218void nd_iostat_end(struct bio *bio, unsigned long start);
219resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
220#endif /* __ND_H__ */
diff --git a/drivers/block/pmem.c b/drivers/nvdimm/pmem.c
index 095dfaadcaa5..ade9eb917a4d 100644
--- a/drivers/block/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Persistent Memory Driver 2 * Persistent Memory Driver
3 * 3 *
4 * Copyright (c) 2014, Intel Corporation. 4 * Copyright (c) 2014-2015, Intel Corporation.
5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7 * 7 *
@@ -23,8 +23,9 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/moduleparam.h> 24#include <linux/moduleparam.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26#include <linux/pmem.h>
27#define PMEM_MINORS 16 27#include <linux/nd.h>
28#include "nd.h"
28 29
29struct pmem_device { 30struct pmem_device {
30 struct request_queue *pmem_queue; 31 struct request_queue *pmem_queue;
@@ -32,12 +33,11 @@ struct pmem_device {
32 33
33 /* One contiguous memory region per device */ 34 /* One contiguous memory region per device */
34 phys_addr_t phys_addr; 35 phys_addr_t phys_addr;
35 void *virt_addr; 36 void __pmem *virt_addr;
36 size_t size; 37 size_t size;
37}; 38};
38 39
39static int pmem_major; 40static int pmem_major;
40static atomic_t pmem_index;
41 41
42static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, 42static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
43 unsigned int len, unsigned int off, int rw, 43 unsigned int len, unsigned int off, int rw,
@@ -45,13 +45,14 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
45{ 45{
46 void *mem = kmap_atomic(page); 46 void *mem = kmap_atomic(page);
47 size_t pmem_off = sector << 9; 47 size_t pmem_off = sector << 9;
48 void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
48 49
49 if (rw == READ) { 50 if (rw == READ) {
50 memcpy(mem + off, pmem->virt_addr + pmem_off, len); 51 memcpy_from_pmem(mem + off, pmem_addr, len);
51 flush_dcache_page(page); 52 flush_dcache_page(page);
52 } else { 53 } else {
53 flush_dcache_page(page); 54 flush_dcache_page(page);
54 memcpy(pmem->virt_addr + pmem_off, mem + off, len); 55 memcpy_to_pmem(pmem_addr, mem + off, len);
55 } 56 }
56 57
57 kunmap_atomic(mem); 58 kunmap_atomic(mem);
@@ -59,31 +60,24 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
59 60
60static void pmem_make_request(struct request_queue *q, struct bio *bio) 61static void pmem_make_request(struct request_queue *q, struct bio *bio)
61{ 62{
62 struct block_device *bdev = bio->bi_bdev; 63 bool do_acct;
63 struct pmem_device *pmem = bdev->bd_disk->private_data; 64 unsigned long start;
64 int rw;
65 struct bio_vec bvec; 65 struct bio_vec bvec;
66 sector_t sector;
67 struct bvec_iter iter; 66 struct bvec_iter iter;
68 int err = 0; 67 struct block_device *bdev = bio->bi_bdev;
69 68 struct pmem_device *pmem = bdev->bd_disk->private_data;
70 if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
71 err = -EIO;
72 goto out;
73 }
74
75 BUG_ON(bio->bi_rw & REQ_DISCARD);
76 69
77 rw = bio_data_dir(bio); 70 do_acct = nd_iostat_start(bio, &start);
78 sector = bio->bi_iter.bi_sector; 71 bio_for_each_segment(bvec, bio, iter)
79 bio_for_each_segment(bvec, bio, iter) {
80 pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, 72 pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
81 rw, sector); 73 bio_data_dir(bio), iter.bi_sector);
82 sector += bvec.bv_len >> 9; 74 if (do_acct)
83 } 75 nd_iostat_end(bio, start);
84 76
85out: 77 if (bio_data_dir(bio))
86 bio_endio(bio, err); 78 wmb_pmem();
79
80 bio_endio(bio, 0);
87} 81}
88 82
89static int pmem_rw_page(struct block_device *bdev, sector_t sector, 83static int pmem_rw_page(struct block_device *bdev, sector_t sector,
@@ -106,7 +100,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
106 if (!pmem) 100 if (!pmem)
107 return -ENODEV; 101 return -ENODEV;
108 102
109 *kaddr = pmem->virt_addr + offset; 103 /* FIXME convert DAX to comprehend that this mapping has a lifetime */
104 *kaddr = (void __force *) pmem->virt_addr + offset;
110 *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; 105 *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
111 106
112 return pmem->size - offset; 107 return pmem->size - offset;
@@ -116,124 +111,165 @@ static const struct block_device_operations pmem_fops = {
116 .owner = THIS_MODULE, 111 .owner = THIS_MODULE,
117 .rw_page = pmem_rw_page, 112 .rw_page = pmem_rw_page,
118 .direct_access = pmem_direct_access, 113 .direct_access = pmem_direct_access,
114 .revalidate_disk = nvdimm_revalidate_disk,
119}; 115};
120 116
121static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) 117static struct pmem_device *pmem_alloc(struct device *dev,
118 struct resource *res, int id)
122{ 119{
123 struct pmem_device *pmem; 120 struct pmem_device *pmem;
124 struct gendisk *disk;
125 int idx, err;
126 121
127 err = -ENOMEM;
128 pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); 122 pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
129 if (!pmem) 123 if (!pmem)
130 goto out; 124 return ERR_PTR(-ENOMEM);
131 125
132 pmem->phys_addr = res->start; 126 pmem->phys_addr = res->start;
133 pmem->size = resource_size(res); 127 pmem->size = resource_size(res);
128 if (!arch_has_pmem_api())
129 dev_warn(dev, "unable to guarantee persistence of writes\n");
130
131 if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
132 dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
133 &pmem->phys_addr, pmem->size);
134 kfree(pmem);
135 return ERR_PTR(-EBUSY);
136 }
134 137
135 err = -EINVAL; 138 pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
136 if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { 139 if (!pmem->virt_addr) {
137 dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); 140 release_mem_region(pmem->phys_addr, pmem->size);
138 goto out_free_dev; 141 kfree(pmem);
142 return ERR_PTR(-ENXIO);
139 } 143 }
140 144
141 /* 145 return pmem;
142 * Map the memory as write-through, as we can't write back the contents 146}
143 * of the CPU caches in case of a crash. 147
144 */ 148static void pmem_detach_disk(struct pmem_device *pmem)
145 err = -ENOMEM; 149{
146 pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size); 150 del_gendisk(pmem->pmem_disk);
147 if (!pmem->virt_addr) 151 put_disk(pmem->pmem_disk);
148 goto out_release_region; 152 blk_cleanup_queue(pmem->pmem_queue);
153}
154
155static int pmem_attach_disk(struct nd_namespace_common *ndns,
156 struct pmem_device *pmem)
157{
158 struct gendisk *disk;
149 159
150 pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); 160 pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
151 if (!pmem->pmem_queue) 161 if (!pmem->pmem_queue)
152 goto out_unmap; 162 return -ENOMEM;
153 163
154 blk_queue_make_request(pmem->pmem_queue, pmem_make_request); 164 blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
155 blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); 165 blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
156 blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); 166 blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
167 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
157 168
158 disk = alloc_disk(PMEM_MINORS); 169 disk = alloc_disk(0);
159 if (!disk) 170 if (!disk) {
160 goto out_free_queue; 171 blk_cleanup_queue(pmem->pmem_queue);
161 172 return -ENOMEM;
162 idx = atomic_inc_return(&pmem_index) - 1; 173 }
163 174
164 disk->major = pmem_major; 175 disk->major = pmem_major;
165 disk->first_minor = PMEM_MINORS * idx; 176 disk->first_minor = 0;
166 disk->fops = &pmem_fops; 177 disk->fops = &pmem_fops;
167 disk->private_data = pmem; 178 disk->private_data = pmem;
168 disk->queue = pmem->pmem_queue; 179 disk->queue = pmem->pmem_queue;
169 disk->flags = GENHD_FL_EXT_DEVT; 180 disk->flags = GENHD_FL_EXT_DEVT;
170 sprintf(disk->disk_name, "pmem%d", idx); 181 nvdimm_namespace_disk_name(ndns, disk->disk_name);
171 disk->driverfs_dev = dev; 182 disk->driverfs_dev = &ndns->dev;
172 set_capacity(disk, pmem->size >> 9); 183 set_capacity(disk, pmem->size >> 9);
173 pmem->pmem_disk = disk; 184 pmem->pmem_disk = disk;
174 185
175 add_disk(disk); 186 add_disk(disk);
187 revalidate_disk(disk);
176 188
177 return pmem; 189 return 0;
190}
178 191
179out_free_queue: 192static int pmem_rw_bytes(struct nd_namespace_common *ndns,
180 blk_cleanup_queue(pmem->pmem_queue); 193 resource_size_t offset, void *buf, size_t size, int rw)
181out_unmap: 194{
182 iounmap(pmem->virt_addr); 195 struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
183out_release_region: 196
184 release_mem_region(pmem->phys_addr, pmem->size); 197 if (unlikely(offset + size > pmem->size)) {
185out_free_dev: 198 dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
186 kfree(pmem); 199 return -EFAULT;
187out: 200 }
188 return ERR_PTR(err); 201
202 if (rw == READ)
203 memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
204 else {
205 memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
206 wmb_pmem();
207 }
208
209 return 0;
189} 210}
190 211
191static void pmem_free(struct pmem_device *pmem) 212static void pmem_free(struct pmem_device *pmem)
192{ 213{
193 del_gendisk(pmem->pmem_disk); 214 memunmap_pmem(pmem->virt_addr);
194 put_disk(pmem->pmem_disk);
195 blk_cleanup_queue(pmem->pmem_queue);
196 iounmap(pmem->virt_addr);
197 release_mem_region(pmem->phys_addr, pmem->size); 215 release_mem_region(pmem->phys_addr, pmem->size);
198 kfree(pmem); 216 kfree(pmem);
199} 217}
200 218
201static int pmem_probe(struct platform_device *pdev) 219static int nd_pmem_probe(struct device *dev)
202{ 220{
221 struct nd_region *nd_region = to_nd_region(dev->parent);
222 struct nd_namespace_common *ndns;
223 struct nd_namespace_io *nsio;
203 struct pmem_device *pmem; 224 struct pmem_device *pmem;
204 struct resource *res; 225 int rc;
205
206 if (WARN_ON(pdev->num_resources > 1))
207 return -ENXIO;
208 226
209 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 227 ndns = nvdimm_namespace_common_probe(dev);
210 if (!res) 228 if (IS_ERR(ndns))
211 return -ENXIO; 229 return PTR_ERR(ndns);
212 230
213 pmem = pmem_alloc(&pdev->dev, res); 231 nsio = to_nd_namespace_io(&ndns->dev);
232 pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
214 if (IS_ERR(pmem)) 233 if (IS_ERR(pmem))
215 return PTR_ERR(pmem); 234 return PTR_ERR(pmem);
216 235
217 platform_set_drvdata(pdev, pmem); 236 dev_set_drvdata(dev, pmem);
218 237 ndns->rw_bytes = pmem_rw_bytes;
219 return 0; 238 if (is_nd_btt(dev))
239 rc = nvdimm_namespace_attach_btt(ndns);
240 else if (nd_btt_probe(ndns, pmem) == 0) {
241 /* we'll come back as btt-pmem */
242 rc = -ENXIO;
243 } else
244 rc = pmem_attach_disk(ndns, pmem);
245 if (rc)
246 pmem_free(pmem);
247 return rc;
220} 248}
221 249
222static int pmem_remove(struct platform_device *pdev) 250static int nd_pmem_remove(struct device *dev)
223{ 251{
224 struct pmem_device *pmem = platform_get_drvdata(pdev); 252 struct pmem_device *pmem = dev_get_drvdata(dev);
225 253
254 if (is_nd_btt(dev))
255 nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
256 else
257 pmem_detach_disk(pmem);
226 pmem_free(pmem); 258 pmem_free(pmem);
259
227 return 0; 260 return 0;
228} 261}
229 262
230static struct platform_driver pmem_driver = { 263MODULE_ALIAS("pmem");
231 .probe = pmem_probe, 264MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
232 .remove = pmem_remove, 265MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
233 .driver = { 266static struct nd_device_driver nd_pmem_driver = {
234 .owner = THIS_MODULE, 267 .probe = nd_pmem_probe,
235 .name = "pmem", 268 .remove = nd_pmem_remove,
269 .drv = {
270 .name = "nd_pmem",
236 }, 271 },
272 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
237}; 273};
238 274
239static int __init pmem_init(void) 275static int __init pmem_init(void)
@@ -244,16 +280,19 @@ static int __init pmem_init(void)
244 if (pmem_major < 0) 280 if (pmem_major < 0)
245 return pmem_major; 281 return pmem_major;
246 282
247 error = platform_driver_register(&pmem_driver); 283 error = nd_driver_register(&nd_pmem_driver);
248 if (error) 284 if (error) {
249 unregister_blkdev(pmem_major, "pmem"); 285 unregister_blkdev(pmem_major, "pmem");
250 return error; 286 return error;
287 }
288
289 return 0;
251} 290}
252module_init(pmem_init); 291module_init(pmem_init);
253 292
254static void pmem_exit(void) 293static void pmem_exit(void)
255{ 294{
256 platform_driver_unregister(&pmem_driver); 295 driver_unregister(&nd_pmem_driver.drv);
257 unregister_blkdev(pmem_major, "pmem"); 296 unregister_blkdev(pmem_major, "pmem");
258} 297}
259module_exit(pmem_exit); 298module_exit(pmem_exit);
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
new file mode 100644
index 000000000000..f28f78ccff19
--- /dev/null
+++ b/drivers/nvdimm/region.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/device.h>
16#include <linux/nd.h>
17#include "nd.h"
18
19static int nd_region_probe(struct device *dev)
20{
21 int err, rc;
22 static unsigned long once;
23 struct nd_region_namespaces *num_ns;
24 struct nd_region *nd_region = to_nd_region(dev);
25
26 if (nd_region->num_lanes > num_online_cpus()
27 && nd_region->num_lanes < num_possible_cpus()
28 && !test_and_set_bit(0, &once)) {
29 dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n",
30 num_online_cpus(), nd_region->num_lanes,
31 num_possible_cpus());
32 dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n",
33 nd_region->num_lanes);
34 }
35
36 rc = nd_blk_region_init(nd_region);
37 if (rc)
38 return rc;
39
40 rc = nd_region_register_namespaces(nd_region, &err);
41 num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
42 if (!num_ns)
43 return -ENOMEM;
44
45 if (rc < 0)
46 return rc;
47
48 num_ns->active = rc;
49 num_ns->count = rc + err;
50 dev_set_drvdata(dev, num_ns);
51
52 if (rc && err && rc == err)
53 return -ENODEV;
54
55 nd_region->btt_seed = nd_btt_create(nd_region);
56 if (err == 0)
57 return 0;
58
59 /*
60 * Given multiple namespaces per region, we do not want to
61 * disable all the successfully registered peer namespaces upon
62 * a single registration failure. If userspace is missing a
63 * namespace that it expects it can disable/re-enable the region
64 * to retry discovery after correcting the failure.
65 * <regionX>/namespaces returns the current
66 * "<async-registered>/<total>" namespace count.
67 */
68 dev_err(dev, "failed to register %d namespace%s, continuing...\n",
69 err, err == 1 ? "" : "s");
70 return 0;
71}
72
73static int child_unregister(struct device *dev, void *data)
74{
75 nd_device_unregister(dev, ND_SYNC);
76 return 0;
77}
78
79static int nd_region_remove(struct device *dev)
80{
81 struct nd_region *nd_region = to_nd_region(dev);
82
83 /* flush attribute readers and disable */
84 nvdimm_bus_lock(dev);
85 nd_region->ns_seed = NULL;
86 nd_region->btt_seed = NULL;
87 dev_set_drvdata(dev, NULL);
88 nvdimm_bus_unlock(dev);
89
90 device_for_each_child(dev, NULL, child_unregister);
91 return 0;
92}
93
94static struct nd_device_driver nd_region_driver = {
95 .probe = nd_region_probe,
96 .remove = nd_region_remove,
97 .drv = {
98 .name = "nd_region",
99 },
100 .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM,
101};
102
103int __init nd_region_init(void)
104{
105 return nd_driver_register(&nd_region_driver);
106}
107
108void nd_region_exit(void)
109{
110 driver_unregister(&nd_region_driver.drv);
111}
112
113MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
114MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
new file mode 100644
index 000000000000..a5233422f9dc
--- /dev/null
+++ b/drivers/nvdimm/region_devs.c
@@ -0,0 +1,787 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/scatterlist.h>
14#include <linux/highmem.h>
15#include <linux/sched.h>
16#include <linux/slab.h>
17#include <linux/sort.h>
18#include <linux/io.h>
19#include <linux/nd.h>
20#include "nd-core.h"
21#include "nd.h"
22
23static DEFINE_IDA(region_ida);
24
25static void nd_region_release(struct device *dev)
26{
27 struct nd_region *nd_region = to_nd_region(dev);
28 u16 i;
29
30 for (i = 0; i < nd_region->ndr_mappings; i++) {
31 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
32 struct nvdimm *nvdimm = nd_mapping->nvdimm;
33
34 put_device(&nvdimm->dev);
35 }
36 free_percpu(nd_region->lane);
37 ida_simple_remove(&region_ida, nd_region->id);
38 if (is_nd_blk(dev))
39 kfree(to_nd_blk_region(dev));
40 else
41 kfree(nd_region);
42}
43
44static struct device_type nd_blk_device_type = {
45 .name = "nd_blk",
46 .release = nd_region_release,
47};
48
49static struct device_type nd_pmem_device_type = {
50 .name = "nd_pmem",
51 .release = nd_region_release,
52};
53
54static struct device_type nd_volatile_device_type = {
55 .name = "nd_volatile",
56 .release = nd_region_release,
57};
58
59bool is_nd_pmem(struct device *dev)
60{
61 return dev ? dev->type == &nd_pmem_device_type : false;
62}
63
64bool is_nd_blk(struct device *dev)
65{
66 return dev ? dev->type == &nd_blk_device_type : false;
67}
68
69struct nd_region *to_nd_region(struct device *dev)
70{
71 struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
72
73 WARN_ON(dev->type->release != nd_region_release);
74 return nd_region;
75}
76EXPORT_SYMBOL_GPL(to_nd_region);
77
78struct nd_blk_region *to_nd_blk_region(struct device *dev)
79{
80 struct nd_region *nd_region = to_nd_region(dev);
81
82 WARN_ON(!is_nd_blk(dev));
83 return container_of(nd_region, struct nd_blk_region, nd_region);
84}
85EXPORT_SYMBOL_GPL(to_nd_blk_region);
86
87void *nd_region_provider_data(struct nd_region *nd_region)
88{
89 return nd_region->provider_data;
90}
91EXPORT_SYMBOL_GPL(nd_region_provider_data);
92
93void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
94{
95 return ndbr->blk_provider_data;
96}
97EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
98
99void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
100{
101 ndbr->blk_provider_data = data;
102}
103EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
104
105/**
106 * nd_region_to_nstype() - region to an integer namespace type
107 * @nd_region: region-device to interrogate
108 *
109 * This is the 'nstype' attribute of a region as well, an input to the
110 * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match
111 * namespace devices with namespace drivers.
112 */
113int nd_region_to_nstype(struct nd_region *nd_region)
114{
115 if (is_nd_pmem(&nd_region->dev)) {
116 u16 i, alias;
117
118 for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
119 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
120 struct nvdimm *nvdimm = nd_mapping->nvdimm;
121
122 if (nvdimm->flags & NDD_ALIASING)
123 alias++;
124 }
125 if (alias)
126 return ND_DEVICE_NAMESPACE_PMEM;
127 else
128 return ND_DEVICE_NAMESPACE_IO;
129 } else if (is_nd_blk(&nd_region->dev)) {
130 return ND_DEVICE_NAMESPACE_BLK;
131 }
132
133 return 0;
134}
135EXPORT_SYMBOL(nd_region_to_nstype);
136
137static int is_uuid_busy(struct device *dev, void *data)
138{
139 struct nd_region *nd_region = to_nd_region(dev->parent);
140 u8 *uuid = data;
141
142 switch (nd_region_to_nstype(nd_region)) {
143 case ND_DEVICE_NAMESPACE_PMEM: {
144 struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
145
146 if (!nspm->uuid)
147 break;
148 if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
149 return -EBUSY;
150 break;
151 }
152 case ND_DEVICE_NAMESPACE_BLK: {
153 struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
154
155 if (!nsblk->uuid)
156 break;
157 if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
158 return -EBUSY;
159 break;
160 }
161 default:
162 break;
163 }
164
165 return 0;
166}
167
168static int is_namespace_uuid_busy(struct device *dev, void *data)
169{
170 if (is_nd_pmem(dev) || is_nd_blk(dev))
171 return device_for_each_child(dev, data, is_uuid_busy);
172 return 0;
173}
174
175/**
176 * nd_is_uuid_unique - verify that no other namespace has @uuid
177 * @dev: any device on a nvdimm_bus
178 * @uuid: uuid to check
179 */
180bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
181{
182 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
183
184 if (!nvdimm_bus)
185 return false;
186 WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
187 if (device_for_each_child(&nvdimm_bus->dev, uuid,
188 is_namespace_uuid_busy) != 0)
189 return false;
190 return true;
191}
192
193static ssize_t size_show(struct device *dev,
194 struct device_attribute *attr, char *buf)
195{
196 struct nd_region *nd_region = to_nd_region(dev);
197 unsigned long long size = 0;
198
199 if (is_nd_pmem(dev)) {
200 size = nd_region->ndr_size;
201 } else if (nd_region->ndr_mappings == 1) {
202 struct nd_mapping *nd_mapping = &nd_region->mapping[0];
203
204 size = nd_mapping->size;
205 }
206
207 return sprintf(buf, "%llu\n", size);
208}
209static DEVICE_ATTR_RO(size);
210
211static ssize_t mappings_show(struct device *dev,
212 struct device_attribute *attr, char *buf)
213{
214 struct nd_region *nd_region = to_nd_region(dev);
215
216 return sprintf(buf, "%d\n", nd_region->ndr_mappings);
217}
218static DEVICE_ATTR_RO(mappings);
219
220static ssize_t nstype_show(struct device *dev,
221 struct device_attribute *attr, char *buf)
222{
223 struct nd_region *nd_region = to_nd_region(dev);
224
225 return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
226}
227static DEVICE_ATTR_RO(nstype);
228
229static ssize_t set_cookie_show(struct device *dev,
230 struct device_attribute *attr, char *buf)
231{
232 struct nd_region *nd_region = to_nd_region(dev);
233 struct nd_interleave_set *nd_set = nd_region->nd_set;
234
235 if (is_nd_pmem(dev) && nd_set)
236 /* pass, should be precluded by region_visible */;
237 else
238 return -ENXIO;
239
240 return sprintf(buf, "%#llx\n", nd_set->cookie);
241}
242static DEVICE_ATTR_RO(set_cookie);
243
244resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
245{
246 resource_size_t blk_max_overlap = 0, available, overlap;
247 int i;
248
249 WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
250
251 retry:
252 available = 0;
253 overlap = blk_max_overlap;
254 for (i = 0; i < nd_region->ndr_mappings; i++) {
255 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
256 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
257
258 /* if a dimm is disabled the available capacity is zero */
259 if (!ndd)
260 return 0;
261
262 if (is_nd_pmem(&nd_region->dev)) {
263 available += nd_pmem_available_dpa(nd_region,
264 nd_mapping, &overlap);
265 if (overlap > blk_max_overlap) {
266 blk_max_overlap = overlap;
267 goto retry;
268 }
269 } else if (is_nd_blk(&nd_region->dev)) {
270 available += nd_blk_available_dpa(nd_mapping);
271 }
272 }
273
274 return available;
275}
276
277static ssize_t available_size_show(struct device *dev,
278 struct device_attribute *attr, char *buf)
279{
280 struct nd_region *nd_region = to_nd_region(dev);
281 unsigned long long available = 0;
282
283 /*
284 * Flush in-flight updates and grab a snapshot of the available
285 * size. Of course, this value is potentially invalidated the
286 * memory nvdimm_bus_lock() is dropped, but that's userspace's
287 * problem to not race itself.
288 */
289 nvdimm_bus_lock(dev);
290 wait_nvdimm_bus_probe_idle(dev);
291 available = nd_region_available_dpa(nd_region);
292 nvdimm_bus_unlock(dev);
293
294 return sprintf(buf, "%llu\n", available);
295}
296static DEVICE_ATTR_RO(available_size);
297
298static ssize_t init_namespaces_show(struct device *dev,
299 struct device_attribute *attr, char *buf)
300{
301 struct nd_region_namespaces *num_ns = dev_get_drvdata(dev);
302 ssize_t rc;
303
304 nvdimm_bus_lock(dev);
305 if (num_ns)
306 rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count);
307 else
308 rc = -ENXIO;
309 nvdimm_bus_unlock(dev);
310
311 return rc;
312}
313static DEVICE_ATTR_RO(init_namespaces);
314
315static ssize_t namespace_seed_show(struct device *dev,
316 struct device_attribute *attr, char *buf)
317{
318 struct nd_region *nd_region = to_nd_region(dev);
319 ssize_t rc;
320
321 nvdimm_bus_lock(dev);
322 if (nd_region->ns_seed)
323 rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed));
324 else
325 rc = sprintf(buf, "\n");
326 nvdimm_bus_unlock(dev);
327 return rc;
328}
329static DEVICE_ATTR_RO(namespace_seed);
330
331static ssize_t btt_seed_show(struct device *dev,
332 struct device_attribute *attr, char *buf)
333{
334 struct nd_region *nd_region = to_nd_region(dev);
335 ssize_t rc;
336
337 nvdimm_bus_lock(dev);
338 if (nd_region->btt_seed)
339 rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed));
340 else
341 rc = sprintf(buf, "\n");
342 nvdimm_bus_unlock(dev);
343
344 return rc;
345}
346static DEVICE_ATTR_RO(btt_seed);
347
348static ssize_t read_only_show(struct device *dev,
349 struct device_attribute *attr, char *buf)
350{
351 struct nd_region *nd_region = to_nd_region(dev);
352
353 return sprintf(buf, "%d\n", nd_region->ro);
354}
355
356static ssize_t read_only_store(struct device *dev,
357 struct device_attribute *attr, const char *buf, size_t len)
358{
359 bool ro;
360 int rc = strtobool(buf, &ro);
361 struct nd_region *nd_region = to_nd_region(dev);
362
363 if (rc)
364 return rc;
365
366 nd_region->ro = ro;
367 return len;
368}
369static DEVICE_ATTR_RW(read_only);
370
371static struct attribute *nd_region_attributes[] = {
372 &dev_attr_size.attr,
373 &dev_attr_nstype.attr,
374 &dev_attr_mappings.attr,
375 &dev_attr_btt_seed.attr,
376 &dev_attr_read_only.attr,
377 &dev_attr_set_cookie.attr,
378 &dev_attr_available_size.attr,
379 &dev_attr_namespace_seed.attr,
380 &dev_attr_init_namespaces.attr,
381 NULL,
382};
383
384static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
385{
386 struct device *dev = container_of(kobj, typeof(*dev), kobj);
387 struct nd_region *nd_region = to_nd_region(dev);
388 struct nd_interleave_set *nd_set = nd_region->nd_set;
389 int type = nd_region_to_nstype(nd_region);
390
391 if (a != &dev_attr_set_cookie.attr
392 && a != &dev_attr_available_size.attr)
393 return a->mode;
394
395 if ((type == ND_DEVICE_NAMESPACE_PMEM
396 || type == ND_DEVICE_NAMESPACE_BLK)
397 && a == &dev_attr_available_size.attr)
398 return a->mode;
399 else if (is_nd_pmem(dev) && nd_set)
400 return a->mode;
401
402 return 0;
403}
404
405struct attribute_group nd_region_attribute_group = {
406 .attrs = nd_region_attributes,
407 .is_visible = region_visible,
408};
409EXPORT_SYMBOL_GPL(nd_region_attribute_group);
410
411u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
412{
413 struct nd_interleave_set *nd_set = nd_region->nd_set;
414
415 if (nd_set)
416 return nd_set->cookie;
417 return 0;
418}
419
420/*
421 * Upon successful probe/remove, take/release a reference on the
422 * associated interleave set (if present), and plant new btt + namespace
423 * seeds. Also, on the removal of a BLK region, notify the provider to
424 * disable the region.
425 */
426static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
427 struct device *dev, bool probe)
428{
429 struct nd_region *nd_region;
430
431 if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
432 int i;
433
434 nd_region = to_nd_region(dev);
435 for (i = 0; i < nd_region->ndr_mappings; i++) {
436 struct nd_mapping *nd_mapping = &nd_region->mapping[i];
437 struct nvdimm_drvdata *ndd = nd_mapping->ndd;
438 struct nvdimm *nvdimm = nd_mapping->nvdimm;
439
440 kfree(nd_mapping->labels);
441 nd_mapping->labels = NULL;
442 put_ndd(ndd);
443 nd_mapping->ndd = NULL;
444 if (ndd)
445 atomic_dec(&nvdimm->busy);
446 }
447
448 if (is_nd_pmem(dev))
449 return;
450
451 to_nd_blk_region(dev)->disable(nvdimm_bus, dev);
452 }
453 if (dev->parent && is_nd_blk(dev->parent) && probe) {
454 nd_region = to_nd_region(dev->parent);
455 nvdimm_bus_lock(dev);
456 if (nd_region->ns_seed == dev)
457 nd_region_create_blk_seed(nd_region);
458 nvdimm_bus_unlock(dev);
459 }
460 if (is_nd_btt(dev) && probe) {
461 nd_region = to_nd_region(dev->parent);
462 nvdimm_bus_lock(dev);
463 if (nd_region->btt_seed == dev)
464 nd_region_create_btt_seed(nd_region);
465 nvdimm_bus_unlock(dev);
466 }
467}
468
469void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
470{
471 nd_region_notify_driver_action(nvdimm_bus, dev, true);
472}
473
474void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev)
475{
476 nd_region_notify_driver_action(nvdimm_bus, dev, false);
477}
478
479static ssize_t mappingN(struct device *dev, char *buf, int n)
480{
481 struct nd_region *nd_region = to_nd_region(dev);
482 struct nd_mapping *nd_mapping;
483 struct nvdimm *nvdimm;
484
485 if (n >= nd_region->ndr_mappings)
486 return -ENXIO;
487 nd_mapping = &nd_region->mapping[n];
488 nvdimm = nd_mapping->nvdimm;
489
490 return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
491 nd_mapping->start, nd_mapping->size);
492}
493
494#define REGION_MAPPING(idx) \
495static ssize_t mapping##idx##_show(struct device *dev, \
496 struct device_attribute *attr, char *buf) \
497{ \
498 return mappingN(dev, buf, idx); \
499} \
500static DEVICE_ATTR_RO(mapping##idx)
501
502/*
503 * 32 should be enough for a while, even in the presence of socket
504 * interleave a 32-way interleave set is a degenerate case.
505 */
506REGION_MAPPING(0);
507REGION_MAPPING(1);
508REGION_MAPPING(2);
509REGION_MAPPING(3);
510REGION_MAPPING(4);
511REGION_MAPPING(5);
512REGION_MAPPING(6);
513REGION_MAPPING(7);
514REGION_MAPPING(8);
515REGION_MAPPING(9);
516REGION_MAPPING(10);
517REGION_MAPPING(11);
518REGION_MAPPING(12);
519REGION_MAPPING(13);
520REGION_MAPPING(14);
521REGION_MAPPING(15);
522REGION_MAPPING(16);
523REGION_MAPPING(17);
524REGION_MAPPING(18);
525REGION_MAPPING(19);
526REGION_MAPPING(20);
527REGION_MAPPING(21);
528REGION_MAPPING(22);
529REGION_MAPPING(23);
530REGION_MAPPING(24);
531REGION_MAPPING(25);
532REGION_MAPPING(26);
533REGION_MAPPING(27);
534REGION_MAPPING(28);
535REGION_MAPPING(29);
536REGION_MAPPING(30);
537REGION_MAPPING(31);
538
539static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n)
540{
541 struct device *dev = container_of(kobj, struct device, kobj);
542 struct nd_region *nd_region = to_nd_region(dev);
543
544 if (n < nd_region->ndr_mappings)
545 return a->mode;
546 return 0;
547}
548
549static struct attribute *mapping_attributes[] = {
550 &dev_attr_mapping0.attr,
551 &dev_attr_mapping1.attr,
552 &dev_attr_mapping2.attr,
553 &dev_attr_mapping3.attr,
554 &dev_attr_mapping4.attr,
555 &dev_attr_mapping5.attr,
556 &dev_attr_mapping6.attr,
557 &dev_attr_mapping7.attr,
558 &dev_attr_mapping8.attr,
559 &dev_attr_mapping9.attr,
560 &dev_attr_mapping10.attr,
561 &dev_attr_mapping11.attr,
562 &dev_attr_mapping12.attr,
563 &dev_attr_mapping13.attr,
564 &dev_attr_mapping14.attr,
565 &dev_attr_mapping15.attr,
566 &dev_attr_mapping16.attr,
567 &dev_attr_mapping17.attr,
568 &dev_attr_mapping18.attr,
569 &dev_attr_mapping19.attr,
570 &dev_attr_mapping20.attr,
571 &dev_attr_mapping21.attr,
572 &dev_attr_mapping22.attr,
573 &dev_attr_mapping23.attr,
574 &dev_attr_mapping24.attr,
575 &dev_attr_mapping25.attr,
576 &dev_attr_mapping26.attr,
577 &dev_attr_mapping27.attr,
578 &dev_attr_mapping28.attr,
579 &dev_attr_mapping29.attr,
580 &dev_attr_mapping30.attr,
581 &dev_attr_mapping31.attr,
582 NULL,
583};
584
585struct attribute_group nd_mapping_attribute_group = {
586 .is_visible = mapping_visible,
587 .attrs = mapping_attributes,
588};
589EXPORT_SYMBOL_GPL(nd_mapping_attribute_group);
590
591int nd_blk_region_init(struct nd_region *nd_region)
592{
593 struct device *dev = &nd_region->dev;
594 struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
595
596 if (!is_nd_blk(dev))
597 return 0;
598
599 if (nd_region->ndr_mappings < 1) {
600 dev_err(dev, "invalid BLK region\n");
601 return -ENXIO;
602 }
603
604 return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
605}
606
607/**
608 * nd_region_acquire_lane - allocate and lock a lane
609 * @nd_region: region id and number of lanes possible
610 *
611 * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
612 * We optimize for the common case where there are 256 lanes, one
613 * per-cpu. For larger systems we need to lock to share lanes. For now
614 * this implementation assumes the cost of maintaining an allocator for
615 * free lanes is on the order of the lock hold time, so it implements a
616 * static lane = cpu % num_lanes mapping.
617 *
618 * In the case of a BTT instance on top of a BLK namespace a lane may be
619 * acquired recursively. We lock on the first instance.
620 *
621 * In the case of a BTT instance on top of PMEM, we only acquire a lane
622 * for the BTT metadata updates.
623 */
624unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
625{
626 unsigned int cpu, lane;
627
628 cpu = get_cpu();
629 if (nd_region->num_lanes < nr_cpu_ids) {
630 struct nd_percpu_lane *ndl_lock, *ndl_count;
631
632 lane = cpu % nd_region->num_lanes;
633 ndl_count = per_cpu_ptr(nd_region->lane, cpu);
634 ndl_lock = per_cpu_ptr(nd_region->lane, lane);
635 if (ndl_count->count++ == 0)
636 spin_lock(&ndl_lock->lock);
637 } else
638 lane = cpu;
639
640 return lane;
641}
642EXPORT_SYMBOL(nd_region_acquire_lane);
643
644void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
645{
646 if (nd_region->num_lanes < nr_cpu_ids) {
647 unsigned int cpu = get_cpu();
648 struct nd_percpu_lane *ndl_lock, *ndl_count;
649
650 ndl_count = per_cpu_ptr(nd_region->lane, cpu);
651 ndl_lock = per_cpu_ptr(nd_region->lane, lane);
652 if (--ndl_count->count == 0)
653 spin_unlock(&ndl_lock->lock);
654 put_cpu();
655 }
656 put_cpu();
657}
658EXPORT_SYMBOL(nd_region_release_lane);
659
660static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
661 struct nd_region_desc *ndr_desc, struct device_type *dev_type,
662 const char *caller)
663{
664 struct nd_region *nd_region;
665 struct device *dev;
666 void *region_buf;
667 unsigned int i;
668 int ro = 0;
669
670 for (i = 0; i < ndr_desc->num_mappings; i++) {
671 struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
672 struct nvdimm *nvdimm = nd_mapping->nvdimm;
673
674 if ((nd_mapping->start | nd_mapping->size) % SZ_4K) {
675 dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
676 caller, dev_name(&nvdimm->dev), i);
677
678 return NULL;
679 }
680
681 if (nvdimm->flags & NDD_UNARMED)
682 ro = 1;
683 }
684
685 if (dev_type == &nd_blk_device_type) {
686 struct nd_blk_region_desc *ndbr_desc;
687 struct nd_blk_region *ndbr;
688
689 ndbr_desc = to_blk_region_desc(ndr_desc);
690 ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
691 * ndr_desc->num_mappings,
692 GFP_KERNEL);
693 if (ndbr) {
694 nd_region = &ndbr->nd_region;
695 ndbr->enable = ndbr_desc->enable;
696 ndbr->disable = ndbr_desc->disable;
697 ndbr->do_io = ndbr_desc->do_io;
698 }
699 region_buf = ndbr;
700 } else {
701 nd_region = kzalloc(sizeof(struct nd_region)
702 + sizeof(struct nd_mapping)
703 * ndr_desc->num_mappings,
704 GFP_KERNEL);
705 region_buf = nd_region;
706 }
707
708 if (!region_buf)
709 return NULL;
710 nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
711 if (nd_region->id < 0)
712 goto err_id;
713
714 nd_region->lane = alloc_percpu(struct nd_percpu_lane);
715 if (!nd_region->lane)
716 goto err_percpu;
717
718 for (i = 0; i < nr_cpu_ids; i++) {
719 struct nd_percpu_lane *ndl;
720
721 ndl = per_cpu_ptr(nd_region->lane, i);
722 spin_lock_init(&ndl->lock);
723 ndl->count = 0;
724 }
725
726 memcpy(nd_region->mapping, ndr_desc->nd_mapping,
727 sizeof(struct nd_mapping) * ndr_desc->num_mappings);
728 for (i = 0; i < ndr_desc->num_mappings; i++) {
729 struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
730 struct nvdimm *nvdimm = nd_mapping->nvdimm;
731
732 get_device(&nvdimm->dev);
733 }
734 nd_region->ndr_mappings = ndr_desc->num_mappings;
735 nd_region->provider_data = ndr_desc->provider_data;
736 nd_region->nd_set = ndr_desc->nd_set;
737 nd_region->num_lanes = ndr_desc->num_lanes;
738 nd_region->ro = ro;
739 nd_region->numa_node = ndr_desc->numa_node;
740 ida_init(&nd_region->ns_ida);
741 ida_init(&nd_region->btt_ida);
742 dev = &nd_region->dev;
743 dev_set_name(dev, "region%d", nd_region->id);
744 dev->parent = &nvdimm_bus->dev;
745 dev->type = dev_type;
746 dev->groups = ndr_desc->attr_groups;
747 nd_region->ndr_size = resource_size(ndr_desc->res);
748 nd_region->ndr_start = ndr_desc->res->start;
749 nd_device_register(dev);
750
751 return nd_region;
752
753 err_percpu:
754 ida_simple_remove(&region_ida, nd_region->id);
755 err_id:
756 kfree(region_buf);
757 return NULL;
758}
759
760struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
761 struct nd_region_desc *ndr_desc)
762{
763 ndr_desc->num_lanes = ND_MAX_LANES;
764 return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
765 __func__);
766}
767EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
768
769struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
770 struct nd_region_desc *ndr_desc)
771{
772 if (ndr_desc->num_mappings > 1)
773 return NULL;
774 ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
775 return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
776 __func__);
777}
778EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
779
780struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
781 struct nd_region_desc *ndr_desc)
782{
783 ndr_desc->num_lanes = ND_MAX_LANES;
784 return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
785 __func__);
786}
787EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f04c873a7365..b155d32db766 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -377,7 +377,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
377 struct page *page) 377 struct page *page)
378{ 378{
379 const struct block_device_operations *ops = bdev->bd_disk->fops; 379 const struct block_device_operations *ops = bdev->bd_disk->fops;
380 if (!ops->rw_page) 380 if (!ops->rw_page || bdev_get_integrity(bdev))
381 return -EOPNOTSUPP; 381 return -EOPNOTSUPP;
382 return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); 382 return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
383} 383}
@@ -408,7 +408,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
408 int result; 408 int result;
409 int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; 409 int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
410 const struct block_device_operations *ops = bdev->bd_disk->fops; 410 const struct block_device_operations *ops = bdev->bd_disk->fops;
411 if (!ops->rw_page) 411 if (!ops->rw_page || bdev_get_integrity(bdev))
412 return -EOPNOTSUPP; 412 return -EOPNOTSUPP;
413 set_page_writeback(page); 413 set_page_writeback(page);
414 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); 414 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c187817471fb..1618cdfb38c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -261,8 +261,13 @@ extern void acpi_osi_setup(char *str);
261extern bool acpi_osi_is_win8(void); 261extern bool acpi_osi_is_win8(void);
262 262
263#ifdef CONFIG_ACPI_NUMA 263#ifdef CONFIG_ACPI_NUMA
264int acpi_map_pxm_to_online_node(int pxm);
264int acpi_get_node(acpi_handle handle); 265int acpi_get_node(acpi_handle handle);
265#else 266#else
267static inline int acpi_map_pxm_to_online_node(int pxm)
268{
269 return 0;
270}
266static inline int acpi_get_node(acpi_handle handle) 271static inline int acpi_get_node(acpi_handle handle)
267{ 272{
268 return 0; 273 return 0;
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 05be2352fef8..26fc8bc77f85 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -21,6 +21,7 @@
21# define __rcu __attribute__((noderef, address_space(4))) 21# define __rcu __attribute__((noderef, address_space(4)))
22#else 22#else
23# define __rcu 23# define __rcu
24# define __pmem __attribute__((noderef, address_space(5)))
24#endif 25#endif
25extern void __chk_user_ptr(const volatile void __user *); 26extern void __chk_user_ptr(const volatile void __user *);
26extern void __chk_io_ptr(const volatile void __iomem *); 27extern void __chk_io_ptr(const volatile void __iomem *);
@@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
42# define __cond_lock(x,c) (c) 43# define __cond_lock(x,c) (c)
43# define __percpu 44# define __percpu
44# define __rcu 45# define __rcu
46# define __pmem
45#endif 47#endif
46 48
47/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ 49/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 5f19efe4eb3f..85ef051ac6fb 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -85,7 +85,8 @@ typedef struct {
85#define EFI_MEMORY_MAPPED_IO 11 85#define EFI_MEMORY_MAPPED_IO 11
86#define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 86#define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12
87#define EFI_PAL_CODE 13 87#define EFI_PAL_CODE 13
88#define EFI_MAX_MEMORY_TYPE 14 88#define EFI_PERSISTENT_MEMORY 14
89#define EFI_MAX_MEMORY_TYPE 15
89 90
90/* Attribute values: */ 91/* Attribute values: */
91#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ 92#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
new file mode 100644
index 000000000000..75e3af01ee32
--- /dev/null
+++ b/include/linux/libnvdimm.h
@@ -0,0 +1,151 @@
1/*
2 * libnvdimm - Non-volatile-memory Devices Subsystem
3 *
4 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */
15#ifndef __LIBNVDIMM_H__
16#define __LIBNVDIMM_H__
17#include <linux/kernel.h>
18#include <linux/sizes.h>
19#include <linux/types.h>
20
21enum {
22 /* when a dimm supports both PMEM and BLK access a label is required */
23 NDD_ALIASING = 1 << 0,
24 /* unarmed memory devices may not persist writes */
25 NDD_UNARMED = 1 << 1,
26
27 /* need to set a limit somewhere, but yes, this is likely overkill */
28 ND_IOCTL_MAX_BUFLEN = SZ_4M,
29 ND_CMD_MAX_ELEM = 4,
30 ND_CMD_MAX_ENVELOPE = 16,
31 ND_CMD_ARS_STATUS_MAX = SZ_4K,
32 ND_MAX_MAPPINGS = 32,
33
34 /* mark newly adjusted resources as requiring a label update */
35 DPA_RESOURCE_ADJUSTED = 1 << 0,
36};
37
38extern struct attribute_group nvdimm_bus_attribute_group;
39extern struct attribute_group nvdimm_attribute_group;
40extern struct attribute_group nd_device_attribute_group;
41extern struct attribute_group nd_numa_attribute_group;
42extern struct attribute_group nd_region_attribute_group;
43extern struct attribute_group nd_mapping_attribute_group;
44
45struct nvdimm;
46struct nvdimm_bus_descriptor;
47typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
48 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
49 unsigned int buf_len);
50
51struct nd_namespace_label;
52struct nvdimm_drvdata;
53struct nd_mapping {
54 struct nvdimm *nvdimm;
55 struct nd_namespace_label **labels;
56 u64 start;
57 u64 size;
58 /*
59 * @ndd is for private use at region enable / disable time for
60 * get_ndd() + put_ndd(), all other nd_mapping to ndd
61 * conversions use to_ndd() which respects enabled state of the
62 * nvdimm.
63 */
64 struct nvdimm_drvdata *ndd;
65};
66
67struct nvdimm_bus_descriptor {
68 const struct attribute_group **attr_groups;
69 unsigned long dsm_mask;
70 char *provider_name;
71 ndctl_fn ndctl;
72};
73
74struct nd_cmd_desc {
75 int in_num;
76 int out_num;
77 u32 in_sizes[ND_CMD_MAX_ELEM];
78 int out_sizes[ND_CMD_MAX_ELEM];
79};
80
81struct nd_interleave_set {
82 u64 cookie;
83};
84
85struct nd_region_desc {
86 struct resource *res;
87 struct nd_mapping *nd_mapping;
88 u16 num_mappings;
89 const struct attribute_group **attr_groups;
90 struct nd_interleave_set *nd_set;
91 void *provider_data;
92 int num_lanes;
93 int numa_node;
94};
95
96struct nvdimm_bus;
97struct module;
98struct device;
99struct nd_blk_region;
100struct nd_blk_region_desc {
101 int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
102 void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
103 int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
104 void *iobuf, u64 len, int rw);
105 struct nd_region_desc ndr_desc;
106};
107
108static inline struct nd_blk_region_desc *to_blk_region_desc(
109 struct nd_region_desc *ndr_desc)
110{
111 return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc);
112
113}
114
115struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
116 struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
117#define nvdimm_bus_register(parent, desc) \
118 __nvdimm_bus_register(parent, desc, THIS_MODULE)
119void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
120struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
121struct nvdimm *to_nvdimm(struct device *dev);
122struct nd_region *to_nd_region(struct device *dev);
123struct nd_blk_region *to_nd_blk_region(struct device *dev);
124struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
125const char *nvdimm_name(struct nvdimm *nvdimm);
126void *nvdimm_provider_data(struct nvdimm *nvdimm);
127struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
128 const struct attribute_group **groups, unsigned long flags,
129 unsigned long *dsm_mask);
130const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
131const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
132u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
133 const struct nd_cmd_desc *desc, int idx, void *buf);
134u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
135 const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
136 const u32 *out_field);
137int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count);
138struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
139 struct nd_region_desc *ndr_desc);
140struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
141 struct nd_region_desc *ndr_desc);
142struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
143 struct nd_region_desc *ndr_desc);
144void *nd_region_provider_data(struct nd_region *nd_region);
145void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
146void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
147struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
148unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
149void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
150u64 nd_fletcher64(void *addr, size_t len, bool le);
151#endif /* __LIBNVDIMM_H__ */
diff --git a/include/linux/nd.h b/include/linux/nd.h
new file mode 100644
index 000000000000..507e47c86737
--- /dev/null
+++ b/include/linux/nd.h
@@ -0,0 +1,151 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __LINUX_ND_H__
14#define __LINUX_ND_H__
15#include <linux/fs.h>
16#include <linux/ndctl.h>
17#include <linux/device.h>
18
19struct nd_device_driver {
20 struct device_driver drv;
21 unsigned long type;
22 int (*probe)(struct device *dev);
23 int (*remove)(struct device *dev);
24};
25
26static inline struct nd_device_driver *to_nd_device_driver(
27 struct device_driver *drv)
28{
29 return container_of(drv, struct nd_device_driver, drv);
30};
31
32/**
33 * struct nd_namespace_common - core infrastructure of a namespace
34 * @force_raw: ignore other personalities for the namespace (e.g. btt)
35 * @dev: device model node
36 * @claim: when set a another personality has taken ownership of the namespace
37 * @rw_bytes: access the raw namespace capacity with byte-aligned transfers
38 */
39struct nd_namespace_common {
40 int force_raw;
41 struct device dev;
42 struct device *claim;
43 int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
44 void *buf, size_t size, int rw);
45};
46
47static inline struct nd_namespace_common *to_ndns(struct device *dev)
48{
49 return container_of(dev, struct nd_namespace_common, dev);
50}
51
52/**
53 * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
54 * @dev: namespace device created by the nd region driver
55 * @res: struct resource conversion of a NFIT SPA table
56 */
57struct nd_namespace_io {
58 struct nd_namespace_common common;
59 struct resource res;
60};
61
62/**
63 * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory
64 * @nsio: device and system physical address range to drive
65 * @alt_name: namespace name supplied in the dimm label
66 * @uuid: namespace name supplied in the dimm label
67 */
68struct nd_namespace_pmem {
69 struct nd_namespace_io nsio;
70 char *alt_name;
71 u8 *uuid;
72};
73
74/**
75 * struct nd_namespace_blk - namespace for dimm-bounded persistent memory
76 * @alt_name: namespace name supplied in the dimm label
77 * @uuid: namespace name supplied in the dimm label
78 * @id: ida allocated id
79 * @lbasize: blk namespaces have a native sector size when btt not present
80 * @num_resources: number of dpa extents to claim
81 * @res: discontiguous dpa extents for given dimm
82 */
83struct nd_namespace_blk {
84 struct nd_namespace_common common;
85 char *alt_name;
86 u8 *uuid;
87 int id;
88 unsigned long lbasize;
89 int num_resources;
90 struct resource **res;
91};
92
93static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
94{
95 return container_of(dev, struct nd_namespace_io, common.dev);
96}
97
98static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
99{
100 struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
101
102 return container_of(nsio, struct nd_namespace_pmem, nsio);
103}
104
105static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev)
106{
107 return container_of(dev, struct nd_namespace_blk, common.dev);
108}
109
110/**
111 * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace
112 * @ndns: device to read
113 * @offset: namespace-relative starting offset
114 * @buf: buffer to fill
115 * @size: transfer length
116 *
117 * @buf is up-to-date upon return from this routine.
118 */
119static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
120 resource_size_t offset, void *buf, size_t size)
121{
122 return ndns->rw_bytes(ndns, offset, buf, size, READ);
123}
124
125/**
126 * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace
127 * @ndns: device to read
128 * @offset: namespace-relative starting offset
129 * @buf: buffer to drain
130 * @size: transfer length
131 *
132 * NVDIMM Namepaces disks do not implement sectors internally. Depending on
133 * the @ndns, the contents of @buf may be in cpu cache, platform buffers,
134 * or on backing memory media upon return from this routine. Flushing
135 * to media is handled internal to the @ndns driver, if at all.
136 */
137static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
138 resource_size_t offset, void *buf, size_t size)
139{
140 return ndns->rw_bytes(ndns, offset, buf, size, WRITE);
141}
142
143#define MODULE_ALIAS_ND_DEVICE(type) \
144 MODULE_ALIAS("nd:t" __stringify(type) "*")
145#define ND_DEVICE_MODALIAS_FMT "nd:t%d"
146
147int __must_check __nd_driver_register(struct nd_device_driver *nd_drv,
148 struct module *module, const char *mod_name);
149#define nd_driver_register(driver) \
150 __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
151#endif /* __LINUX_ND_H__ */
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 000000000000..d2114045a6c4
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,152 @@
1/*
2 * Copyright(c) 2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __PMEM_H__
14#define __PMEM_H__
15
16#include <linux/io.h>
17
18#ifdef CONFIG_ARCH_HAS_PMEM_API
19#include <asm/cacheflush.h>
20#else
21static inline void arch_wmb_pmem(void)
22{
23 BUG();
24}
25
26static inline bool __arch_has_wmb_pmem(void)
27{
28 return false;
29}
30
31static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
32 unsigned long size)
33{
34 return NULL;
35}
36
37static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
38 size_t n)
39{
40 BUG();
41}
42#endif
43
44/*
45 * Architectures that define ARCH_HAS_PMEM_API must provide
46 * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
47 * arch_wmb_pmem(), and __arch_has_wmb_pmem().
48 */
49
50static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
51{
52 memcpy(dst, (void __force const *) src, size);
53}
54
55static inline void memunmap_pmem(void __pmem *addr)
56{
57 iounmap((void __force __iomem *) addr);
58}
59
60/**
61 * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
62 *
63 * For a given cpu implementation within an architecture it is possible
64 * that wmb_pmem() resolves to a nop. In the case this returns
65 * false, pmem api users are unable to ensure durability and may want to
66 * fall back to a different data consistency model, or otherwise notify
67 * the user.
68 */
69static inline bool arch_has_wmb_pmem(void)
70{
71 if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
72 return __arch_has_wmb_pmem();
73 return false;
74}
75
76static inline bool arch_has_pmem_api(void)
77{
78 return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
79}
80
81/*
82 * These defaults seek to offer decent performance and minimize the
83 * window between i/o completion and writes being durable on media.
84 * However, it is undefined / architecture specific whether
85 * default_memremap_pmem + default_memcpy_to_pmem is sufficient for
86 * making data durable relative to i/o completion.
87 */
88static void default_memcpy_to_pmem(void __pmem *dst, const void *src,
89 size_t size)
90{
91 memcpy((void __force *) dst, src, size);
92}
93
94static void __pmem *default_memremap_pmem(resource_size_t offset,
95 unsigned long size)
96{
97 return (void __pmem __force *)ioremap_wt(offset, size);
98}
99
100/**
101 * memremap_pmem - map physical persistent memory for pmem api
102 * @offset: physical address of persistent memory
103 * @size: size of the mapping
104 *
105 * Establish a mapping of the architecture specific memory type expected
106 * by memcpy_to_pmem() and wmb_pmem(). For example, it may be
107 * the case that an uncacheable or writethrough mapping is sufficient,
108 * or a writeback mapping provided memcpy_to_pmem() and
109 * wmb_pmem() arrange for the data to be written through the
110 * cache to persistent media.
111 */
112static inline void __pmem *memremap_pmem(resource_size_t offset,
113 unsigned long size)
114{
115 if (arch_has_pmem_api())
116 return arch_memremap_pmem(offset, size);
117 return default_memremap_pmem(offset, size);
118}
119
120/**
121 * memcpy_to_pmem - copy data to persistent memory
122 * @dst: destination buffer for the copy
123 * @src: source buffer for the copy
124 * @n: length of the copy in bytes
125 *
126 * Perform a memory copy that results in the destination of the copy
127 * being effectively evicted from, or never written to, the processor
128 * cache hierarchy after the copy completes. After memcpy_to_pmem()
129 * data may still reside in cpu or platform buffers, so this operation
130 * must be followed by a wmb_pmem().
131 */
132static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
133{
134 if (arch_has_pmem_api())
135 arch_memcpy_to_pmem(dst, src, n);
136 else
137 default_memcpy_to_pmem(dst, src, n);
138}
139
140/**
141 * wmb_pmem - synchronize writes to persistent memory
142 *
143 * After a series of memcpy_to_pmem() operations this drains data from
144 * cpu write buffers and any platform (memory controller) buffers to
145 * ensure that written data is durable on persistent memory media.
146 */
147static inline void wmb_pmem(void)
148{
149 if (arch_has_pmem_api())
150 arch_wmb_pmem();
151}
152#endif /* __PMEM_H__ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index c1c23f19d4a2..1ff9942718fe 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -272,6 +272,7 @@ header-y += ncp_fs.h
272header-y += ncp.h 272header-y += ncp.h
273header-y += ncp_mount.h 273header-y += ncp_mount.h
274header-y += ncp_no.h 274header-y += ncp_no.h
275header-y += ndctl.h
275header-y += neighbour.h 276header-y += neighbour.h
276header-y += netconf.h 277header-y += netconf.h
277header-y += netdevice.h 278header-y += netdevice.h
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
new file mode 100644
index 000000000000..2b94ea2287bb
--- /dev/null
+++ b/include/uapi/linux/ndctl.h
@@ -0,0 +1,197 @@
1/*
2 * Copyright (c) 2014-2015, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU Lesser General Public License,
6 * version 2.1, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT ANY
9 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
10 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
11 * more details.
12 */
13#ifndef __NDCTL_H__
14#define __NDCTL_H__
15
16#include <linux/types.h>
17
18struct nd_cmd_smart {
19 __u32 status;
20 __u8 data[128];
21} __packed;
22
23struct nd_cmd_smart_threshold {
24 __u32 status;
25 __u8 data[8];
26} __packed;
27
28struct nd_cmd_dimm_flags {
29 __u32 status;
30 __u32 flags;
31} __packed;
32
33struct nd_cmd_get_config_size {
34 __u32 status;
35 __u32 config_size;
36 __u32 max_xfer;
37} __packed;
38
39struct nd_cmd_get_config_data_hdr {
40 __u32 in_offset;
41 __u32 in_length;
42 __u32 status;
43 __u8 out_buf[0];
44} __packed;
45
46struct nd_cmd_set_config_hdr {
47 __u32 in_offset;
48 __u32 in_length;
49 __u8 in_buf[0];
50} __packed;
51
52struct nd_cmd_vendor_hdr {
53 __u32 opcode;
54 __u32 in_length;
55 __u8 in_buf[0];
56} __packed;
57
58struct nd_cmd_vendor_tail {
59 __u32 status;
60 __u32 out_length;
61 __u8 out_buf[0];
62} __packed;
63
64struct nd_cmd_ars_cap {
65 __u64 address;
66 __u64 length;
67 __u32 status;
68 __u32 max_ars_out;
69} __packed;
70
71struct nd_cmd_ars_start {
72 __u64 address;
73 __u64 length;
74 __u16 type;
75 __u8 reserved[6];
76 __u32 status;
77} __packed;
78
79struct nd_cmd_ars_status {
80 __u32 status;
81 __u32 out_length;
82 __u64 address;
83 __u64 length;
84 __u16 type;
85 __u32 num_records;
86 struct nd_ars_record {
87 __u32 handle;
88 __u32 flags;
89 __u64 err_address;
90 __u64 mask;
91 } __packed records[0];
92} __packed;
93
94enum {
95 ND_CMD_IMPLEMENTED = 0,
96
97 /* bus commands */
98 ND_CMD_ARS_CAP = 1,
99 ND_CMD_ARS_START = 2,
100 ND_CMD_ARS_STATUS = 3,
101
102 /* per-dimm commands */
103 ND_CMD_SMART = 1,
104 ND_CMD_SMART_THRESHOLD = 2,
105 ND_CMD_DIMM_FLAGS = 3,
106 ND_CMD_GET_CONFIG_SIZE = 4,
107 ND_CMD_GET_CONFIG_DATA = 5,
108 ND_CMD_SET_CONFIG_DATA = 6,
109 ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
110 ND_CMD_VENDOR_EFFECT_LOG = 8,
111 ND_CMD_VENDOR = 9,
112};
113
114static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
115{
116 static const char * const names[] = {
117 [ND_CMD_ARS_CAP] = "ars_cap",
118 [ND_CMD_ARS_START] = "ars_start",
119 [ND_CMD_ARS_STATUS] = "ars_status",
120 };
121
122 if (cmd < ARRAY_SIZE(names) && names[cmd])
123 return names[cmd];
124 return "unknown";
125}
126
127static inline const char *nvdimm_cmd_name(unsigned cmd)
128{
129 static const char * const names[] = {
130 [ND_CMD_SMART] = "smart",
131 [ND_CMD_SMART_THRESHOLD] = "smart_thresh",
132 [ND_CMD_DIMM_FLAGS] = "flags",
133 [ND_CMD_GET_CONFIG_SIZE] = "get_size",
134 [ND_CMD_GET_CONFIG_DATA] = "get_data",
135 [ND_CMD_SET_CONFIG_DATA] = "set_data",
136 [ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size",
137 [ND_CMD_VENDOR_EFFECT_LOG] = "effect_log",
138 [ND_CMD_VENDOR] = "vendor",
139 };
140
141 if (cmd < ARRAY_SIZE(names) && names[cmd])
142 return names[cmd];
143 return "unknown";
144}
145
146#define ND_IOCTL 'N'
147
148#define ND_IOCTL_SMART _IOWR(ND_IOCTL, ND_CMD_SMART,\
149 struct nd_cmd_smart)
150
151#define ND_IOCTL_SMART_THRESHOLD _IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\
152 struct nd_cmd_smart_threshold)
153
154#define ND_IOCTL_DIMM_FLAGS _IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\
155 struct nd_cmd_dimm_flags)
156
157#define ND_IOCTL_GET_CONFIG_SIZE _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\
158 struct nd_cmd_get_config_size)
159
160#define ND_IOCTL_GET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\
161 struct nd_cmd_get_config_data_hdr)
162
163#define ND_IOCTL_SET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\
164 struct nd_cmd_set_config_hdr)
165
166#define ND_IOCTL_VENDOR _IOWR(ND_IOCTL, ND_CMD_VENDOR,\
167 struct nd_cmd_vendor_hdr)
168
169#define ND_IOCTL_ARS_CAP _IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\
170 struct nd_cmd_ars_cap)
171
172#define ND_IOCTL_ARS_START _IOWR(ND_IOCTL, ND_CMD_ARS_START,\
173 struct nd_cmd_ars_start)
174
175#define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
176 struct nd_cmd_ars_status)
177
178#define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */
179#define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */
180#define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */
181#define ND_DEVICE_NAMESPACE_IO 4 /* legacy persistent memory */
182#define ND_DEVICE_NAMESPACE_PMEM 5 /* PMEM namespace (may alias with BLK) */
183#define ND_DEVICE_NAMESPACE_BLK 6 /* BLK namespace (may alias with PMEM) */
184
185enum nd_driver_flags {
186 ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM,
187 ND_DRIVER_REGION_PMEM = 1 << ND_DEVICE_REGION_PMEM,
188 ND_DRIVER_REGION_BLK = 1 << ND_DEVICE_REGION_BLK,
189 ND_DRIVER_NAMESPACE_IO = 1 << ND_DEVICE_NAMESPACE_IO,
190 ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM,
191 ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK,
192};
193
194enum {
195 ND_MIN_NAMESPACE_SIZE = 0x00400000,
196};
197#endif /* __NDCTL_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 34e332b8d326..3a2ef67db6c7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -528,4 +528,7 @@ source "lib/fonts/Kconfig"
528config ARCH_HAS_SG_CHAIN 528config ARCH_HAS_SG_CHAIN
529 def_bool n 529 def_bool n
530 530
531config ARCH_HAS_PMEM_API
532 bool
533
531endmenu 534endmenu
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
new file mode 100644
index 000000000000..8e9b64520ec1
--- /dev/null
+++ b/tools/testing/nvdimm/Kbuild
@@ -0,0 +1,40 @@
1ldflags-y += --wrap=ioremap_cache
2ldflags-y += --wrap=ioremap_nocache
3ldflags-y += --wrap=iounmap
4ldflags-y += --wrap=__request_region
5ldflags-y += --wrap=__release_region
6
7DRIVERS := ../../../drivers
8NVDIMM_SRC := $(DRIVERS)/nvdimm
9ACPI_SRC := $(DRIVERS)/acpi
10
11obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
12obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
13obj-$(CONFIG_ND_BTT) += nd_btt.o
14obj-$(CONFIG_ND_BLK) += nd_blk.o
15obj-$(CONFIG_ACPI_NFIT) += nfit.o
16
17nfit-y := $(ACPI_SRC)/nfit.o
18nfit-y += config_check.o
19
20nd_pmem-y := $(NVDIMM_SRC)/pmem.o
21nd_pmem-y += config_check.o
22
23nd_btt-y := $(NVDIMM_SRC)/btt.o
24nd_btt-y += config_check.o
25
26nd_blk-y := $(NVDIMM_SRC)/blk.o
27nd_blk-y += config_check.o
28
29libnvdimm-y := $(NVDIMM_SRC)/core.o
30libnvdimm-y += $(NVDIMM_SRC)/bus.o
31libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
32libnvdimm-y += $(NVDIMM_SRC)/dimm.o
33libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
34libnvdimm-y += $(NVDIMM_SRC)/region.o
35libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
36libnvdimm-y += $(NVDIMM_SRC)/label.o
37libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
38libnvdimm-y += config_check.o
39
40obj-m += test/
diff --git a/tools/testing/nvdimm/Makefile b/tools/testing/nvdimm/Makefile
new file mode 100644
index 000000000000..3dfe024b4e7e
--- /dev/null
+++ b/tools/testing/nvdimm/Makefile
@@ -0,0 +1,7 @@
1KDIR ?= ../../../
2
3default:
4 $(MAKE) -C $(KDIR) M=$$PWD
5
6install: default
7 $(MAKE) -C $(KDIR) M=$$PWD modules_install
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
new file mode 100644
index 000000000000..f2c7615554eb
--- /dev/null
+++ b/tools/testing/nvdimm/config_check.c
@@ -0,0 +1,15 @@
1#include <linux/kconfig.h>
2#include <linux/bug.h>
3
4void check(void)
5{
6 /*
7 * These kconfig symbols must be set to "m" for nfit_test to
8 * load and operate.
9 */
10 BUILD_BUG_ON(!IS_MODULE(CONFIG_LIBNVDIMM));
11 BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM));
12 BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
13 BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
14 BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
15}
diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild
new file mode 100644
index 000000000000..9241064970fe
--- /dev/null
+++ b/tools/testing/nvdimm/test/Kbuild
@@ -0,0 +1,8 @@
1ccflags-y := -I$(src)/../../../../drivers/nvdimm/
2ccflags-y += -I$(src)/../../../../drivers/acpi/
3
4obj-m += nfit_test.o
5obj-m += nfit_test_iomap.o
6
7nfit_test-y := nfit.o
8nfit_test_iomap-y := iomap.o
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
new file mode 100644
index 000000000000..c85a6f6ba559
--- /dev/null
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -0,0 +1,151 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/rculist.h>
14#include <linux/export.h>
15#include <linux/ioport.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/io.h>
19#include "nfit_test.h"
20
21static LIST_HEAD(iomap_head);
22
23static struct iomap_ops {
24 nfit_test_lookup_fn nfit_test_lookup;
25 struct list_head list;
26} iomap_ops = {
27 .list = LIST_HEAD_INIT(iomap_ops.list),
28};
29
30void nfit_test_setup(nfit_test_lookup_fn lookup)
31{
32 iomap_ops.nfit_test_lookup = lookup;
33 list_add_rcu(&iomap_ops.list, &iomap_head);
34}
35EXPORT_SYMBOL(nfit_test_setup);
36
37void nfit_test_teardown(void)
38{
39 list_del_rcu(&iomap_ops.list);
40 synchronize_rcu();
41}
42EXPORT_SYMBOL(nfit_test_teardown);
43
44static struct nfit_test_resource *get_nfit_res(resource_size_t resource)
45{
46 struct iomap_ops *ops;
47
48 ops = list_first_or_null_rcu(&iomap_head, typeof(*ops), list);
49 if (ops)
50 return ops->nfit_test_lookup(resource);
51 return NULL;
52}
53
54void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size,
55 void __iomem *(*fallback_fn)(resource_size_t, unsigned long))
56{
57 struct nfit_test_resource *nfit_res;
58
59 rcu_read_lock();
60 nfit_res = get_nfit_res(offset);
61 rcu_read_unlock();
62 if (nfit_res)
63 return (void __iomem *) nfit_res->buf + offset
64 - nfit_res->res->start;
65 return fallback_fn(offset, size);
66}
67
68void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size)
69{
70 return __nfit_test_ioremap(offset, size, ioremap_cache);
71}
72EXPORT_SYMBOL(__wrap_ioremap_cache);
73
74void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
75{
76 return __nfit_test_ioremap(offset, size, ioremap_nocache);
77}
78EXPORT_SYMBOL(__wrap_ioremap_nocache);
79
80void __wrap_iounmap(volatile void __iomem *addr)
81{
82 struct nfit_test_resource *nfit_res;
83
84 rcu_read_lock();
85 nfit_res = get_nfit_res((unsigned long) addr);
86 rcu_read_unlock();
87 if (nfit_res)
88 return;
89 return iounmap(addr);
90}
91EXPORT_SYMBOL(__wrap_iounmap);
92
93struct resource *__wrap___request_region(struct resource *parent,
94 resource_size_t start, resource_size_t n, const char *name,
95 int flags)
96{
97 struct nfit_test_resource *nfit_res;
98
99 if (parent == &iomem_resource) {
100 rcu_read_lock();
101 nfit_res = get_nfit_res(start);
102 rcu_read_unlock();
103 if (nfit_res) {
104 struct resource *res = nfit_res->res + 1;
105
106 if (start + n > nfit_res->res->start
107 + resource_size(nfit_res->res)) {
108 pr_debug("%s: start: %llx n: %llx overflow: %pr\n",
109 __func__, start, n,
110 nfit_res->res);
111 return NULL;
112 }
113
114 res->start = start;
115 res->end = start + n - 1;
116 res->name = name;
117 res->flags = resource_type(parent);
118 res->flags |= IORESOURCE_BUSY | flags;
119 pr_debug("%s: %pr\n", __func__, res);
120 return res;
121 }
122 }
123 return __request_region(parent, start, n, name, flags);
124}
125EXPORT_SYMBOL(__wrap___request_region);
126
127void __wrap___release_region(struct resource *parent, resource_size_t start,
128 resource_size_t n)
129{
130 struct nfit_test_resource *nfit_res;
131
132 if (parent == &iomem_resource) {
133 rcu_read_lock();
134 nfit_res = get_nfit_res(start);
135 rcu_read_unlock();
136 if (nfit_res) {
137 struct resource *res = nfit_res->res + 1;
138
139 if (start != res->start || resource_size(res) != n)
140 pr_info("%s: start: %llx n: %llx mismatch: %pr\n",
141 __func__, start, n, res);
142 else
143 memset(res, 0, sizeof(*res));
144 return;
145 }
146 }
147 __release_region(parent, start, n);
148}
149EXPORT_SYMBOL(__wrap___release_region);
150
151MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
new file mode 100644
index 000000000000..4b69b8368de0
--- /dev/null
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -0,0 +1,1116 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14#include <linux/platform_device.h>
15#include <linux/dma-mapping.h>
16#include <linux/libnvdimm.h>
17#include <linux/vmalloc.h>
18#include <linux/device.h>
19#include <linux/module.h>
20#include <linux/ndctl.h>
21#include <linux/sizes.h>
22#include <linux/slab.h>
23#include <nfit.h>
24#include <nd.h>
25#include "nfit_test.h"
26
27/*
28 * Generate an NFIT table to describe the following topology:
29 *
30 * BUS0: Interleaved PMEM regions, and aliasing with BLK regions
31 *
32 * (a) (b) DIMM BLK-REGION
33 * +----------+--------------+----------+---------+
34 * +------+ | blk2.0 | pm0.0 | blk2.1 | pm1.0 | 0 region2
35 * | imc0 +--+- - - - - region0 - - - -+----------+ +
36 * +--+---+ | blk3.0 | pm0.0 | blk3.1 | pm1.0 | 1 region3
37 * | +----------+--------------v----------v v
38 * +--+---+ | |
39 * | cpu0 | region1
40 * +--+---+ | |
41 * | +-------------------------^----------^ ^
42 * +--+---+ | blk4.0 | pm1.0 | 2 region4
43 * | imc1 +--+-------------------------+----------+ +
44 * +------+ | blk5.0 | pm1.0 | 3 region5
45 * +-------------------------+----------+-+-------+
46 *
47 * *) In this layout we have four dimms and two memory controllers in one
48 * socket. Each unique interface (BLK or PMEM) to DPA space
49 * is identified by a region device with a dynamically assigned id.
50 *
51 * *) The first portion of dimm0 and dimm1 are interleaved as REGION0.
52 * A single PMEM namespace "pm0.0" is created using half of the
53 * REGION0 SPA-range. REGION0 spans dimm0 and dimm1. PMEM namespace
54 * allocate from from the bottom of a region. The unallocated
55 * portion of REGION0 aliases with REGION2 and REGION3. That
56 * unallacted capacity is reclaimed as BLK namespaces ("blk2.0" and
57 * "blk3.0") starting at the base of each DIMM to offset (a) in those
58 * DIMMs. "pm0.0", "blk2.0" and "blk3.0" are free-form readable
59 * names that can be assigned to a namespace.
60 *
61 * *) In the last portion of dimm0 and dimm1 we have an interleaved
62 * SPA range, REGION1, that spans those two dimms as well as dimm2
63 * and dimm3. Some of REGION1 allocated to a PMEM namespace named
64 * "pm1.0" the rest is reclaimed in 4 BLK namespaces (for each
65 * dimm in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
66 * "blk5.0".
67 *
68 * *) The portion of dimm2 and dimm3 that do not participate in the
69 * REGION1 interleaved SPA range (i.e. the DPA address below offset
70 * (b) are also included in the "blk4.0" and "blk5.0" namespaces.
71 * Note, that BLK namespaces need not be contiguous in DPA-space, and
72 * can consume aliased capacity from multiple interleave sets.
73 *
74 * BUS1: Legacy NVDIMM (single contiguous range)
75 *
76 * region2
77 * +---------------------+
78 * |---------------------|
79 * || pm2.0 ||
80 * |---------------------|
81 * +---------------------+
82 *
83 * *) A NFIT-table may describe a simple system-physical-address range
84 * with no BLK aliasing. This type of region may optionally
85 * reference an NVDIMM.
86 */
87enum {
88 NUM_PM = 2,
89 NUM_DCR = 4,
90 NUM_BDW = NUM_DCR,
91 NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW,
92 NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */,
93 DIMM_SIZE = SZ_32M,
94 LABEL_SIZE = SZ_128K,
95 SPA0_SIZE = DIMM_SIZE,
96 SPA1_SIZE = DIMM_SIZE*2,
97 SPA2_SIZE = DIMM_SIZE,
98 BDW_SIZE = 64 << 8,
99 DCR_SIZE = 12,
100 NUM_NFITS = 2, /* permit testing multiple NFITs per system */
101};
102
103struct nfit_test_dcr {
104 __le64 bdw_addr;
105 __le32 bdw_status;
106 __u8 aperature[BDW_SIZE];
107};
108
109#define NFIT_DIMM_HANDLE(node, socket, imc, chan, dimm) \
110 (((node & 0xfff) << 16) | ((socket & 0xf) << 12) \
111 | ((imc & 0xf) << 8) | ((chan & 0xf) << 4) | (dimm & 0xf))
112
113static u32 handle[NUM_DCR] = {
114 [0] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 0),
115 [1] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 1),
116 [2] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 0),
117 [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1),
118};
119
120struct nfit_test {
121 struct acpi_nfit_desc acpi_desc;
122 struct platform_device pdev;
123 struct list_head resources;
124 void *nfit_buf;
125 dma_addr_t nfit_dma;
126 size_t nfit_size;
127 int num_dcr;
128 int num_pm;
129 void **dimm;
130 dma_addr_t *dimm_dma;
131 void **label;
132 dma_addr_t *label_dma;
133 void **spa_set;
134 dma_addr_t *spa_set_dma;
135 struct nfit_test_dcr **dcr;
136 dma_addr_t *dcr_dma;
137 int (*alloc)(struct nfit_test *t);
138 void (*setup)(struct nfit_test *t);
139};
140
141static struct nfit_test *to_nfit_test(struct device *dev)
142{
143 struct platform_device *pdev = to_platform_device(dev);
144
145 return container_of(pdev, struct nfit_test, pdev);
146}
147
148static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
149 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
150 unsigned int buf_len)
151{
152 struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
153 struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
154 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
155 int i, rc;
156
157 if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
158 return -ENXIO;
159
160 /* lookup label space for the given dimm */
161 for (i = 0; i < ARRAY_SIZE(handle); i++)
162 if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i])
163 break;
164 if (i >= ARRAY_SIZE(handle))
165 return -ENXIO;
166
167 switch (cmd) {
168 case ND_CMD_GET_CONFIG_SIZE: {
169 struct nd_cmd_get_config_size *nd_cmd = buf;
170
171 if (buf_len < sizeof(*nd_cmd))
172 return -EINVAL;
173 nd_cmd->status = 0;
174 nd_cmd->config_size = LABEL_SIZE;
175 nd_cmd->max_xfer = SZ_4K;
176 rc = 0;
177 break;
178 }
179 case ND_CMD_GET_CONFIG_DATA: {
180 struct nd_cmd_get_config_data_hdr *nd_cmd = buf;
181 unsigned int len, offset = nd_cmd->in_offset;
182
183 if (buf_len < sizeof(*nd_cmd))
184 return -EINVAL;
185 if (offset >= LABEL_SIZE)
186 return -EINVAL;
187 if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
188 return -EINVAL;
189
190 nd_cmd->status = 0;
191 len = min(nd_cmd->in_length, LABEL_SIZE - offset);
192 memcpy(nd_cmd->out_buf, t->label[i] + offset, len);
193 rc = buf_len - sizeof(*nd_cmd) - len;
194 break;
195 }
196 case ND_CMD_SET_CONFIG_DATA: {
197 struct nd_cmd_set_config_hdr *nd_cmd = buf;
198 unsigned int len, offset = nd_cmd->in_offset;
199 u32 *status;
200
201 if (buf_len < sizeof(*nd_cmd))
202 return -EINVAL;
203 if (offset >= LABEL_SIZE)
204 return -EINVAL;
205 if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
206 return -EINVAL;
207
208 status = buf + nd_cmd->in_length + sizeof(*nd_cmd);
209 *status = 0;
210 len = min(nd_cmd->in_length, LABEL_SIZE - offset);
211 memcpy(t->label[i] + offset, nd_cmd->in_buf, len);
212 rc = buf_len - sizeof(*nd_cmd) - (len + 4);
213 break;
214 }
215 default:
216 return -ENOTTY;
217 }
218
219 return rc;
220}
221
222static DEFINE_SPINLOCK(nfit_test_lock);
223static struct nfit_test *instances[NUM_NFITS];
224
225static void release_nfit_res(void *data)
226{
227 struct nfit_test_resource *nfit_res = data;
228 struct resource *res = nfit_res->res;
229
230 spin_lock(&nfit_test_lock);
231 list_del(&nfit_res->list);
232 spin_unlock(&nfit_test_lock);
233
234 if (is_vmalloc_addr(nfit_res->buf))
235 vfree(nfit_res->buf);
236 else
237 dma_free_coherent(nfit_res->dev, resource_size(res),
238 nfit_res->buf, res->start);
239 kfree(res);
240 kfree(nfit_res);
241}
242
243static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma,
244 void *buf)
245{
246 struct device *dev = &t->pdev.dev;
247 struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL);
248 struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res),
249 GFP_KERNEL);
250 int rc;
251
252 if (!res || !buf || !nfit_res)
253 goto err;
254 rc = devm_add_action(dev, release_nfit_res, nfit_res);
255 if (rc)
256 goto err;
257 INIT_LIST_HEAD(&nfit_res->list);
258 memset(buf, 0, size);
259 nfit_res->dev = dev;
260 nfit_res->buf = buf;
261 nfit_res->res = res;
262 res->start = *dma;
263 res->end = *dma + size - 1;
264 res->name = "NFIT";
265 spin_lock(&nfit_test_lock);
266 list_add(&nfit_res->list, &t->resources);
267 spin_unlock(&nfit_test_lock);
268
269 return nfit_res->buf;
270 err:
271 if (buf && !is_vmalloc_addr(buf))
272 dma_free_coherent(dev, size, buf, *dma);
273 else if (buf)
274 vfree(buf);
275 kfree(res);
276 kfree(nfit_res);
277 return NULL;
278}
279
280static void *test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma)
281{
282 void *buf = vmalloc(size);
283
284 *dma = (unsigned long) buf;
285 return __test_alloc(t, size, dma, buf);
286}
287
288static void *test_alloc_coherent(struct nfit_test *t, size_t size,
289 dma_addr_t *dma)
290{
291 struct device *dev = &t->pdev.dev;
292 void *buf = dma_alloc_coherent(dev, size, dma, GFP_KERNEL);
293
294 return __test_alloc(t, size, dma, buf);
295}
296
297static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
298{
299 int i;
300
301 for (i = 0; i < ARRAY_SIZE(instances); i++) {
302 struct nfit_test_resource *n, *nfit_res = NULL;
303 struct nfit_test *t = instances[i];
304
305 if (!t)
306 continue;
307 spin_lock(&nfit_test_lock);
308 list_for_each_entry(n, &t->resources, list) {
309 if (addr >= n->res->start && (addr < n->res->start
310 + resource_size(n->res))) {
311 nfit_res = n;
312 break;
313 } else if (addr >= (unsigned long) n->buf
314 && (addr < (unsigned long) n->buf
315 + resource_size(n->res))) {
316 nfit_res = n;
317 break;
318 }
319 }
320 spin_unlock(&nfit_test_lock);
321 if (nfit_res)
322 return nfit_res;
323 }
324
325 return NULL;
326}
327
328static int nfit_test0_alloc(struct nfit_test *t)
329{
330 size_t nfit_size = sizeof(struct acpi_table_nfit)
331 + sizeof(struct acpi_nfit_system_address) * NUM_SPA
332 + sizeof(struct acpi_nfit_memory_map) * NUM_MEM
333 + sizeof(struct acpi_nfit_control_region) * NUM_DCR
334 + sizeof(struct acpi_nfit_data_region) * NUM_BDW;
335 int i;
336
337 t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
338 if (!t->nfit_buf)
339 return -ENOMEM;
340 t->nfit_size = nfit_size;
341
342 t->spa_set[0] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[0]);
343 if (!t->spa_set[0])
344 return -ENOMEM;
345
346 t->spa_set[1] = test_alloc_coherent(t, SPA1_SIZE, &t->spa_set_dma[1]);
347 if (!t->spa_set[1])
348 return -ENOMEM;
349
350 for (i = 0; i < NUM_DCR; i++) {
351 t->dimm[i] = test_alloc(t, DIMM_SIZE, &t->dimm_dma[i]);
352 if (!t->dimm[i])
353 return -ENOMEM;
354
355 t->label[i] = test_alloc(t, LABEL_SIZE, &t->label_dma[i]);
356 if (!t->label[i])
357 return -ENOMEM;
358 sprintf(t->label[i], "label%d", i);
359 }
360
361 for (i = 0; i < NUM_DCR; i++) {
362 t->dcr[i] = test_alloc(t, LABEL_SIZE, &t->dcr_dma[i]);
363 if (!t->dcr[i])
364 return -ENOMEM;
365 }
366
367 return 0;
368}
369
370static int nfit_test1_alloc(struct nfit_test *t)
371{
372 size_t nfit_size = sizeof(struct acpi_table_nfit)
373 + sizeof(struct acpi_nfit_system_address)
374 + sizeof(struct acpi_nfit_memory_map)
375 + sizeof(struct acpi_nfit_control_region);
376
377 t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
378 if (!t->nfit_buf)
379 return -ENOMEM;
380 t->nfit_size = nfit_size;
381
382 t->spa_set[0] = test_alloc_coherent(t, SPA2_SIZE, &t->spa_set_dma[0]);
383 if (!t->spa_set[0])
384 return -ENOMEM;
385
386 return 0;
387}
388
389static void nfit_test_init_header(struct acpi_table_nfit *nfit, size_t size)
390{
391 memcpy(nfit->header.signature, ACPI_SIG_NFIT, 4);
392 nfit->header.length = size;
393 nfit->header.revision = 1;
394 memcpy(nfit->header.oem_id, "LIBND", 6);
395 memcpy(nfit->header.oem_table_id, "TEST", 5);
396 nfit->header.oem_revision = 1;
397 memcpy(nfit->header.asl_compiler_id, "TST", 4);
398 nfit->header.asl_compiler_revision = 1;
399}
400
401static void nfit_test0_setup(struct nfit_test *t)
402{
403 struct nvdimm_bus_descriptor *nd_desc;
404 struct acpi_nfit_desc *acpi_desc;
405 struct acpi_nfit_memory_map *memdev;
406 void *nfit_buf = t->nfit_buf;
407 size_t size = t->nfit_size;
408 struct acpi_nfit_system_address *spa;
409 struct acpi_nfit_control_region *dcr;
410 struct acpi_nfit_data_region *bdw;
411 unsigned int offset;
412
413 nfit_test_init_header(nfit_buf, size);
414
415 /*
416 * spa0 (interleave first half of dimm0 and dimm1, note storage
417 * does not actually alias the related block-data-window
418 * regions)
419 */
420 spa = nfit_buf + sizeof(struct acpi_table_nfit);
421 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
422 spa->header.length = sizeof(*spa);
423 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
424 spa->range_index = 0+1;
425 spa->address = t->spa_set_dma[0];
426 spa->length = SPA0_SIZE;
427
428 /*
429 * spa1 (interleave last half of the 4 DIMMS, note storage
430 * does not actually alias the related block-data-window
431 * regions)
432 */
433 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa);
434 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
435 spa->header.length = sizeof(*spa);
436 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
437 spa->range_index = 1+1;
438 spa->address = t->spa_set_dma[1];
439 spa->length = SPA1_SIZE;
440
441 /* spa2 (dcr0) dimm0 */
442 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 2;
443 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
444 spa->header.length = sizeof(*spa);
445 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
446 spa->range_index = 2+1;
447 spa->address = t->dcr_dma[0];
448 spa->length = DCR_SIZE;
449
450 /* spa3 (dcr1) dimm1 */
451 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 3;
452 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
453 spa->header.length = sizeof(*spa);
454 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
455 spa->range_index = 3+1;
456 spa->address = t->dcr_dma[1];
457 spa->length = DCR_SIZE;
458
459 /* spa4 (dcr2) dimm2 */
460 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 4;
461 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
462 spa->header.length = sizeof(*spa);
463 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
464 spa->range_index = 4+1;
465 spa->address = t->dcr_dma[2];
466 spa->length = DCR_SIZE;
467
468 /* spa5 (dcr3) dimm3 */
469 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 5;
470 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
471 spa->header.length = sizeof(*spa);
472 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16);
473 spa->range_index = 5+1;
474 spa->address = t->dcr_dma[3];
475 spa->length = DCR_SIZE;
476
477 /* spa6 (bdw for dcr0) dimm0 */
478 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 6;
479 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
480 spa->header.length = sizeof(*spa);
481 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
482 spa->range_index = 6+1;
483 spa->address = t->dimm_dma[0];
484 spa->length = DIMM_SIZE;
485
486 /* spa7 (bdw for dcr1) dimm1 */
487 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 7;
488 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
489 spa->header.length = sizeof(*spa);
490 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
491 spa->range_index = 7+1;
492 spa->address = t->dimm_dma[1];
493 spa->length = DIMM_SIZE;
494
495 /* spa8 (bdw for dcr2) dimm2 */
496 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 8;
497 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
498 spa->header.length = sizeof(*spa);
499 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
500 spa->range_index = 8+1;
501 spa->address = t->dimm_dma[2];
502 spa->length = DIMM_SIZE;
503
504 /* spa9 (bdw for dcr3) dimm3 */
505 spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 9;
506 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
507 spa->header.length = sizeof(*spa);
508 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16);
509 spa->range_index = 9+1;
510 spa->address = t->dimm_dma[3];
511 spa->length = DIMM_SIZE;
512
513 offset = sizeof(struct acpi_table_nfit) + sizeof(*spa) * 10;
514 /* mem-region0 (spa0, dimm0) */
515 memdev = nfit_buf + offset;
516 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
517 memdev->header.length = sizeof(*memdev);
518 memdev->device_handle = handle[0];
519 memdev->physical_id = 0;
520 memdev->region_id = 0;
521 memdev->range_index = 0+1;
522 memdev->region_index = 0+1;
523 memdev->region_size = SPA0_SIZE/2;
524 memdev->region_offset = t->spa_set_dma[0];
525 memdev->address = 0;
526 memdev->interleave_index = 0;
527 memdev->interleave_ways = 2;
528
529 /* mem-region1 (spa0, dimm1) */
530 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map);
531 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
532 memdev->header.length = sizeof(*memdev);
533 memdev->device_handle = handle[1];
534 memdev->physical_id = 1;
535 memdev->region_id = 0;
536 memdev->range_index = 0+1;
537 memdev->region_index = 1+1;
538 memdev->region_size = SPA0_SIZE/2;
539 memdev->region_offset = t->spa_set_dma[0] + SPA0_SIZE/2;
540 memdev->address = 0;
541 memdev->interleave_index = 0;
542 memdev->interleave_ways = 2;
543
544 /* mem-region2 (spa1, dimm0) */
545 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2;
546 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
547 memdev->header.length = sizeof(*memdev);
548 memdev->device_handle = handle[0];
549 memdev->physical_id = 0;
550 memdev->region_id = 1;
551 memdev->range_index = 1+1;
552 memdev->region_index = 0+1;
553 memdev->region_size = SPA1_SIZE/4;
554 memdev->region_offset = t->spa_set_dma[1];
555 memdev->address = SPA0_SIZE/2;
556 memdev->interleave_index = 0;
557 memdev->interleave_ways = 4;
558
559 /* mem-region3 (spa1, dimm1) */
560 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3;
561 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
562 memdev->header.length = sizeof(*memdev);
563 memdev->device_handle = handle[1];
564 memdev->physical_id = 1;
565 memdev->region_id = 1;
566 memdev->range_index = 1+1;
567 memdev->region_index = 1+1;
568 memdev->region_size = SPA1_SIZE/4;
569 memdev->region_offset = t->spa_set_dma[1] + SPA1_SIZE/4;
570 memdev->address = SPA0_SIZE/2;
571 memdev->interleave_index = 0;
572 memdev->interleave_ways = 4;
573
574 /* mem-region4 (spa1, dimm2) */
575 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 4;
576 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
577 memdev->header.length = sizeof(*memdev);
578 memdev->device_handle = handle[2];
579 memdev->physical_id = 2;
580 memdev->region_id = 0;
581 memdev->range_index = 1+1;
582 memdev->region_index = 2+1;
583 memdev->region_size = SPA1_SIZE/4;
584 memdev->region_offset = t->spa_set_dma[1] + 2*SPA1_SIZE/4;
585 memdev->address = SPA0_SIZE/2;
586 memdev->interleave_index = 0;
587 memdev->interleave_ways = 4;
588
589 /* mem-region5 (spa1, dimm3) */
590 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5;
591 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
592 memdev->header.length = sizeof(*memdev);
593 memdev->device_handle = handle[3];
594 memdev->physical_id = 3;
595 memdev->region_id = 0;
596 memdev->range_index = 1+1;
597 memdev->region_index = 3+1;
598 memdev->region_size = SPA1_SIZE/4;
599 memdev->region_offset = t->spa_set_dma[1] + 3*SPA1_SIZE/4;
600 memdev->address = SPA0_SIZE/2;
601 memdev->interleave_index = 0;
602 memdev->interleave_ways = 4;
603
604 /* mem-region6 (spa/dcr0, dimm0) */
605 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 6;
606 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
607 memdev->header.length = sizeof(*memdev);
608 memdev->device_handle = handle[0];
609 memdev->physical_id = 0;
610 memdev->region_id = 0;
611 memdev->range_index = 2+1;
612 memdev->region_index = 0+1;
613 memdev->region_size = 0;
614 memdev->region_offset = 0;
615 memdev->address = 0;
616 memdev->interleave_index = 0;
617 memdev->interleave_ways = 1;
618
619 /* mem-region7 (spa/dcr1, dimm1) */
620 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 7;
621 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
622 memdev->header.length = sizeof(*memdev);
623 memdev->device_handle = handle[1];
624 memdev->physical_id = 1;
625 memdev->region_id = 0;
626 memdev->range_index = 3+1;
627 memdev->region_index = 1+1;
628 memdev->region_size = 0;
629 memdev->region_offset = 0;
630 memdev->address = 0;
631 memdev->interleave_index = 0;
632 memdev->interleave_ways = 1;
633
634 /* mem-region8 (spa/dcr2, dimm2) */
635 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 8;
636 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
637 memdev->header.length = sizeof(*memdev);
638 memdev->device_handle = handle[2];
639 memdev->physical_id = 2;
640 memdev->region_id = 0;
641 memdev->range_index = 4+1;
642 memdev->region_index = 2+1;
643 memdev->region_size = 0;
644 memdev->region_offset = 0;
645 memdev->address = 0;
646 memdev->interleave_index = 0;
647 memdev->interleave_ways = 1;
648
649 /* mem-region9 (spa/dcr3, dimm3) */
650 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 9;
651 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
652 memdev->header.length = sizeof(*memdev);
653 memdev->device_handle = handle[3];
654 memdev->physical_id = 3;
655 memdev->region_id = 0;
656 memdev->range_index = 5+1;
657 memdev->region_index = 3+1;
658 memdev->region_size = 0;
659 memdev->region_offset = 0;
660 memdev->address = 0;
661 memdev->interleave_index = 0;
662 memdev->interleave_ways = 1;
663
664 /* mem-region10 (spa/bdw0, dimm0) */
665 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 10;
666 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
667 memdev->header.length = sizeof(*memdev);
668 memdev->device_handle = handle[0];
669 memdev->physical_id = 0;
670 memdev->region_id = 0;
671 memdev->range_index = 6+1;
672 memdev->region_index = 0+1;
673 memdev->region_size = 0;
674 memdev->region_offset = 0;
675 memdev->address = 0;
676 memdev->interleave_index = 0;
677 memdev->interleave_ways = 1;
678
679 /* mem-region11 (spa/bdw1, dimm1) */
680 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 11;
681 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
682 memdev->header.length = sizeof(*memdev);
683 memdev->device_handle = handle[1];
684 memdev->physical_id = 1;
685 memdev->region_id = 0;
686 memdev->range_index = 7+1;
687 memdev->region_index = 1+1;
688 memdev->region_size = 0;
689 memdev->region_offset = 0;
690 memdev->address = 0;
691 memdev->interleave_index = 0;
692 memdev->interleave_ways = 1;
693
694 /* mem-region12 (spa/bdw2, dimm2) */
695 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 12;
696 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
697 memdev->header.length = sizeof(*memdev);
698 memdev->device_handle = handle[2];
699 memdev->physical_id = 2;
700 memdev->region_id = 0;
701 memdev->range_index = 8+1;
702 memdev->region_index = 2+1;
703 memdev->region_size = 0;
704 memdev->region_offset = 0;
705 memdev->address = 0;
706 memdev->interleave_index = 0;
707 memdev->interleave_ways = 1;
708
709 /* mem-region13 (spa/dcr3, dimm3) */
710 memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 13;
711 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
712 memdev->header.length = sizeof(*memdev);
713 memdev->device_handle = handle[3];
714 memdev->physical_id = 3;
715 memdev->region_id = 0;
716 memdev->range_index = 9+1;
717 memdev->region_index = 3+1;
718 memdev->region_size = 0;
719 memdev->region_offset = 0;
720 memdev->address = 0;
721 memdev->interleave_index = 0;
722 memdev->interleave_ways = 1;
723
724 offset = offset + sizeof(struct acpi_nfit_memory_map) * 14;
725 /* dcr-descriptor0 */
726 dcr = nfit_buf + offset;
727 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
728 dcr->header.length = sizeof(struct acpi_nfit_control_region);
729 dcr->region_index = 0+1;
730 dcr->vendor_id = 0xabcd;
731 dcr->device_id = 0;
732 dcr->revision_id = 1;
733 dcr->serial_number = ~handle[0];
734 dcr->windows = 1;
735 dcr->window_size = DCR_SIZE;
736 dcr->command_offset = 0;
737 dcr->command_size = 8;
738 dcr->status_offset = 8;
739 dcr->status_size = 4;
740
741 /* dcr-descriptor1 */
742 dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region);
743 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
744 dcr->header.length = sizeof(struct acpi_nfit_control_region);
745 dcr->region_index = 1+1;
746 dcr->vendor_id = 0xabcd;
747 dcr->device_id = 0;
748 dcr->revision_id = 1;
749 dcr->serial_number = ~handle[1];
750 dcr->windows = 1;
751 dcr->window_size = DCR_SIZE;
752 dcr->command_offset = 0;
753 dcr->command_size = 8;
754 dcr->status_offset = 8;
755 dcr->status_size = 4;
756
757 /* dcr-descriptor2 */
758 dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 2;
759 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
760 dcr->header.length = sizeof(struct acpi_nfit_control_region);
761 dcr->region_index = 2+1;
762 dcr->vendor_id = 0xabcd;
763 dcr->device_id = 0;
764 dcr->revision_id = 1;
765 dcr->serial_number = ~handle[2];
766 dcr->windows = 1;
767 dcr->window_size = DCR_SIZE;
768 dcr->command_offset = 0;
769 dcr->command_size = 8;
770 dcr->status_offset = 8;
771 dcr->status_size = 4;
772
773 /* dcr-descriptor3 */
774 dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 3;
775 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
776 dcr->header.length = sizeof(struct acpi_nfit_control_region);
777 dcr->region_index = 3+1;
778 dcr->vendor_id = 0xabcd;
779 dcr->device_id = 0;
780 dcr->revision_id = 1;
781 dcr->serial_number = ~handle[3];
782 dcr->windows = 1;
783 dcr->window_size = DCR_SIZE;
784 dcr->command_offset = 0;
785 dcr->command_size = 8;
786 dcr->status_offset = 8;
787 dcr->status_size = 4;
788
789 offset = offset + sizeof(struct acpi_nfit_control_region) * 4;
790 /* bdw0 (spa/dcr0, dimm0) */
791 bdw = nfit_buf + offset;
792 bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
793 bdw->header.length = sizeof(struct acpi_nfit_data_region);
794 bdw->region_index = 0+1;
795 bdw->windows = 1;
796 bdw->offset = 0;
797 bdw->size = BDW_SIZE;
798 bdw->capacity = DIMM_SIZE;
799 bdw->start_address = 0;
800
801 /* bdw1 (spa/dcr1, dimm1) */
802 bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region);
803 bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
804 bdw->header.length = sizeof(struct acpi_nfit_data_region);
805 bdw->region_index = 1+1;
806 bdw->windows = 1;
807 bdw->offset = 0;
808 bdw->size = BDW_SIZE;
809 bdw->capacity = DIMM_SIZE;
810 bdw->start_address = 0;
811
812 /* bdw2 (spa/dcr2, dimm2) */
813 bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 2;
814 bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
815 bdw->header.length = sizeof(struct acpi_nfit_data_region);
816 bdw->region_index = 2+1;
817 bdw->windows = 1;
818 bdw->offset = 0;
819 bdw->size = BDW_SIZE;
820 bdw->capacity = DIMM_SIZE;
821 bdw->start_address = 0;
822
823 /* bdw3 (spa/dcr3, dimm3) */
824 bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 3;
825 bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
826 bdw->header.length = sizeof(struct acpi_nfit_data_region);
827 bdw->region_index = 3+1;
828 bdw->windows = 1;
829 bdw->offset = 0;
830 bdw->size = BDW_SIZE;
831 bdw->capacity = DIMM_SIZE;
832 bdw->start_address = 0;
833
834 acpi_desc = &t->acpi_desc;
835 set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
836 set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
837 set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
838 nd_desc = &acpi_desc->nd_desc;
839 nd_desc->ndctl = nfit_test_ctl;
840}
841
842static void nfit_test1_setup(struct nfit_test *t)
843{
844 size_t size = t->nfit_size, offset;
845 void *nfit_buf = t->nfit_buf;
846 struct acpi_nfit_memory_map *memdev;
847 struct acpi_nfit_control_region *dcr;
848 struct acpi_nfit_system_address *spa;
849
850 nfit_test_init_header(nfit_buf, size);
851
852 offset = sizeof(struct acpi_table_nfit);
853 /* spa0 (flat range with no bdw aliasing) */
854 spa = nfit_buf + offset;
855 spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS;
856 spa->header.length = sizeof(*spa);
857 memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16);
858 spa->range_index = 0+1;
859 spa->address = t->spa_set_dma[0];
860 spa->length = SPA2_SIZE;
861
862 offset += sizeof(*spa);
863 /* mem-region0 (spa0, dimm0) */
864 memdev = nfit_buf + offset;
865 memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
866 memdev->header.length = sizeof(*memdev);
867 memdev->device_handle = 0;
868 memdev->physical_id = 0;
869 memdev->region_id = 0;
870 memdev->range_index = 0+1;
871 memdev->region_index = 0+1;
872 memdev->region_size = SPA2_SIZE;
873 memdev->region_offset = 0;
874 memdev->address = 0;
875 memdev->interleave_index = 0;
876 memdev->interleave_ways = 1;
877 memdev->flags = ACPI_NFIT_MEM_SAVE_FAILED | ACPI_NFIT_MEM_RESTORE_FAILED
878 | ACPI_NFIT_MEM_FLUSH_FAILED | ACPI_NFIT_MEM_HEALTH_OBSERVED
879 | ACPI_NFIT_MEM_ARMED;
880
881 offset += sizeof(*memdev);
882 /* dcr-descriptor0 */
883 dcr = nfit_buf + offset;
884 dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
885 dcr->header.length = sizeof(struct acpi_nfit_control_region);
886 dcr->region_index = 0+1;
887 dcr->vendor_id = 0xabcd;
888 dcr->device_id = 0;
889 dcr->revision_id = 1;
890 dcr->serial_number = ~0;
891 dcr->code = 0x201;
892 dcr->windows = 0;
893 dcr->window_size = 0;
894 dcr->command_offset = 0;
895 dcr->command_size = 0;
896 dcr->status_offset = 0;
897 dcr->status_size = 0;
898}
899
900static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
901 void *iobuf, u64 len, int rw)
902{
903 struct nfit_blk *nfit_blk = ndbr->blk_provider_data;
904 struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
905 struct nd_region *nd_region = &ndbr->nd_region;
906 unsigned int lane;
907
908 lane = nd_region_acquire_lane(nd_region);
909 if (rw)
910 memcpy(mmio->base + dpa, iobuf, len);
911 else
912 memcpy(iobuf, mmio->base + dpa, len);
913 nd_region_release_lane(nd_region, lane);
914
915 return 0;
916}
917
918static int nfit_test_probe(struct platform_device *pdev)
919{
920 struct nvdimm_bus_descriptor *nd_desc;
921 struct acpi_nfit_desc *acpi_desc;
922 struct device *dev = &pdev->dev;
923 struct nfit_test *nfit_test;
924 int rc;
925
926 nfit_test = to_nfit_test(&pdev->dev);
927
928 /* common alloc */
929 if (nfit_test->num_dcr) {
930 int num = nfit_test->num_dcr;
931
932 nfit_test->dimm = devm_kcalloc(dev, num, sizeof(void *),
933 GFP_KERNEL);
934 nfit_test->dimm_dma = devm_kcalloc(dev, num, sizeof(dma_addr_t),
935 GFP_KERNEL);
936 nfit_test->label = devm_kcalloc(dev, num, sizeof(void *),
937 GFP_KERNEL);
938 nfit_test->label_dma = devm_kcalloc(dev, num,
939 sizeof(dma_addr_t), GFP_KERNEL);
940 nfit_test->dcr = devm_kcalloc(dev, num,
941 sizeof(struct nfit_test_dcr *), GFP_KERNEL);
942 nfit_test->dcr_dma = devm_kcalloc(dev, num,
943 sizeof(dma_addr_t), GFP_KERNEL);
944 if (nfit_test->dimm && nfit_test->dimm_dma && nfit_test->label
945 && nfit_test->label_dma && nfit_test->dcr
946 && nfit_test->dcr_dma)
947 /* pass */;
948 else
949 return -ENOMEM;
950 }
951
952 if (nfit_test->num_pm) {
953 int num = nfit_test->num_pm;
954
955 nfit_test->spa_set = devm_kcalloc(dev, num, sizeof(void *),
956 GFP_KERNEL);
957 nfit_test->spa_set_dma = devm_kcalloc(dev, num,
958 sizeof(dma_addr_t), GFP_KERNEL);
959 if (nfit_test->spa_set && nfit_test->spa_set_dma)
960 /* pass */;
961 else
962 return -ENOMEM;
963 }
964
965 /* per-nfit specific alloc */
966 if (nfit_test->alloc(nfit_test))
967 return -ENOMEM;
968
969 nfit_test->setup(nfit_test);
970 acpi_desc = &nfit_test->acpi_desc;
971 acpi_desc->dev = &pdev->dev;
972 acpi_desc->nfit = nfit_test->nfit_buf;
973 acpi_desc->blk_do_io = nfit_test_blk_do_io;
974 nd_desc = &acpi_desc->nd_desc;
975 nd_desc->attr_groups = acpi_nfit_attribute_groups;
976 acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
977 if (!acpi_desc->nvdimm_bus)
978 return -ENXIO;
979
980 rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size);
981 if (rc) {
982 nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
983 return rc;
984 }
985
986 return 0;
987}
988
989static int nfit_test_remove(struct platform_device *pdev)
990{
991 struct nfit_test *nfit_test = to_nfit_test(&pdev->dev);
992 struct acpi_nfit_desc *acpi_desc = &nfit_test->acpi_desc;
993
994 nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
995
996 return 0;
997}
998
999static void nfit_test_release(struct device *dev)
1000{
1001 struct nfit_test *nfit_test = to_nfit_test(dev);
1002
1003 kfree(nfit_test);
1004}
1005
1006static const struct platform_device_id nfit_test_id[] = {
1007 { KBUILD_MODNAME },
1008 { },
1009};
1010
1011static struct platform_driver nfit_test_driver = {
1012 .probe = nfit_test_probe,
1013 .remove = nfit_test_remove,
1014 .driver = {
1015 .name = KBUILD_MODNAME,
1016 },
1017 .id_table = nfit_test_id,
1018};
1019
1020#ifdef CONFIG_CMA_SIZE_MBYTES
1021#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
1022#else
1023#define CMA_SIZE_MBYTES 0
1024#endif
1025
1026static __init int nfit_test_init(void)
1027{
1028 int rc, i;
1029
1030 nfit_test_setup(nfit_test_lookup);
1031
1032 for (i = 0; i < NUM_NFITS; i++) {
1033 struct nfit_test *nfit_test;
1034 struct platform_device *pdev;
1035 static int once;
1036
1037 nfit_test = kzalloc(sizeof(*nfit_test), GFP_KERNEL);
1038 if (!nfit_test) {
1039 rc = -ENOMEM;
1040 goto err_register;
1041 }
1042 INIT_LIST_HEAD(&nfit_test->resources);
1043 switch (i) {
1044 case 0:
1045 nfit_test->num_pm = NUM_PM;
1046 nfit_test->num_dcr = NUM_DCR;
1047 nfit_test->alloc = nfit_test0_alloc;
1048 nfit_test->setup = nfit_test0_setup;
1049 break;
1050 case 1:
1051 nfit_test->num_pm = 1;
1052 nfit_test->alloc = nfit_test1_alloc;
1053 nfit_test->setup = nfit_test1_setup;
1054 break;
1055 default:
1056 rc = -EINVAL;
1057 goto err_register;
1058 }
1059 pdev = &nfit_test->pdev;
1060 pdev->name = KBUILD_MODNAME;
1061 pdev->id = i;
1062 pdev->dev.release = nfit_test_release;
1063 rc = platform_device_register(pdev);
1064 if (rc) {
1065 put_device(&pdev->dev);
1066 goto err_register;
1067 }
1068
1069 rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
1070 if (rc)
1071 goto err_register;
1072
1073 instances[i] = nfit_test;
1074
1075 if (!once++) {
1076 dma_addr_t dma;
1077 void *buf;
1078
1079 buf = dma_alloc_coherent(&pdev->dev, SZ_128M, &dma,
1080 GFP_KERNEL);
1081 if (!buf) {
1082 rc = -ENOMEM;
1083 dev_warn(&pdev->dev, "need 128M of free cma\n");
1084 goto err_register;
1085 }
1086 dma_free_coherent(&pdev->dev, SZ_128M, buf, dma);
1087 }
1088 }
1089
1090 rc = platform_driver_register(&nfit_test_driver);
1091 if (rc)
1092 goto err_register;
1093 return 0;
1094
1095 err_register:
1096 for (i = 0; i < NUM_NFITS; i++)
1097 if (instances[i])
1098 platform_device_unregister(&instances[i]->pdev);
1099 nfit_test_teardown();
1100 return rc;
1101}
1102
1103static __exit void nfit_test_exit(void)
1104{
1105 int i;
1106
1107 platform_driver_unregister(&nfit_test_driver);
1108 for (i = 0; i < NUM_NFITS; i++)
1109 platform_device_unregister(&instances[i]->pdev);
1110 nfit_test_teardown();
1111}
1112
1113module_init(nfit_test_init);
1114module_exit(nfit_test_exit);
1115MODULE_LICENSE("GPL v2");
1116MODULE_AUTHOR("Intel Corporation");
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
new file mode 100644
index 000000000000..96c5e16d7db9
--- /dev/null
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#ifndef __NFIT_TEST_H__
14#define __NFIT_TEST_H__
15
16struct nfit_test_resource {
17 struct list_head list;
18 struct resource *res;
19 struct device *dev;
20 void *buf;
21};
22
23typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t);
24void __iomem *__wrap_ioremap_nocache(resource_size_t offset,
25 unsigned long size);
26void __wrap_iounmap(volatile void __iomem *addr);
27void nfit_test_setup(nfit_test_lookup_fn lookup);
28void nfit_test_teardown(void);
29#endif