summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-17 12:51:57 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-17 12:51:57 -0500
commita3841f94c7ecb3ede0f888d3fcfe8fb6368ddd7a (patch)
tree6625eedf10d0672068ee218bb893a5a0e1803df2
parentadeba81ac2a6451f44545874da3d181081f0ab04 (diff)
parent4247f24c23589bcc3bc3490515ef8c9497e9ae55 (diff)
Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams: "Save for a few late fixes, all of these commits have shipped in -next releases since before the merge window opened, and 0day has given a build success notification. The ext4 touches came from Jan, and the xfs touches have Darrick's reviewed-by. An xfstest for the MAP_SYNC feature has been through a few round of reviews and is on track to be merged. - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable 'userspace flush' of persistent memory updates via filesystem-dax mappings. It arranges for any filesystem metadata updates that may be required to satisfy a write fault to also be flushed ("on disk") before the kernel returns to userspace from the fault handler. Effectively every write-fault that dirties metadata completes an fsync() before returning from the fault handler. The new MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag is validated as supported by the filesystem's ->mmap() file operation. - Add support for the standard ACPI 6.2 label access methods that replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods. This enables interoperability with environments that only implement the standardized methods. - Add support for the ACPI 6.2 NVDIMM media error injection methods. - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for latch last shutdown status, firmware update, SMART error injection, and SMART alarm threshold control. - Cleanup physical address information disclosures to be root-only. - Fix revalidation of the DIMM "locked label area" status to support dynamic unlock of the label area. - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA (system-physical-address) command and error injection commands. Acknowledgements that came after the commits were pushed to -next: - 957ac8c421ad ("dax: fix PMD faults on zero-length files"): Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> - a39e596baa07 ("xfs: support for synchronous DAX faults") and 7b565c9f965b ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()") Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>" * tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits) acpi, nfit: add 'Enable Latch System Shutdown Status' command support dax: fix general protection fault in dax_alloc_inode dax: fix PMD faults on zero-length files dax: stop requiring a live device for dax_flush() brd: remove dax support dax: quiet bdev_dax_supported() fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core tools/testing/nvdimm: unit test clear-error commands acpi, nfit: validate commands against the device type tools/testing/nvdimm: stricter bounds checking for error injection commands xfs: support for synchronous DAX faults xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault() ext4: Support for synchronous DAX faults ext4: Simplify error handling in ext4_dax_huge_fault() dax: Implement dax_finish_sync_fault() dax, iomap: Add support for synchronous faults mm: Define MAP_SYNC and VM_SYNC flags dax: Allow tuning whether dax_insert_mapping_entry() dirties entry dax: Allow dax_iomap_fault() to return pfn dax: Fix comment describing dax_iomap_fault() ...
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/alpha/include/uapi/asm/mman.h1
-rw-r--r--arch/mips/include/uapi/asm/mman.h1
-rw-r--r--arch/parisc/include/uapi/asm/mman.h1
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h1
-rw-r--r--drivers/acpi/nfit/core.c274
-rw-r--r--drivers/acpi/nfit/mce.c2
-rw-r--r--drivers/acpi/nfit/nfit.h37
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/brd.c65
-rw-r--r--drivers/dax/device.c3
-rw-r--r--drivers/dax/super.c14
-rw-r--r--drivers/nvdimm/Makefile1
-rw-r--r--drivers/nvdimm/badrange.c293
-rw-r--r--drivers/nvdimm/bus.c24
-rw-r--r--drivers/nvdimm/core.c260
-rw-r--r--drivers/nvdimm/dimm.c3
-rw-r--r--drivers/nvdimm/dimm_devs.c19
-rw-r--r--drivers/nvdimm/label.c2
-rw-r--r--drivers/nvdimm/namespace_devs.c6
-rw-r--r--drivers/nvdimm/nd-core.h3
-rw-r--r--drivers/nvdimm/nd.h7
-rw-r--r--drivers/nvdimm/pfn_devs.c8
-rw-r--r--drivers/nvdimm/region_devs.c8
-rw-r--r--fs/dax.c319
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext4/file.c26
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/xfs/xfs_file.c44
-rw-r--r--fs/xfs/xfs_iomap.c5
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--include/linux/dax.h4
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/jbd2.h1
-rw-r--r--include/linux/libnvdimm.h21
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/mman.h48
-rw-r--r--include/trace/events/fs_dax.h3
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--include/uapi/asm-generic/mman.h1
-rw-r--r--mm/mmap.c15
-rw-r--r--tools/include/uapi/asm-generic/mman-common.h1
-rw-r--r--tools/testing/nvdimm/Kbuild1
-rw-r--r--tools/testing/nvdimm/test/nfit.c319
-rw-r--r--tools/testing/nvdimm/test/nfit_test.h52
48 files changed, 1405 insertions, 560 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 540762a62906..e04d108055f0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4208,7 +4208,7 @@ L: linux-i2c@vger.kernel.org
4208S: Maintained 4208S: Maintained
4209F: drivers/i2c/busses/i2c-diolan-u2c.c 4209F: drivers/i2c/busses/i2c-diolan-u2c.c
4210 4210
4211DIRECT ACCESS (DAX) 4211FILESYSTEM DIRECT ACCESS (DAX)
4212M: Matthew Wilcox <mawilcox@microsoft.com> 4212M: Matthew Wilcox <mawilcox@microsoft.com>
4213M: Ross Zwisler <ross.zwisler@linux.intel.com> 4213M: Ross Zwisler <ross.zwisler@linux.intel.com>
4214L: linux-fsdevel@vger.kernel.org 4214L: linux-fsdevel@vger.kernel.org
@@ -4217,6 +4217,12 @@ F: fs/dax.c
4217F: include/linux/dax.h 4217F: include/linux/dax.h
4218F: include/trace/events/fs_dax.h 4218F: include/trace/events/fs_dax.h
4219 4219
4220DEVICE DIRECT ACCESS (DAX)
4221M: Dan Williams <dan.j.williams@intel.com>
4222L: linux-nvdimm@lists.01.org
4223S: Supported
4224F: drivers/dax/
4225
4220DIRECTORY NOTIFICATION (DNOTIFY) 4226DIRECTORY NOTIFICATION (DNOTIFY)
4221M: Jan Kara <jack@suse.cz> 4227M: Jan Kara <jack@suse.cz>
4222R: Amir Goldstein <amir73il@gmail.com> 4228R: Amir Goldstein <amir73il@gmail.com>
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 6bf730063e3f..2dbdf59258d9 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@
12 12
13#define MAP_SHARED 0x01 /* Share changes */ 13#define MAP_SHARED 0x01 /* Share changes */
14#define MAP_PRIVATE 0x02 /* Changes are private */ 14#define MAP_PRIVATE 0x02 /* Changes are private */
15#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
15#define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ 16#define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */
16#define MAP_FIXED 0x100 /* Interpret addr exactly */ 17#define MAP_FIXED 0x100 /* Interpret addr exactly */
17#define MAP_ANONYMOUS 0x10 /* don't use a file */ 18#define MAP_ANONYMOUS 0x10 /* don't use a file */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 20c3df7a8fdd..606e02ca4b6c 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -29,6 +29,7 @@
29 */ 29 */
30#define MAP_SHARED 0x001 /* Share changes */ 30#define MAP_SHARED 0x001 /* Share changes */
31#define MAP_PRIVATE 0x002 /* Changes are private */ 31#define MAP_PRIVATE 0x002 /* Changes are private */
32#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
32#define MAP_TYPE 0x00f /* Mask for type of mapping */ 33#define MAP_TYPE 0x00f /* Mask for type of mapping */
33#define MAP_FIXED 0x010 /* Interpret addr exactly */ 34#define MAP_FIXED 0x010 /* Interpret addr exactly */
34 35
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index d1af0d74a188..80510ba44c08 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@
12 12
13#define MAP_SHARED 0x01 /* Share changes */ 13#define MAP_SHARED 0x01 /* Share changes */
14#define MAP_PRIVATE 0x02 /* Changes are private */ 14#define MAP_PRIVATE 0x02 /* Changes are private */
15#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
15#define MAP_TYPE 0x03 /* Mask for type of mapping */ 16#define MAP_TYPE 0x03 /* Mask for type of mapping */
16#define MAP_FIXED 0x04 /* Interpret addr exactly */ 17#define MAP_FIXED 0x04 /* Interpret addr exactly */
17#define MAP_ANONYMOUS 0x10 /* don't use a file */ 18#define MAP_ANONYMOUS 0x10 /* don't use a file */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 2bfe590694fc..3e9d01ada81f 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -36,6 +36,7 @@
36 */ 36 */
37#define MAP_SHARED 0x001 /* Share changes */ 37#define MAP_SHARED 0x001 /* Share changes */
38#define MAP_PRIVATE 0x002 /* Changes are private */ 38#define MAP_PRIVATE 0x002 /* Changes are private */
39#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
39#define MAP_TYPE 0x00f /* Mask for type of mapping */ 40#define MAP_TYPE 0x00f /* Mask for type of mapping */
40#define MAP_FIXED 0x010 /* Interpret addr exactly */ 41#define MAP_FIXED 0x010 /* Interpret addr exactly */
41 42
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 9c2c49b6a240..ff2580e7611d 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -183,13 +183,33 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
183 return 0; 183 return 0;
184} 184}
185 185
186static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status) 186#define ACPI_LABELS_LOCKED 3
187
188static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
189 u32 status)
187{ 190{
191 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
192
188 switch (cmd) { 193 switch (cmd) {
189 case ND_CMD_GET_CONFIG_SIZE: 194 case ND_CMD_GET_CONFIG_SIZE:
195 /*
196 * In the _LSI, _LSR, _LSW case the locked status is
197 * communicated via the read/write commands
198 */
199 if (nfit_mem->has_lsi)
200 break;
201
190 if (status >> 16 & ND_CONFIG_LOCKED) 202 if (status >> 16 & ND_CONFIG_LOCKED)
191 return -EACCES; 203 return -EACCES;
192 break; 204 break;
205 case ND_CMD_GET_CONFIG_DATA:
206 if (nfit_mem->has_lsr && status == ACPI_LABELS_LOCKED)
207 return -EACCES;
208 break;
209 case ND_CMD_SET_CONFIG_DATA:
210 if (nfit_mem->has_lsw && status == ACPI_LABELS_LOCKED)
211 return -EACCES;
212 break;
193 default: 213 default:
194 break; 214 break;
195 } 215 }
@@ -205,13 +225,182 @@ static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
205{ 225{
206 if (!nvdimm) 226 if (!nvdimm)
207 return xlat_bus_status(buf, cmd, status); 227 return xlat_bus_status(buf, cmd, status);
208 return xlat_nvdimm_status(buf, cmd, status); 228 return xlat_nvdimm_status(nvdimm, buf, cmd, status);
229}
230
231/* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */
232static union acpi_object *pkg_to_buf(union acpi_object *pkg)
233{
234 int i;
235 void *dst;
236 size_t size = 0;
237 union acpi_object *buf = NULL;
238
239 if (pkg->type != ACPI_TYPE_PACKAGE) {
240 WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
241 pkg->type);
242 goto err;
243 }
244
245 for (i = 0; i < pkg->package.count; i++) {
246 union acpi_object *obj = &pkg->package.elements[i];
247
248 if (obj->type == ACPI_TYPE_INTEGER)
249 size += 4;
250 else if (obj->type == ACPI_TYPE_BUFFER)
251 size += obj->buffer.length;
252 else {
253 WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
254 obj->type);
255 goto err;
256 }
257 }
258
259 buf = ACPI_ALLOCATE(sizeof(*buf) + size);
260 if (!buf)
261 goto err;
262
263 dst = buf + 1;
264 buf->type = ACPI_TYPE_BUFFER;
265 buf->buffer.length = size;
266 buf->buffer.pointer = dst;
267 for (i = 0; i < pkg->package.count; i++) {
268 union acpi_object *obj = &pkg->package.elements[i];
269
270 if (obj->type == ACPI_TYPE_INTEGER) {
271 memcpy(dst, &obj->integer.value, 4);
272 dst += 4;
273 } else if (obj->type == ACPI_TYPE_BUFFER) {
274 memcpy(dst, obj->buffer.pointer, obj->buffer.length);
275 dst += obj->buffer.length;
276 }
277 }
278err:
279 ACPI_FREE(pkg);
280 return buf;
281}
282
283static union acpi_object *int_to_buf(union acpi_object *integer)
284{
285 union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
286 void *dst = NULL;
287
288 if (!buf)
289 goto err;
290
291 if (integer->type != ACPI_TYPE_INTEGER) {
292 WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
293 integer->type);
294 goto err;
295 }
296
297 dst = buf + 1;
298 buf->type = ACPI_TYPE_BUFFER;
299 buf->buffer.length = 4;
300 buf->buffer.pointer = dst;
301 memcpy(dst, &integer->integer.value, 4);
302err:
303 ACPI_FREE(integer);
304 return buf;
305}
306
307static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset,
308 u32 len, void *data)
309{
310 acpi_status rc;
311 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
312 struct acpi_object_list input = {
313 .count = 3,
314 .pointer = (union acpi_object []) {
315 [0] = {
316 .integer.type = ACPI_TYPE_INTEGER,
317 .integer.value = offset,
318 },
319 [1] = {
320 .integer.type = ACPI_TYPE_INTEGER,
321 .integer.value = len,
322 },
323 [2] = {
324 .buffer.type = ACPI_TYPE_BUFFER,
325 .buffer.pointer = data,
326 .buffer.length = len,
327 },
328 },
329 };
330
331 rc = acpi_evaluate_object(handle, "_LSW", &input, &buf);
332 if (ACPI_FAILURE(rc))
333 return NULL;
334 return int_to_buf(buf.pointer);
335}
336
337static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset,
338 u32 len)
339{
340 acpi_status rc;
341 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
342 struct acpi_object_list input = {
343 .count = 2,
344 .pointer = (union acpi_object []) {
345 [0] = {
346 .integer.type = ACPI_TYPE_INTEGER,
347 .integer.value = offset,
348 },
349 [1] = {
350 .integer.type = ACPI_TYPE_INTEGER,
351 .integer.value = len,
352 },
353 },
354 };
355
356 rc = acpi_evaluate_object(handle, "_LSR", &input, &buf);
357 if (ACPI_FAILURE(rc))
358 return NULL;
359 return pkg_to_buf(buf.pointer);
360}
361
362static union acpi_object *acpi_label_info(acpi_handle handle)
363{
364 acpi_status rc;
365 struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
366
367 rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf);
368 if (ACPI_FAILURE(rc))
369 return NULL;
370 return pkg_to_buf(buf.pointer);
371}
372
373static u8 nfit_dsm_revid(unsigned family, unsigned func)
374{
375 static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
376 [NVDIMM_FAMILY_INTEL] = {
377 [NVDIMM_INTEL_GET_MODES] = 2,
378 [NVDIMM_INTEL_GET_FWINFO] = 2,
379 [NVDIMM_INTEL_START_FWUPDATE] = 2,
380 [NVDIMM_INTEL_SEND_FWUPDATE] = 2,
381 [NVDIMM_INTEL_FINISH_FWUPDATE] = 2,
382 [NVDIMM_INTEL_QUERY_FWUPDATE] = 2,
383 [NVDIMM_INTEL_SET_THRESHOLD] = 2,
384 [NVDIMM_INTEL_INJECT_ERROR] = 2,
385 },
386 };
387 u8 id;
388
389 if (family > NVDIMM_FAMILY_MAX)
390 return 0;
391 if (func > 31)
392 return 0;
393 id = revid_table[family][func];
394 if (id == 0)
395 return 1; /* default */
396 return id;
209} 397}
210 398
211int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, 399int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
212 unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) 400 unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
213{ 401{
214 struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc); 402 struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
403 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
215 union acpi_object in_obj, in_buf, *out_obj; 404 union acpi_object in_obj, in_buf, *out_obj;
216 const struct nd_cmd_desc *desc = NULL; 405 const struct nd_cmd_desc *desc = NULL;
217 struct device *dev = acpi_desc->dev; 406 struct device *dev = acpi_desc->dev;
@@ -235,7 +424,6 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
235 } 424 }
236 425
237 if (nvdimm) { 426 if (nvdimm) {
238 struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
239 struct acpi_device *adev = nfit_mem->adev; 427 struct acpi_device *adev = nfit_mem->adev;
240 428
241 if (!adev) 429 if (!adev)
@@ -294,7 +482,29 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
294 in_buf.buffer.pointer, 482 in_buf.buffer.pointer,
295 min_t(u32, 256, in_buf.buffer.length), true); 483 min_t(u32, 256, in_buf.buffer.length), true);
296 484
297 out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj); 485 /* call the BIOS, prefer the named methods over _DSM if available */
486 if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
487 out_obj = acpi_label_info(handle);
488 else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
489 struct nd_cmd_get_config_data_hdr *p = buf;
490
491 out_obj = acpi_label_read(handle, p->in_offset, p->in_length);
492 } else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA
493 && nfit_mem->has_lsw) {
494 struct nd_cmd_set_config_hdr *p = buf;
495
496 out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
497 p->in_buf);
498 } else {
499 u8 revid;
500
501 if (nvdimm)
502 revid = nfit_dsm_revid(nfit_mem->family, func);
503 else
504 revid = 1;
505 out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj);
506 }
507
298 if (!out_obj) { 508 if (!out_obj) {
299 dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name, 509 dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
300 cmd_name); 510 cmd_name);
@@ -356,8 +566,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
356 * Set fw_status for all the commands with a known format to be 566 * Set fw_status for all the commands with a known format to be
357 * later interpreted by xlat_status(). 567 * later interpreted by xlat_status().
358 */ 568 */
359 if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR) 569 if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP
360 || (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR))) 570 && cmd <= ND_CMD_CLEAR_ERROR)
571 || (nvdimm && cmd >= ND_CMD_SMART
572 && cmd <= ND_CMD_VENDOR)))
361 fw_status = *(u32 *) out_obj->buffer.pointer; 573 fw_status = *(u32 *) out_obj->buffer.pointer;
362 574
363 if (offset + in_buf.buffer.length < buf_len) { 575 if (offset + in_buf.buffer.length < buf_len) {
@@ -1431,6 +1643,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1431{ 1643{
1432 struct acpi_device *adev, *adev_dimm; 1644 struct acpi_device *adev, *adev_dimm;
1433 struct device *dev = acpi_desc->dev; 1645 struct device *dev = acpi_desc->dev;
1646 union acpi_object *obj;
1434 unsigned long dsm_mask; 1647 unsigned long dsm_mask;
1435 const guid_t *guid; 1648 const guid_t *guid;
1436 int i; 1649 int i;
@@ -1463,7 +1676,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1463 * different command sets. Note, that checking for function0 (bit0) 1676 * different command sets. Note, that checking for function0 (bit0)
1464 * tells us if any commands are reachable through this GUID. 1677 * tells us if any commands are reachable through this GUID.
1465 */ 1678 */
1466 for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) 1679 for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
1467 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) 1680 if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
1468 if (family < 0 || i == default_dsm_family) 1681 if (family < 0 || i == default_dsm_family)
1469 family = i; 1682 family = i;
@@ -1473,7 +1686,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1473 if (override_dsm_mask && !disable_vendor_specific) 1686 if (override_dsm_mask && !disable_vendor_specific)
1474 dsm_mask = override_dsm_mask; 1687 dsm_mask = override_dsm_mask;
1475 else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { 1688 else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
1476 dsm_mask = 0x3fe; 1689 dsm_mask = NVDIMM_INTEL_CMDMASK;
1477 if (disable_vendor_specific) 1690 if (disable_vendor_specific)
1478 dsm_mask &= ~(1 << ND_CMD_VENDOR); 1691 dsm_mask &= ~(1 << ND_CMD_VENDOR);
1479 } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) { 1692 } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
@@ -1493,9 +1706,32 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
1493 1706
1494 guid = to_nfit_uuid(nfit_mem->family); 1707 guid = to_nfit_uuid(nfit_mem->family);
1495 for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) 1708 for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
1496 if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i)) 1709 if (acpi_check_dsm(adev_dimm->handle, guid,
1710 nfit_dsm_revid(nfit_mem->family, i),
1711 1ULL << i))
1497 set_bit(i, &nfit_mem->dsm_mask); 1712 set_bit(i, &nfit_mem->dsm_mask);
1498 1713
1714 obj = acpi_label_info(adev_dimm->handle);
1715 if (obj) {
1716 ACPI_FREE(obj);
1717 nfit_mem->has_lsi = 1;
1718 dev_dbg(dev, "%s: has _LSI\n", dev_name(&adev_dimm->dev));
1719 }
1720
1721 obj = acpi_label_read(adev_dimm->handle, 0, 0);
1722 if (obj) {
1723 ACPI_FREE(obj);
1724 nfit_mem->has_lsr = 1;
1725 dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
1726 }
1727
1728 obj = acpi_label_write(adev_dimm->handle, 0, 0, NULL);
1729 if (obj) {
1730 ACPI_FREE(obj);
1731 nfit_mem->has_lsw = 1;
1732 dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
1733 }
1734
1499 return 0; 1735 return 0;
1500} 1736}
1501 1737
@@ -1571,8 +1807,21 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
1571 * userspace interface. 1807 * userspace interface.
1572 */ 1808 */
1573 cmd_mask = 1UL << ND_CMD_CALL; 1809 cmd_mask = 1UL << ND_CMD_CALL;
1574 if (nfit_mem->family == NVDIMM_FAMILY_INTEL) 1810 if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
1575 cmd_mask |= nfit_mem->dsm_mask; 1811 /*
1812 * These commands have a 1:1 correspondence
1813 * between DSM payload and libnvdimm ioctl
1814 * payload format.
1815 */
1816 cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
1817 }
1818
1819 if (nfit_mem->has_lsi)
1820 set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
1821 if (nfit_mem->has_lsr)
1822 set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
1823 if (nfit_mem->has_lsw)
1824 set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
1576 1825
1577 flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush 1826 flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
1578 : NULL; 1827 : NULL;
@@ -1645,6 +1894,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
1645 int i; 1894 int i;
1646 1895
1647 nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; 1896 nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
1897 nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
1648 adev = to_acpi_dev(acpi_desc); 1898 adev = to_acpi_dev(acpi_desc);
1649 if (!adev) 1899 if (!adev)
1650 return; 1900 return;
@@ -2239,7 +2489,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
2239 if (ars_status->out_length 2489 if (ars_status->out_length
2240 < 44 + sizeof(struct nd_ars_record) * (i + 1)) 2490 < 44 + sizeof(struct nd_ars_record) * (i + 1))
2241 break; 2491 break;
2242 rc = nvdimm_bus_add_poison(nvdimm_bus, 2492 rc = nvdimm_bus_add_badrange(nvdimm_bus,
2243 ars_status->records[i].err_address, 2493 ars_status->records[i].err_address,
2244 ars_status->records[i].length); 2494 ars_status->records[i].length);
2245 if (rc) 2495 if (rc)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index feeb95d574fa..b92921439657 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
67 continue; 67 continue;
68 68
69 /* If this fails due to an -ENOMEM, there is little we can do */ 69 /* If this fails due to an -ENOMEM, there is little we can do */
70 nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, 70 nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
71 ALIGN(mce->addr, L1_CACHE_BYTES), 71 ALIGN(mce->addr, L1_CACHE_BYTES),
72 L1_CACHE_BYTES); 72 L1_CACHE_BYTES);
73 nvdimm_region_notify(nfit_spa->nd_region, 73 nvdimm_region_notify(nfit_spa->nd_region,
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index 54292db61262..f0cf18b2da8b 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -24,7 +24,7 @@
24/* ACPI 6.1 */ 24/* ACPI 6.1 */
25#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" 25#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
26 26
27/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */ 27/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
28#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" 28#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
29 29
30/* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */ 30/* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
@@ -38,6 +38,37 @@
38 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ 38 | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
39 | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) 39 | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
40 40
41#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
42
43#define NVDIMM_STANDARD_CMDMASK \
44(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
45 | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
46 | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
47 | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
48
49/*
50 * Command numbers that the kernel needs to know about to handle
51 * non-default DSM revision ids
52 */
53enum nvdimm_family_cmds {
54 NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
55 NVDIMM_INTEL_GET_MODES = 11,
56 NVDIMM_INTEL_GET_FWINFO = 12,
57 NVDIMM_INTEL_START_FWUPDATE = 13,
58 NVDIMM_INTEL_SEND_FWUPDATE = 14,
59 NVDIMM_INTEL_FINISH_FWUPDATE = 15,
60 NVDIMM_INTEL_QUERY_FWUPDATE = 16,
61 NVDIMM_INTEL_SET_THRESHOLD = 17,
62 NVDIMM_INTEL_INJECT_ERROR = 18,
63};
64
65#define NVDIMM_INTEL_CMDMASK \
66(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
67 | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
68 | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
69 | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
70 | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
71
41enum nfit_uuids { 72enum nfit_uuids {
42 /* for simplicity alias the uuid index with the family id */ 73 /* for simplicity alias the uuid index with the family id */
43 NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL, 74 NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
@@ -140,6 +171,9 @@ struct nfit_mem {
140 struct resource *flush_wpq; 171 struct resource *flush_wpq;
141 unsigned long dsm_mask; 172 unsigned long dsm_mask;
142 int family; 173 int family;
174 u32 has_lsi:1;
175 u32 has_lsr:1;
176 u32 has_lsw:1;
143}; 177};
144 178
145struct acpi_nfit_desc { 179struct acpi_nfit_desc {
@@ -167,6 +201,7 @@ struct acpi_nfit_desc {
167 unsigned int init_complete:1; 201 unsigned int init_complete:1;
168 unsigned long dimm_cmd_force_en; 202 unsigned long dimm_cmd_force_en;
169 unsigned long bus_cmd_force_en; 203 unsigned long bus_cmd_force_en;
204 unsigned long bus_nfit_cmd_force_en;
170 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, 205 int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
171 void *iobuf, u64 len, int rw); 206 void *iobuf, u64 len, int rw);
172}; 207};
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 923b417eaf4c..40579d0cb3d1 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -302,7 +302,6 @@ config BLK_DEV_SX8
302 302
303config BLK_DEV_RAM 303config BLK_DEV_RAM
304 tristate "RAM block device support" 304 tristate "RAM block device support"
305 select DAX if BLK_DEV_RAM_DAX
306 ---help--- 305 ---help---
307 Saying Y here will allow you to use a portion of your RAM memory as 306 Saying Y here will allow you to use a portion of your RAM memory as
308 a block device, so that you can make file systems on it, read and 307 a block device, so that you can make file systems on it, read and
@@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE
338 The default value is 4096 kilobytes. Only change this if you know 337 The default value is 4096 kilobytes. Only change this if you know
339 what you are doing. 338 what you are doing.
340 339
341config BLK_DEV_RAM_DAX
342 bool "Support Direct Access (DAX) to RAM block devices"
343 depends on BLK_DEV_RAM && FS_DAX
344 default n
345 help
346 Support filesystems using DAX to access RAM block devices. This
347 avoids double-buffering data in the page cache before copying it
348 to the block device. Answering Y will slightly enlarge the kernel,
349 and will prevent RAM block device backing store memory from being
350 allocated from highmem (only a problem for highmem systems).
351
352config CDROM_PKTCDVD 340config CDROM_PKTCDVD
353 tristate "Packet writing on CD/DVD media (DEPRECATED)" 341 tristate "Packet writing on CD/DVD media (DEPRECATED)"
354 depends on !UML 342 depends on !UML
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 588360d79fca..8028a3a7e7fd 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -21,11 +21,6 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#ifdef CONFIG_BLK_DEV_RAM_DAX
25#include <linux/pfn_t.h>
26#include <linux/dax.h>
27#include <linux/uio.h>
28#endif
29 24
30#include <linux/uaccess.h> 25#include <linux/uaccess.h>
31 26
@@ -45,9 +40,6 @@ struct brd_device {
45 40
46 struct request_queue *brd_queue; 41 struct request_queue *brd_queue;
47 struct gendisk *brd_disk; 42 struct gendisk *brd_disk;
48#ifdef CONFIG_BLK_DEV_RAM_DAX
49 struct dax_device *dax_dev;
50#endif
51 struct list_head brd_list; 43 struct list_head brd_list;
52 44
53 /* 45 /*
@@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
112 * restriction might be able to be lifted. 104 * restriction might be able to be lifted.
113 */ 105 */
114 gfp_flags = GFP_NOIO | __GFP_ZERO; 106 gfp_flags = GFP_NOIO | __GFP_ZERO;
115#ifndef CONFIG_BLK_DEV_RAM_DAX
116 gfp_flags |= __GFP_HIGHMEM;
117#endif
118 page = alloc_page(gfp_flags); 107 page = alloc_page(gfp_flags);
119 if (!page) 108 if (!page)
120 return NULL; 109 return NULL;
@@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
334 return err; 323 return err;
335} 324}
336 325
337#ifdef CONFIG_BLK_DEV_RAM_DAX
338static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
339 long nr_pages, void **kaddr, pfn_t *pfn)
340{
341 struct page *page;
342
343 if (!brd)
344 return -ENODEV;
345 page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
346 if (!page)
347 return -ENOSPC;
348 *kaddr = page_address(page);
349 *pfn = page_to_pfn_t(page);
350
351 return 1;
352}
353
354static long brd_dax_direct_access(struct dax_device *dax_dev,
355 pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
356{
357 struct brd_device *brd = dax_get_private(dax_dev);
358
359 return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
360}
361
362static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
363 void *addr, size_t bytes, struct iov_iter *i)
364{
365 return copy_from_iter(addr, bytes, i);
366}
367
368static const struct dax_operations brd_dax_ops = {
369 .direct_access = brd_dax_direct_access,
370 .copy_from_iter = brd_dax_copy_from_iter,
371};
372#endif
373
374static const struct block_device_operations brd_fops = { 326static const struct block_device_operations brd_fops = {
375 .owner = THIS_MODULE, 327 .owner = THIS_MODULE,
376 .rw_page = brd_rw_page, 328 .rw_page = brd_rw_page,
@@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i)
451 set_capacity(disk, rd_size * 2); 403 set_capacity(disk, rd_size * 2);
452 disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; 404 disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
453 405
454#ifdef CONFIG_BLK_DEV_RAM_DAX
455 queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
456 brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
457 if (!brd->dax_dev)
458 goto out_free_inode;
459#endif
460
461
462 return brd; 406 return brd;
463 407
464#ifdef CONFIG_BLK_DEV_RAM_DAX
465out_free_inode:
466 kill_dax(brd->dax_dev);
467 put_dax(brd->dax_dev);
468#endif
469out_free_queue: 408out_free_queue:
470 blk_cleanup_queue(brd->brd_queue); 409 blk_cleanup_queue(brd->brd_queue);
471out_free_dev: 410out_free_dev:
@@ -505,10 +444,6 @@ out:
505static void brd_del_one(struct brd_device *brd) 444static void brd_del_one(struct brd_device *brd)
506{ 445{
507 list_del(&brd->brd_list); 446 list_del(&brd->brd_list);
508#ifdef CONFIG_BLK_DEV_RAM_DAX
509 kill_dax(brd->dax_dev);
510 put_dax(brd->dax_dev);
511#endif
512 del_gendisk(brd->brd_disk); 447 del_gendisk(brd->brd_disk);
513 brd_free(brd); 448 brd_free(brd);
514} 449}
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e9f3b3e4bbf4..6833ada237ab 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
222 unsigned long size) 222 unsigned long size)
223{ 223{
224 struct resource *res; 224 struct resource *res;
225 phys_addr_t phys; 225 /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
226 phys_addr_t uninitialized_var(phys);
226 int i; 227 int i;
227 228
228 for (i = 0; i < dev_dax->num_resources; i++) { 229 for (i = 0; i < dev_dax->num_resources; i++) {
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 557b93703532..3ec804672601 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
92 long len; 92 long len;
93 93
94 if (blocksize != PAGE_SIZE) { 94 if (blocksize != PAGE_SIZE) {
95 pr_err("VFS (%s): error: unsupported blocksize for dax\n", 95 pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
96 sb->s_id); 96 sb->s_id);
97 return -EINVAL; 97 return -EINVAL;
98 } 98 }
99 99
100 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); 100 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
101 if (err) { 101 if (err) {
102 pr_err("VFS (%s): error: unaligned partition for dax\n", 102 pr_debug("VFS (%s): error: unaligned partition for dax\n",
103 sb->s_id); 103 sb->s_id);
104 return err; 104 return err;
105 } 105 }
106 106
107 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 107 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
108 if (!dax_dev) { 108 if (!dax_dev) {
109 pr_err("VFS (%s): error: device does not support dax\n", 109 pr_debug("VFS (%s): error: device does not support dax\n",
110 sb->s_id); 110 sb->s_id);
111 return -EOPNOTSUPP; 111 return -EOPNOTSUPP;
112 } 112 }
@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
118 put_dax(dax_dev); 118 put_dax(dax_dev);
119 119
120 if (len < 1) { 120 if (len < 1) {
121 pr_err("VFS (%s): error: dax access failed (%ld)", 121 pr_debug("VFS (%s): error: dax access failed (%ld)\n",
122 sb->s_id, len); 122 sb->s_id, len);
123 return len < 0 ? len : -EIO; 123 return len < 0 ? len : -EIO;
124 } 124 }
@@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
273void arch_wb_cache_pmem(void *addr, size_t size); 273void arch_wb_cache_pmem(void *addr, size_t size);
274void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 274void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
275{ 275{
276 if (unlikely(!dax_alive(dax_dev)))
277 return;
278
279 if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) 276 if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
280 return; 277 return;
281 278
@@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
344 struct inode *inode; 341 struct inode *inode;
345 342
346 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 343 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
344 if (!dax_dev)
345 return NULL;
346
347 inode = &dax_dev->inode; 347 inode = &dax_dev->inode;
348 inode->i_rdev = 0; 348 inode->i_rdev = 0;
349 return inode; 349 return inode;
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 447e0e14f3b6..70d5f3ad9909 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o
21libnvdimm-y += region.o 21libnvdimm-y += region.o
22libnvdimm-y += namespace_devs.o 22libnvdimm-y += namespace_devs.o
23libnvdimm-y += label.o 23libnvdimm-y += label.o
24libnvdimm-y += badrange.o
24libnvdimm-$(CONFIG_ND_CLAIM) += claim.o 25libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
25libnvdimm-$(CONFIG_BTT) += btt_devs.o 26libnvdimm-$(CONFIG_BTT) += btt_devs.o
26libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o 27libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c
new file mode 100644
index 000000000000..e068d72b4357
--- /dev/null
+++ b/drivers/nvdimm/badrange.c
@@ -0,0 +1,293 @@
1/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/libnvdimm.h>
14#include <linux/badblocks.h>
15#include <linux/export.h>
16#include <linux/module.h>
17#include <linux/blkdev.h>
18#include <linux/device.h>
19#include <linux/ctype.h>
20#include <linux/ndctl.h>
21#include <linux/mutex.h>
22#include <linux/slab.h>
23#include <linux/io.h>
24#include "nd-core.h"
25#include "nd.h"
26
27void badrange_init(struct badrange *badrange)
28{
29 INIT_LIST_HEAD(&badrange->list);
30 spin_lock_init(&badrange->lock);
31}
32EXPORT_SYMBOL_GPL(badrange_init);
33
34static void append_badrange_entry(struct badrange *badrange,
35 struct badrange_entry *bre, u64 addr, u64 length)
36{
37 lockdep_assert_held(&badrange->lock);
38 bre->start = addr;
39 bre->length = length;
40 list_add_tail(&bre->list, &badrange->list);
41}
42
43static int alloc_and_append_badrange_entry(struct badrange *badrange,
44 u64 addr, u64 length, gfp_t flags)
45{
46 struct badrange_entry *bre;
47
48 bre = kzalloc(sizeof(*bre), flags);
49 if (!bre)
50 return -ENOMEM;
51
52 append_badrange_entry(badrange, bre, addr, length);
53 return 0;
54}
55
56static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
57{
58 struct badrange_entry *bre, *bre_new;
59
60 spin_unlock(&badrange->lock);
61 bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
62 spin_lock(&badrange->lock);
63
64 if (list_empty(&badrange->list)) {
65 if (!bre_new)
66 return -ENOMEM;
67 append_badrange_entry(badrange, bre_new, addr, length);
68 return 0;
69 }
70
71 /*
72 * There is a chance this is a duplicate, check for those first.
73 * This will be the common case as ARS_STATUS returns all known
74 * errors in the SPA space, and we can't query it per region
75 */
76 list_for_each_entry(bre, &badrange->list, list)
77 if (bre->start == addr) {
78 /* If length has changed, update this list entry */
79 if (bre->length != length)
80 bre->length = length;
81 kfree(bre_new);
82 return 0;
83 }
84
85 /*
86 * If not a duplicate or a simple length update, add the entry as is,
87 * as any overlapping ranges will get resolved when the list is consumed
88 * and converted to badblocks
89 */
90 if (!bre_new)
91 return -ENOMEM;
92 append_badrange_entry(badrange, bre_new, addr, length);
93
94 return 0;
95}
96
97int badrange_add(struct badrange *badrange, u64 addr, u64 length)
98{
99 int rc;
100
101 spin_lock(&badrange->lock);
102 rc = add_badrange(badrange, addr, length);
103 spin_unlock(&badrange->lock);
104
105 return rc;
106}
107EXPORT_SYMBOL_GPL(badrange_add);
108
109void badrange_forget(struct badrange *badrange, phys_addr_t start,
110 unsigned int len)
111{
112 struct list_head *badrange_list = &badrange->list;
113 u64 clr_end = start + len - 1;
114 struct badrange_entry *bre, *next;
115
116 spin_lock(&badrange->lock);
117
118 /*
119 * [start, clr_end] is the badrange interval being cleared.
120 * [bre->start, bre_end] is the badrange_list entry we're comparing
121 * the above interval against. The badrange list entry may need
122 * to be modified (update either start or length), deleted, or
123 * split into two based on the overlap characteristics
124 */
125
126 list_for_each_entry_safe(bre, next, badrange_list, list) {
127 u64 bre_end = bre->start + bre->length - 1;
128
129 /* Skip intervals with no intersection */
130 if (bre_end < start)
131 continue;
132 if (bre->start > clr_end)
133 continue;
134 /* Delete completely overlapped badrange entries */
135 if ((bre->start >= start) && (bre_end <= clr_end)) {
136 list_del(&bre->list);
137 kfree(bre);
138 continue;
139 }
140 /* Adjust start point of partially cleared entries */
141 if ((start <= bre->start) && (clr_end > bre->start)) {
142 bre->length -= clr_end - bre->start + 1;
143 bre->start = clr_end + 1;
144 continue;
145 }
146 /* Adjust bre->length for partial clearing at the tail end */
147 if ((bre->start < start) && (bre_end <= clr_end)) {
148 /* bre->start remains the same */
149 bre->length = start - bre->start;
150 continue;
151 }
152 /*
153 * If clearing in the middle of an entry, we split it into
154 * two by modifying the current entry to represent one half of
155 * the split, and adding a new entry for the second half.
156 */
157 if ((bre->start < start) && (bre_end > clr_end)) {
158 u64 new_start = clr_end + 1;
159 u64 new_len = bre_end - new_start + 1;
160
161 /* Add new entry covering the right half */
162 alloc_and_append_badrange_entry(badrange, new_start,
163 new_len, GFP_NOWAIT);
164 /* Adjust this entry to cover the left half */
165 bre->length = start - bre->start;
166 continue;
167 }
168 }
169 spin_unlock(&badrange->lock);
170}
171EXPORT_SYMBOL_GPL(badrange_forget);
172
173static void set_badblock(struct badblocks *bb, sector_t s, int num)
174{
175 dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
176 (u64) s * 512, (u64) num * 512);
177 /* this isn't an error as the hardware will still throw an exception */
178 if (badblocks_set(bb, s, num, 1))
179 dev_info_once(bb->dev, "%s: failed for sector %llx\n",
180 __func__, (u64) s);
181}
182
183/**
184 * __add_badblock_range() - Convert a physical address range to bad sectors
185 * @bb: badblocks instance to populate
186 * @ns_offset: namespace offset where the error range begins (in bytes)
187 * @len: number of bytes of badrange to be added
188 *
189 * This assumes that the range provided with (ns_offset, len) is within
190 * the bounds of physical addresses for this namespace, i.e. lies in the
191 * interval [ns_start, ns_start + ns_size)
192 */
193static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
194{
195 const unsigned int sector_size = 512;
196 sector_t start_sector, end_sector;
197 u64 num_sectors;
198 u32 rem;
199
200 start_sector = div_u64(ns_offset, sector_size);
201 end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
202 if (rem)
203 end_sector++;
204 num_sectors = end_sector - start_sector;
205
206 if (unlikely(num_sectors > (u64)INT_MAX)) {
207 u64 remaining = num_sectors;
208 sector_t s = start_sector;
209
210 while (remaining) {
211 int done = min_t(u64, remaining, INT_MAX);
212
213 set_badblock(bb, s, done);
214 remaining -= done;
215 s += done;
216 }
217 } else
218 set_badblock(bb, start_sector, num_sectors);
219}
220
221static void badblocks_populate(struct badrange *badrange,
222 struct badblocks *bb, const struct resource *res)
223{
224 struct badrange_entry *bre;
225
226 if (list_empty(&badrange->list))
227 return;
228
229 list_for_each_entry(bre, &badrange->list, list) {
230 u64 bre_end = bre->start + bre->length - 1;
231
232 /* Discard intervals with no intersection */
233 if (bre_end < res->start)
234 continue;
235 if (bre->start > res->end)
236 continue;
237 /* Deal with any overlap after start of the namespace */
238 if (bre->start >= res->start) {
239 u64 start = bre->start;
240 u64 len;
241
242 if (bre_end <= res->end)
243 len = bre->length;
244 else
245 len = res->start + resource_size(res)
246 - bre->start;
247 __add_badblock_range(bb, start - res->start, len);
248 continue;
249 }
250 /*
251 * Deal with overlap for badrange starting before
252 * the namespace.
253 */
254 if (bre->start < res->start) {
255 u64 len;
256
257 if (bre_end < res->end)
258 len = bre->start + bre->length - res->start;
259 else
260 len = resource_size(res);
261 __add_badblock_range(bb, 0, len);
262 }
263 }
264}
265
266/**
267 * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
268 * @region: parent region of the range to interrogate
269 * @bb: badblocks instance to populate
270 * @res: resource range to consider
271 *
272 * The badrange list generated during bus initialization may contain
273 * multiple, possibly overlapping physical address ranges. Compare each
274 * of these ranges to the resource range currently being initialized,
275 * and add badblocks entries for all matching sub-ranges
276 */
277void nvdimm_badblocks_populate(struct nd_region *nd_region,
278 struct badblocks *bb, const struct resource *res)
279{
280 struct nvdimm_bus *nvdimm_bus;
281
282 if (!is_memory(&nd_region->dev)) {
283 dev_WARN_ONCE(&nd_region->dev, 1,
284 "%s only valid for pmem regions\n", __func__);
285 return;
286 }
287 nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
288
289 nvdimm_bus_lock(&nvdimm_bus->dev);
290 badblocks_populate(&nvdimm_bus->badrange, bb, res);
291 nvdimm_bus_unlock(&nvdimm_bus->dev);
292}
293EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index baf283986a7e..0a5e6cd758fe 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
11 * General Public License for more details. 11 * General Public License for more details.
12 */ 12 */
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14#include <linux/libnvdimm.h>
14#include <linux/sched/mm.h> 15#include <linux/sched/mm.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/uaccess.h> 17#include <linux/uaccess.h>
@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
221 phys_addr_t phys, u64 cleared) 222 phys_addr_t phys, u64 cleared)
222{ 223{
223 if (cleared > 0) 224 if (cleared > 0)
224 nvdimm_forget_poison(nvdimm_bus, phys, cleared); 225 badrange_forget(&nvdimm_bus->badrange, phys, cleared);
225 226
226 if (cleared > 0 && cleared / 512) 227 if (cleared > 0 && cleared / 512)
227 nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); 228 nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
344 return NULL; 345 return NULL;
345 INIT_LIST_HEAD(&nvdimm_bus->list); 346 INIT_LIST_HEAD(&nvdimm_bus->list);
346 INIT_LIST_HEAD(&nvdimm_bus->mapping_list); 347 INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
347 INIT_LIST_HEAD(&nvdimm_bus->poison_list);
348 init_waitqueue_head(&nvdimm_bus->probe_wait); 348 init_waitqueue_head(&nvdimm_bus->probe_wait);
349 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); 349 nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
350 mutex_init(&nvdimm_bus->reconfig_mutex); 350 mutex_init(&nvdimm_bus->reconfig_mutex);
351 spin_lock_init(&nvdimm_bus->poison_lock); 351 badrange_init(&nvdimm_bus->badrange);
352 if (nvdimm_bus->id < 0) { 352 if (nvdimm_bus->id < 0) {
353 kfree(nvdimm_bus); 353 kfree(nvdimm_bus);
354 return NULL; 354 return NULL;
@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
395 return 0; 395 return 0;
396} 396}
397 397
398static void free_poison_list(struct list_head *poison_list) 398static void free_badrange_list(struct list_head *badrange_list)
399{ 399{
400 struct nd_poison *pl, *next; 400 struct badrange_entry *bre, *next;
401 401
402 list_for_each_entry_safe(pl, next, poison_list, list) { 402 list_for_each_entry_safe(bre, next, badrange_list, list) {
403 list_del(&pl->list); 403 list_del(&bre->list);
404 kfree(pl); 404 kfree(bre);
405 } 405 }
406 list_del_init(poison_list); 406 list_del_init(badrange_list);
407} 407}
408 408
409static int nd_bus_remove(struct device *dev) 409static int nd_bus_remove(struct device *dev)
@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
417 nd_synchronize(); 417 nd_synchronize();
418 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); 418 device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
419 419
420 spin_lock(&nvdimm_bus->poison_lock); 420 spin_lock(&nvdimm_bus->badrange.lock);
421 free_poison_list(&nvdimm_bus->poison_list); 421 free_badrange_list(&nvdimm_bus->badrange.list);
422 spin_unlock(&nvdimm_bus->poison_lock); 422 spin_unlock(&nvdimm_bus->badrange.lock);
423 423
424 nvdimm_bus_destroy_ndctl(nvdimm_bus); 424 nvdimm_bus_destroy_ndctl(nvdimm_bus);
425 425
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index bb71f0cf8f5d..1dc527660637 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
398}; 398};
399EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); 399EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
400 400
401static void set_badblock(struct badblocks *bb, sector_t s, int num) 401int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
402{ 402{
403 dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", 403 return badrange_add(&nvdimm_bus->badrange, addr, length);
404 (u64) s * 512, (u64) num * 512);
405 /* this isn't an error as the hardware will still throw an exception */
406 if (badblocks_set(bb, s, num, 1))
407 dev_info_once(bb->dev, "%s: failed for sector %llx\n",
408 __func__, (u64) s);
409} 404}
410 405EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
411/**
412 * __add_badblock_range() - Convert a physical address range to bad sectors
413 * @bb: badblocks instance to populate
414 * @ns_offset: namespace offset where the error range begins (in bytes)
415 * @len: number of bytes of poison to be added
416 *
417 * This assumes that the range provided with (ns_offset, len) is within
418 * the bounds of physical addresses for this namespace, i.e. lies in the
419 * interval [ns_start, ns_start + ns_size)
420 */
421static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
422{
423 const unsigned int sector_size = 512;
424 sector_t start_sector, end_sector;
425 u64 num_sectors;
426 u32 rem;
427
428 start_sector = div_u64(ns_offset, sector_size);
429 end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
430 if (rem)
431 end_sector++;
432 num_sectors = end_sector - start_sector;
433
434 if (unlikely(num_sectors > (u64)INT_MAX)) {
435 u64 remaining = num_sectors;
436 sector_t s = start_sector;
437
438 while (remaining) {
439 int done = min_t(u64, remaining, INT_MAX);
440
441 set_badblock(bb, s, done);
442 remaining -= done;
443 s += done;
444 }
445 } else
446 set_badblock(bb, start_sector, num_sectors);
447}
448
449static void badblocks_populate(struct list_head *poison_list,
450 struct badblocks *bb, const struct resource *res)
451{
452 struct nd_poison *pl;
453
454 if (list_empty(poison_list))
455 return;
456
457 list_for_each_entry(pl, poison_list, list) {
458 u64 pl_end = pl->start + pl->length - 1;
459
460 /* Discard intervals with no intersection */
461 if (pl_end < res->start)
462 continue;
463 if (pl->start > res->end)
464 continue;
465 /* Deal with any overlap after start of the namespace */
466 if (pl->start >= res->start) {
467 u64 start = pl->start;
468 u64 len;
469
470 if (pl_end <= res->end)
471 len = pl->length;
472 else
473 len = res->start + resource_size(res)
474 - pl->start;
475 __add_badblock_range(bb, start - res->start, len);
476 continue;
477 }
478 /* Deal with overlap for poison starting before the namespace */
479 if (pl->start < res->start) {
480 u64 len;
481
482 if (pl_end < res->end)
483 len = pl->start + pl->length - res->start;
484 else
485 len = resource_size(res);
486 __add_badblock_range(bb, 0, len);
487 }
488 }
489}
490
491/**
492 * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
493 * @region: parent region of the range to interrogate
494 * @bb: badblocks instance to populate
495 * @res: resource range to consider
496 *
497 * The poison list generated during bus initialization may contain
498 * multiple, possibly overlapping physical address ranges. Compare each
499 * of these ranges to the resource range currently being initialized,
500 * and add badblocks entries for all matching sub-ranges
501 */
502void nvdimm_badblocks_populate(struct nd_region *nd_region,
503 struct badblocks *bb, const struct resource *res)
504{
505 struct nvdimm_bus *nvdimm_bus;
506 struct list_head *poison_list;
507
508 if (!is_memory(&nd_region->dev)) {
509 dev_WARN_ONCE(&nd_region->dev, 1,
510 "%s only valid for pmem regions\n", __func__);
511 return;
512 }
513 nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
514 poison_list = &nvdimm_bus->poison_list;
515
516 nvdimm_bus_lock(&nvdimm_bus->dev);
517 badblocks_populate(poison_list, bb, res);
518 nvdimm_bus_unlock(&nvdimm_bus->dev);
519}
520EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
521
522static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
523 struct nd_poison *pl, u64 addr, u64 length)
524{
525 lockdep_assert_held(&nvdimm_bus->poison_lock);
526 pl->start = addr;
527 pl->length = length;
528 list_add_tail(&pl->list, &nvdimm_bus->poison_list);
529}
530
531static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
532 gfp_t flags)
533{
534 struct nd_poison *pl;
535
536 pl = kzalloc(sizeof(*pl), flags);
537 if (!pl)
538 return -ENOMEM;
539
540 append_poison_entry(nvdimm_bus, pl, addr, length);
541 return 0;
542}
543
544static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
545{
546 struct nd_poison *pl, *pl_new;
547
548 spin_unlock(&nvdimm_bus->poison_lock);
549 pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
550 spin_lock(&nvdimm_bus->poison_lock);
551
552 if (list_empty(&nvdimm_bus->poison_list)) {
553 if (!pl_new)
554 return -ENOMEM;
555 append_poison_entry(nvdimm_bus, pl_new, addr, length);
556 return 0;
557 }
558
559 /*
560 * There is a chance this is a duplicate, check for those first.
561 * This will be the common case as ARS_STATUS returns all known
562 * errors in the SPA space, and we can't query it per region
563 */
564 list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
565 if (pl->start == addr) {
566 /* If length has changed, update this list entry */
567 if (pl->length != length)
568 pl->length = length;
569 kfree(pl_new);
570 return 0;
571 }
572
573 /*
574 * If not a duplicate or a simple length update, add the entry as is,
575 * as any overlapping ranges will get resolved when the list is consumed
576 * and converted to badblocks
577 */
578 if (!pl_new)
579 return -ENOMEM;
580 append_poison_entry(nvdimm_bus, pl_new, addr, length);
581
582 return 0;
583}
584
585int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
586{
587 int rc;
588
589 spin_lock(&nvdimm_bus->poison_lock);
590 rc = bus_add_poison(nvdimm_bus, addr, length);
591 spin_unlock(&nvdimm_bus->poison_lock);
592
593 return rc;
594}
595EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
596
597void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
598 unsigned int len)
599{
600 struct list_head *poison_list = &nvdimm_bus->poison_list;
601 u64 clr_end = start + len - 1;
602 struct nd_poison *pl, *next;
603
604 spin_lock(&nvdimm_bus->poison_lock);
605 WARN_ON_ONCE(list_empty(poison_list));
606
607 /*
608 * [start, clr_end] is the poison interval being cleared.
609 * [pl->start, pl_end] is the poison_list entry we're comparing
610 * the above interval against. The poison list entry may need
611 * to be modified (update either start or length), deleted, or
612 * split into two based on the overlap characteristics
613 */
614
615 list_for_each_entry_safe(pl, next, poison_list, list) {
616 u64 pl_end = pl->start + pl->length - 1;
617
618 /* Skip intervals with no intersection */
619 if (pl_end < start)
620 continue;
621 if (pl->start > clr_end)
622 continue;
623 /* Delete completely overlapped poison entries */
624 if ((pl->start >= start) && (pl_end <= clr_end)) {
625 list_del(&pl->list);
626 kfree(pl);
627 continue;
628 }
629 /* Adjust start point of partially cleared entries */
630 if ((start <= pl->start) && (clr_end > pl->start)) {
631 pl->length -= clr_end - pl->start + 1;
632 pl->start = clr_end + 1;
633 continue;
634 }
635 /* Adjust pl->length for partial clearing at the tail end */
636 if ((pl->start < start) && (pl_end <= clr_end)) {
637 /* pl->start remains the same */
638 pl->length = start - pl->start;
639 continue;
640 }
641 /*
642 * If clearing in the middle of an entry, we split it into
643 * two by modifying the current entry to represent one half of
644 * the split, and adding a new entry for the second half.
645 */
646 if ((pl->start < start) && (pl_end > clr_end)) {
647 u64 new_start = clr_end + 1;
648 u64 new_len = pl_end - new_start + 1;
649
650 /* Add new entry covering the right half */
651 add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
652 /* Adjust this entry to cover the left half */
653 pl->length = start - pl->start;
654 continue;
655 }
656 }
657 spin_unlock(&nvdimm_bus->poison_lock);
658}
659EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
660 406
661#ifdef CONFIG_BLK_DEV_INTEGRITY 407#ifdef CONFIG_BLK_DEV_INTEGRITY
662int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) 408int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index e0f0e3ce1a32..f8913b8124b6 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
55 goto err; 55 goto err;
56 56
57 rc = nvdimm_init_config_data(ndd); 57 rc = nvdimm_init_config_data(ndd);
58 if (rc == -EACCES)
59 nvdimm_set_locked(dev);
58 if (rc) 60 if (rc)
59 goto err; 61 goto err;
60 62
@@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev)
68 rc = nd_label_reserve_dpa(ndd); 70 rc = nd_label_reserve_dpa(ndd);
69 if (ndd->ns_current >= 0) 71 if (ndd->ns_current >= 0)
70 nvdimm_set_aliasing(dev); 72 nvdimm_set_aliasing(dev);
73 nvdimm_clear_locked(dev);
71 nvdimm_bus_unlock(dev); 74 nvdimm_bus_unlock(dev);
72 75
73 if (rc) 76 if (rc)
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index f0d1b7e5de01..097794d9f786 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
200 set_bit(NDD_LOCKED, &nvdimm->flags); 200 set_bit(NDD_LOCKED, &nvdimm->flags);
201} 201}
202 202
203void nvdimm_clear_locked(struct device *dev)
204{
205 struct nvdimm *nvdimm = to_nvdimm(dev);
206
207 clear_bit(NDD_LOCKED, &nvdimm->flags);
208}
209
203static void nvdimm_release(struct device *dev) 210static void nvdimm_release(struct device *dev)
204{ 211{
205 struct nvdimm *nvdimm = to_nvdimm(dev); 212 struct nvdimm *nvdimm = to_nvdimm(dev);
@@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
324} 331}
325static DEVICE_ATTR_RO(commands); 332static DEVICE_ATTR_RO(commands);
326 333
334static ssize_t flags_show(struct device *dev,
335 struct device_attribute *attr, char *buf)
336{
337 struct nvdimm *nvdimm = to_nvdimm(dev);
338
339 return sprintf(buf, "%s%s\n",
340 test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
341 test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
342}
343static DEVICE_ATTR_RO(flags);
344
327static ssize_t state_show(struct device *dev, struct device_attribute *attr, 345static ssize_t state_show(struct device *dev, struct device_attribute *attr,
328 char *buf) 346 char *buf)
329{ 347{
@@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);
365 383
366static struct attribute *nvdimm_attributes[] = { 384static struct attribute *nvdimm_attributes[] = {
367 &dev_attr_state.attr, 385 &dev_attr_state.attr,
386 &dev_attr_flags.attr,
368 &dev_attr_commands.attr, 387 &dev_attr_commands.attr,
369 &dev_attr_available_slots.attr, 388 &dev_attr_available_slots.attr,
370 NULL, 389 NULL,
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 9c5f108910e3..de66c02f6140 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
1050 nsindex = to_namespace_index(ndd, 0); 1050 nsindex = to_namespace_index(ndd, 0);
1051 memset(nsindex, 0, ndd->nsarea.config_size); 1051 memset(nsindex, 0, ndd->nsarea.config_size);
1052 for (i = 0; i < 2; i++) { 1052 for (i = 0; i < 2; i++) {
1053 int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); 1053 int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
1054 1054
1055 if (rc) 1055 if (rc)
1056 return rc; 1056 return rc;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 3e4d1e7998da..bb3ba8cf24d4 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
1620 if (a == &dev_attr_resource.attr) { 1620 if (a == &dev_attr_resource.attr) {
1621 if (is_namespace_blk(dev)) 1621 if (is_namespace_blk(dev))
1622 return 0; 1622 return 0;
1623 return a->mode; 1623 return 0400;
1624 } 1624 }
1625 1625
1626 if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { 1626 if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
@@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
1875 * @nspm: target namespace to create 1875 * @nspm: target namespace to create
1876 * @nd_label: target pmem namespace label to evaluate 1876 * @nd_label: target pmem namespace label to evaluate
1877 */ 1877 */
1878struct device *create_namespace_pmem(struct nd_region *nd_region, 1878static struct device *create_namespace_pmem(struct nd_region *nd_region,
1879 struct nd_namespace_index *nsindex, 1879 struct nd_namespace_index *nsindex,
1880 struct nd_namespace_label *nd_label) 1880 struct nd_namespace_label *nd_label)
1881{ 1881{
@@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
2186 return i; 2186 return i;
2187} 2187}
2188 2188
2189struct device *create_namespace_blk(struct nd_region *nd_region, 2189static struct device *create_namespace_blk(struct nd_region *nd_region,
2190 struct nd_namespace_label *nd_label, int count) 2190 struct nd_namespace_label *nd_label, int count)
2191{ 2191{
2192 2192
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 86bc19ae30da..79274ead54fb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -29,10 +29,9 @@ struct nvdimm_bus {
29 struct list_head list; 29 struct list_head list;
30 struct device dev; 30 struct device dev;
31 int id, probe_active; 31 int id, probe_active;
32 struct list_head poison_list;
33 struct list_head mapping_list; 32 struct list_head mapping_list;
34 struct mutex reconfig_mutex; 33 struct mutex reconfig_mutex;
35 spinlock_t poison_lock; 34 struct badrange badrange;
36}; 35};
37 36
38struct nvdimm { 37struct nvdimm {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 9c758a91372b..e958f3724c41 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -34,12 +34,6 @@ enum {
34 NVDIMM_IO_ATOMIC = 1, 34 NVDIMM_IO_ATOMIC = 1,
35}; 35};
36 36
37struct nd_poison {
38 u64 start;
39 u64 length;
40 struct list_head list;
41};
42
43struct nvdimm_drvdata { 37struct nvdimm_drvdata {
44 struct device *dev; 38 struct device *dev;
45 int nslabel_size; 39 int nslabel_size;
@@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
254 unsigned int len); 248 unsigned int len);
255void nvdimm_set_aliasing(struct device *dev); 249void nvdimm_set_aliasing(struct device *dev);
256void nvdimm_set_locked(struct device *dev); 250void nvdimm_set_locked(struct device *dev);
251void nvdimm_clear_locked(struct device *dev);
257struct nd_btt *to_nd_btt(struct device *dev); 252struct nd_btt *to_nd_btt(struct device *dev);
258 253
259struct nd_gen_sb { 254struct nd_gen_sb {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 9576c444f0ab..65cc171c721d 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
282 NULL, 282 NULL,
283}; 283};
284 284
285static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
286{
287 if (a == &dev_attr_resource.attr)
288 return 0400;
289 return a->mode;
290}
291
285struct attribute_group nd_pfn_attribute_group = { 292struct attribute_group nd_pfn_attribute_group = {
286 .attrs = nd_pfn_attributes, 293 .attrs = nd_pfn_attributes,
294 .is_visible = pfn_visible,
287}; 295};
288 296
289static const struct attribute_group *nd_pfn_attribute_groups[] = { 297static const struct attribute_group *nd_pfn_attribute_groups[] = {
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 829d760f651c..abaf38c61220 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
562 if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) 562 if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
563 return 0; 563 return 0;
564 564
565 if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) 565 if (a == &dev_attr_resource.attr) {
566 return 0; 566 if (is_nd_pmem(dev))
567 return 0400;
568 else
569 return 0;
570 }
567 571
568 if (a == &dev_attr_deep_flush.attr) { 572 if (a == &dev_attr_deep_flush.attr) {
569 int has_flush = nvdimm_has_flush(nd_region); 573 int has_flush = nvdimm_has_flush(nd_region);
diff --git a/fs/dax.c b/fs/dax.c
index 3652b26a0048..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
526static void *dax_insert_mapping_entry(struct address_space *mapping, 526static void *dax_insert_mapping_entry(struct address_space *mapping,
527 struct vm_fault *vmf, 527 struct vm_fault *vmf,
528 void *entry, sector_t sector, 528 void *entry, sector_t sector,
529 unsigned long flags) 529 unsigned long flags, bool dirty)
530{ 530{
531 struct radix_tree_root *page_tree = &mapping->page_tree; 531 struct radix_tree_root *page_tree = &mapping->page_tree;
532 void *new_entry; 532 void *new_entry;
533 pgoff_t index = vmf->pgoff; 533 pgoff_t index = vmf->pgoff;
534 534
535 if (vmf->flags & FAULT_FLAG_WRITE) 535 if (dirty)
536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 536 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
537 537
538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { 538 if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
569 entry = new_entry; 569 entry = new_entry;
570 } 570 }
571 571
572 if (vmf->flags & FAULT_FLAG_WRITE) 572 if (dirty)
573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 573 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
574 574
575 spin_unlock_irq(&mapping->tree_lock); 575 spin_unlock_irq(&mapping->tree_lock);
@@ -825,38 +825,42 @@ out:
825} 825}
826EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 826EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
827 827
828static int dax_insert_mapping(struct address_space *mapping, 828static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
829 struct block_device *bdev, struct dax_device *dax_dev,
830 sector_t sector, size_t size, void *entry,
831 struct vm_area_struct *vma, struct vm_fault *vmf)
832{ 829{
833 unsigned long vaddr = vmf->address; 830 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
834 void *ret, *kaddr; 831}
832
833static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
834 pfn_t *pfnp)
835{
836 const sector_t sector = dax_iomap_sector(iomap, pos);
835 pgoff_t pgoff; 837 pgoff_t pgoff;
838 void *kaddr;
836 int id, rc; 839 int id, rc;
837 pfn_t pfn; 840 long length;
838 841
839 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 842 rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
840 if (rc) 843 if (rc)
841 return rc; 844 return rc;
842
843 id = dax_read_lock(); 845 id = dax_read_lock();
844 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 846 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
845 if (rc < 0) { 847 &kaddr, pfnp);
846 dax_read_unlock(id); 848 if (length < 0) {
847 return rc; 849 rc = length;
850 goto out;
848 } 851 }
852 rc = -EINVAL;
853 if (PFN_PHYS(length) < size)
854 goto out;
855 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
856 goto out;
857 /* For larger pages we need devmap */
858 if (length > 1 && !pfn_t_devmap(*pfnp))
859 goto out;
860 rc = 0;
861out:
849 dax_read_unlock(id); 862 dax_read_unlock(id);
850 863 return rc;
851 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
852 if (IS_ERR(ret))
853 return PTR_ERR(ret);
854
855 trace_dax_insert_mapping(mapping->host, vmf, ret);
856 if (vmf->flags & FAULT_FLAG_WRITE)
857 return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
858 else
859 return vm_insert_mixed(vma, vaddr, pfn);
860} 864}
861 865
862/* 866/*
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
882 } 886 }
883 887
884 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, 888 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
885 RADIX_DAX_ZERO_PAGE); 889 RADIX_DAX_ZERO_PAGE, false);
886 if (IS_ERR(entry2)) { 890 if (IS_ERR(entry2)) {
887 ret = VM_FAULT_SIGBUS; 891 ret = VM_FAULT_SIGBUS;
888 goto out; 892 goto out;
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
941} 945}
942EXPORT_SYMBOL_GPL(__dax_zero_page_range); 946EXPORT_SYMBOL_GPL(__dax_zero_page_range);
943 947
944static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
945{
946 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
947}
948
949static loff_t 948static loff_t
950dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 949dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
951 struct iomap *iomap) 950 struct iomap *iomap)
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
1085 return VM_FAULT_SIGBUS; 1084 return VM_FAULT_SIGBUS;
1086} 1085}
1087 1086
1088static int dax_iomap_pte_fault(struct vm_fault *vmf, 1087/*
1088 * MAP_SYNC on a dax mapping guarantees dirty metadata is
1089 * flushed on write-faults (non-cow), but not read-faults.
1090 */
1091static bool dax_fault_is_synchronous(unsigned long flags,
1092 struct vm_area_struct *vma, struct iomap *iomap)
1093{
1094 return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
1095 && (iomap->flags & IOMAP_F_DIRTY);
1096}
1097
1098static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1089 const struct iomap_ops *ops) 1099 const struct iomap_ops *ops)
1090{ 1100{
1091 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1101 struct vm_area_struct *vma = vmf->vma;
1102 struct address_space *mapping = vma->vm_file->f_mapping;
1092 struct inode *inode = mapping->host; 1103 struct inode *inode = mapping->host;
1093 unsigned long vaddr = vmf->address; 1104 unsigned long vaddr = vmf->address;
1094 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1105 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1095 sector_t sector;
1096 struct iomap iomap = { 0 }; 1106 struct iomap iomap = { 0 };
1097 unsigned flags = IOMAP_FAULT; 1107 unsigned flags = IOMAP_FAULT;
1098 int error, major = 0; 1108 int error, major = 0;
1109 bool write = vmf->flags & FAULT_FLAG_WRITE;
1110 bool sync;
1099 int vmf_ret = 0; 1111 int vmf_ret = 0;
1100 void *entry; 1112 void *entry;
1113 pfn_t pfn;
1101 1114
1102 trace_dax_pte_fault(inode, vmf, vmf_ret); 1115 trace_dax_pte_fault(inode, vmf, vmf_ret);
1103 /* 1116 /*
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1110 goto out; 1123 goto out;
1111 } 1124 }
1112 1125
1113 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1126 if (write && !vmf->cow_page)
1114 flags |= IOMAP_WRITE; 1127 flags |= IOMAP_WRITE;
1115 1128
1116 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1129 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1145 goto error_finish_iomap; 1158 goto error_finish_iomap;
1146 } 1159 }
1147 1160
1148 sector = dax_iomap_sector(&iomap, pos);
1149
1150 if (vmf->cow_page) { 1161 if (vmf->cow_page) {
1162 sector_t sector = dax_iomap_sector(&iomap, pos);
1163
1151 switch (iomap.type) { 1164 switch (iomap.type) {
1152 case IOMAP_HOLE: 1165 case IOMAP_HOLE:
1153 case IOMAP_UNWRITTEN: 1166 case IOMAP_UNWRITTEN:
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1173 goto finish_iomap; 1186 goto finish_iomap;
1174 } 1187 }
1175 1188
1189 sync = dax_fault_is_synchronous(flags, vma, &iomap);
1190
1176 switch (iomap.type) { 1191 switch (iomap.type) {
1177 case IOMAP_MAPPED: 1192 case IOMAP_MAPPED:
1178 if (iomap.flags & IOMAP_F_NEW) { 1193 if (iomap.flags & IOMAP_F_NEW) {
1179 count_vm_event(PGMAJFAULT); 1194 count_vm_event(PGMAJFAULT);
1180 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 1195 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
1181 major = VM_FAULT_MAJOR; 1196 major = VM_FAULT_MAJOR;
1182 } 1197 }
1183 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1198 error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
1184 sector, PAGE_SIZE, entry, vmf->vma, vmf); 1199 if (error < 0)
1200 goto error_finish_iomap;
1201
1202 entry = dax_insert_mapping_entry(mapping, vmf, entry,
1203 dax_iomap_sector(&iomap, pos),
1204 0, write && !sync);
1205 if (IS_ERR(entry)) {
1206 error = PTR_ERR(entry);
1207 goto error_finish_iomap;
1208 }
1209
1210 /*
1211 * If we are doing synchronous page fault and inode needs fsync,
1212 * we can insert PTE into page tables only after that happens.
1213 * Skip insertion for now and return the pfn so that caller can
1214 * insert it after fsync is done.
1215 */
1216 if (sync) {
1217 if (WARN_ON_ONCE(!pfnp)) {
1218 error = -EIO;
1219 goto error_finish_iomap;
1220 }
1221 *pfnp = pfn;
1222 vmf_ret = VM_FAULT_NEEDDSYNC | major;
1223 goto finish_iomap;
1224 }
1225 trace_dax_insert_mapping(inode, vmf, entry);
1226 if (write)
1227 error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
1228 else
1229 error = vm_insert_mixed(vma, vaddr, pfn);
1230
1185 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1231 /* -EBUSY is fine, somebody else faulted on the same PTE */
1186 if (error == -EBUSY) 1232 if (error == -EBUSY)
1187 error = 0; 1233 error = 0;
1188 break; 1234 break;
1189 case IOMAP_UNWRITTEN: 1235 case IOMAP_UNWRITTEN:
1190 case IOMAP_HOLE: 1236 case IOMAP_HOLE:
1191 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1237 if (!write) {
1192 vmf_ret = dax_load_hole(mapping, entry, vmf); 1238 vmf_ret = dax_load_hole(mapping, entry, vmf);
1193 goto finish_iomap; 1239 goto finish_iomap;
1194 } 1240 }
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
1223} 1269}
1224 1270
1225#ifdef CONFIG_FS_DAX_PMD 1271#ifdef CONFIG_FS_DAX_PMD
1226static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1272/*
1227 loff_t pos, void *entry) 1273 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1228{ 1274 * more often than one might expect in the below functions.
1229 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1275 */
1230 const sector_t sector = dax_iomap_sector(iomap, pos); 1276#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1231 struct dax_device *dax_dev = iomap->dax_dev;
1232 struct block_device *bdev = iomap->bdev;
1233 struct inode *inode = mapping->host;
1234 const size_t size = PMD_SIZE;
1235 void *ret = NULL, *kaddr;
1236 long length = 0;
1237 pgoff_t pgoff;
1238 pfn_t pfn = {};
1239 int id;
1240
1241 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
1242 goto fallback;
1243
1244 id = dax_read_lock();
1245 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
1246 if (length < 0)
1247 goto unlock_fallback;
1248 length = PFN_PHYS(length);
1249
1250 if (length < size)
1251 goto unlock_fallback;
1252 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
1253 goto unlock_fallback;
1254 if (!pfn_t_devmap(pfn))
1255 goto unlock_fallback;
1256 dax_read_unlock(id);
1257
1258 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
1259 RADIX_DAX_PMD);
1260 if (IS_ERR(ret))
1261 goto fallback;
1262
1263 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
1264 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1265 pfn, vmf->flags & FAULT_FLAG_WRITE);
1266
1267unlock_fallback:
1268 dax_read_unlock(id);
1269fallback:
1270 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
1271 return VM_FAULT_FALLBACK;
1272}
1273 1277
1274static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1278static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1275 void *entry) 1279 void *entry)
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1288 goto fallback; 1292 goto fallback;
1289 1293
1290 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, 1294 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
1291 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); 1295 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
1292 if (IS_ERR(ret)) 1296 if (IS_ERR(ret))
1293 goto fallback; 1297 goto fallback;
1294 1298
@@ -1310,13 +1314,14 @@ fallback:
1310 return VM_FAULT_FALLBACK; 1314 return VM_FAULT_FALLBACK;
1311} 1315}
1312 1316
1313static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1317static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1314 const struct iomap_ops *ops) 1318 const struct iomap_ops *ops)
1315{ 1319{
1316 struct vm_area_struct *vma = vmf->vma; 1320 struct vm_area_struct *vma = vmf->vma;
1317 struct address_space *mapping = vma->vm_file->f_mapping; 1321 struct address_space *mapping = vma->vm_file->f_mapping;
1318 unsigned long pmd_addr = vmf->address & PMD_MASK; 1322 unsigned long pmd_addr = vmf->address & PMD_MASK;
1319 bool write = vmf->flags & FAULT_FLAG_WRITE; 1323 bool write = vmf->flags & FAULT_FLAG_WRITE;
1324 bool sync;
1320 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1325 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
1321 struct inode *inode = mapping->host; 1326 struct inode *inode = mapping->host;
1322 int result = VM_FAULT_FALLBACK; 1327 int result = VM_FAULT_FALLBACK;
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1325 void *entry; 1330 void *entry;
1326 loff_t pos; 1331 loff_t pos;
1327 int error; 1332 int error;
1333 pfn_t pfn;
1328 1334
1329 /* 1335 /*
1330 * Check whether offset isn't beyond end of file now. Caller is 1336 * Check whether offset isn't beyond end of file now. Caller is
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1332 * this is a reliable test. 1338 * this is a reliable test.
1333 */ 1339 */
1334 pgoff = linear_page_index(vma, pmd_addr); 1340 pgoff = linear_page_index(vma, pmd_addr);
1335 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1341 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
1336 1342
1337 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1343 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
1338 1344
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1356 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1362 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1357 goto fallback; 1363 goto fallback;
1358 1364
1359 if (pgoff > max_pgoff) { 1365 if (pgoff >= max_pgoff) {
1360 result = VM_FAULT_SIGBUS; 1366 result = VM_FAULT_SIGBUS;
1361 goto out; 1367 goto out;
1362 } 1368 }
1363 1369
1364 /* If the PMD would extend beyond the file size */ 1370 /* If the PMD would extend beyond the file size */
1365 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1371 if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
1366 goto fallback; 1372 goto fallback;
1367 1373
1368 /* 1374 /*
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1400 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1406 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1401 goto finish_iomap; 1407 goto finish_iomap;
1402 1408
1409 sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
1410
1403 switch (iomap.type) { 1411 switch (iomap.type) {
1404 case IOMAP_MAPPED: 1412 case IOMAP_MAPPED:
1405 result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); 1413 error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
1414 if (error < 0)
1415 goto finish_iomap;
1416
1417 entry = dax_insert_mapping_entry(mapping, vmf, entry,
1418 dax_iomap_sector(&iomap, pos),
1419 RADIX_DAX_PMD, write && !sync);
1420 if (IS_ERR(entry))
1421 goto finish_iomap;
1422
1423 /*
1424 * If we are doing synchronous page fault and inode needs fsync,
1425 * we can insert PMD into page tables only after that happens.
1426 * Skip insertion for now and return the pfn so that caller can
1427 * insert it after fsync is done.
1428 */
1429 if (sync) {
1430 if (WARN_ON_ONCE(!pfnp))
1431 goto finish_iomap;
1432 *pfnp = pfn;
1433 result = VM_FAULT_NEEDDSYNC;
1434 goto finish_iomap;
1435 }
1436
1437 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
1438 result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
1439 write);
1406 break; 1440 break;
1407 case IOMAP_UNWRITTEN: 1441 case IOMAP_UNWRITTEN:
1408 case IOMAP_HOLE: 1442 case IOMAP_HOLE:
@@ -1442,7 +1476,7 @@ out:
1442 return result; 1476 return result;
1443} 1477}
1444#else 1478#else
1445static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1479static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1446 const struct iomap_ops *ops) 1480 const struct iomap_ops *ops)
1447{ 1481{
1448 return VM_FAULT_FALLBACK; 1482 return VM_FAULT_FALLBACK;
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1452/** 1486/**
1453 * dax_iomap_fault - handle a page fault on a DAX file 1487 * dax_iomap_fault - handle a page fault on a DAX file
1454 * @vmf: The description of the fault 1488 * @vmf: The description of the fault
1455 * @ops: iomap ops passed from the file system 1489 * @pe_size: Size of the page to fault in
1490 * @pfnp: PFN to insert for synchronous faults if fsync is required
1491 * @ops: Iomap ops passed from the file system
1456 * 1492 *
1457 * When a page fault occurs, filesystems may call this helper in 1493 * When a page fault occurs, filesystems may call this helper in
1458 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1494 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1460 * successfully. 1496 * successfully.
1461 */ 1497 */
1462int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1498int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1463 const struct iomap_ops *ops) 1499 pfn_t *pfnp, const struct iomap_ops *ops)
1464{ 1500{
1465 switch (pe_size) { 1501 switch (pe_size) {
1466 case PE_SIZE_PTE: 1502 case PE_SIZE_PTE:
1467 return dax_iomap_pte_fault(vmf, ops); 1503 return dax_iomap_pte_fault(vmf, pfnp, ops);
1468 case PE_SIZE_PMD: 1504 case PE_SIZE_PMD:
1469 return dax_iomap_pmd_fault(vmf, ops); 1505 return dax_iomap_pmd_fault(vmf, pfnp, ops);
1470 default: 1506 default:
1471 return VM_FAULT_FALLBACK; 1507 return VM_FAULT_FALLBACK;
1472 } 1508 }
1473} 1509}
1474EXPORT_SYMBOL_GPL(dax_iomap_fault); 1510EXPORT_SYMBOL_GPL(dax_iomap_fault);
1511
1512/**
1513 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1514 * @vmf: The description of the fault
1515 * @pe_size: Size of entry to be inserted
1516 * @pfn: PFN to insert
1517 *
1518 * This function inserts writeable PTE or PMD entry into page tables for mmaped
1519 * DAX file. It takes care of marking corresponding radix tree entry as dirty
1520 * as well.
1521 */
1522static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
1523 enum page_entry_size pe_size,
1524 pfn_t pfn)
1525{
1526 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1527 void *entry, **slot;
1528 pgoff_t index = vmf->pgoff;
1529 int vmf_ret, error;
1530
1531 spin_lock_irq(&mapping->tree_lock);
1532 entry = get_unlocked_mapping_entry(mapping, index, &slot);
1533 /* Did we race with someone splitting entry or so? */
1534 if (!entry ||
1535 (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
1536 (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
1537 put_unlocked_mapping_entry(mapping, index, entry);
1538 spin_unlock_irq(&mapping->tree_lock);
1539 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
1540 VM_FAULT_NOPAGE);
1541 return VM_FAULT_NOPAGE;
1542 }
1543 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1544 entry = lock_slot(mapping, slot);
1545 spin_unlock_irq(&mapping->tree_lock);
1546 switch (pe_size) {
1547 case PE_SIZE_PTE:
1548 error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1549 vmf_ret = dax_fault_return(error);
1550 break;
1551#ifdef CONFIG_FS_DAX_PMD
1552 case PE_SIZE_PMD:
1553 vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1554 pfn, true);
1555 break;
1556#endif
1557 default:
1558 vmf_ret = VM_FAULT_FALLBACK;
1559 }
1560 put_locked_mapping_entry(mapping, index);
1561 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
1562 return vmf_ret;
1563}
1564
1565/**
1566 * dax_finish_sync_fault - finish synchronous page fault
1567 * @vmf: The description of the fault
1568 * @pe_size: Size of entry to be inserted
1569 * @pfn: PFN to insert
1570 *
1571 * This function ensures that the file range touched by the page fault is
1572 * stored persistently on the media and handles inserting of appropriate page
1573 * table entry.
1574 */
1575int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1576 pfn_t pfn)
1577{
1578 int err;
1579 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1580 size_t len = 0;
1581
1582 if (pe_size == PE_SIZE_PTE)
1583 len = PAGE_SIZE;
1584 else if (pe_size == PE_SIZE_PMD)
1585 len = PMD_SIZE;
1586 else
1587 WARN_ON_ONCE(1);
1588 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
1589 if (err)
1590 return VM_FAULT_SIGBUS;
1591 return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
1592}
1593EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c67b486488fd..2da67699dc33 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
100 } 100 }
101 down_read(&ei->dax_sem); 101 down_read(&ei->dax_sem);
102 102
103 ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); 103 ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
104 104
105 up_read(&ei->dax_sem); 105 up_read(&ei->dax_sem);
106 if (vmf->flags & FAULT_FLAG_WRITE) 106 if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad204d2724ac..a0ae27b1bc66 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
28#include <linux/quotaops.h> 28#include <linux/quotaops.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/uio.h> 30#include <linux/uio.h>
31#include <linux/mman.h>
31#include "ext4.h" 32#include "ext4.h"
32#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
33#include "xattr.h" 34#include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
297 */ 298 */
298 bool write = (vmf->flags & FAULT_FLAG_WRITE) && 299 bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
299 (vmf->vma->vm_flags & VM_SHARED); 300 (vmf->vma->vm_flags & VM_SHARED);
301 pfn_t pfn;
300 302
301 if (write) { 303 if (write) {
302 sb_start_pagefault(sb); 304 sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
304 down_read(&EXT4_I(inode)->i_mmap_sem); 306 down_read(&EXT4_I(inode)->i_mmap_sem);
305 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 307 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
306 EXT4_DATA_TRANS_BLOCKS(sb)); 308 EXT4_DATA_TRANS_BLOCKS(sb));
309 if (IS_ERR(handle)) {
310 up_read(&EXT4_I(inode)->i_mmap_sem);
311 sb_end_pagefault(sb);
312 return VM_FAULT_SIGBUS;
313 }
307 } else { 314 } else {
308 down_read(&EXT4_I(inode)->i_mmap_sem); 315 down_read(&EXT4_I(inode)->i_mmap_sem);
309 } 316 }
310 if (!IS_ERR(handle)) 317 result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
311 result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
312 else
313 result = VM_FAULT_SIGBUS;
314 if (write) { 318 if (write) {
315 if (!IS_ERR(handle)) 319 ext4_journal_stop(handle);
316 ext4_journal_stop(handle); 320 /* Handling synchronous page fault? */
321 if (result & VM_FAULT_NEEDDSYNC)
322 result = dax_finish_sync_fault(vmf, pe_size, pfn);
317 up_read(&EXT4_I(inode)->i_mmap_sem); 323 up_read(&EXT4_I(inode)->i_mmap_sem);
318 sb_end_pagefault(sb); 324 sb_end_pagefault(sb);
319 } else { 325 } else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
351 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 357 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
352 return -EIO; 358 return -EIO;
353 359
360 /*
361 * We don't support synchronous mappings for non-DAX files. At least
362 * until someone comes with a sensible use case.
363 */
364 if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
365 return -EOPNOTSUPP;
366
354 file_accessed(file); 367 file_accessed(file);
355 if (IS_DAX(file_inode(file))) { 368 if (IS_DAX(file_inode(file))) {
356 vma->vm_ops = &ext4_dax_vm_ops; 369 vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
469 .compat_ioctl = ext4_compat_ioctl, 482 .compat_ioctl = ext4_compat_ioctl,
470#endif 483#endif
471 .mmap = ext4_file_mmap, 484 .mmap = ext4_file_mmap,
485 .mmap_supported_flags = MAP_SYNC,
472 .open = ext4_file_open, 486 .open = ext4_file_open,
473 .release = ext4_release_file, 487 .release = ext4_release_file,
474 .fsync = ext4_sync_file, 488 .fsync = ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d2b582fb141..0992d76f7ab1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3384 return try_to_free_buffers(page); 3384 return try_to_free_buffers(page);
3385} 3385}
3386 3386
3387static bool ext4_inode_datasync_dirty(struct inode *inode)
3388{
3389 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
3390
3391 if (journal)
3392 return !jbd2_transaction_committed(journal,
3393 EXT4_I(inode)->i_datasync_tid);
3394 /* Any metadata buffers to write? */
3395 if (!list_empty(&inode->i_mapping->private_list))
3396 return true;
3397 return inode->i_state & I_DIRTY_DATASYNC;
3398}
3399
3387static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3400static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3388 unsigned flags, struct iomap *iomap) 3401 unsigned flags, struct iomap *iomap)
3389{ 3402{
@@ -3497,6 +3510,8 @@ retry:
3497 } 3510 }
3498 3511
3499 iomap->flags = 0; 3512 iomap->flags = 0;
3513 if (ext4_inode_datasync_dirty(inode))
3514 iomap->flags |= IOMAP_F_DIRTY;
3500 iomap->bdev = inode->i_sb->s_bdev; 3515 iomap->bdev = inode->i_sb->s_bdev;
3501 iomap->dax_dev = sbi->s_daxdev; 3516 iomap->dax_dev = sbi->s_daxdev;
3502 iomap->offset = first_block << blkbits; 3517 iomap->offset = first_block << blkbits;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d2a85c9720e9..67546c7ad473 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
737 return err; 737 return err;
738} 738}
739 739
740/* Return 1 when transaction with given tid has already committed. */
741int jbd2_transaction_committed(journal_t *journal, tid_t tid)
742{
743 int ret = 1;
744
745 read_lock(&journal->j_state_lock);
746 if (journal->j_running_transaction &&
747 journal->j_running_transaction->t_tid == tid)
748 ret = 0;
749 if (journal->j_committing_transaction &&
750 journal->j_committing_transaction->t_tid == tid)
751 ret = 0;
752 read_unlock(&journal->j_state_lock);
753 return ret;
754}
755EXPORT_SYMBOL(jbd2_transaction_committed);
756
740/* 757/*
741 * When this function returns the transaction corresponding to tid 758 * When this function returns the transaction corresponding to tid
742 * will be completed. If the transaction has currently running, start 759 * will be completed. If the transaction has currently running, start
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 875231c36cb3..339e4c1c044d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
661 [ilog2(VM_ACCOUNT)] = "ac", 661 [ilog2(VM_ACCOUNT)] = "ac",
662 [ilog2(VM_NORESERVE)] = "nr", 662 [ilog2(VM_NORESERVE)] = "nr",
663 [ilog2(VM_HUGETLB)] = "ht", 663 [ilog2(VM_HUGETLB)] = "ht",
664 [ilog2(VM_SYNC)] = "sf",
664 [ilog2(VM_ARCH_1)] = "ar", 665 [ilog2(VM_ARCH_1)] = "ar",
665 [ilog2(VM_WIPEONFORK)] = "wf", 666 [ilog2(VM_WIPEONFORK)] = "wf",
666 [ilog2(VM_DONTDUMP)] = "dd", 667 [ilog2(VM_DONTDUMP)] = "dd",
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 18146873a8b3..8601275cc5e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
44#include <linux/falloc.h> 44#include <linux/falloc.h>
45#include <linux/pagevec.h> 45#include <linux/pagevec.h>
46#include <linux/backing-dev.h> 46#include <linux/backing-dev.h>
47#include <linux/mman.h>
47 48
48static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
49 50
@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
1045 1046
1046 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1047 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1047 if (IS_DAX(inode)) { 1048 if (IS_DAX(inode)) {
1048 ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); 1049 pfn_t pfn;
1050
1051 ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
1052 if (ret & VM_FAULT_NEEDDSYNC)
1053 ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1049 } else { 1054 } else {
1050 if (write_fault) 1055 if (write_fault)
1051 ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); 1056 ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
1090} 1095}
1091 1096
1092/* 1097/*
1093 * pfn_mkwrite was originally inteneded to ensure we capture time stamp 1098 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1094 * updates on write faults. In reality, it's need to serialise against 1099 * on write faults. In reality, it needs to serialise against truncate and
1095 * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED 1100 * prepare memory for writing so handle is as standard write fault.
1096 * to ensure we serialise the fault barrier in place.
1097 */ 1101 */
1098static int 1102static int
1099xfs_filemap_pfn_mkwrite( 1103xfs_filemap_pfn_mkwrite(
1100 struct vm_fault *vmf) 1104 struct vm_fault *vmf)
1101{ 1105{
1102 1106
1103 struct inode *inode = file_inode(vmf->vma->vm_file); 1107 return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1104 struct xfs_inode *ip = XFS_I(inode);
1105 int ret = VM_FAULT_NOPAGE;
1106 loff_t size;
1107
1108 trace_xfs_filemap_pfn_mkwrite(ip);
1109
1110 sb_start_pagefault(inode->i_sb);
1111 file_update_time(vmf->vma->vm_file);
1112
1113 /* check if the faulting page hasn't raced with truncate */
1114 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1115 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1116 if (vmf->pgoff >= size)
1117 ret = VM_FAULT_SIGBUS;
1118 else if (IS_DAX(inode))
1119 ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
1120 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1121 sb_end_pagefault(inode->i_sb);
1122 return ret;
1123
1124} 1108}
1125 1109
1126static const struct vm_operations_struct xfs_file_vm_ops = { 1110static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
1136 struct file *filp, 1120 struct file *filp,
1137 struct vm_area_struct *vma) 1121 struct vm_area_struct *vma)
1138{ 1122{
1123 /*
1124 * We don't support synchronous mappings for non-DAX files. At least
1125 * until someone comes with a sensible use case.
1126 */
1127 if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
1128 return -EOPNOTSUPP;
1129
1139 file_accessed(filp); 1130 file_accessed(filp);
1140 vma->vm_ops = &xfs_file_vm_ops; 1131 vma->vm_ops = &xfs_file_vm_ops;
1141 if (IS_DAX(file_inode(filp))) 1132 if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
1154 .compat_ioctl = xfs_file_compat_ioctl, 1145 .compat_ioctl = xfs_file_compat_ioctl,
1155#endif 1146#endif
1156 .mmap = xfs_file_mmap, 1147 .mmap = xfs_file_mmap,
1148 .mmap_supported_flags = MAP_SYNC,
1157 .open = xfs_file_open, 1149 .open = xfs_file_open,
1158 .release = xfs_file_release, 1150 .release = xfs_file_release,
1159 .fsync = xfs_file_fsync, 1151 .fsync = xfs_file_fsync,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18077e2189a9..33eb4fb2e3fd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
34#include "xfs_error.h" 34#include "xfs_error.h"
35#include "xfs_trans.h" 35#include "xfs_trans.h"
36#include "xfs_trans_space.h" 36#include "xfs_trans_space.h"
37#include "xfs_inode_item.h"
37#include "xfs_iomap.h" 38#include "xfs_iomap.h"
38#include "xfs_trace.h" 39#include "xfs_trace.h"
39#include "xfs_icache.h" 40#include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
1089 trace_xfs_iomap_found(ip, offset, length, 0, &imap); 1090 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1090 } 1091 }
1091 1092
1093 if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
1094 & ~XFS_ILOG_TIMESTAMP))
1095 iomap->flags |= IOMAP_F_DIRTY;
1096
1092 xfs_bmbt_to_iomap(ip, iomap, &imap); 1097 xfs_bmbt_to_iomap(ip, iomap, &imap);
1093 1098
1094 if (shared) 1099 if (shared)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 515ba042d75c..d718a10c2271 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
654DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); 654DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
655DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); 655DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
656 656
657DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
658
659TRACE_EVENT(xfs_filemap_fault, 657TRACE_EVENT(xfs_filemap_fault,
660 TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, 658 TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
661 bool write_fault), 659 bool write_fault),
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 895e16fcc62d..5258346c558c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
96ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 96ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
97 const struct iomap_ops *ops); 97 const struct iomap_ops *ops);
98int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 98int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
99 const struct iomap_ops *ops); 99 pfn_t *pfnp, const struct iomap_ops *ops);
100int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
101 pfn_t pfn);
100int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 102int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
101int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 103int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
102 pgoff_t index); 104 pgoff_t index);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 269086440071..a2b5d64ea503 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1702,6 +1702,7 @@ struct file_operations {
1702 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1702 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1703 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1703 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1704 int (*mmap) (struct file *, struct vm_area_struct *); 1704 int (*mmap) (struct file *, struct vm_area_struct *);
1705 unsigned long mmap_supported_flags;
1705 int (*open) (struct inode *, struct file *); 1706 int (*open) (struct inode *, struct file *);
1706 int (*flush) (struct file *, fl_owner_t id); 1707 int (*flush) (struct file *, fl_owner_t id);
1707 int (*release) (struct inode *, struct file *); 1708 int (*release) (struct inode *, struct file *);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index ca10767ab73d..19a07de28212 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -21,9 +21,13 @@ struct vm_fault;
21 21
22/* 22/*
23 * Flags for all iomap mappings: 23 * Flags for all iomap mappings:
24 *
25 * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
26 * written data and requires fdatasync to commit them to persistent storage.
24 */ 27 */
25#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ 28#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */
26#define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */ 29#define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */
30#define IOMAP_F_DIRTY 0x04 /* uncommitted metadata */
27 31
28/* 32/*
29 * Flags that only need to be reported for IOMAP_REPORT requests: 33 * Flags that only need to be reported for IOMAP_REPORT requests:
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 606b6bce3a5b..296d1e0ea87b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
1367int __jbd2_log_start_commit(journal_t *journal, tid_t tid); 1367int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
1368int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); 1368int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1369int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1369int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1370int jbd2_transaction_committed(journal_t *journal, tid_t tid);
1370int jbd2_complete_transaction(journal_t *journal, tid_t tid); 1371int jbd2_complete_transaction(journal_t *journal, tid_t tid);
1371int jbd2_log_do_checkpoint(journal_t *journal); 1372int jbd2_log_do_checkpoint(journal_t *journal);
1372int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); 1373int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 3eaad2fbf284..f8109ddb5ef1 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -18,6 +18,18 @@
18#include <linux/sizes.h> 18#include <linux/sizes.h>
19#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/uuid.h> 20#include <linux/uuid.h>
21#include <linux/spinlock.h>
22
23struct badrange_entry {
24 u64 start;
25 u64 length;
26 struct list_head list;
27};
28
29struct badrange {
30 struct list_head list;
31 spinlock_t lock;
32};
21 33
22enum { 34enum {
23 /* when a dimm supports both PMEM and BLK access a label is required */ 35 /* when a dimm supports both PMEM and BLK access a label is required */
@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
129 141
130} 142}
131 143
132int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); 144void badrange_init(struct badrange *badrange);
133void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, 145int badrange_add(struct badrange *badrange, u64 addr, u64 length);
134 phys_addr_t start, unsigned int len); 146void badrange_forget(struct badrange *badrange, phys_addr_t start,
147 unsigned int len);
148int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
149 u64 length);
135struct nvdimm_bus *nvdimm_bus_register(struct device *parent, 150struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
136 struct nvdimm_bus_descriptor *nfit_desc); 151 struct nvdimm_bus_descriptor *nfit_desc);
137void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); 152void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c7b1d617dff6..ee073146aaa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp);
199#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 199#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
200#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 200#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
201#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 201#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
202#define VM_SYNC 0x00800000 /* Synchronous page faults */
202#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 203#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
203#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ 204#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
204#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 205#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
@@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
1191#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ 1192#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
1192#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ 1193#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
1193#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ 1194#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
1194 1195#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables
1195#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 1196 * and needs fsync() to complete (for
1197 * synchronous page faults in DAX) */
1196 1198
1197#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ 1199#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
1198 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ 1200 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
1210 { VM_FAULT_LOCKED, "LOCKED" }, \ 1212 { VM_FAULT_LOCKED, "LOCKED" }, \
1211 { VM_FAULT_RETRY, "RETRY" }, \ 1213 { VM_FAULT_RETRY, "RETRY" }, \
1212 { VM_FAULT_FALLBACK, "FALLBACK" }, \ 1214 { VM_FAULT_FALLBACK, "FALLBACK" }, \
1213 { VM_FAULT_DONE_COW, "DONE_COW" } 1215 { VM_FAULT_DONE_COW, "DONE_COW" }, \
1216 { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
1214 1217
1215/* Encode hstate index for a hwpoisoned large page */ 1218/* Encode hstate index for a hwpoisoned large page */
1216#define VM_FAULT_SET_HINDEX(x) ((x) << 12) 1219#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 7c87b6652244..6a4d1caaff5c 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -8,6 +8,48 @@
8#include <linux/atomic.h> 8#include <linux/atomic.h>
9#include <uapi/linux/mman.h> 9#include <uapi/linux/mman.h>
10 10
11/*
12 * Arrange for legacy / undefined architecture specific flags to be
13 * ignored by mmap handling code.
14 */
15#ifndef MAP_32BIT
16#define MAP_32BIT 0
17#endif
18#ifndef MAP_HUGE_2MB
19#define MAP_HUGE_2MB 0
20#endif
21#ifndef MAP_HUGE_1GB
22#define MAP_HUGE_1GB 0
23#endif
24#ifndef MAP_UNINITIALIZED
25#define MAP_UNINITIALIZED 0
26#endif
27#ifndef MAP_SYNC
28#define MAP_SYNC 0
29#endif
30
31/*
32 * The historical set of flags that all mmap implementations implicitly
33 * support when a ->mmap_validate() op is not provided in file_operations.
34 */
35#define LEGACY_MAP_MASK (MAP_SHARED \
36 | MAP_PRIVATE \
37 | MAP_FIXED \
38 | MAP_ANONYMOUS \
39 | MAP_DENYWRITE \
40 | MAP_EXECUTABLE \
41 | MAP_UNINITIALIZED \
42 | MAP_GROWSDOWN \
43 | MAP_LOCKED \
44 | MAP_NORESERVE \
45 | MAP_POPULATE \
46 | MAP_NONBLOCK \
47 | MAP_STACK \
48 | MAP_HUGETLB \
49 | MAP_32BIT \
50 | MAP_HUGE_2MB \
51 | MAP_HUGE_1GB)
52
11extern int sysctl_overcommit_memory; 53extern int sysctl_overcommit_memory;
12extern int sysctl_overcommit_ratio; 54extern int sysctl_overcommit_ratio;
13extern unsigned long sysctl_overcommit_kbytes; 55extern unsigned long sysctl_overcommit_kbytes;
@@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot)
64 * ("bit1" and "bit2" must be single bits) 106 * ("bit1" and "bit2" must be single bits)
65 */ 107 */
66#define _calc_vm_trans(x, bit1, bit2) \ 108#define _calc_vm_trans(x, bit1, bit2) \
109 ((!(bit1) || !(bit2)) ? 0 : \
67 ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ 110 ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
68 : ((x) & (bit1)) / ((bit1) / (bit2))) 111 : ((x) & (bit1)) / ((bit1) / (bit2))))
69 112
70/* 113/*
71 * Combine the mmap "prot" argument into "vm_flags" used internally. 114 * Combine the mmap "prot" argument into "vm_flags" used internally.
@@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags)
87{ 130{
88 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | 131 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
89 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | 132 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
90 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); 133 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
134 _calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
91} 135}
92 136
93unsigned long vm_commit_limit(void); 137unsigned long vm_commit_limit(void);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 8a8df5423dca..97b09fcf7e52 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
149 TP_ARGS(inode, vmf, length, pfn, radix_entry)) 149 TP_ARGS(inode, vmf, length, pfn, radix_entry))
150 150
151DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); 151DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
152DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
153 152
154DECLARE_EVENT_CLASS(dax_pte_fault_class, 153DECLARE_EVENT_CLASS(dax_pte_fault_class,
155 TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), 154 TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
@@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
192DEFINE_PTE_FAULT_EVENT(dax_pte_fault); 191DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
193DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); 192DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
194DEFINE_PTE_FAULT_EVENT(dax_load_hole); 193DEFINE_PTE_FAULT_EVENT(dax_load_hole);
194DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
195DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
195 196
196TRACE_EVENT(dax_insert_mapping, 197TRACE_EVENT(dax_insert_mapping,
197 TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), 198 TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6d319c46fd90..f8b134f5608f 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@
17 17
18#define MAP_SHARED 0x01 /* Share changes */ 18#define MAP_SHARED 0x01 /* Share changes */
19#define MAP_PRIVATE 0x02 /* Changes are private */ 19#define MAP_PRIVATE 0x02 /* Changes are private */
20#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
20#define MAP_TYPE 0x0f /* Mask for type of mapping */ 21#define MAP_TYPE 0x0f /* Mask for type of mapping */
21#define MAP_FIXED 0x10 /* Interpret addr exactly */ 22#define MAP_FIXED 0x10 /* Interpret addr exactly */
22#define MAP_ANONYMOUS 0x20 /* don't use a file */ 23#define MAP_ANONYMOUS 0x20 /* don't use a file */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 2dffcbf705b3..653687d9771b 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,6 +13,7 @@
13#define MAP_NONBLOCK 0x10000 /* do not block on IO */ 13#define MAP_NONBLOCK 0x10000 /* do not block on IO */
14#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 14#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
15#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 15#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
16#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
16 17
17/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ 18/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
18 19
diff --git a/mm/mmap.c b/mm/mmap.c
index 680506faceae..924839fac0e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1387 1387
1388 if (file) { 1388 if (file) {
1389 struct inode *inode = file_inode(file); 1389 struct inode *inode = file_inode(file);
1390 unsigned long flags_mask;
1391
1392 flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
1390 1393
1391 switch (flags & MAP_TYPE) { 1394 switch (flags & MAP_TYPE) {
1392 case MAP_SHARED: 1395 case MAP_SHARED:
1396 /*
1397 * Force use of MAP_SHARED_VALIDATE with non-legacy
1398 * flags. E.g. MAP_SYNC is dangerous to use with
1399 * MAP_SHARED as you don't know which consistency model
1400 * you will get. We silently ignore unsupported flags
1401 * with MAP_SHARED to preserve backward compatibility.
1402 */
1403 flags &= LEGACY_MAP_MASK;
1404 /* fall through */
1405 case MAP_SHARED_VALIDATE:
1406 if (flags & ~flags_mask)
1407 return -EOPNOTSUPP;
1393 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1408 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1394 return -EACCES; 1409 return -EACCES;
1395 1410
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index 6d319c46fd90..f8b134f5608f 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@
17 17
18#define MAP_SHARED 0x01 /* Share changes */ 18#define MAP_SHARED 0x01 /* Share changes */
19#define MAP_PRIVATE 0x02 /* Changes are private */ 19#define MAP_PRIVATE 0x02 /* Changes are private */
20#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
20#define MAP_TYPE 0x0f /* Mask for type of mapping */ 21#define MAP_TYPE 0x0f /* Mask for type of mapping */
21#define MAP_FIXED 0x10 /* Interpret addr exactly */ 22#define MAP_FIXED 0x10 /* Interpret addr exactly */
22#define MAP_ANONYMOUS 0x20 /* don't use a file */ 23#define MAP_ANONYMOUS 0x20 /* don't use a file */
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 65368d9027f5..db33b28c5ef3 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
70libnvdimm-y += $(NVDIMM_SRC)/region.o 70libnvdimm-y += $(NVDIMM_SRC)/region.o
71libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o 71libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
72libnvdimm-y += $(NVDIMM_SRC)/label.o 72libnvdimm-y += $(NVDIMM_SRC)/label.o
73libnvdimm-y += $(NVDIMM_SRC)/badrange.o
73libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o 74libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
74libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o 75libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
75libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o 76libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index bef419d4266d..7217b2b953b5 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -168,8 +168,12 @@ struct nfit_test {
168 spinlock_t lock; 168 spinlock_t lock;
169 } ars_state; 169 } ars_state;
170 struct device *dimm_dev[NUM_DCR]; 170 struct device *dimm_dev[NUM_DCR];
171 struct badrange badrange;
172 struct work_struct work;
171}; 173};
172 174
175static struct workqueue_struct *nfit_wq;
176
173static struct nfit_test *to_nfit_test(struct device *dev) 177static struct nfit_test *to_nfit_test(struct device *dev)
174{ 178{
175 struct platform_device *pdev = to_platform_device(dev); 179 struct platform_device *pdev = to_platform_device(dev);
@@ -234,48 +238,68 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
234 return rc; 238 return rc;
235} 239}
236 240
237#define NFIT_TEST_ARS_RECORDS 4
238#define NFIT_TEST_CLEAR_ERR_UNIT 256 241#define NFIT_TEST_CLEAR_ERR_UNIT 256
239 242
240static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd, 243static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
241 unsigned int buf_len) 244 unsigned int buf_len)
242{ 245{
246 int ars_recs;
247
243 if (buf_len < sizeof(*nd_cmd)) 248 if (buf_len < sizeof(*nd_cmd))
244 return -EINVAL; 249 return -EINVAL;
245 250
251 /* for testing, only store up to n records that fit within 4k */
252 ars_recs = SZ_4K / sizeof(struct nd_ars_record);
253
246 nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status) 254 nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
247 + NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record); 255 + ars_recs * sizeof(struct nd_ars_record);
248 nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16; 256 nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
249 nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT; 257 nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
250 258
251 return 0; 259 return 0;
252} 260}
253 261
254/* 262static void post_ars_status(struct ars_state *ars_state,
255 * Initialize the ars_state to return an ars_result 1 second in the future with 263 struct badrange *badrange, u64 addr, u64 len)
256 * a 4K error range in the middle of the requested address range.
257 */
258static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len)
259{ 264{
260 struct nd_cmd_ars_status *ars_status; 265 struct nd_cmd_ars_status *ars_status;
261 struct nd_ars_record *ars_record; 266 struct nd_ars_record *ars_record;
267 struct badrange_entry *be;
268 u64 end = addr + len - 1;
269 int i = 0;
262 270
263 ars_state->deadline = jiffies + 1*HZ; 271 ars_state->deadline = jiffies + 1*HZ;
264 ars_status = ars_state->ars_status; 272 ars_status = ars_state->ars_status;
265 ars_status->status = 0; 273 ars_status->status = 0;
266 ars_status->out_length = sizeof(struct nd_cmd_ars_status)
267 + sizeof(struct nd_ars_record);
268 ars_status->address = addr; 274 ars_status->address = addr;
269 ars_status->length = len; 275 ars_status->length = len;
270 ars_status->type = ND_ARS_PERSISTENT; 276 ars_status->type = ND_ARS_PERSISTENT;
271 ars_status->num_records = 1; 277
272 ars_record = &ars_status->records[0]; 278 spin_lock(&badrange->lock);
273 ars_record->handle = 0; 279 list_for_each_entry(be, &badrange->list, list) {
274 ars_record->err_address = addr + len / 2; 280 u64 be_end = be->start + be->length - 1;
275 ars_record->length = SZ_4K; 281 u64 rstart, rend;
282
283 /* skip entries outside the range */
284 if (be_end < addr || be->start > end)
285 continue;
286
287 rstart = (be->start < addr) ? addr : be->start;
288 rend = (be_end < end) ? be_end : end;
289 ars_record = &ars_status->records[i];
290 ars_record->handle = 0;
291 ars_record->err_address = rstart;
292 ars_record->length = rend - rstart + 1;
293 i++;
294 }
295 spin_unlock(&badrange->lock);
296 ars_status->num_records = i;
297 ars_status->out_length = sizeof(struct nd_cmd_ars_status)
298 + i * sizeof(struct nd_ars_record);
276} 299}
277 300
278static int nfit_test_cmd_ars_start(struct ars_state *ars_state, 301static int nfit_test_cmd_ars_start(struct nfit_test *t,
302 struct ars_state *ars_state,
279 struct nd_cmd_ars_start *ars_start, unsigned int buf_len, 303 struct nd_cmd_ars_start *ars_start, unsigned int buf_len,
280 int *cmd_rc) 304 int *cmd_rc)
281{ 305{
@@ -289,7 +313,7 @@ static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
289 } else { 313 } else {
290 ars_start->status = 0; 314 ars_start->status = 0;
291 ars_start->scrub_time = 1; 315 ars_start->scrub_time = 1;
292 post_ars_status(ars_state, ars_start->address, 316 post_ars_status(ars_state, &t->badrange, ars_start->address,
293 ars_start->length); 317 ars_start->length);
294 *cmd_rc = 0; 318 *cmd_rc = 0;
295 } 319 }
@@ -320,7 +344,8 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
320 return 0; 344 return 0;
321} 345}
322 346
323static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err, 347static int nfit_test_cmd_clear_error(struct nfit_test *t,
348 struct nd_cmd_clear_error *clear_err,
324 unsigned int buf_len, int *cmd_rc) 349 unsigned int buf_len, int *cmd_rc)
325{ 350{
326 const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1; 351 const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
@@ -330,18 +355,91 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
330 if ((clear_err->address & mask) || (clear_err->length & mask)) 355 if ((clear_err->address & mask) || (clear_err->length & mask))
331 return -EINVAL; 356 return -EINVAL;
332 357
333 /* 358 badrange_forget(&t->badrange, clear_err->address, clear_err->length);
334 * Report 'all clear' success for all commands even though a new
335 * scrub will find errors again. This is enough to have the
336 * error removed from the 'badblocks' tracking in the pmem
337 * driver.
338 */
339 clear_err->status = 0; 359 clear_err->status = 0;
340 clear_err->cleared = clear_err->length; 360 clear_err->cleared = clear_err->length;
341 *cmd_rc = 0; 361 *cmd_rc = 0;
342 return 0; 362 return 0;
343} 363}
344 364
365struct region_search_spa {
366 u64 addr;
367 struct nd_region *region;
368};
369
370static int is_region_device(struct device *dev)
371{
372 return !strncmp(dev->kobj.name, "region", 6);
373}
374
375static int nfit_test_search_region_spa(struct device *dev, void *data)
376{
377 struct region_search_spa *ctx = data;
378 struct nd_region *nd_region;
379 resource_size_t ndr_end;
380
381 if (!is_region_device(dev))
382 return 0;
383
384 nd_region = to_nd_region(dev);
385 ndr_end = nd_region->ndr_start + nd_region->ndr_size;
386
387 if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) {
388 ctx->region = nd_region;
389 return 1;
390 }
391
392 return 0;
393}
394
395static int nfit_test_search_spa(struct nvdimm_bus *bus,
396 struct nd_cmd_translate_spa *spa)
397{
398 int ret;
399 struct nd_region *nd_region = NULL;
400 struct nvdimm *nvdimm = NULL;
401 struct nd_mapping *nd_mapping = NULL;
402 struct region_search_spa ctx = {
403 .addr = spa->spa,
404 .region = NULL,
405 };
406 u64 dpa;
407
408 ret = device_for_each_child(&bus->dev, &ctx,
409 nfit_test_search_region_spa);
410
411 if (!ret)
412 return -ENODEV;
413
414 nd_region = ctx.region;
415
416 dpa = ctx.addr - nd_region->ndr_start;
417
418 /*
419 * last dimm is selected for test
420 */
421 nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1];
422 nvdimm = nd_mapping->nvdimm;
423
424 spa->devices[0].nfit_device_handle = handle[nvdimm->id];
425 spa->num_nvdimms = 1;
426 spa->devices[0].dpa = dpa;
427
428 return 0;
429}
430
431static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus,
432 struct nd_cmd_translate_spa *spa, unsigned int buf_len)
433{
434 if (buf_len < spa->translate_length)
435 return -EINVAL;
436
437 if (nfit_test_search_spa(bus, spa) < 0 || !spa->num_nvdimms)
438 spa->status = 2;
439
440 return 0;
441}
442
345static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len) 443static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
346{ 444{
347 static const struct nd_smart_payload smart_data = { 445 static const struct nd_smart_payload smart_data = {
@@ -378,6 +476,93 @@ static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
378 return 0; 476 return 0;
379} 477}
380 478
479static void uc_error_notify(struct work_struct *work)
480{
481 struct nfit_test *t = container_of(work, typeof(*t), work);
482
483 __acpi_nfit_notify(&t->pdev.dev, t, NFIT_NOTIFY_UC_MEMORY_ERROR);
484}
485
486static int nfit_test_cmd_ars_error_inject(struct nfit_test *t,
487 struct nd_cmd_ars_err_inj *err_inj, unsigned int buf_len)
488{
489 int rc;
490
491 if (buf_len != sizeof(*err_inj)) {
492 rc = -EINVAL;
493 goto err;
494 }
495
496 if (err_inj->err_inj_spa_range_length <= 0) {
497 rc = -EINVAL;
498 goto err;
499 }
500
501 rc = badrange_add(&t->badrange, err_inj->err_inj_spa_range_base,
502 err_inj->err_inj_spa_range_length);
503 if (rc < 0)
504 goto err;
505
506 if (err_inj->err_inj_options & (1 << ND_ARS_ERR_INJ_OPT_NOTIFY))
507 queue_work(nfit_wq, &t->work);
508
509 err_inj->status = 0;
510 return 0;
511
512err:
513 err_inj->status = NFIT_ARS_INJECT_INVALID;
514 return rc;
515}
516
517static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t,
518 struct nd_cmd_ars_err_inj_clr *err_clr, unsigned int buf_len)
519{
520 int rc;
521
522 if (buf_len != sizeof(*err_clr)) {
523 rc = -EINVAL;
524 goto err;
525 }
526
527 if (err_clr->err_inj_clr_spa_range_length <= 0) {
528 rc = -EINVAL;
529 goto err;
530 }
531
532 badrange_forget(&t->badrange, err_clr->err_inj_clr_spa_range_base,
533 err_clr->err_inj_clr_spa_range_length);
534
535 err_clr->status = 0;
536 return 0;
537
538err:
539 err_clr->status = NFIT_ARS_INJECT_INVALID;
540 return rc;
541}
542
543static int nfit_test_cmd_ars_inject_status(struct nfit_test *t,
544 struct nd_cmd_ars_err_inj_stat *err_stat,
545 unsigned int buf_len)
546{
547 struct badrange_entry *be;
548 int max = SZ_4K / sizeof(struct nd_error_stat_query_record);
549 int i = 0;
550
551 err_stat->status = 0;
552 spin_lock(&t->badrange.lock);
553 list_for_each_entry(be, &t->badrange.list, list) {
554 err_stat->record[i].err_inj_stat_spa_range_base = be->start;
555 err_stat->record[i].err_inj_stat_spa_range_length = be->length;
556 i++;
557 if (i > max)
558 break;
559 }
560 spin_unlock(&t->badrange.lock);
561 err_stat->inj_err_rec_count = i;
562
563 return 0;
564}
565
381static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, 566static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
382 struct nvdimm *nvdimm, unsigned int cmd, void *buf, 567 struct nvdimm *nvdimm, unsigned int cmd, void *buf,
383 unsigned int buf_len, int *cmd_rc) 568 unsigned int buf_len, int *cmd_rc)
@@ -449,6 +634,38 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
449 } 634 }
450 } else { 635 } else {
451 struct ars_state *ars_state = &t->ars_state; 636 struct ars_state *ars_state = &t->ars_state;
637 struct nd_cmd_pkg *call_pkg = buf;
638
639 if (!nd_desc)
640 return -ENOTTY;
641
642 if (cmd == ND_CMD_CALL) {
643 func = call_pkg->nd_command;
644
645 buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out;
646 buf = (void *) call_pkg->nd_payload;
647
648 switch (func) {
649 case NFIT_CMD_TRANSLATE_SPA:
650 rc = nfit_test_cmd_translate_spa(
651 acpi_desc->nvdimm_bus, buf, buf_len);
652 return rc;
653 case NFIT_CMD_ARS_INJECT_SET:
654 rc = nfit_test_cmd_ars_error_inject(t, buf,
655 buf_len);
656 return rc;
657 case NFIT_CMD_ARS_INJECT_CLEAR:
658 rc = nfit_test_cmd_ars_inject_clear(t, buf,
659 buf_len);
660 return rc;
661 case NFIT_CMD_ARS_INJECT_GET:
662 rc = nfit_test_cmd_ars_inject_status(t, buf,
663 buf_len);
664 return rc;
665 default:
666 return -ENOTTY;
667 }
668 }
452 669
453 if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask)) 670 if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
454 return -ENOTTY; 671 return -ENOTTY;
@@ -458,15 +675,15 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
458 rc = nfit_test_cmd_ars_cap(buf, buf_len); 675 rc = nfit_test_cmd_ars_cap(buf, buf_len);
459 break; 676 break;
460 case ND_CMD_ARS_START: 677 case ND_CMD_ARS_START:
461 rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len, 678 rc = nfit_test_cmd_ars_start(t, ars_state, buf,
462 cmd_rc); 679 buf_len, cmd_rc);
463 break; 680 break;
464 case ND_CMD_ARS_STATUS: 681 case ND_CMD_ARS_STATUS:
465 rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len, 682 rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
466 cmd_rc); 683 cmd_rc);
467 break; 684 break;
468 case ND_CMD_CLEAR_ERROR: 685 case ND_CMD_CLEAR_ERROR:
469 rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc); 686 rc = nfit_test_cmd_clear_error(t, buf, buf_len, cmd_rc);
470 break; 687 break;
471 default: 688 default:
472 return -ENOTTY; 689 return -ENOTTY;
@@ -566,10 +783,9 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
566 783
567static int ars_state_init(struct device *dev, struct ars_state *ars_state) 784static int ars_state_init(struct device *dev, struct ars_state *ars_state)
568{ 785{
786 /* for testing, only store up to n records that fit within 4k */
569 ars_state->ars_status = devm_kzalloc(dev, 787 ars_state->ars_status = devm_kzalloc(dev,
570 sizeof(struct nd_cmd_ars_status) 788 sizeof(struct nd_cmd_ars_status) + SZ_4K, GFP_KERNEL);
571 + sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS,
572 GFP_KERNEL);
573 if (!ars_state->ars_status) 789 if (!ars_state->ars_status)
574 return -ENOMEM; 790 return -ENOMEM;
575 spin_lock_init(&ars_state->lock); 791 spin_lock_init(&ars_state->lock);
@@ -1419,7 +1635,8 @@ static void nfit_test0_setup(struct nfit_test *t)
1419 + i * sizeof(u64); 1635 + i * sizeof(u64);
1420 } 1636 }
1421 1637
1422 post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE); 1638 post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
1639 SPA0_SIZE);
1423 1640
1424 acpi_desc = &t->acpi_desc; 1641 acpi_desc = &t->acpi_desc;
1425 set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en); 1642 set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
@@ -1430,7 +1647,12 @@ static void nfit_test0_setup(struct nfit_test *t)
1430 set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en); 1647 set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
1431 set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en); 1648 set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
1432 set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en); 1649 set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
1650 set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en);
1433 set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en); 1651 set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en);
1652 set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en);
1653 set_bit(NFIT_CMD_ARS_INJECT_SET, &acpi_desc->bus_nfit_cmd_force_en);
1654 set_bit(NFIT_CMD_ARS_INJECT_CLEAR, &acpi_desc->bus_nfit_cmd_force_en);
1655 set_bit(NFIT_CMD_ARS_INJECT_GET, &acpi_desc->bus_nfit_cmd_force_en);
1434} 1656}
1435 1657
1436static void nfit_test1_setup(struct nfit_test *t) 1658static void nfit_test1_setup(struct nfit_test *t)
@@ -1520,7 +1742,8 @@ static void nfit_test1_setup(struct nfit_test *t)
1520 dcr->code = NFIT_FIC_BYTE; 1742 dcr->code = NFIT_FIC_BYTE;
1521 dcr->windows = 0; 1743 dcr->windows = 0;
1522 1744
1523 post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE); 1745 post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
1746 SPA2_SIZE);
1524 1747
1525 acpi_desc = &t->acpi_desc; 1748 acpi_desc = &t->acpi_desc;
1526 set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en); 1749 set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
@@ -1589,6 +1812,7 @@ static int nfit_ctl_test(struct device *dev)
1589 unsigned long mask, cmd_size, offset; 1812 unsigned long mask, cmd_size, offset;
1590 union { 1813 union {
1591 struct nd_cmd_get_config_size cfg_size; 1814 struct nd_cmd_get_config_size cfg_size;
1815 struct nd_cmd_clear_error clear_err;
1592 struct nd_cmd_ars_status ars_stat; 1816 struct nd_cmd_ars_status ars_stat;
1593 struct nd_cmd_ars_cap ars_cap; 1817 struct nd_cmd_ars_cap ars_cap;
1594 char buf[sizeof(struct nd_cmd_ars_status) 1818 char buf[sizeof(struct nd_cmd_ars_status)
@@ -1613,10 +1837,15 @@ static int nfit_ctl_test(struct device *dev)
1613 .cmd_mask = 1UL << ND_CMD_ARS_CAP 1837 .cmd_mask = 1UL << ND_CMD_ARS_CAP
1614 | 1UL << ND_CMD_ARS_START 1838 | 1UL << ND_CMD_ARS_START
1615 | 1UL << ND_CMD_ARS_STATUS 1839 | 1UL << ND_CMD_ARS_STATUS
1616 | 1UL << ND_CMD_CLEAR_ERROR, 1840 | 1UL << ND_CMD_CLEAR_ERROR
1841 | 1UL << ND_CMD_CALL,
1617 .module = THIS_MODULE, 1842 .module = THIS_MODULE,
1618 .provider_name = "ACPI.NFIT", 1843 .provider_name = "ACPI.NFIT",
1619 .ndctl = acpi_nfit_ctl, 1844 .ndctl = acpi_nfit_ctl,
1845 .bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA
1846 | 1UL << NFIT_CMD_ARS_INJECT_SET
1847 | 1UL << NFIT_CMD_ARS_INJECT_CLEAR
1848 | 1UL << NFIT_CMD_ARS_INJECT_GET,
1620 }, 1849 },
1621 .dev = &adev->dev, 1850 .dev = &adev->dev,
1622 }; 1851 };
@@ -1767,6 +1996,23 @@ static int nfit_ctl_test(struct device *dev)
1767 return -EIO; 1996 return -EIO;
1768 } 1997 }
1769 1998
1999 /* test clear error */
2000 cmd_size = sizeof(cmds.clear_err);
2001 cmds.clear_err = (struct nd_cmd_clear_error) {
2002 .length = 512,
2003 .cleared = 512,
2004 };
2005 rc = setup_result(cmds.buf, cmd_size);
2006 if (rc)
2007 return rc;
2008 rc = acpi_nfit_ctl(&acpi_desc->nd_desc, NULL, ND_CMD_CLEAR_ERROR,
2009 cmds.buf, cmd_size, &cmd_rc);
2010 if (rc < 0 || cmd_rc) {
2011 dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
2012 __func__, __LINE__, rc, cmd_rc);
2013 return -EIO;
2014 }
2015
1770 return 0; 2016 return 0;
1771} 2017}
1772 2018
@@ -1915,6 +2161,10 @@ static __init int nfit_test_init(void)
1915 2161
1916 nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm); 2162 nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
1917 2163
2164 nfit_wq = create_singlethread_workqueue("nfit");
2165 if (!nfit_wq)
2166 return -ENOMEM;
2167
1918 nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm"); 2168 nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm");
1919 if (IS_ERR(nfit_test_dimm)) { 2169 if (IS_ERR(nfit_test_dimm)) {
1920 rc = PTR_ERR(nfit_test_dimm); 2170 rc = PTR_ERR(nfit_test_dimm);
@@ -1931,6 +2181,7 @@ static __init int nfit_test_init(void)
1931 goto err_register; 2181 goto err_register;
1932 } 2182 }
1933 INIT_LIST_HEAD(&nfit_test->resources); 2183 INIT_LIST_HEAD(&nfit_test->resources);
2184 badrange_init(&nfit_test->badrange);
1934 switch (i) { 2185 switch (i) {
1935 case 0: 2186 case 0:
1936 nfit_test->num_pm = NUM_PM; 2187 nfit_test->num_pm = NUM_PM;
@@ -1966,6 +2217,7 @@ static __init int nfit_test_init(void)
1966 goto err_register; 2217 goto err_register;
1967 2218
1968 instances[i] = nfit_test; 2219 instances[i] = nfit_test;
2220 INIT_WORK(&nfit_test->work, uc_error_notify);
1969 } 2221 }
1970 2222
1971 rc = platform_driver_register(&nfit_test_driver); 2223 rc = platform_driver_register(&nfit_test_driver);
@@ -1974,6 +2226,7 @@ static __init int nfit_test_init(void)
1974 return 0; 2226 return 0;
1975 2227
1976 err_register: 2228 err_register:
2229 destroy_workqueue(nfit_wq);
1977 for (i = 0; i < NUM_NFITS; i++) 2230 for (i = 0; i < NUM_NFITS; i++)
1978 if (instances[i]) 2231 if (instances[i])
1979 platform_device_unregister(&instances[i]->pdev); 2232 platform_device_unregister(&instances[i]->pdev);
@@ -1989,6 +2242,8 @@ static __exit void nfit_test_exit(void)
1989{ 2242{
1990 int i; 2243 int i;
1991 2244
2245 flush_workqueue(nfit_wq);
2246 destroy_workqueue(nfit_wq);
1992 for (i = 0; i < NUM_NFITS; i++) 2247 for (i = 0; i < NUM_NFITS; i++)
1993 platform_device_unregister(&instances[i]->pdev); 2248 platform_device_unregister(&instances[i]->pdev);
1994 platform_driver_unregister(&nfit_test_driver); 2249 platform_driver_unregister(&nfit_test_driver);
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index d3d63dd5ed38..113b44675a71 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -32,6 +32,58 @@ struct nfit_test_resource {
32 void *buf; 32 void *buf;
33}; 33};
34 34
35#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA 2
36#define NFIT_ARS_INJECT_INVALID 2
37
38enum err_inj_options {
39 ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
40};
41
42/* nfit commands */
43enum nfit_cmd_num {
44 NFIT_CMD_TRANSLATE_SPA = 5,
45 NFIT_CMD_ARS_INJECT_SET = 7,
46 NFIT_CMD_ARS_INJECT_CLEAR = 8,
47 NFIT_CMD_ARS_INJECT_GET = 9,
48};
49
50struct nd_cmd_translate_spa {
51 __u64 spa;
52 __u32 status;
53 __u8 flags;
54 __u8 _reserved[3];
55 __u64 translate_length;
56 __u32 num_nvdimms;
57 struct nd_nvdimm_device {
58 __u32 nfit_device_handle;
59 __u32 _reserved;
60 __u64 dpa;
61 } __packed devices[0];
62
63} __packed;
64
65struct nd_cmd_ars_err_inj {
66 __u64 err_inj_spa_range_base;
67 __u64 err_inj_spa_range_length;
68 __u8 err_inj_options;
69 __u32 status;
70} __packed;
71
72struct nd_cmd_ars_err_inj_clr {
73 __u64 err_inj_clr_spa_range_base;
74 __u64 err_inj_clr_spa_range_length;
75 __u32 status;
76} __packed;
77
78struct nd_cmd_ars_err_inj_stat {
79 __u32 status;
80 __u32 inj_err_rec_count;
81 struct nd_error_stat_query_record {
82 __u64 err_inj_stat_spa_range_base;
83 __u64 err_inj_stat_spa_range_length;
84 } __packed record[0];
85} __packed;
86
35union acpi_object; 87union acpi_object;
36typedef void *acpi_handle; 88typedef void *acpi_handle;
37 89