aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ntfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ntfs')
-rw-r--r--fs/ntfs/ChangeLog38
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c2247
3 files changed, 2242 insertions, 45 deletions
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 3b8ff2318085..03015c7b236c 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,16 +1,15 @@
1ToDo/Notes: 1ToDo/Notes:
2 - Find and fix bugs. 2 - Find and fix bugs.
3 - In between ntfs_prepare/commit_write, need exclusion between 3 - The only places in the kernel where a file is resized are
4 simultaneous file extensions. This is given to us by holding i_sem 4 ntfs_file_write*() and ntfs_truncate() for both of which i_sem is
5 on the inode. The only places in the kernel when a file is resized 5 held. Just have to be careful in read-/writepage and other helpers
6 are prepare/commit write and ntfs_truncate() for both of which i_sem
7 is held. Just have to be careful in read-/writepage and other helpers
8 not running under i_sem that we play nice... Also need to be careful 6 not running under i_sem that we play nice... Also need to be careful
9 with initialized_size extention in ntfs_prepare_write and writepage. 7 with initialized_size extension in ntfs_file_write*() and writepage.
10 UPDATE: The only things that need to be checked are 8 UPDATE: The only things that need to be checked are the compressed
11 prepare/commit_write as well as the compressed write and the other 9 write and the other attribute resize/write cases like index
12 attribute resize/write cases like index attributes, etc. For now 10 attributes, etc. For now none of these are implemented so are safe.
13 none of these are implemented so are safe. 11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 helpers.
14 - Implement mft.c::sync_mft_mirror_umount(). We currently will just 13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
15 leave the volume dirty on umount if the final iput(vol->mft_ino) 14 leave the volume dirty on umount if the final iput(vol->mft_ino)
16 causes a write of any mirrored mft records due to the mft mirror 15 causes a write of any mirrored mft records due to the mft mirror
@@ -20,7 +19,7 @@ ToDo/Notes:
20 - Enable the code for setting the NT4 compatibility flag when we start 19 - Enable the code for setting the NT4 compatibility flag when we start
21 making NTFS 1.2 specific modifications. 20 making NTFS 1.2 specific modifications.
22 21
232.1.25-WIP 222.1.25 - (Almost) fully implement write(2) and truncate(2).
24 23
25 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and 24 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
26 {__,}ntfs_cluster_free() to also take an optional attribute search 25 {__,}ntfs_cluster_free() to also take an optional attribute search
@@ -49,7 +48,12 @@ ToDo/Notes:
49 extend the allocation of an attributes. Optionally, the data size, 48 extend the allocation of an attributes. Optionally, the data size,
50 but not the initialized size can be extended, too. 49 but not the initialized size can be extended, too.
51 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports 50 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
52 uncompressed and unencrypted files. 51 uncompressed and unencrypted files and it never creates sparse files
52 at least for the moment (making a file sparse requires us to modify
53 its directory entries and we do not support directory operations at
54 the moment). Also, support for highly fragmented files, i.e. ones
55 whose data attribute is split across multiple extents, is severly
56 limited. When such a case is encountered, EOPNOTSUPP is returned.
53 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes 57 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
54 the initial implementation of file truncation. Now both open(2)ing 58 the initial implementation of file truncation. Now both open(2)ing
55 a file with the O_TRUNC flag and the {,f}truncate(2) system calls 59 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
@@ -61,6 +65,16 @@ ToDo/Notes:
61 and cond_resched() in the main loop as we could be dirtying a lot of 65 and cond_resched() in the main loop as we could be dirtying a lot of
62 pages and this ensures we play nice with the VM and the system as a 66 pages and this ensures we play nice with the VM and the system as a
63 whole. 67 whole.
68 - Implement file operations ->write, ->aio_write, ->writev for regular
69 files. This replaces the old use of generic_file_write(), et al and
70 the address space operations ->prepare_write and ->commit_write.
71 This means that both sparse and non-sparse (unencrypted and
72 uncompressed) files can now be extended using the normal write(2)
73 code path. There are two limitations at present and these are that
74 we never create sparse files and that we only have limited support
75 for highly fragmented files, i.e. ones whose data attribute is split
76 across multiple extents. When such a case is encountered,
77 EOPNOTSUPP is returned.
64 78
652.1.24 - Lots of bug fixes and support more clean journal states. 792.1.24 - Lots of bug fixes and support more clean journal states.
66 80
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index a3ce2c0e7dd9..d0d45d1c853a 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25-WIP\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423d..cf2a0e2330df 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/pagevec.h>
25#include <linux/sched.h>
26#include <linux/swap.h>
27#include <linux/uio.h>
28#include <linux/writeback.h>
24 29
30#include <asm/page.h>
31#include <asm/uaccess.h>
32
33#include "attrib.h"
34#include "bitmap.h"
25#include "inode.h" 35#include "inode.h"
26#include "debug.h" 36#include "debug.h"
37#include "lcnalloc.h"
38#include "malloc.h"
39#include "mft.h"
27#include "ntfs.h" 40#include "ntfs.h"
28 41
29/** 42/**
@@ -56,6 +69,2176 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
56#ifdef NTFS_RW 69#ifdef NTFS_RW
57 70
58/** 71/**
72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
73 * @ni: ntfs inode of the attribute to extend
74 * @new_init_size: requested new initialized size in bytes
75 * @cached_page: store any allocated but unused page here
76 * @lru_pvec: lru-buffering pagevec of the caller
77 *
78 * Extend the initialized size of an attribute described by the ntfs inode @ni
79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
80 * the old initialized size and @new_init_size both in the page cache and on
81 * disk (if relevant complete pages are zeroed in the page cache then these may
82 * simply be marked dirty for later writeout). There is one caveat and that is
83 * that if any uptodate page cache pages between the old initialized size and
84 * the smaller of @new_init_size and the file size (vfs inode->i_size) are in
85 * memory, these need to be marked dirty without being zeroed since they could
86 * be non-zero due to mmap() based writes.
87 *
88 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
89 * in the resident attribute case, it is tied to the initialized size and, in
90 * the non-resident attribute case, it may not fall below the initialized size.
91 *
92 * Note that if the attribute is resident, we do not need to touch the page
93 * cache at all. This is because if the page cache page is not uptodate we
94 * bring it uptodate later, when doing the write to the mft record since we
95 * then already have the page mapped. And if the page is uptodate, the
96 * non-initialized region will already have been zeroed when the page was
97 * brought uptodate and the region may in fact already have been overwritten
98 * with new data via mmap() based writes, so we cannot just zero it. And since
99 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
100 * is unspecified, we choose not to do zeroing and thus we do not need to touch
101 * the page at all. For a more detailed explanation see ntfs_truncate() which
102 * is in fs/ntfs/inode.c.
103 *
104 * @cached_page and @lru_pvec are just optimisations for dealing with multiple
105 * pages.
106 *
107 * Return 0 on success and -errno on error. In the case that an error is
108 * encountered it is possible that the initialized size will already have been
109 * incremented some way towards @new_init_size but it is guaranteed that if
110 * this is the case, the necessary zeroing will also have happened and that all
111 * metadata is self-consistent.
112 *
113 * Locking: This function locks the mft record of the base ntfs inode and
114 * maintains the lock throughout execution of the function. This is required
115 * so that the initialized size of the attribute can be modified safely.
116 */
117static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
118 struct page **cached_page, struct pagevec *lru_pvec)
119{
120 s64 old_init_size;
121 loff_t old_i_size;
122 pgoff_t index, end_index;
123 unsigned long flags;
124 struct inode *vi = VFS_I(ni);
125 ntfs_inode *base_ni;
126 MFT_RECORD *m = NULL;
127 ATTR_RECORD *a;
128 ntfs_attr_search_ctx *ctx = NULL;
129 struct address_space *mapping;
130 struct page *page = NULL;
131 u8 *kattr;
132 int err;
133 u32 attr_len;
134
135 read_lock_irqsave(&ni->size_lock, flags);
136 old_init_size = ni->initialized_size;
137 old_i_size = i_size_read(vi);
138 BUG_ON(new_init_size > ni->allocated_size);
139 read_unlock_irqrestore(&ni->size_lock, flags);
140 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
141 "old_initialized_size 0x%llx, "
142 "new_initialized_size 0x%llx, i_size 0x%llx.",
143 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
144 (unsigned long long)old_init_size,
145 (unsigned long long)new_init_size, old_i_size);
146 if (!NInoAttr(ni))
147 base_ni = ni;
148 else
149 base_ni = ni->ext.base_ntfs_ino;
150 /* Use goto to reduce indentation and we need the label below anyway. */
151 if (NInoNonResident(ni))
152 goto do_non_resident_extend;
153 BUG_ON(old_init_size != old_i_size);
154 m = map_mft_record(base_ni);
155 if (IS_ERR(m)) {
156 err = PTR_ERR(m);
157 m = NULL;
158 goto err_out;
159 }
160 ctx = ntfs_attr_get_search_ctx(base_ni, m);
161 if (unlikely(!ctx)) {
162 err = -ENOMEM;
163 goto err_out;
164 }
165 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
166 CASE_SENSITIVE, 0, NULL, 0, ctx);
167 if (unlikely(err)) {
168 if (err == -ENOENT)
169 err = -EIO;
170 goto err_out;
171 }
172 m = ctx->mrec;
173 a = ctx->attr;
174 BUG_ON(a->non_resident);
175 /* The total length of the attribute value. */
176 attr_len = le32_to_cpu(a->data.resident.value_length);
177 BUG_ON(old_i_size != (loff_t)attr_len);
178 /*
179 * Do the zeroing in the mft record and update the attribute size in
180 * the mft record.
181 */
182 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
183 memset(kattr + attr_len, 0, new_init_size - attr_len);
184 a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
185 /* Finally, update the sizes in the vfs and ntfs inodes. */
186 write_lock_irqsave(&ni->size_lock, flags);
187 i_size_write(vi, new_init_size);
188 ni->initialized_size = new_init_size;
189 write_unlock_irqrestore(&ni->size_lock, flags);
190 goto done;
191do_non_resident_extend:
192 /*
193 * If the new initialized size @new_init_size exceeds the current file
194 * size (vfs inode->i_size), we need to extend the file size to the
195 * new initialized size.
196 */
197 if (new_init_size > old_i_size) {
198 m = map_mft_record(base_ni);
199 if (IS_ERR(m)) {
200 err = PTR_ERR(m);
201 m = NULL;
202 goto err_out;
203 }
204 ctx = ntfs_attr_get_search_ctx(base_ni, m);
205 if (unlikely(!ctx)) {
206 err = -ENOMEM;
207 goto err_out;
208 }
209 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
210 CASE_SENSITIVE, 0, NULL, 0, ctx);
211 if (unlikely(err)) {
212 if (err == -ENOENT)
213 err = -EIO;
214 goto err_out;
215 }
216 m = ctx->mrec;
217 a = ctx->attr;
218 BUG_ON(!a->non_resident);
219 BUG_ON(old_i_size != (loff_t)
220 sle64_to_cpu(a->data.non_resident.data_size));
221 a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
222 flush_dcache_mft_record_page(ctx->ntfs_ino);
223 mark_mft_record_dirty(ctx->ntfs_ino);
224 /* Update the file size in the vfs inode. */
225 i_size_write(vi, new_init_size);
226 ntfs_attr_put_search_ctx(ctx);
227 ctx = NULL;
228 unmap_mft_record(base_ni);
229 m = NULL;
230 }
231 mapping = vi->i_mapping;
232 index = old_init_size >> PAGE_CACHE_SHIFT;
233 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
234 do {
235 /*
236 * Read the page. If the page is not present, this will zero
237 * the uninitialized regions for us.
238 */
239 page = read_cache_page(mapping, index,
240 (filler_t*)mapping->a_ops->readpage, NULL);
241 if (IS_ERR(page)) {
242 err = PTR_ERR(page);
243 goto init_err_out;
244 }
245 wait_on_page_locked(page);
246 if (unlikely(!PageUptodate(page) || PageError(page))) {
247 page_cache_release(page);
248 err = -EIO;
249 goto init_err_out;
250 }
251 /*
252 * Update the initialized size in the ntfs inode. This is
253 * enough to make ntfs_writepage() work.
254 */
255 write_lock_irqsave(&ni->size_lock, flags);
256 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
257 if (ni->initialized_size > new_init_size)
258 ni->initialized_size = new_init_size;
259 write_unlock_irqrestore(&ni->size_lock, flags);
260 /* Set the page dirty so it gets written out. */
261 set_page_dirty(page);
262 page_cache_release(page);
263 /*
264 * Play nice with the vm and the rest of the system. This is
265 * very much needed as we can potentially be modifying the
266 * initialised size from a very small value to a really huge
267 * value, e.g.
268 * f = open(somefile, O_TRUNC);
269 * truncate(f, 10GiB);
270 * seek(f, 10GiB);
271 * write(f, 1);
272 * And this would mean we would be marking dirty hundreds of
273 * thousands of pages or as in the above example more than
274 * two and a half million pages!
275 *
276 * TODO: For sparse pages could optimize this workload by using
277 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
278 * would be set in readpage for sparse pages and here we would
279 * not need to mark dirty any pages which have this bit set.
280 * The only caveat is that we have to clear the bit everywhere
281 * where we allocate any clusters that lie in the page or that
282 * contain the page.
283 *
284 * TODO: An even greater optimization would be for us to only
285 * call readpage() on pages which are not in sparse regions as
286 * determined from the runlist. This would greatly reduce the
287 * number of pages we read and make dirty in the case of sparse
288 * files.
289 */
290 balance_dirty_pages_ratelimited(mapping);
291 cond_resched();
292 } while (++index < end_index);
293 read_lock_irqsave(&ni->size_lock, flags);
294 BUG_ON(ni->initialized_size != new_init_size);
295 read_unlock_irqrestore(&ni->size_lock, flags);
296 /* Now bring in sync the initialized_size in the mft record. */
297 m = map_mft_record(base_ni);
298 if (IS_ERR(m)) {
299 err = PTR_ERR(m);
300 m = NULL;
301 goto init_err_out;
302 }
303 ctx = ntfs_attr_get_search_ctx(base_ni, m);
304 if (unlikely(!ctx)) {
305 err = -ENOMEM;
306 goto init_err_out;
307 }
308 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
309 CASE_SENSITIVE, 0, NULL, 0, ctx);
310 if (unlikely(err)) {
311 if (err == -ENOENT)
312 err = -EIO;
313 goto init_err_out;
314 }
315 m = ctx->mrec;
316 a = ctx->attr;
317 BUG_ON(!a->non_resident);
318 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
319done:
320 flush_dcache_mft_record_page(ctx->ntfs_ino);
321 mark_mft_record_dirty(ctx->ntfs_ino);
322 if (ctx)
323 ntfs_attr_put_search_ctx(ctx);
324 if (m)
325 unmap_mft_record(base_ni);
326 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
327 (unsigned long long)new_init_size, i_size_read(vi));
328 return 0;
329init_err_out:
330 write_lock_irqsave(&ni->size_lock, flags);
331 ni->initialized_size = old_init_size;
332 write_unlock_irqrestore(&ni->size_lock, flags);
333err_out:
334 if (ctx)
335 ntfs_attr_put_search_ctx(ctx);
336 if (m)
337 unmap_mft_record(base_ni);
338 ntfs_debug("Failed. Returning error code %i.", err);
339 return err;
340}
341
342/**
343 * ntfs_fault_in_pages_readable -
344 *
345 * Fault a number of userspace pages into pagetables.
346 *
347 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
348 * with more than two userspace pages as well as handling the single page case
349 * elegantly.
350 *
351 * If you find this difficult to understand, then think of the while loop being
352 * the following code, except that we do without the integer variable ret:
353 *
354 * do {
355 * ret = __get_user(c, uaddr);
356 * uaddr += PAGE_SIZE;
357 * } while (!ret && uaddr < end);
358 *
359 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
360 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
361 * this is only a read and not a write, and since it is still in the same page,
362 * it should not matter and this makes the code much simpler.
363 */
364static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
365 int bytes)
366{
367 const char __user *end;
368 volatile char c;
369
370 /* Set @end to the first byte outside the last page we care about. */
371 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
372
373 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
374 ;
375}
376
377/**
378 * ntfs_fault_in_pages_readable_iovec -
379 *
380 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
381 */
382static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
383 size_t iov_ofs, int bytes)
384{
385 do {
386 const char __user *buf;
387 unsigned len;
388
389 buf = iov->iov_base + iov_ofs;
390 len = iov->iov_len - iov_ofs;
391 if (len > bytes)
392 len = bytes;
393 ntfs_fault_in_pages_readable(buf, len);
394 bytes -= len;
395 iov++;
396 iov_ofs = 0;
397 } while (bytes);
398}
399
400/**
401 * __ntfs_grab_cache_pages - obtain a number of locked pages
402 * @mapping: address space mapping from which to obtain page cache pages
403 * @index: starting index in @mapping at which to begin obtaining pages
404 * @nr_pages: number of page cache pages to obtain
405 * @pages: array of pages in which to return the obtained page cache pages
406 * @cached_page: allocated but as yet unused page
407 * @lru_pvec: lru-buffering pagevec of caller
408 *
409 * Obtain @nr_pages locked page cache pages from the mapping @maping and
410 * starting at index @index.
411 *
412 * If a page is newly created, increment its refcount and add it to the
413 * caller's lru-buffering pagevec @lru_pvec.
414 *
415 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
416 * are obtained at once instead of just one page and that 0 is returned on
417 * success and -errno on error.
418 *
419 * Note, the page locks are obtained in ascending page index order.
420 */
421static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
422 pgoff_t index, const unsigned nr_pages, struct page **pages,
423 struct page **cached_page, struct pagevec *lru_pvec)
424{
425 int err, nr;
426
427 BUG_ON(!nr_pages);
428 err = nr = 0;
429 do {
430 pages[nr] = find_lock_page(mapping, index);
431 if (!pages[nr]) {
432 if (!*cached_page) {
433 *cached_page = page_cache_alloc(mapping);
434 if (unlikely(!*cached_page)) {
435 err = -ENOMEM;
436 goto err_out;
437 }
438 }
439 err = add_to_page_cache(*cached_page, mapping, index,
440 GFP_KERNEL);
441 if (unlikely(err)) {
442 if (err == -EEXIST)
443 continue;
444 goto err_out;
445 }
446 pages[nr] = *cached_page;
447 page_cache_get(*cached_page);
448 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
449 __pagevec_lru_add(lru_pvec);
450 *cached_page = NULL;
451 }
452 index++;
453 nr++;
454 } while (nr < nr_pages);
455out:
456 return err;
457err_out:
458 while (nr > 0) {
459 unlock_page(pages[--nr]);
460 page_cache_release(pages[nr]);
461 }
462 goto out;
463}
464
465static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
466{
467 lock_buffer(bh);
468 get_bh(bh);
469 bh->b_end_io = end_buffer_read_sync;
470 return submit_bh(READ, bh);
471}
472
473/**
474 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
475 * @pages: array of destination pages
476 * @nr_pages: number of pages in @pages
477 * @pos: byte position in file at which the write begins
478 * @bytes: number of bytes to be written
479 *
480 * This is called for non-resident attributes from ntfs_file_buffered_write()
481 * with i_sem held on the inode (@pages[0]->mapping->host). There are
482 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
483 * data has not yet been copied into the @pages.
484 *
485 * Need to fill any holes with actual clusters, allocate buffers if necessary,
486 * ensure all the buffers are mapped, and bring uptodate any buffers that are
487 * only partially being written to.
488 *
489 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
490 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
491 * the same cluster and that they are the entirety of that cluster, and that
492 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
493 *
494 * i_size is not to be modified yet.
495 *
496 * Return 0 on success or -errno on error.
497 */
498static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
499 unsigned nr_pages, s64 pos, size_t bytes)
500{
501 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
502 LCN lcn;
503 s64 bh_pos, vcn_len, end, initialized_size;
504 sector_t lcn_block;
505 struct page *page;
506 struct inode *vi;
507 ntfs_inode *ni, *base_ni = NULL;
508 ntfs_volume *vol;
509 runlist_element *rl, *rl2;
510 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
511 ntfs_attr_search_ctx *ctx = NULL;
512 MFT_RECORD *m = NULL;
513 ATTR_RECORD *a = NULL;
514 unsigned long flags;
515 u32 attr_rec_len = 0;
516 unsigned blocksize, u;
517 int err, mp_size;
518 BOOL rl_write_locked, was_hole, is_retry;
519 unsigned char blocksize_bits;
520 struct {
521 u8 runlist_merged:1;
522 u8 mft_attr_mapped:1;
523 u8 mp_rebuilt:1;
524 u8 attr_switched:1;
525 } status = { 0, 0, 0, 0 };
526
527 BUG_ON(!nr_pages);
528 BUG_ON(!pages);
529 BUG_ON(!*pages);
530 vi = pages[0]->mapping->host;
531 ni = NTFS_I(vi);
532 vol = ni->vol;
533 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
534 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%x.",
535 vi->i_ino, ni->type, pages[0]->index, nr_pages,
536 (long long)pos, bytes);
537 blocksize_bits = vi->i_blkbits;
538 blocksize = 1 << blocksize_bits;
539 u = 0;
540 do {
541 struct page *page = pages[u];
542 /*
543 * create_empty_buffers() will create uptodate/dirty buffers if
544 * the page is uptodate/dirty.
545 */
546 if (!page_has_buffers(page)) {
547 create_empty_buffers(page, blocksize, 0);
548 if (unlikely(!page_has_buffers(page)))
549 return -ENOMEM;
550 }
551 } while (++u < nr_pages);
552 rl_write_locked = FALSE;
553 rl = NULL;
554 err = 0;
555 vcn = lcn = -1;
556 vcn_len = 0;
557 lcn_block = -1;
558 was_hole = FALSE;
559 cpos = pos >> vol->cluster_size_bits;
560 end = pos + bytes;
561 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
562 /*
563 * Loop over each page and for each page over each buffer. Use goto to
564 * reduce indentation.
565 */
566 u = 0;
567do_next_page:
568 page = pages[u];
569 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
570 bh = head = page_buffers(page);
571 do {
572 VCN cdelta;
573 s64 bh_end;
574 unsigned bh_cofs;
575
576 /* Clear buffer_new on all buffers to reinitialise state. */
577 if (buffer_new(bh))
578 clear_buffer_new(bh);
579 bh_end = bh_pos + blocksize;
580 bh_cpos = bh_pos >> vol->cluster_size_bits;
581 bh_cofs = bh_pos & vol->cluster_size_mask;
582 if (buffer_mapped(bh)) {
583 /*
584 * The buffer is already mapped. If it is uptodate,
585 * ignore it.
586 */
587 if (buffer_uptodate(bh))
588 continue;
589 /*
590 * The buffer is not uptodate. If the page is uptodate
591 * set the buffer uptodate and otherwise ignore it.
592 */
593 if (PageUptodate(page)) {
594 set_buffer_uptodate(bh);
595 continue;
596 }
597 /*
598 * Neither the page nor the buffer are uptodate. If
599 * the buffer is only partially being written to, we
600 * need to read it in before the write, i.e. now.
601 */
602 if ((bh_pos < pos && bh_end > pos) ||
603 (bh_pos < end && bh_end > end)) {
604 /*
605 * If the buffer is fully or partially within
606 * the initialized size, do an actual read.
607 * Otherwise, simply zero the buffer.
608 */
609 read_lock_irqsave(&ni->size_lock, flags);
610 initialized_size = ni->initialized_size;
611 read_unlock_irqrestore(&ni->size_lock, flags);
612 if (bh_pos < initialized_size) {
613 ntfs_submit_bh_for_read(bh);
614 *wait_bh++ = bh;
615 } else {
616 u8 *kaddr = kmap_atomic(page, KM_USER0);
617 memset(kaddr + bh_offset(bh), 0,
618 blocksize);
619 kunmap_atomic(kaddr, KM_USER0);
620 flush_dcache_page(page);
621 set_buffer_uptodate(bh);
622 }
623 }
624 continue;
625 }
626 /* Unmapped buffer. Need to map it. */
627 bh->b_bdev = vol->sb->s_bdev;
628 /*
629 * If the current buffer is in the same clusters as the map
630 * cache, there is no need to check the runlist again. The
631 * map cache is made up of @vcn, which is the first cached file
632 * cluster, @vcn_len which is the number of cached file
633 * clusters, @lcn is the device cluster corresponding to @vcn,
634 * and @lcn_block is the block number corresponding to @lcn.
635 */
636 cdelta = bh_cpos - vcn;
637 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
638map_buffer_cached:
639 BUG_ON(lcn < 0);
640 bh->b_blocknr = lcn_block +
641 (cdelta << (vol->cluster_size_bits -
642 blocksize_bits)) +
643 (bh_cofs >> blocksize_bits);
644 set_buffer_mapped(bh);
645 /*
646 * If the page is uptodate so is the buffer. If the
647 * buffer is fully outside the write, we ignore it if
648 * it was already allocated and we mark it dirty so it
649 * gets written out if we allocated it. On the other
650 * hand, if we allocated the buffer but we are not
651 * marking it dirty we set buffer_new so we can do
652 * error recovery.
653 */
654 if (PageUptodate(page)) {
655 if (!buffer_uptodate(bh))
656 set_buffer_uptodate(bh);
657 if (unlikely(was_hole)) {
658 /* We allocated the buffer. */
659 unmap_underlying_metadata(bh->b_bdev,
660 bh->b_blocknr);
661 if (bh_end <= pos || bh_pos >= end)
662 mark_buffer_dirty(bh);
663 else
664 set_buffer_new(bh);
665 }
666 continue;
667 }
668 /* Page is _not_ uptodate. */
669 if (likely(!was_hole)) {
670 /*
671 * Buffer was already allocated. If it is not
672 * uptodate and is only partially being written
673 * to, we need to read it in before the write,
674 * i.e. now.
675 */
676 if (!buffer_uptodate(bh) && ((bh_pos < pos &&
677 bh_end > pos) ||
678 (bh_end > end &&
679 bh_end > end))) {
680 /*
681 * If the buffer is fully or partially
682 * within the initialized size, do an
683 * actual read. Otherwise, simply zero
684 * the buffer.
685 */
686 read_lock_irqsave(&ni->size_lock,
687 flags);
688 initialized_size = ni->initialized_size;
689 read_unlock_irqrestore(&ni->size_lock,
690 flags);
691 if (bh_pos < initialized_size) {
692 ntfs_submit_bh_for_read(bh);
693 *wait_bh++ = bh;
694 } else {
695 u8 *kaddr = kmap_atomic(page,
696 KM_USER0);
697 memset(kaddr + bh_offset(bh),
698 0, blocksize);
699 kunmap_atomic(kaddr, KM_USER0);
700 flush_dcache_page(page);
701 set_buffer_uptodate(bh);
702 }
703 }
704 continue;
705 }
706 /* We allocated the buffer. */
707 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
708 /*
709 * If the buffer is fully outside the write, zero it,
710 * set it uptodate, and mark it dirty so it gets
711 * written out. If it is partially being written to,
712 * zero region surrounding the write but leave it to
713 * commit write to do anything else. Finally, if the
714 * buffer is fully being overwritten, do nothing.
715 */
716 if (bh_end <= pos || bh_pos >= end) {
717 if (!buffer_uptodate(bh)) {
718 u8 *kaddr = kmap_atomic(page, KM_USER0);
719 memset(kaddr + bh_offset(bh), 0,
720 blocksize);
721 kunmap_atomic(kaddr, KM_USER0);
722 flush_dcache_page(page);
723 set_buffer_uptodate(bh);
724 }
725 mark_buffer_dirty(bh);
726 continue;
727 }
728 set_buffer_new(bh);
729 if (!buffer_uptodate(bh) &&
730 (bh_pos < pos || bh_end > end)) {
731 u8 *kaddr;
732 unsigned pofs;
733
734 kaddr = kmap_atomic(page, KM_USER0);
735 if (bh_pos < pos) {
736 pofs = bh_pos & ~PAGE_CACHE_MASK;
737 memset(kaddr + pofs, 0, pos - bh_pos);
738 }
739 if (bh_end > end) {
740 pofs = end & ~PAGE_CACHE_MASK;
741 memset(kaddr + pofs, 0, bh_end - end);
742 }
743 kunmap_atomic(kaddr, KM_USER0);
744 flush_dcache_page(page);
745 }
746 continue;
747 }
748 /*
749 * Slow path: this is the first buffer in the cluster. If it
750 * is outside allocated size and is not uptodate, zero it and
751 * set it uptodate.
752 */
753 read_lock_irqsave(&ni->size_lock, flags);
754 initialized_size = ni->allocated_size;
755 read_unlock_irqrestore(&ni->size_lock, flags);
756 if (bh_pos > initialized_size) {
757 if (PageUptodate(page)) {
758 if (!buffer_uptodate(bh))
759 set_buffer_uptodate(bh);
760 } else if (!buffer_uptodate(bh)) {
761 u8 *kaddr = kmap_atomic(page, KM_USER0);
762 memset(kaddr + bh_offset(bh), 0, blocksize);
763 kunmap_atomic(kaddr, KM_USER0);
764 flush_dcache_page(page);
765 set_buffer_uptodate(bh);
766 }
767 continue;
768 }
769 is_retry = FALSE;
770 if (!rl) {
771 down_read(&ni->runlist.lock);
772retry_remap:
773 rl = ni->runlist.rl;
774 }
775 if (likely(rl != NULL)) {
776 /* Seek to element containing target cluster. */
777 while (rl->length && rl[1].vcn <= bh_cpos)
778 rl++;
779 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
780 if (likely(lcn >= 0)) {
781 /*
782 * Successful remap, setup the map cache and
783 * use that to deal with the buffer.
784 */
785 was_hole = FALSE;
786 vcn = bh_cpos;
787 vcn_len = rl[1].vcn - vcn;
788 lcn_block = lcn << (vol->cluster_size_bits -
789 blocksize_bits);
790 /*
791 * If the number of remaining clusters in the
792 * @pages is smaller or equal to the number of
793 * cached clusters, unlock the runlist as the
794 * map cache will be used from now on.
795 */
796 if (likely(vcn + vcn_len >= cend)) {
797 if (rl_write_locked) {
798 up_write(&ni->runlist.lock);
799 rl_write_locked = FALSE;
800 } else
801 up_read(&ni->runlist.lock);
802 rl = NULL;
803 }
804 goto map_buffer_cached;
805 }
806 } else
807 lcn = LCN_RL_NOT_MAPPED;
808 /*
809 * If it is not a hole and not out of bounds, the runlist is
810 * probably unmapped so try to map it now.
811 */
812 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
813 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
814 /* Attempt to map runlist. */
815 if (!rl_write_locked) {
816 /*
817 * We need the runlist locked for
818 * writing, so if it is locked for
819 * reading relock it now and retry in
820 * case it changed whilst we dropped
821 * the lock.
822 */
823 up_read(&ni->runlist.lock);
824 down_write(&ni->runlist.lock);
825 rl_write_locked = TRUE;
826 goto retry_remap;
827 }
828 err = ntfs_map_runlist_nolock(ni, bh_cpos,
829 NULL);
830 if (likely(!err)) {
831 is_retry = TRUE;
832 goto retry_remap;
833 }
834 /*
835 * If @vcn is out of bounds, pretend @lcn is
836 * LCN_ENOENT. As long as the buffer is out
837 * of bounds this will work fine.
838 */
839 if (err == -ENOENT) {
840 lcn = LCN_ENOENT;
841 err = 0;
842 goto rl_not_mapped_enoent;
843 }
844 } else
845 err = -EIO;
846 /* Failed to map the buffer, even after retrying. */
847 bh->b_blocknr = -1;
848 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
849 "attribute type 0x%x, vcn 0x%llx, "
850 "vcn offset 0x%x, because its "
851 "location on disk could not be "
852 "determined%s (error code %i).",
853 ni->mft_no, ni->type,
854 (unsigned long long)bh_cpos,
855 (unsigned)bh_pos &
856 vol->cluster_size_mask,
857 is_retry ? " even after retrying" : "",
858 err);
859 break;
860 }
861rl_not_mapped_enoent:
862 /*
863 * The buffer is in a hole or out of bounds. We need to fill
864 * the hole, unless the buffer is in a cluster which is not
865 * touched by the write, in which case we just leave the buffer
866 * unmapped. This can only happen when the cluster size is
867 * less than the page cache size.
868 */
869 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
870 bh_cend = (bh_end + vol->cluster_size - 1) >>
871 vol->cluster_size_bits;
872 if ((bh_cend <= cpos || bh_cpos >= cend)) {
873 bh->b_blocknr = -1;
874 /*
875 * If the buffer is uptodate we skip it. If it
876 * is not but the page is uptodate, we can set
877 * the buffer uptodate. If the page is not
878 * uptodate, we can clear the buffer and set it
879 * uptodate. Whether this is worthwhile is
880 * debatable and this could be removed.
881 */
882 if (PageUptodate(page)) {
883 if (!buffer_uptodate(bh))
884 set_buffer_uptodate(bh);
885 } else if (!buffer_uptodate(bh)) {
886 u8 *kaddr = kmap_atomic(page, KM_USER0);
887 memset(kaddr + bh_offset(bh), 0,
888 blocksize);
889 kunmap_atomic(kaddr, KM_USER0);
890 flush_dcache_page(page);
891 set_buffer_uptodate(bh);
892 }
893 continue;
894 }
895 }
896 /*
897 * Out of bounds buffer is invalid if it was not really out of
898 * bounds.
899 */
900 BUG_ON(lcn != LCN_HOLE);
901 /*
902 * We need the runlist locked for writing, so if it is locked
903 * for reading relock it now and retry in case it changed
904 * whilst we dropped the lock.
905 */
906 BUG_ON(!rl);
907 if (!rl_write_locked) {
908 up_read(&ni->runlist.lock);
909 down_write(&ni->runlist.lock);
910 rl_write_locked = TRUE;
911 goto retry_remap;
912 }
913 /* Find the previous last allocated cluster. */
914 BUG_ON(rl->lcn != LCN_HOLE);
915 lcn = -1;
916 rl2 = rl;
917 while (--rl2 >= ni->runlist.rl) {
918 if (rl2->lcn >= 0) {
919 lcn = rl2->lcn + rl2->length;
920 break;
921 }
922 }
923 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
924 FALSE);
925 if (IS_ERR(rl2)) {
926 err = PTR_ERR(rl2);
927 ntfs_debug("Failed to allocate cluster, error code %i.",
928 err);
929 break;
930 }
931 lcn = rl2->lcn;
932 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
933 if (IS_ERR(rl)) {
934 err = PTR_ERR(rl);
935 if (err != -ENOMEM)
936 err = -EIO;
937 if (ntfs_cluster_free_from_rl(vol, rl2)) {
938 ntfs_error(vol->sb, "Failed to release "
939 "allocated cluster in error "
940 "code path. Run chkdsk to "
941 "recover the lost cluster.");
942 NVolSetErrors(vol);
943 }
944 ntfs_free(rl2);
945 break;
946 }
947 ni->runlist.rl = rl;
948 status.runlist_merged = 1;
949 ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
950 /* Map and lock the mft record and get the attribute record. */
951 if (!NInoAttr(ni))
952 base_ni = ni;
953 else
954 base_ni = ni->ext.base_ntfs_ino;
955 m = map_mft_record(base_ni);
956 if (IS_ERR(m)) {
957 err = PTR_ERR(m);
958 break;
959 }
960 ctx = ntfs_attr_get_search_ctx(base_ni, m);
961 if (unlikely(!ctx)) {
962 err = -ENOMEM;
963 unmap_mft_record(base_ni);
964 break;
965 }
966 status.mft_attr_mapped = 1;
967 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
968 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
969 if (unlikely(err)) {
970 if (err == -ENOENT)
971 err = -EIO;
972 break;
973 }
974 m = ctx->mrec;
975 a = ctx->attr;
976 /*
977 * Find the runlist element with which the attribute extent
978 * starts. Note, we cannot use the _attr_ version because we
979 * have mapped the mft record. That is ok because we know the
980 * runlist fragment must be mapped already to have ever gotten
981 * here, so we can just use the _rl_ version.
982 */
983 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
984 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
985 BUG_ON(!rl2);
986 BUG_ON(!rl2->length);
987 BUG_ON(rl2->lcn < LCN_HOLE);
988 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
989 /*
990 * If @highest_vcn is zero, calculate the real highest_vcn
991 * (which can really be zero).
992 */
993 if (!highest_vcn)
994 highest_vcn = (sle64_to_cpu(
995 a->data.non_resident.allocated_size) >>
996 vol->cluster_size_bits) - 1;
997 /*
998 * Determine the size of the mapping pairs array for the new
999 * extent, i.e. the old extent with the hole filled.
1000 */
1001 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
1002 highest_vcn);
1003 if (unlikely(mp_size <= 0)) {
1004 if (!(err = mp_size))
1005 err = -EIO;
1006 ntfs_debug("Failed to get size for mapping pairs "
1007 "array, error code %i.", err);
1008 break;
1009 }
1010 /*
1011 * Resize the attribute record to fit the new mapping pairs
1012 * array.
1013 */
1014 attr_rec_len = le32_to_cpu(a->length);
1015 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1016 a->data.non_resident.mapping_pairs_offset));
1017 if (unlikely(err)) {
1018 BUG_ON(err != -ENOSPC);
1019 // TODO: Deal with this by using the current attribute
1020 // and fill it with as much of the mapping pairs
1021 // array as possible. Then loop over each attribute
1022 // extent rewriting the mapping pairs arrays as we go
1023 // along and if when we reach the end we have not
1024 // enough space, try to resize the last attribute
1025 // extent and if even that fails, add a new attribute
1026 // extent.
1027 // We could also try to resize at each step in the hope
1028 // that we will not need to rewrite every single extent.
1029 // Note, we may need to decompress some extents to fill
1030 // the runlist as we are walking the extents...
1031 ntfs_error(vol->sb, "Not enough space in the mft "
1032 "record for the extended attribute "
1033 "record. This case is not "
1034 "implemented yet.");
1035 err = -EOPNOTSUPP;
1036 break ;
1037 }
1038 status.mp_rebuilt = 1;
1039 /*
1040 * Generate the mapping pairs array directly into the attribute
1041 * record.
1042 */
1043 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1044 a->data.non_resident.mapping_pairs_offset),
1045 mp_size, rl2, vcn, highest_vcn, NULL);
1046 if (unlikely(err)) {
1047 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1048 "attribute type 0x%x, because building "
1049 "the mapping pairs failed with error "
1050 "code %i.", vi->i_ino,
1051 (unsigned)le32_to_cpu(ni->type), err);
1052 err = -EIO;
1053 break;
1054 }
1055 /* Update the highest_vcn but only if it was not set. */
1056 if (unlikely(!a->data.non_resident.highest_vcn))
1057 a->data.non_resident.highest_vcn =
1058 cpu_to_sle64(highest_vcn);
1059 /*
1060 * If the attribute is sparse/compressed, update the compressed
1061 * size in the ntfs_inode structure and the attribute record.
1062 */
1063 if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1064 /*
1065 * If we are not in the first attribute extent, switch
1066 * to it, but first ensure the changes will make it to
1067 * disk later.
1068 */
1069 if (a->data.non_resident.lowest_vcn) {
1070 flush_dcache_mft_record_page(ctx->ntfs_ino);
1071 mark_mft_record_dirty(ctx->ntfs_ino);
1072 ntfs_attr_reinit_search_ctx(ctx);
1073 err = ntfs_attr_lookup(ni->type, ni->name,
1074 ni->name_len, CASE_SENSITIVE,
1075 0, NULL, 0, ctx);
1076 if (unlikely(err)) {
1077 status.attr_switched = 1;
1078 break;
1079 }
1080 /* @m is not used any more so do not set it. */
1081 a = ctx->attr;
1082 }
1083 write_lock_irqsave(&ni->size_lock, flags);
1084 ni->itype.compressed.size += vol->cluster_size;
1085 a->data.non_resident.compressed_size =
1086 cpu_to_sle64(ni->itype.compressed.size);
1087 write_unlock_irqrestore(&ni->size_lock, flags);
1088 }
1089 /* Ensure the changes make it to disk. */
1090 flush_dcache_mft_record_page(ctx->ntfs_ino);
1091 mark_mft_record_dirty(ctx->ntfs_ino);
1092 ntfs_attr_put_search_ctx(ctx);
1093 unmap_mft_record(base_ni);
1094 /* Successfully filled the hole. */
1095 status.runlist_merged = 0;
1096 status.mft_attr_mapped = 0;
1097 status.mp_rebuilt = 0;
1098 /* Setup the map cache and use that to deal with the buffer. */
1099 was_hole = TRUE;
1100 vcn = bh_cpos;
1101 vcn_len = 1;
1102 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1103 cdelta = 0;
1104 /*
1105 * If the number of remaining clusters in the @pages is smaller
1106 * or equal to the number of cached clusters, unlock the
1107 * runlist as the map cache will be used from now on.
1108 */
1109 if (likely(vcn + vcn_len >= cend)) {
1110 up_write(&ni->runlist.lock);
1111 rl_write_locked = FALSE;
1112 rl = NULL;
1113 }
1114 goto map_buffer_cached;
1115 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1116 /* If there are no errors, do the next page. */
1117 if (likely(!err && ++u < nr_pages))
1118 goto do_next_page;
1119 /* If there are no errors, release the runlist lock if we took it. */
1120 if (likely(!err)) {
1121 if (unlikely(rl_write_locked)) {
1122 up_write(&ni->runlist.lock);
1123 rl_write_locked = FALSE;
1124 } else if (unlikely(rl))
1125 up_read(&ni->runlist.lock);
1126 rl = NULL;
1127 }
1128 /* If we issued read requests, let them complete. */
1129 read_lock_irqsave(&ni->size_lock, flags);
1130 initialized_size = ni->initialized_size;
1131 read_unlock_irqrestore(&ni->size_lock, flags);
1132 while (wait_bh > wait) {
1133 bh = *--wait_bh;
1134 wait_on_buffer(bh);
1135 if (likely(buffer_uptodate(bh))) {
1136 page = bh->b_page;
1137 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1138 bh_offset(bh);
1139 /*
1140 * If the buffer overflows the initialized size, need
1141 * to zero the overflowing region.
1142 */
1143 if (unlikely(bh_pos + blocksize > initialized_size)) {
1144 u8 *kaddr;
1145 int ofs = 0;
1146
1147 if (likely(bh_pos < initialized_size))
1148 ofs = initialized_size - bh_pos;
1149 kaddr = kmap_atomic(page, KM_USER0);
1150 memset(kaddr + bh_offset(bh) + ofs, 0,
1151 blocksize - ofs);
1152 kunmap_atomic(kaddr, KM_USER0);
1153 flush_dcache_page(page);
1154 }
1155 } else /* if (unlikely(!buffer_uptodate(bh))) */
1156 err = -EIO;
1157 }
1158 if (likely(!err)) {
1159 /* Clear buffer_new on all buffers. */
1160 u = 0;
1161 do {
1162 bh = head = page_buffers(pages[u]);
1163 do {
1164 if (buffer_new(bh))
1165 clear_buffer_new(bh);
1166 } while ((bh = bh->b_this_page) != head);
1167 } while (++u < nr_pages);
1168 ntfs_debug("Done.");
1169 return err;
1170 }
1171 if (status.attr_switched) {
1172 /* Get back to the attribute extent we modified. */
1173 ntfs_attr_reinit_search_ctx(ctx);
1174 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1175 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1176 ntfs_error(vol->sb, "Failed to find required "
1177 "attribute extent of attribute in "
1178 "error code path. Run chkdsk to "
1179 "recover.");
1180 write_lock_irqsave(&ni->size_lock, flags);
1181 ni->itype.compressed.size += vol->cluster_size;
1182 write_unlock_irqrestore(&ni->size_lock, flags);
1183 flush_dcache_mft_record_page(ctx->ntfs_ino);
1184 mark_mft_record_dirty(ctx->ntfs_ino);
1185 /*
1186 * The only thing that is now wrong is the compressed
1187 * size of the base attribute extent which chkdsk
1188 * should be able to fix.
1189 */
1190 NVolSetErrors(vol);
1191 } else {
1192 m = ctx->mrec;
1193 a = ctx->attr;
1194 status.attr_switched = 0;
1195 }
1196 }
1197 /*
1198 * If the runlist has been modified, need to restore it by punching a
1199 * hole into it and we then need to deallocate the on-disk cluster as
1200 * well. Note, we only modify the runlist if we are able to generate a
1201 * new mapping pairs array, i.e. only when the mapped attribute extent
1202 * is not switched.
1203 */
1204 if (status.runlist_merged && !status.attr_switched) {
1205 BUG_ON(!rl_write_locked);
1206 /* Make the file cluster we allocated sparse in the runlist. */
1207 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1208 ntfs_error(vol->sb, "Failed to punch hole into "
1209 "attribute runlist in error code "
1210 "path. Run chkdsk to recover the "
1211 "lost cluster.");
1212 make_bad_inode(vi);
1213 make_bad_inode(VFS_I(base_ni));
1214 NVolSetErrors(vol);
1215 } else /* if (success) */ {
1216 status.runlist_merged = 0;
1217 /*
1218 * Deallocate the on-disk cluster we allocated but only
1219 * if we succeeded in punching its vcn out of the
1220 * runlist.
1221 */
1222 down_write(&vol->lcnbmp_lock);
1223 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1224 ntfs_error(vol->sb, "Failed to release "
1225 "allocated cluster in error "
1226 "code path. Run chkdsk to "
1227 "recover the lost cluster.");
1228 NVolSetErrors(vol);
1229 }
1230 up_write(&vol->lcnbmp_lock);
1231 }
1232 }
1233 /*
1234 * Resize the attribute record to its old size and rebuild the mapping
1235 * pairs array. Note, we only can do this if the runlist has been
1236 * restored to its old state which also implies that the mapped
1237 * attribute extent is not switched.
1238 */
1239 if (status.mp_rebuilt && !status.runlist_merged) {
1240 if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1241 ntfs_error(vol->sb, "Failed to restore attribute "
1242 "record in error code path. Run "
1243 "chkdsk to recover.");
1244 make_bad_inode(vi);
1245 make_bad_inode(VFS_I(base_ni));
1246 NVolSetErrors(vol);
1247 } else /* if (success) */ {
1248 if (ntfs_mapping_pairs_build(vol, (u8*)a +
1249 le16_to_cpu(a->data.non_resident.
1250 mapping_pairs_offset), attr_rec_len -
1251 le16_to_cpu(a->data.non_resident.
1252 mapping_pairs_offset), ni->runlist.rl,
1253 vcn, highest_vcn, NULL)) {
1254 ntfs_error(vol->sb, "Failed to restore "
1255 "mapping pairs array in error "
1256 "code path. Run chkdsk to "
1257 "recover.");
1258 make_bad_inode(vi);
1259 make_bad_inode(VFS_I(base_ni));
1260 NVolSetErrors(vol);
1261 }
1262 flush_dcache_mft_record_page(ctx->ntfs_ino);
1263 mark_mft_record_dirty(ctx->ntfs_ino);
1264 }
1265 }
1266 /* Release the mft record and the attribute. */
1267 if (status.mft_attr_mapped) {
1268 ntfs_attr_put_search_ctx(ctx);
1269 unmap_mft_record(base_ni);
1270 }
1271 /* Release the runlist lock. */
1272 if (rl_write_locked)
1273 up_write(&ni->runlist.lock);
1274 else if (rl)
1275 up_read(&ni->runlist.lock);
1276 /*
1277 * Zero out any newly allocated blocks to avoid exposing stale data.
1278 * If BH_New is set, we know that the block was newly allocated above
1279 * and that it has not been fully zeroed and marked dirty yet.
1280 */
1281 nr_pages = u;
1282 u = 0;
1283 end = bh_cpos << vol->cluster_size_bits;
1284 do {
1285 page = pages[u];
1286 bh = head = page_buffers(page);
1287 do {
1288 if (u == nr_pages &&
1289 ((s64)page->index << PAGE_CACHE_SHIFT) +
1290 bh_offset(bh) >= end)
1291 break;
1292 if (!buffer_new(bh))
1293 continue;
1294 clear_buffer_new(bh);
1295 if (!buffer_uptodate(bh)) {
1296 if (PageUptodate(page))
1297 set_buffer_uptodate(bh);
1298 else {
1299 u8 *kaddr = kmap_atomic(page, KM_USER0);
1300 memset(kaddr + bh_offset(bh), 0,
1301 blocksize);
1302 kunmap_atomic(kaddr, KM_USER0);
1303 flush_dcache_page(page);
1304 set_buffer_uptodate(bh);
1305 }
1306 }
1307 mark_buffer_dirty(bh);
1308 } while ((bh = bh->b_this_page) != head);
1309 } while (++u <= nr_pages);
1310 ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1311 return err;
1312}
1313
1314/*
1315 * Copy as much as we can into the pages and return the number of bytes which
1316 * were sucessfully copied. If a fault is encountered then clear the pages
1317 * out to (ofs + bytes) and return the number of bytes which were copied.
1318 */
1319static inline size_t ntfs_copy_from_user(struct page **pages,
1320 unsigned nr_pages, unsigned ofs, const char __user *buf,
1321 size_t bytes)
1322{
1323 struct page **last_page = pages + nr_pages;
1324 char *kaddr;
1325 size_t total = 0;
1326 unsigned len;
1327 int left;
1328
1329 do {
1330 len = PAGE_CACHE_SIZE - ofs;
1331 if (len > bytes)
1332 len = bytes;
1333 kaddr = kmap_atomic(*pages, KM_USER0);
1334 left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
1335 kunmap_atomic(kaddr, KM_USER0);
1336 if (unlikely(left)) {
1337 /* Do it the slow way. */
1338 kaddr = kmap(*pages);
1339 left = __copy_from_user(kaddr + ofs, buf, len);
1340 kunmap(*pages);
1341 if (unlikely(left))
1342 goto err_out;
1343 }
1344 total += len;
1345 bytes -= len;
1346 if (!bytes)
1347 break;
1348 buf += len;
1349 ofs = 0;
1350 } while (++pages < last_page);
1351out:
1352 return total;
1353err_out:
1354 total += len - left;
1355 /* Zero the rest of the target like __copy_from_user(). */
1356 while (++pages < last_page) {
1357 bytes -= len;
1358 if (!bytes)
1359 break;
1360 len = PAGE_CACHE_SIZE;
1361 if (len > bytes)
1362 len = bytes;
1363 kaddr = kmap_atomic(*pages, KM_USER0);
1364 memset(kaddr, 0, len);
1365 kunmap_atomic(kaddr, KM_USER0);
1366 }
1367 goto out;
1368}
1369
1370static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1371 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1372{
1373 size_t total = 0;
1374
1375 while (1) {
1376 const char __user *buf = iov->iov_base + iov_ofs;
1377 unsigned len;
1378 size_t left;
1379
1380 len = iov->iov_len - iov_ofs;
1381 if (len > bytes)
1382 len = bytes;
1383 left = __copy_from_user_inatomic(vaddr, buf, len);
1384 total += len;
1385 bytes -= len;
1386 vaddr += len;
1387 if (unlikely(left)) {
1388 /*
1389 * Zero the rest of the target like __copy_from_user().
1390 */
1391 memset(vaddr, 0, bytes);
1392 total -= left;
1393 break;
1394 }
1395 if (!bytes)
1396 break;
1397 iov++;
1398 iov_ofs = 0;
1399 }
1400 return total;
1401}
1402
1403static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1404 size_t *iov_ofsp, size_t bytes)
1405{
1406 const struct iovec *iov = *iovp;
1407 size_t iov_ofs = *iov_ofsp;
1408
1409 while (bytes) {
1410 unsigned len;
1411
1412 len = iov->iov_len - iov_ofs;
1413 if (len > bytes)
1414 len = bytes;
1415 bytes -= len;
1416 iov_ofs += len;
1417 if (iov->iov_len == iov_ofs) {
1418 iov++;
1419 iov_ofs = 0;
1420 }
1421 }
1422 *iovp = iov;
1423 *iov_ofsp = iov_ofs;
1424}
1425
1426/*
1427 * This has the same side-effects and return value as ntfs_copy_from_user().
1428 * The difference is that on a fault we need to memset the remainder of the
1429 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1430 * single-segment behaviour.
1431 *
1432 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
1433 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
1434 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1435 * fact, the only difference between __copy_from_user_inatomic() and
1436 * __copy_from_user() is that the latter calls might_sleep(). And on many
1437 * architectures __copy_from_user_inatomic() is just defined to
1438 * __copy_from_user() so it makes no difference at all on those architectures.
1439 */
1440static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1441 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1442 size_t *iov_ofs, size_t bytes)
1443{
1444 struct page **last_page = pages + nr_pages;
1445 char *kaddr;
1446 size_t copied, len, total = 0;
1447
1448 do {
1449 len = PAGE_CACHE_SIZE - ofs;
1450 if (len > bytes)
1451 len = bytes;
1452 kaddr = kmap_atomic(*pages, KM_USER0);
1453 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1454 *iov, *iov_ofs, len);
1455 kunmap_atomic(kaddr, KM_USER0);
1456 if (unlikely(copied != len)) {
1457 /* Do it the slow way. */
1458 kaddr = kmap(*pages);
1459 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1460 *iov, *iov_ofs, len);
1461 kunmap(*pages);
1462 if (unlikely(copied != len))
1463 goto err_out;
1464 }
1465 total += len;
1466 bytes -= len;
1467 if (!bytes)
1468 break;
1469 ntfs_set_next_iovec(iov, iov_ofs, len);
1470 ofs = 0;
1471 } while (++pages < last_page);
1472out:
1473 return total;
1474err_out:
1475 total += copied;
1476 /* Zero the rest of the target like __copy_from_user(). */
1477 while (++pages < last_page) {
1478 bytes -= len;
1479 if (!bytes)
1480 break;
1481 len = PAGE_CACHE_SIZE;
1482 if (len > bytes)
1483 len = bytes;
1484 kaddr = kmap_atomic(*pages, KM_USER0);
1485 memset(kaddr, 0, len);
1486 kunmap_atomic(kaddr, KM_USER0);
1487 }
1488 goto out;
1489}
1490
1491static inline void ntfs_flush_dcache_pages(struct page **pages,
1492 unsigned nr_pages)
1493{
1494 BUG_ON(!nr_pages);
1495 do {
1496 /*
1497 * Warning: Do not do the decrement at the same time as the
1498 * call because flush_dcache_page() is a NULL macro on i386
1499 * and hence the decrement never happens.
1500 */
1501 flush_dcache_page(pages[nr_pages]);
1502 } while (--nr_pages > 0);
1503}
1504
1505/**
1506 * ntfs_commit_pages_after_non_resident_write - commit the received data
1507 * @pages: array of destination pages
1508 * @nr_pages: number of pages in @pages
1509 * @pos: byte position in file at which the write begins
1510 * @bytes: number of bytes to be written
1511 *
1512 * See description of ntfs_commit_pages_after_write(), below.
1513 */
1514static inline int ntfs_commit_pages_after_non_resident_write(
1515 struct page **pages, const unsigned nr_pages,
1516 s64 pos, size_t bytes)
1517{
1518 s64 end, initialized_size;
1519 struct inode *vi;
1520 ntfs_inode *ni, *base_ni;
1521 struct buffer_head *bh, *head;
1522 ntfs_attr_search_ctx *ctx;
1523 MFT_RECORD *m;
1524 ATTR_RECORD *a;
1525 unsigned long flags;
1526 unsigned blocksize, u;
1527 int err;
1528
1529 vi = pages[0]->mapping->host;
1530 ni = NTFS_I(vi);
1531 blocksize = 1 << vi->i_blkbits;
1532 end = pos + bytes;
1533 u = 0;
1534 do {
1535 s64 bh_pos;
1536 struct page *page;
1537 BOOL partial;
1538
1539 page = pages[u];
1540 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1541 bh = head = page_buffers(page);
1542 partial = FALSE;
1543 do {
1544 s64 bh_end;
1545
1546 bh_end = bh_pos + blocksize;
1547 if (bh_end <= pos || bh_pos >= end) {
1548 if (!buffer_uptodate(bh))
1549 partial = TRUE;
1550 } else {
1551 set_buffer_uptodate(bh);
1552 mark_buffer_dirty(bh);
1553 }
1554 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1555 /*
1556 * If all buffers are now uptodate but the page is not, set the
1557 * page uptodate.
1558 */
1559 if (!partial && !PageUptodate(page))
1560 SetPageUptodate(page);
1561 } while (++u < nr_pages);
1562 /*
1563 * Finally, if we do not need to update initialized_size or i_size we
1564 * are finished.
1565 */
1566 read_lock_irqsave(&ni->size_lock, flags);
1567 initialized_size = ni->initialized_size;
1568 read_unlock_irqrestore(&ni->size_lock, flags);
1569 if (end <= initialized_size) {
1570 ntfs_debug("Done.");
1571 return 0;
1572 }
1573 /*
1574 * Update initialized_size/i_size as appropriate, both in the inode and
1575 * the mft record.
1576 */
1577 if (!NInoAttr(ni))
1578 base_ni = ni;
1579 else
1580 base_ni = ni->ext.base_ntfs_ino;
1581 /* Map, pin, and lock the mft record. */
1582 m = map_mft_record(base_ni);
1583 if (IS_ERR(m)) {
1584 err = PTR_ERR(m);
1585 m = NULL;
1586 ctx = NULL;
1587 goto err_out;
1588 }
1589 BUG_ON(!NInoNonResident(ni));
1590 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1591 if (unlikely(!ctx)) {
1592 err = -ENOMEM;
1593 goto err_out;
1594 }
1595 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1596 CASE_SENSITIVE, 0, NULL, 0, ctx);
1597 if (unlikely(err)) {
1598 if (err == -ENOENT)
1599 err = -EIO;
1600 goto err_out;
1601 }
1602 a = ctx->attr;
1603 BUG_ON(!a->non_resident);
1604 write_lock_irqsave(&ni->size_lock, flags);
1605 BUG_ON(end > ni->allocated_size);
1606 ni->initialized_size = end;
1607 a->data.non_resident.initialized_size = cpu_to_sle64(end);
1608 if (end > i_size_read(vi)) {
1609 i_size_write(vi, end);
1610 a->data.non_resident.data_size =
1611 a->data.non_resident.initialized_size;
1612 }
1613 write_unlock_irqrestore(&ni->size_lock, flags);
1614 /* Mark the mft record dirty, so it gets written back. */
1615 flush_dcache_mft_record_page(ctx->ntfs_ino);
1616 mark_mft_record_dirty(ctx->ntfs_ino);
1617 ntfs_attr_put_search_ctx(ctx);
1618 unmap_mft_record(base_ni);
1619 ntfs_debug("Done.");
1620 return 0;
1621err_out:
1622 if (ctx)
1623 ntfs_attr_put_search_ctx(ctx);
1624 if (m)
1625 unmap_mft_record(base_ni);
1626 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1627 "code %i).", err);
1628 if (err != -ENOMEM) {
1629 NVolSetErrors(ni->vol);
1630 make_bad_inode(VFS_I(base_ni));
1631 make_bad_inode(vi);
1632 }
1633 return err;
1634}
1635
1636/**
1637 * ntfs_commit_pages_after_write - commit the received data
1638 * @pages: array of destination pages
1639 * @nr_pages: number of pages in @pages
1640 * @pos: byte position in file at which the write begins
1641 * @bytes: number of bytes to be written
1642 *
1643 * This is called from ntfs_file_buffered_write() with i_sem held on the inode
1644 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1645 * locked but not kmap()ped. The source data has already been copied into the
1646 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1647 * the data was copied (for non-resident attributes only) and it returned
1648 * success.
1649 *
1650 * Need to set uptodate and mark dirty all buffers within the boundary of the
1651 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1652 *
1653 * Setting the buffers dirty ensures that they get written out later when
1654 * ntfs_writepage() is invoked by the VM.
1655 *
1656 * Finally, we need to update i_size and initialized_size as appropriate both
1657 * in the inode and the mft record.
1658 *
1659 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1660 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1661 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1662 * that case, it also marks the inode dirty.
1663 *
1664 * If things have gone as outlined in
1665 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1666 * content modifications here for non-resident attributes. For resident
1667 * attributes we need to do the uptodate bringing here which we combine with
1668 * the copying into the mft record which means we save one atomic kmap.
1669 *
1670 * Return 0 on success or -errno on error.
1671 */
1672static int ntfs_commit_pages_after_write(struct page **pages,
1673 const unsigned nr_pages, s64 pos, size_t bytes)
1674{
1675 s64 end, initialized_size;
1676 loff_t i_size;
1677 struct inode *vi;
1678 ntfs_inode *ni, *base_ni;
1679 struct page *page;
1680 ntfs_attr_search_ctx *ctx;
1681 MFT_RECORD *m;
1682 ATTR_RECORD *a;
1683 char *kattr, *kaddr;
1684 unsigned long flags;
1685 u32 attr_len;
1686 int err;
1687
1688 BUG_ON(!nr_pages);
1689 BUG_ON(!pages);
1690 page = pages[0];
1691 BUG_ON(!page);
1692 vi = page->mapping->host;
1693 ni = NTFS_I(vi);
1694 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1695 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%x.",
1696 vi->i_ino, ni->type, page->index, nr_pages,
1697 (long long)pos, bytes);
1698 if (NInoNonResident(ni))
1699 return ntfs_commit_pages_after_non_resident_write(pages,
1700 nr_pages, pos, bytes);
1701 BUG_ON(nr_pages > 1);
1702 /*
1703 * Attribute is resident, implying it is not compressed, encrypted, or
1704 * sparse.
1705 */
1706 if (!NInoAttr(ni))
1707 base_ni = ni;
1708 else
1709 base_ni = ni->ext.base_ntfs_ino;
1710 BUG_ON(NInoNonResident(ni));
1711 /* Map, pin, and lock the mft record. */
1712 m = map_mft_record(base_ni);
1713 if (IS_ERR(m)) {
1714 err = PTR_ERR(m);
1715 m = NULL;
1716 ctx = NULL;
1717 goto err_out;
1718 }
1719 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1720 if (unlikely(!ctx)) {
1721 err = -ENOMEM;
1722 goto err_out;
1723 }
1724 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1725 CASE_SENSITIVE, 0, NULL, 0, ctx);
1726 if (unlikely(err)) {
1727 if (err == -ENOENT)
1728 err = -EIO;
1729 goto err_out;
1730 }
1731 a = ctx->attr;
1732 BUG_ON(a->non_resident);
1733 /* The total length of the attribute value. */
1734 attr_len = le32_to_cpu(a->data.resident.value_length);
1735 i_size = i_size_read(vi);
1736 BUG_ON(attr_len != i_size);
1737 BUG_ON(pos > attr_len);
1738 end = pos + bytes;
1739 BUG_ON(end > le32_to_cpu(a->length) -
1740 le16_to_cpu(a->data.resident.value_offset));
1741 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1742 kaddr = kmap_atomic(page, KM_USER0);
1743 /* Copy the received data from the page to the mft record. */
1744 memcpy(kattr + pos, kaddr + pos, bytes);
1745 /* Update the attribute length if necessary. */
1746 if (end > attr_len) {
1747 attr_len = end;
1748 a->data.resident.value_length = cpu_to_le32(attr_len);
1749 }
1750 /*
1751 * If the page is not uptodate, bring the out of bounds area(s)
1752 * uptodate by copying data from the mft record to the page.
1753 */
1754 if (!PageUptodate(page)) {
1755 if (pos > 0)
1756 memcpy(kaddr, kattr, pos);
1757 if (end < attr_len)
1758 memcpy(kaddr + end, kattr + end, attr_len - end);
1759 /* Zero the region outside the end of the attribute value. */
1760 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1761 flush_dcache_page(page);
1762 SetPageUptodate(page);
1763 }
1764 kunmap_atomic(kaddr, KM_USER0);
1765 /* Update initialized_size/i_size if necessary. */
1766 read_lock_irqsave(&ni->size_lock, flags);
1767 initialized_size = ni->initialized_size;
1768 BUG_ON(end > ni->allocated_size);
1769 read_unlock_irqrestore(&ni->size_lock, flags);
1770 BUG_ON(initialized_size != i_size);
1771 if (end > initialized_size) {
1772 unsigned long flags;
1773
1774 write_lock_irqsave(&ni->size_lock, flags);
1775 ni->initialized_size = end;
1776 i_size_write(vi, end);
1777 write_unlock_irqrestore(&ni->size_lock, flags);
1778 }
1779 /* Mark the mft record dirty, so it gets written back. */
1780 flush_dcache_mft_record_page(ctx->ntfs_ino);
1781 mark_mft_record_dirty(ctx->ntfs_ino);
1782 ntfs_attr_put_search_ctx(ctx);
1783 unmap_mft_record(base_ni);
1784 ntfs_debug("Done.");
1785 return 0;
1786err_out:
1787 if (err == -ENOMEM) {
1788 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1789 "commit the write.");
1790 if (PageUptodate(page)) {
1791 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1792 "dirty so the write will be retried "
1793 "later on by the VM.");
1794 /*
1795 * Put the page on mapping->dirty_pages, but leave its
1796 * buffers' dirty state as-is.
1797 */
1798 __set_page_dirty_nobuffers(page);
1799 err = 0;
1800 } else
1801 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1802 "data has been lost.");
1803 } else {
1804 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1805 "with error %i.", err);
1806 NVolSetErrors(ni->vol);
1807 make_bad_inode(VFS_I(base_ni));
1808 make_bad_inode(vi);
1809 }
1810 if (ctx)
1811 ntfs_attr_put_search_ctx(ctx);
1812 if (m)
1813 unmap_mft_record(base_ni);
1814 return err;
1815}
1816
1817/**
1818 * ntfs_file_buffered_write -
1819 *
1820 * Locking: The vfs is holding ->i_sem on the inode.
1821 */
1822static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1823 const struct iovec *iov, unsigned long nr_segs,
1824 loff_t pos, loff_t *ppos, size_t count)
1825{
1826 struct file *file = iocb->ki_filp;
1827 struct address_space *mapping = file->f_mapping;
1828 struct inode *vi = mapping->host;
1829 ntfs_inode *ni = NTFS_I(vi);
1830 ntfs_volume *vol = ni->vol;
1831 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1832 struct page *cached_page = NULL;
1833 char __user *buf = NULL;
1834 s64 end, ll;
1835 VCN last_vcn;
1836 LCN lcn;
1837 unsigned long flags;
1838 size_t bytes, iov_ofs;
1839 ssize_t status, written;
1840 unsigned nr_pages;
1841 int err;
1842 struct pagevec lru_pvec;
1843
1844 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1845 "pos 0x%llx, count 0x%lx.",
1846 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1847 (unsigned long long)pos, (unsigned long)count);
1848 if (unlikely(!count))
1849 return 0;
1850 BUG_ON(NInoMstProtected(ni));
1851 /*
1852 * If the attribute is not an index root and it is encrypted or
1853 * compressed, we cannot write to it yet. Note we need to check for
1854 * AT_INDEX_ALLOCATION since this is the type of both directory and
1855 * index inodes.
1856 */
1857 if (ni->type != AT_INDEX_ALLOCATION) {
1858 /* If file is encrypted, deny access, just like NT4. */
1859 if (NInoEncrypted(ni)) {
1860 ntfs_debug("Denying write access to encrypted file.");
1861 return -EACCES;
1862 }
1863 if (NInoCompressed(ni)) {
1864 ntfs_error(vi->i_sb, "Writing to compressed files is "
1865 "not implemented yet. Sorry.");
1866 return -EOPNOTSUPP;
1867 }
1868 }
1869 /*
1870 * If a previous ntfs_truncate() failed, repeat it and abort if it
1871 * fails again.
1872 */
1873 if (unlikely(NInoTruncateFailed(ni))) {
1874 down_write(&vi->i_alloc_sem);
1875 err = ntfs_truncate(vi);
1876 up_write(&vi->i_alloc_sem);
1877 if (err || NInoTruncateFailed(ni)) {
1878 if (!err)
1879 err = -EIO;
1880 ntfs_error(vol->sb, "Cannot perform write to inode "
1881 "0x%lx, attribute type 0x%x, because "
1882 "ntfs_truncate() failed (error code "
1883 "%i).", vi->i_ino,
1884 (unsigned)le32_to_cpu(ni->type), err);
1885 return err;
1886 }
1887 }
1888 /* The first byte after the write. */
1889 end = pos + count;
1890 /*
1891 * If the write goes beyond the allocated size, extend the allocation
1892 * to cover the whole of the write, rounded up to the nearest cluster.
1893 */
1894 read_lock_irqsave(&ni->size_lock, flags);
1895 ll = ni->allocated_size;
1896 read_unlock_irqrestore(&ni->size_lock, flags);
1897 if (end > ll) {
1898 /* Extend the allocation without changing the data size. */
1899 ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1900 if (likely(ll >= 0)) {
1901 BUG_ON(pos >= ll);
1902 /* If the extension was partial truncate the write. */
1903 if (end > ll) {
1904 ntfs_debug("Truncating write to inode 0x%lx, "
1905 "attribute type 0x%x, because "
1906 "the allocation was only "
1907 "partially extended.",
1908 vi->i_ino, (unsigned)
1909 le32_to_cpu(ni->type));
1910 end = ll;
1911 count = ll - pos;
1912 }
1913 } else {
1914 err = ll;
1915 read_lock_irqsave(&ni->size_lock, flags);
1916 ll = ni->allocated_size;
1917 read_unlock_irqrestore(&ni->size_lock, flags);
1918 /* Perform a partial write if possible or fail. */
1919 if (pos < ll) {
1920 ntfs_debug("Truncating write to inode 0x%lx, "
1921 "attribute type 0x%x, because "
1922 "extending the allocation "
1923 "failed (error code %i).",
1924 vi->i_ino, (unsigned)
1925 le32_to_cpu(ni->type), err);
1926 end = ll;
1927 count = ll - pos;
1928 } else {
1929 ntfs_error(vol->sb, "Cannot perform write to "
1930 "inode 0x%lx, attribute type "
1931 "0x%x, because extending the "
1932 "allocation failed (error "
1933 "code %i).", vi->i_ino,
1934 (unsigned)
1935 le32_to_cpu(ni->type), err);
1936 return err;
1937 }
1938 }
1939 }
1940 pagevec_init(&lru_pvec, 0);
1941 written = 0;
1942 /*
1943 * If the write starts beyond the initialized size, extend it up to the
1944 * beginning of the write and initialize all non-sparse space between
1945 * the old initialized size and the new one. This automatically also
1946 * increments the vfs inode->i_size to keep it above or equal to the
1947 * initialized_size.
1948 */
1949 read_lock_irqsave(&ni->size_lock, flags);
1950 ll = ni->initialized_size;
1951 read_unlock_irqrestore(&ni->size_lock, flags);
1952 if (pos > ll) {
1953 err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1954 &lru_pvec);
1955 if (err < 0) {
1956 ntfs_error(vol->sb, "Cannot perform write to inode "
1957 "0x%lx, attribute type 0x%x, because "
1958 "extending the initialized size "
1959 "failed (error code %i).", vi->i_ino,
1960 (unsigned)le32_to_cpu(ni->type), err);
1961 status = err;
1962 goto err_out;
1963 }
1964 }
1965 /*
1966 * Determine the number of pages per cluster for non-resident
1967 * attributes.
1968 */
1969 nr_pages = 1;
1970 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1971 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1972 /* Finally, perform the actual write. */
1973 last_vcn = -1;
1974 if (likely(nr_segs == 1))
1975 buf = iov->iov_base;
1976 else
1977 iov_ofs = 0; /* Offset in the current iovec. */
1978 do {
1979 VCN vcn;
1980 pgoff_t idx, start_idx;
1981 unsigned ofs, do_pages, u;
1982 size_t copied;
1983
1984 start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1985 ofs = pos & ~PAGE_CACHE_MASK;
1986 bytes = PAGE_CACHE_SIZE - ofs;
1987 do_pages = 1;
1988 if (nr_pages > 1) {
1989 vcn = pos >> vol->cluster_size_bits;
1990 if (vcn != last_vcn) {
1991 last_vcn = vcn;
1992 /*
1993 * Get the lcn of the vcn the write is in. If
1994 * it is a hole, need to lock down all pages in
1995 * the cluster.
1996 */
1997 down_read(&ni->runlist.lock);
1998 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
1999 vol->cluster_size_bits, FALSE);
2000 up_read(&ni->runlist.lock);
2001 if (unlikely(lcn < LCN_HOLE)) {
2002 status = -EIO;
2003 if (lcn == LCN_ENOMEM)
2004 status = -ENOMEM;
2005 else
2006 ntfs_error(vol->sb, "Cannot "
2007 "perform write to "
2008 "inode 0x%lx, "
2009 "attribute type 0x%x, "
2010 "because the attribute "
2011 "is corrupt.",
2012 vi->i_ino, (unsigned)
2013 le32_to_cpu(ni->type));
2014 break;
2015 }
2016 if (lcn == LCN_HOLE) {
2017 start_idx = (pos & ~(s64)
2018 vol->cluster_size_mask)
2019 >> PAGE_CACHE_SHIFT;
2020 bytes = vol->cluster_size - (pos &
2021 vol->cluster_size_mask);
2022 do_pages = nr_pages;
2023 }
2024 }
2025 }
2026 if (bytes > count)
2027 bytes = count;
2028 /*
2029 * Bring in the user page(s) that we will copy from _first_.
2030 * Otherwise there is a nasty deadlock on copying from the same
2031 * page(s) as we are writing to, without it/them being marked
2032 * up-to-date. Note, at present there is nothing to stop the
2033 * pages being swapped out between us bringing them into memory
2034 * and doing the actual copying.
2035 */
2036 if (likely(nr_segs == 1))
2037 ntfs_fault_in_pages_readable(buf, bytes);
2038 else
2039 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2040 /* Get and lock @do_pages starting at index @start_idx. */
2041 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2042 pages, &cached_page, &lru_pvec);
2043 if (unlikely(status))
2044 break;
2045 /*
2046 * For non-resident attributes, we need to fill any holes with
2047 * actual clusters and ensure all bufferes are mapped. We also
2048 * need to bring uptodate any buffers that are only partially
2049 * being written to.
2050 */
2051 if (NInoNonResident(ni)) {
2052 status = ntfs_prepare_pages_for_non_resident_write(
2053 pages, do_pages, pos, bytes);
2054 if (unlikely(status)) {
2055 loff_t i_size;
2056
2057 do {
2058 unlock_page(pages[--do_pages]);
2059 page_cache_release(pages[do_pages]);
2060 } while (do_pages);
2061 /*
2062 * The write preparation may have instantiated
2063 * allocated space outside i_size. Trim this
2064 * off again. We can ignore any errors in this
2065 * case as we will just be waisting a bit of
2066 * allocated space, which is not a disaster.
2067 */
2068 i_size = i_size_read(vi);
2069 if (pos + bytes > i_size)
2070 vmtruncate(vi, i_size);
2071 break;
2072 }
2073 }
2074 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2075 if (likely(nr_segs == 1)) {
2076 copied = ntfs_copy_from_user(pages + u, do_pages - u,
2077 ofs, buf, bytes);
2078 buf += copied;
2079 } else
2080 copied = ntfs_copy_from_user_iovec(pages + u,
2081 do_pages - u, ofs, &iov, &iov_ofs,
2082 bytes);
2083 ntfs_flush_dcache_pages(pages + u, do_pages - u);
2084 status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2085 bytes);
2086 if (likely(!status)) {
2087 written += copied;
2088 count -= copied;
2089 pos += copied;
2090 if (unlikely(copied != bytes))
2091 status = -EFAULT;
2092 }
2093 do {
2094 unlock_page(pages[--do_pages]);
2095 mark_page_accessed(pages[do_pages]);
2096 page_cache_release(pages[do_pages]);
2097 } while (do_pages);
2098 if (unlikely(status))
2099 break;
2100 balance_dirty_pages_ratelimited(mapping);
2101 cond_resched();
2102 } while (count);
2103err_out:
2104 *ppos = pos;
2105 if (cached_page)
2106 page_cache_release(cached_page);
2107 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2108 if (likely(!status)) {
2109 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2110 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2111 status = generic_osync_inode(vi, mapping,
2112 OSYNC_METADATA|OSYNC_DATA);
2113 }
2114 }
2115 pagevec_lru_add(&lru_pvec);
2116 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2117 written ? "written" : "status", (unsigned long)written,
2118 (long)status);
2119 return written ? written : status;
2120}
2121
2122/**
2123 * ntfs_file_aio_write_nolock -
2124 */
2125static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2126 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2127{
2128 struct file *file = iocb->ki_filp;
2129 struct address_space *mapping = file->f_mapping;
2130 struct inode *inode = mapping->host;
2131 loff_t pos;
2132 unsigned long seg;
2133 size_t count; /* after file limit checks */
2134 ssize_t written, err;
2135
2136 count = 0;
2137 for (seg = 0; seg < nr_segs; seg++) {
2138 const struct iovec *iv = &iov[seg];
2139 /*
2140 * If any segment has a negative length, or the cumulative
2141 * length ever wraps negative then return -EINVAL.
2142 */
2143 count += iv->iov_len;
2144 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2145 return -EINVAL;
2146 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2147 continue;
2148 if (!seg)
2149 return -EFAULT;
2150 nr_segs = seg;
2151 count -= iv->iov_len; /* This segment is no good */
2152 break;
2153 }
2154 pos = *ppos;
2155 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2156 /* We can write back this queue in page reclaim. */
2157 current->backing_dev_info = mapping->backing_dev_info;
2158 written = 0;
2159 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2160 if (err)
2161 goto out;
2162 if (!count)
2163 goto out;
2164 err = remove_suid(file->f_dentry);
2165 if (err)
2166 goto out;
2167 inode_update_time(inode, 1);
2168 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2169 count);
2170out:
2171 current->backing_dev_info = NULL;
2172 return written ? written : err;
2173}
2174
2175/**
2176 * ntfs_file_aio_write -
2177 */
2178static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2179 size_t count, loff_t pos)
2180{
2181 struct file *file = iocb->ki_filp;
2182 struct address_space *mapping = file->f_mapping;
2183 struct inode *inode = mapping->host;
2184 ssize_t ret;
2185 struct iovec local_iov = { .iov_base = (void __user *)buf,
2186 .iov_len = count };
2187
2188 BUG_ON(iocb->ki_pos != pos);
2189
2190 down(&inode->i_sem);
2191 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2192 up(&inode->i_sem);
2193 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2194 int err = sync_page_range(inode, mapping, pos, ret);
2195 if (err < 0)
2196 ret = err;
2197 }
2198 return ret;
2199}
2200
2201/**
2202 * ntfs_file_writev -
2203 *
2204 * Basically the same as generic_file_writev() except that it ends up calling
2205 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2206 */
2207static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2208 unsigned long nr_segs, loff_t *ppos)
2209{
2210 struct address_space *mapping = file->f_mapping;
2211 struct inode *inode = mapping->host;
2212 struct kiocb kiocb;
2213 ssize_t ret;
2214
2215 down(&inode->i_sem);
2216 init_sync_kiocb(&kiocb, file);
2217 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2218 if (ret == -EIOCBQUEUED)
2219 ret = wait_on_sync_kiocb(&kiocb);
2220 up(&inode->i_sem);
2221 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2222 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2223 if (err < 0)
2224 ret = err;
2225 }
2226 return ret;
2227}
2228
2229/**
2230 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2231 */
2232static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2233 size_t count, loff_t *ppos)
2234{
2235 struct iovec local_iov = { .iov_base = (void __user *)buf,
2236 .iov_len = count };
2237
2238 return ntfs_file_writev(file, &local_iov, 1, ppos);
2239}
2240
2241/**
59 * ntfs_file_fsync - sync a file to disk 2242 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced 2243 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync 2244 * @dentry: dentry describing the file to sync
@@ -113,39 +2296,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
113#endif /* NTFS_RW */ 2296#endif /* NTFS_RW */
114 2297
115struct file_operations ntfs_file_ops = { 2298struct file_operations ntfs_file_ops = {
116 .llseek = generic_file_llseek, /* Seek inside file. */ 2299 .llseek = generic_file_llseek, /* Seek inside file. */
117 .read = generic_file_read, /* Read from file. */ 2300 .read = generic_file_read, /* Read from file. */
118 .aio_read = generic_file_aio_read, /* Async read from file. */ 2301 .aio_read = generic_file_aio_read, /* Async read from file. */
119 .readv = generic_file_readv, /* Read from file. */ 2302 .readv = generic_file_readv, /* Read from file. */
120#ifdef NTFS_RW 2303#ifdef NTFS_RW
121 .write = generic_file_write, /* Write to file. */ 2304 .write = ntfs_file_write, /* Write to file. */
122 .aio_write = generic_file_aio_write, /* Async write to file. */ 2305 .aio_write = ntfs_file_aio_write, /* Async write to file. */
123 .writev = generic_file_writev, /* Write to file. */ 2306 .writev = ntfs_file_writev, /* Write to file. */
124 /*.release = ,*/ /* Last file is closed. See 2307 /*.release = ,*/ /* Last file is closed. See
125 fs/ext2/file.c:: 2308 fs/ext2/file.c::
126 ext2_release_file() for 2309 ext2_release_file() for
127 how to use this to discard 2310 how to use this to discard
128 preallocated space for 2311 preallocated space for
129 write opened files. */ 2312 write opened files. */
130 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2313 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
131 /*.aio_fsync = ,*/ /* Sync all outstanding async 2314 /*.aio_fsync = ,*/ /* Sync all outstanding async
132 i/o operations on a 2315 i/o operations on a
133 kiocb. */ 2316 kiocb. */
134#endif /* NTFS_RW */ 2317#endif /* NTFS_RW */
135 /*.ioctl = ,*/ /* Perform function on the 2318 /*.ioctl = ,*/ /* Perform function on the
136 mounted filesystem. */ 2319 mounted filesystem. */
137 .mmap = generic_file_mmap, /* Mmap file. */ 2320 .mmap = generic_file_mmap, /* Mmap file. */
138 .open = ntfs_file_open, /* Open file. */ 2321 .open = ntfs_file_open, /* Open file. */
139 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2322 .sendfile = generic_file_sendfile, /* Zero-copy data send with
140 the data source being on 2323 the data source being on
141 the ntfs partition. We 2324 the ntfs partition. We do
142 do not need to care about 2325 not need to care about the
143 the data destination. */ 2326 data destination. */
144 /*.sendpage = ,*/ /* Zero-copy data send with 2327 /*.sendpage = ,*/ /* Zero-copy data send with
145 the data destination being 2328 the data destination being
146 on the ntfs partition. We 2329 on the ntfs partition. We
147 do not need to care about 2330 do not need to care about
148 the data source. */ 2331 the data source. */
149}; 2332};
150 2333
151struct inode_operations ntfs_file_inode_ops = { 2334struct inode_operations ntfs_file_inode_ops = {