aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ntfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ntfs/file.c')
-rw-r--r--fs/ntfs/file.c2255
1 files changed, 2223 insertions, 32 deletions
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423d..cf3e6ced2d01 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/pagevec.h>
25#include <linux/sched.h>
26#include <linux/swap.h>
27#include <linux/uio.h>
28#include <linux/writeback.h>
24 29
30#include <asm/page.h>
31#include <asm/uaccess.h>
32
33#include "attrib.h"
34#include "bitmap.h"
25#include "inode.h" 35#include "inode.h"
26#include "debug.h" 36#include "debug.h"
37#include "lcnalloc.h"
38#include "malloc.h"
39#include "mft.h"
27#include "ntfs.h" 40#include "ntfs.h"
28 41
29/** 42/**
@@ -56,6 +69,2184 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
56#ifdef NTFS_RW 69#ifdef NTFS_RW
57 70
58/** 71/**
72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
73 * @ni: ntfs inode of the attribute to extend
74 * @new_init_size: requested new initialized size in bytes
75 * @cached_page: store any allocated but unused page here
76 * @lru_pvec: lru-buffering pagevec of the caller
77 *
78 * Extend the initialized size of an attribute described by the ntfs inode @ni
79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
80 * the old initialized size and @new_init_size both in the page cache and on
81 * disk (if relevant complete pages are already uptodate in the page cache then
82 * these are simply marked dirty).
83 *
84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
85 * in the resident attribute case, it is tied to the initialized size and, in
86 * the non-resident attribute case, it may not fall below the initialized size.
87 *
88 * Note that if the attribute is resident, we do not need to touch the page
89 * cache at all. This is because if the page cache page is not uptodate we
90 * bring it uptodate later, when doing the write to the mft record since we
91 * then already have the page mapped. And if the page is uptodate, the
92 * non-initialized region will already have been zeroed when the page was
93 * brought uptodate and the region may in fact already have been overwritten
94 * with new data via mmap() based writes, so we cannot just zero it. And since
95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
96 * is unspecified, we choose not to do zeroing and thus we do not need to touch
97 * the page at all. For a more detailed explanation see ntfs_truncate() in
98 * fs/ntfs/inode.c.
99 *
100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
101 * pages.
102 *
103 * Return 0 on success and -errno on error. In the case that an error is
104 * encountered it is possible that the initialized size will already have been
105 * incremented some way towards @new_init_size but it is guaranteed that if
106 * this is the case, the necessary zeroing will also have happened and that all
107 * metadata is self-consistent.
108 *
109 * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller.
111 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
113 struct page **cached_page, struct pagevec *lru_pvec)
114{
115 s64 old_init_size;
116 loff_t old_i_size;
117 pgoff_t index, end_index;
118 unsigned long flags;
119 struct inode *vi = VFS_I(ni);
120 ntfs_inode *base_ni;
121 MFT_RECORD *m = NULL;
122 ATTR_RECORD *a;
123 ntfs_attr_search_ctx *ctx = NULL;
124 struct address_space *mapping;
125 struct page *page = NULL;
126 u8 *kattr;
127 int err;
128 u32 attr_len;
129
130 read_lock_irqsave(&ni->size_lock, flags);
131 old_init_size = ni->initialized_size;
132 old_i_size = i_size_read(vi);
133 BUG_ON(new_init_size > ni->allocated_size);
134 read_unlock_irqrestore(&ni->size_lock, flags);
135 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
136 "old_initialized_size 0x%llx, "
137 "new_initialized_size 0x%llx, i_size 0x%llx.",
138 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
139 (unsigned long long)old_init_size,
140 (unsigned long long)new_init_size, old_i_size);
141 if (!NInoAttr(ni))
142 base_ni = ni;
143 else
144 base_ni = ni->ext.base_ntfs_ino;
145 /* Use goto to reduce indentation and we need the label below anyway. */
146 if (NInoNonResident(ni))
147 goto do_non_resident_extend;
148 BUG_ON(old_init_size != old_i_size);
149 m = map_mft_record(base_ni);
150 if (IS_ERR(m)) {
151 err = PTR_ERR(m);
152 m = NULL;
153 goto err_out;
154 }
155 ctx = ntfs_attr_get_search_ctx(base_ni, m);
156 if (unlikely(!ctx)) {
157 err = -ENOMEM;
158 goto err_out;
159 }
160 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
161 CASE_SENSITIVE, 0, NULL, 0, ctx);
162 if (unlikely(err)) {
163 if (err == -ENOENT)
164 err = -EIO;
165 goto err_out;
166 }
167 m = ctx->mrec;
168 a = ctx->attr;
169 BUG_ON(a->non_resident);
170 /* The total length of the attribute value. */
171 attr_len = le32_to_cpu(a->data.resident.value_length);
172 BUG_ON(old_i_size != (loff_t)attr_len);
173 /*
174 * Do the zeroing in the mft record and update the attribute size in
175 * the mft record.
176 */
177 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
178 memset(kattr + attr_len, 0, new_init_size - attr_len);
179 a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
180 /* Finally, update the sizes in the vfs and ntfs inodes. */
181 write_lock_irqsave(&ni->size_lock, flags);
182 i_size_write(vi, new_init_size);
183 ni->initialized_size = new_init_size;
184 write_unlock_irqrestore(&ni->size_lock, flags);
185 goto done;
186do_non_resident_extend:
187 /*
188 * If the new initialized size @new_init_size exceeds the current file
189 * size (vfs inode->i_size), we need to extend the file size to the
190 * new initialized size.
191 */
192 if (new_init_size > old_i_size) {
193 m = map_mft_record(base_ni);
194 if (IS_ERR(m)) {
195 err = PTR_ERR(m);
196 m = NULL;
197 goto err_out;
198 }
199 ctx = ntfs_attr_get_search_ctx(base_ni, m);
200 if (unlikely(!ctx)) {
201 err = -ENOMEM;
202 goto err_out;
203 }
204 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
205 CASE_SENSITIVE, 0, NULL, 0, ctx);
206 if (unlikely(err)) {
207 if (err == -ENOENT)
208 err = -EIO;
209 goto err_out;
210 }
211 m = ctx->mrec;
212 a = ctx->attr;
213 BUG_ON(!a->non_resident);
214 BUG_ON(old_i_size != (loff_t)
215 sle64_to_cpu(a->data.non_resident.data_size));
216 a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
217 flush_dcache_mft_record_page(ctx->ntfs_ino);
218 mark_mft_record_dirty(ctx->ntfs_ino);
219 /* Update the file size in the vfs inode. */
220 i_size_write(vi, new_init_size);
221 ntfs_attr_put_search_ctx(ctx);
222 ctx = NULL;
223 unmap_mft_record(base_ni);
224 m = NULL;
225 }
226 mapping = vi->i_mapping;
227 index = old_init_size >> PAGE_CACHE_SHIFT;
228 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 do {
230 /*
231 * Read the page. If the page is not present, this will zero
232 * the uninitialized regions for us.
233 */
234 page = read_cache_page(mapping, index,
235 (filler_t*)mapping->a_ops->readpage, NULL);
236 if (IS_ERR(page)) {
237 err = PTR_ERR(page);
238 goto init_err_out;
239 }
240 wait_on_page_locked(page);
241 if (unlikely(!PageUptodate(page) || PageError(page))) {
242 page_cache_release(page);
243 err = -EIO;
244 goto init_err_out;
245 }
246 /*
247 * Update the initialized size in the ntfs inode. This is
248 * enough to make ntfs_writepage() work.
249 */
250 write_lock_irqsave(&ni->size_lock, flags);
251 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
252 if (ni->initialized_size > new_init_size)
253 ni->initialized_size = new_init_size;
254 write_unlock_irqrestore(&ni->size_lock, flags);
255 /* Set the page dirty so it gets written out. */
256 set_page_dirty(page);
257 page_cache_release(page);
258 /*
259 * Play nice with the vm and the rest of the system. This is
260 * very much needed as we can potentially be modifying the
261 * initialised size from a very small value to a really huge
262 * value, e.g.
263 * f = open(somefile, O_TRUNC);
264 * truncate(f, 10GiB);
265 * seek(f, 10GiB);
266 * write(f, 1);
267 * And this would mean we would be marking dirty hundreds of
268 * thousands of pages or as in the above example more than
269 * two and a half million pages!
270 *
271 * TODO: For sparse pages could optimize this workload by using
272 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
273 * would be set in readpage for sparse pages and here we would
274 * not need to mark dirty any pages which have this bit set.
275 * The only caveat is that we have to clear the bit everywhere
276 * where we allocate any clusters that lie in the page or that
277 * contain the page.
278 *
279 * TODO: An even greater optimization would be for us to only
280 * call readpage() on pages which are not in sparse regions as
281 * determined from the runlist. This would greatly reduce the
282 * number of pages we read and make dirty in the case of sparse
283 * files.
284 */
285 balance_dirty_pages_ratelimited(mapping);
286 cond_resched();
287 } while (++index < end_index);
288 read_lock_irqsave(&ni->size_lock, flags);
289 BUG_ON(ni->initialized_size != new_init_size);
290 read_unlock_irqrestore(&ni->size_lock, flags);
291 /* Now bring in sync the initialized_size in the mft record. */
292 m = map_mft_record(base_ni);
293 if (IS_ERR(m)) {
294 err = PTR_ERR(m);
295 m = NULL;
296 goto init_err_out;
297 }
298 ctx = ntfs_attr_get_search_ctx(base_ni, m);
299 if (unlikely(!ctx)) {
300 err = -ENOMEM;
301 goto init_err_out;
302 }
303 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
304 CASE_SENSITIVE, 0, NULL, 0, ctx);
305 if (unlikely(err)) {
306 if (err == -ENOENT)
307 err = -EIO;
308 goto init_err_out;
309 }
310 m = ctx->mrec;
311 a = ctx->attr;
312 BUG_ON(!a->non_resident);
313 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
314done:
315 flush_dcache_mft_record_page(ctx->ntfs_ino);
316 mark_mft_record_dirty(ctx->ntfs_ino);
317 if (ctx)
318 ntfs_attr_put_search_ctx(ctx);
319 if (m)
320 unmap_mft_record(base_ni);
321 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
322 (unsigned long long)new_init_size, i_size_read(vi));
323 return 0;
324init_err_out:
325 write_lock_irqsave(&ni->size_lock, flags);
326 ni->initialized_size = old_init_size;
327 write_unlock_irqrestore(&ni->size_lock, flags);
328err_out:
329 if (ctx)
330 ntfs_attr_put_search_ctx(ctx);
331 if (m)
332 unmap_mft_record(base_ni);
333 ntfs_debug("Failed. Returning error code %i.", err);
334 return err;
335}
336
337/**
338 * ntfs_fault_in_pages_readable -
339 *
340 * Fault a number of userspace pages into pagetables.
341 *
342 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
343 * with more than two userspace pages as well as handling the single page case
344 * elegantly.
345 *
346 * If you find this difficult to understand, then think of the while loop being
347 * the following code, except that we do without the integer variable ret:
348 *
349 * do {
350 * ret = __get_user(c, uaddr);
351 * uaddr += PAGE_SIZE;
352 * } while (!ret && uaddr < end);
353 *
354 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
355 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
356 * this is only a read and not a write, and since it is still in the same page,
357 * it should not matter and this makes the code much simpler.
358 */
359static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
360 int bytes)
361{
362 const char __user *end;
363 volatile char c;
364
365 /* Set @end to the first byte outside the last page we care about. */
366 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
367
368 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
369 ;
370}
371
372/**
373 * ntfs_fault_in_pages_readable_iovec -
374 *
375 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
376 */
377static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
378 size_t iov_ofs, int bytes)
379{
380 do {
381 const char __user *buf;
382 unsigned len;
383
384 buf = iov->iov_base + iov_ofs;
385 len = iov->iov_len - iov_ofs;
386 if (len > bytes)
387 len = bytes;
388 ntfs_fault_in_pages_readable(buf, len);
389 bytes -= len;
390 iov++;
391 iov_ofs = 0;
392 } while (bytes);
393}
394
395/**
396 * __ntfs_grab_cache_pages - obtain a number of locked pages
397 * @mapping: address space mapping from which to obtain page cache pages
398 * @index: starting index in @mapping at which to begin obtaining pages
399 * @nr_pages: number of page cache pages to obtain
400 * @pages: array of pages in which to return the obtained page cache pages
401 * @cached_page: allocated but as yet unused page
402 * @lru_pvec: lru-buffering pagevec of caller
403 *
404 * Obtain @nr_pages locked page cache pages from the mapping @maping and
405 * starting at index @index.
406 *
407 * If a page is newly created, increment its refcount and add it to the
408 * caller's lru-buffering pagevec @lru_pvec.
409 *
410 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
411 * are obtained at once instead of just one page and that 0 is returned on
412 * success and -errno on error.
413 *
414 * Note, the page locks are obtained in ascending page index order.
415 */
416static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
417 pgoff_t index, const unsigned nr_pages, struct page **pages,
418 struct page **cached_page, struct pagevec *lru_pvec)
419{
420 int err, nr;
421
422 BUG_ON(!nr_pages);
423 err = nr = 0;
424 do {
425 pages[nr] = find_lock_page(mapping, index);
426 if (!pages[nr]) {
427 if (!*cached_page) {
428 *cached_page = page_cache_alloc(mapping);
429 if (unlikely(!*cached_page)) {
430 err = -ENOMEM;
431 goto err_out;
432 }
433 }
434 err = add_to_page_cache(*cached_page, mapping, index,
435 GFP_KERNEL);
436 if (unlikely(err)) {
437 if (err == -EEXIST)
438 continue;
439 goto err_out;
440 }
441 pages[nr] = *cached_page;
442 page_cache_get(*cached_page);
443 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
444 __pagevec_lru_add(lru_pvec);
445 *cached_page = NULL;
446 }
447 index++;
448 nr++;
449 } while (nr < nr_pages);
450out:
451 return err;
452err_out:
453 while (nr > 0) {
454 unlock_page(pages[--nr]);
455 page_cache_release(pages[nr]);
456 }
457 goto out;
458}
459
460static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
461{
462 lock_buffer(bh);
463 get_bh(bh);
464 bh->b_end_io = end_buffer_read_sync;
465 return submit_bh(READ, bh);
466}
467
468/**
469 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
470 * @pages: array of destination pages
471 * @nr_pages: number of pages in @pages
472 * @pos: byte position in file at which the write begins
473 * @bytes: number of bytes to be written
474 *
475 * This is called for non-resident attributes from ntfs_file_buffered_write()
476 * with i_sem held on the inode (@pages[0]->mapping->host). There are
477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
478 * data has not yet been copied into the @pages.
479 *
480 * Need to fill any holes with actual clusters, allocate buffers if necessary,
481 * ensure all the buffers are mapped, and bring uptodate any buffers that are
482 * only partially being written to.
483 *
484 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
485 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
486 * the same cluster and that they are the entirety of that cluster, and that
487 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
488 *
489 * i_size is not to be modified yet.
490 *
491 * Return 0 on success or -errno on error.
492 */
493static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
494 unsigned nr_pages, s64 pos, size_t bytes)
495{
496 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
497 LCN lcn;
498 s64 bh_pos, vcn_len, end, initialized_size;
499 sector_t lcn_block;
500 struct page *page;
501 struct inode *vi;
502 ntfs_inode *ni, *base_ni = NULL;
503 ntfs_volume *vol;
504 runlist_element *rl, *rl2;
505 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
506 ntfs_attr_search_ctx *ctx = NULL;
507 MFT_RECORD *m = NULL;
508 ATTR_RECORD *a = NULL;
509 unsigned long flags;
510 u32 attr_rec_len = 0;
511 unsigned blocksize, u;
512 int err, mp_size;
513 BOOL rl_write_locked, was_hole, is_retry;
514 unsigned char blocksize_bits;
515 struct {
516 u8 runlist_merged:1;
517 u8 mft_attr_mapped:1;
518 u8 mp_rebuilt:1;
519 u8 attr_switched:1;
520 } status = { 0, 0, 0, 0 };
521
522 BUG_ON(!nr_pages);
523 BUG_ON(!pages);
524 BUG_ON(!*pages);
525 vi = pages[0]->mapping->host;
526 ni = NTFS_I(vi);
527 vol = ni->vol;
528 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
530 vi->i_ino, ni->type, pages[0]->index, nr_pages,
531 (long long)pos, bytes);
532 blocksize_bits = vi->i_blkbits;
533 blocksize = 1 << blocksize_bits;
534 u = 0;
535 do {
536 struct page *page = pages[u];
537 /*
538 * create_empty_buffers() will create uptodate/dirty buffers if
539 * the page is uptodate/dirty.
540 */
541 if (!page_has_buffers(page)) {
542 create_empty_buffers(page, blocksize, 0);
543 if (unlikely(!page_has_buffers(page)))
544 return -ENOMEM;
545 }
546 } while (++u < nr_pages);
547 rl_write_locked = FALSE;
548 rl = NULL;
549 err = 0;
550 vcn = lcn = -1;
551 vcn_len = 0;
552 lcn_block = -1;
553 was_hole = FALSE;
554 cpos = pos >> vol->cluster_size_bits;
555 end = pos + bytes;
556 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
557 /*
558 * Loop over each page and for each page over each buffer. Use goto to
559 * reduce indentation.
560 */
561 u = 0;
562do_next_page:
563 page = pages[u];
564 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
565 bh = head = page_buffers(page);
566 do {
567 VCN cdelta;
568 s64 bh_end;
569 unsigned bh_cofs;
570
571 /* Clear buffer_new on all buffers to reinitialise state. */
572 if (buffer_new(bh))
573 clear_buffer_new(bh);
574 bh_end = bh_pos + blocksize;
575 bh_cpos = bh_pos >> vol->cluster_size_bits;
576 bh_cofs = bh_pos & vol->cluster_size_mask;
577 if (buffer_mapped(bh)) {
578 /*
579 * The buffer is already mapped. If it is uptodate,
580 * ignore it.
581 */
582 if (buffer_uptodate(bh))
583 continue;
584 /*
585 * The buffer is not uptodate. If the page is uptodate
586 * set the buffer uptodate and otherwise ignore it.
587 */
588 if (PageUptodate(page)) {
589 set_buffer_uptodate(bh);
590 continue;
591 }
592 /*
593 * Neither the page nor the buffer are uptodate. If
594 * the buffer is only partially being written to, we
595 * need to read it in before the write, i.e. now.
596 */
597 if ((bh_pos < pos && bh_end > pos) ||
598 (bh_pos < end && bh_end > end)) {
599 /*
600 * If the buffer is fully or partially within
601 * the initialized size, do an actual read.
602 * Otherwise, simply zero the buffer.
603 */
604 read_lock_irqsave(&ni->size_lock, flags);
605 initialized_size = ni->initialized_size;
606 read_unlock_irqrestore(&ni->size_lock, flags);
607 if (bh_pos < initialized_size) {
608 ntfs_submit_bh_for_read(bh);
609 *wait_bh++ = bh;
610 } else {
611 u8 *kaddr = kmap_atomic(page, KM_USER0);
612 memset(kaddr + bh_offset(bh), 0,
613 blocksize);
614 kunmap_atomic(kaddr, KM_USER0);
615 flush_dcache_page(page);
616 set_buffer_uptodate(bh);
617 }
618 }
619 continue;
620 }
621 /* Unmapped buffer. Need to map it. */
622 bh->b_bdev = vol->sb->s_bdev;
623 /*
624 * If the current buffer is in the same clusters as the map
625 * cache, there is no need to check the runlist again. The
626 * map cache is made up of @vcn, which is the first cached file
627 * cluster, @vcn_len which is the number of cached file
628 * clusters, @lcn is the device cluster corresponding to @vcn,
629 * and @lcn_block is the block number corresponding to @lcn.
630 */
631 cdelta = bh_cpos - vcn;
632 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
633map_buffer_cached:
634 BUG_ON(lcn < 0);
635 bh->b_blocknr = lcn_block +
636 (cdelta << (vol->cluster_size_bits -
637 blocksize_bits)) +
638 (bh_cofs >> blocksize_bits);
639 set_buffer_mapped(bh);
640 /*
641 * If the page is uptodate so is the buffer. If the
642 * buffer is fully outside the write, we ignore it if
643 * it was already allocated and we mark it dirty so it
644 * gets written out if we allocated it. On the other
645 * hand, if we allocated the buffer but we are not
646 * marking it dirty we set buffer_new so we can do
647 * error recovery.
648 */
649 if (PageUptodate(page)) {
650 if (!buffer_uptodate(bh))
651 set_buffer_uptodate(bh);
652 if (unlikely(was_hole)) {
653 /* We allocated the buffer. */
654 unmap_underlying_metadata(bh->b_bdev,
655 bh->b_blocknr);
656 if (bh_end <= pos || bh_pos >= end)
657 mark_buffer_dirty(bh);
658 else
659 set_buffer_new(bh);
660 }
661 continue;
662 }
663 /* Page is _not_ uptodate. */
664 if (likely(!was_hole)) {
665 /*
666 * Buffer was already allocated. If it is not
667 * uptodate and is only partially being written
668 * to, we need to read it in before the write,
669 * i.e. now.
670 */
671 if (!buffer_uptodate(bh) && ((bh_pos < pos &&
672 bh_end > pos) ||
673 (bh_end > end &&
674 bh_end > end))) {
675 /*
676 * If the buffer is fully or partially
677 * within the initialized size, do an
678 * actual read. Otherwise, simply zero
679 * the buffer.
680 */
681 read_lock_irqsave(&ni->size_lock,
682 flags);
683 initialized_size = ni->initialized_size;
684 read_unlock_irqrestore(&ni->size_lock,
685 flags);
686 if (bh_pos < initialized_size) {
687 ntfs_submit_bh_for_read(bh);
688 *wait_bh++ = bh;
689 } else {
690 u8 *kaddr = kmap_atomic(page,
691 KM_USER0);
692 memset(kaddr + bh_offset(bh),
693 0, blocksize);
694 kunmap_atomic(kaddr, KM_USER0);
695 flush_dcache_page(page);
696 set_buffer_uptodate(bh);
697 }
698 }
699 continue;
700 }
701 /* We allocated the buffer. */
702 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
703 /*
704 * If the buffer is fully outside the write, zero it,
705 * set it uptodate, and mark it dirty so it gets
706 * written out. If it is partially being written to,
707 * zero region surrounding the write but leave it to
708 * commit write to do anything else. Finally, if the
709 * buffer is fully being overwritten, do nothing.
710 */
711 if (bh_end <= pos || bh_pos >= end) {
712 if (!buffer_uptodate(bh)) {
713 u8 *kaddr = kmap_atomic(page, KM_USER0);
714 memset(kaddr + bh_offset(bh), 0,
715 blocksize);
716 kunmap_atomic(kaddr, KM_USER0);
717 flush_dcache_page(page);
718 set_buffer_uptodate(bh);
719 }
720 mark_buffer_dirty(bh);
721 continue;
722 }
723 set_buffer_new(bh);
724 if (!buffer_uptodate(bh) &&
725 (bh_pos < pos || bh_end > end)) {
726 u8 *kaddr;
727 unsigned pofs;
728
729 kaddr = kmap_atomic(page, KM_USER0);
730 if (bh_pos < pos) {
731 pofs = bh_pos & ~PAGE_CACHE_MASK;
732 memset(kaddr + pofs, 0, pos - bh_pos);
733 }
734 if (bh_end > end) {
735 pofs = end & ~PAGE_CACHE_MASK;
736 memset(kaddr + pofs, 0, bh_end - end);
737 }
738 kunmap_atomic(kaddr, KM_USER0);
739 flush_dcache_page(page);
740 }
741 continue;
742 }
743 /*
744 * Slow path: this is the first buffer in the cluster. If it
745 * is outside allocated size and is not uptodate, zero it and
746 * set it uptodate.
747 */
748 read_lock_irqsave(&ni->size_lock, flags);
749 initialized_size = ni->allocated_size;
750 read_unlock_irqrestore(&ni->size_lock, flags);
751 if (bh_pos > initialized_size) {
752 if (PageUptodate(page)) {
753 if (!buffer_uptodate(bh))
754 set_buffer_uptodate(bh);
755 } else if (!buffer_uptodate(bh)) {
756 u8 *kaddr = kmap_atomic(page, KM_USER0);
757 memset(kaddr + bh_offset(bh), 0, blocksize);
758 kunmap_atomic(kaddr, KM_USER0);
759 flush_dcache_page(page);
760 set_buffer_uptodate(bh);
761 }
762 continue;
763 }
764 is_retry = FALSE;
765 if (!rl) {
766 down_read(&ni->runlist.lock);
767retry_remap:
768 rl = ni->runlist.rl;
769 }
770 if (likely(rl != NULL)) {
771 /* Seek to element containing target cluster. */
772 while (rl->length && rl[1].vcn <= bh_cpos)
773 rl++;
774 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
775 if (likely(lcn >= 0)) {
776 /*
777 * Successful remap, setup the map cache and
778 * use that to deal with the buffer.
779 */
780 was_hole = FALSE;
781 vcn = bh_cpos;
782 vcn_len = rl[1].vcn - vcn;
783 lcn_block = lcn << (vol->cluster_size_bits -
784 blocksize_bits);
785 cdelta = 0;
786 /*
787 * If the number of remaining clusters in the
788 * @pages is smaller or equal to the number of
789 * cached clusters, unlock the runlist as the
790 * map cache will be used from now on.
791 */
792 if (likely(vcn + vcn_len >= cend)) {
793 if (rl_write_locked) {
794 up_write(&ni->runlist.lock);
795 rl_write_locked = FALSE;
796 } else
797 up_read(&ni->runlist.lock);
798 rl = NULL;
799 }
800 goto map_buffer_cached;
801 }
802 } else
803 lcn = LCN_RL_NOT_MAPPED;
804 /*
805 * If it is not a hole and not out of bounds, the runlist is
806 * probably unmapped so try to map it now.
807 */
808 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
809 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
810 /* Attempt to map runlist. */
811 if (!rl_write_locked) {
812 /*
813 * We need the runlist locked for
814 * writing, so if it is locked for
815 * reading relock it now and retry in
816 * case it changed whilst we dropped
817 * the lock.
818 */
819 up_read(&ni->runlist.lock);
820 down_write(&ni->runlist.lock);
821 rl_write_locked = TRUE;
822 goto retry_remap;
823 }
824 err = ntfs_map_runlist_nolock(ni, bh_cpos,
825 NULL);
826 if (likely(!err)) {
827 is_retry = TRUE;
828 goto retry_remap;
829 }
830 /*
831 * If @vcn is out of bounds, pretend @lcn is
832 * LCN_ENOENT. As long as the buffer is out
833 * of bounds this will work fine.
834 */
835 if (err == -ENOENT) {
836 lcn = LCN_ENOENT;
837 err = 0;
838 goto rl_not_mapped_enoent;
839 }
840 } else
841 err = -EIO;
842 /* Failed to map the buffer, even after retrying. */
843 bh->b_blocknr = -1;
844 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
845 "attribute type 0x%x, vcn 0x%llx, "
846 "vcn offset 0x%x, because its "
847 "location on disk could not be "
848 "determined%s (error code %i).",
849 ni->mft_no, ni->type,
850 (unsigned long long)bh_cpos,
851 (unsigned)bh_pos &
852 vol->cluster_size_mask,
853 is_retry ? " even after retrying" : "",
854 err);
855 break;
856 }
857rl_not_mapped_enoent:
858 /*
859 * The buffer is in a hole or out of bounds. We need to fill
860 * the hole, unless the buffer is in a cluster which is not
861 * touched by the write, in which case we just leave the buffer
862 * unmapped. This can only happen when the cluster size is
863 * less than the page cache size.
864 */
865 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
866 bh_cend = (bh_end + vol->cluster_size - 1) >>
867 vol->cluster_size_bits;
868 if ((bh_cend <= cpos || bh_cpos >= cend)) {
869 bh->b_blocknr = -1;
870 /*
871 * If the buffer is uptodate we skip it. If it
872 * is not but the page is uptodate, we can set
873 * the buffer uptodate. If the page is not
874 * uptodate, we can clear the buffer and set it
875 * uptodate. Whether this is worthwhile is
876 * debatable and this could be removed.
877 */
878 if (PageUptodate(page)) {
879 if (!buffer_uptodate(bh))
880 set_buffer_uptodate(bh);
881 } else if (!buffer_uptodate(bh)) {
882 u8 *kaddr = kmap_atomic(page, KM_USER0);
883 memset(kaddr + bh_offset(bh), 0,
884 blocksize);
885 kunmap_atomic(kaddr, KM_USER0);
886 flush_dcache_page(page);
887 set_buffer_uptodate(bh);
888 }
889 continue;
890 }
891 }
892 /*
893 * Out of bounds buffer is invalid if it was not really out of
894 * bounds.
895 */
896 BUG_ON(lcn != LCN_HOLE);
897 /*
898 * We need the runlist locked for writing, so if it is locked
899 * for reading relock it now and retry in case it changed
900 * whilst we dropped the lock.
901 */
902 BUG_ON(!rl);
903 if (!rl_write_locked) {
904 up_read(&ni->runlist.lock);
905 down_write(&ni->runlist.lock);
906 rl_write_locked = TRUE;
907 goto retry_remap;
908 }
909 /* Find the previous last allocated cluster. */
910 BUG_ON(rl->lcn != LCN_HOLE);
911 lcn = -1;
912 rl2 = rl;
913 while (--rl2 >= ni->runlist.rl) {
914 if (rl2->lcn >= 0) {
915 lcn = rl2->lcn + rl2->length;
916 break;
917 }
918 }
919 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
920 FALSE);
921 if (IS_ERR(rl2)) {
922 err = PTR_ERR(rl2);
923 ntfs_debug("Failed to allocate cluster, error code %i.",
924 err);
925 break;
926 }
927 lcn = rl2->lcn;
928 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
929 if (IS_ERR(rl)) {
930 err = PTR_ERR(rl);
931 if (err != -ENOMEM)
932 err = -EIO;
933 if (ntfs_cluster_free_from_rl(vol, rl2)) {
934 ntfs_error(vol->sb, "Failed to release "
935 "allocated cluster in error "
936 "code path. Run chkdsk to "
937 "recover the lost cluster.");
938 NVolSetErrors(vol);
939 }
940 ntfs_free(rl2);
941 break;
942 }
943 ni->runlist.rl = rl;
944 status.runlist_merged = 1;
945 ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
946 /* Map and lock the mft record and get the attribute record. */
947 if (!NInoAttr(ni))
948 base_ni = ni;
949 else
950 base_ni = ni->ext.base_ntfs_ino;
951 m = map_mft_record(base_ni);
952 if (IS_ERR(m)) {
953 err = PTR_ERR(m);
954 break;
955 }
956 ctx = ntfs_attr_get_search_ctx(base_ni, m);
957 if (unlikely(!ctx)) {
958 err = -ENOMEM;
959 unmap_mft_record(base_ni);
960 break;
961 }
962 status.mft_attr_mapped = 1;
963 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
964 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
965 if (unlikely(err)) {
966 if (err == -ENOENT)
967 err = -EIO;
968 break;
969 }
970 m = ctx->mrec;
971 a = ctx->attr;
972 /*
973 * Find the runlist element with which the attribute extent
974 * starts. Note, we cannot use the _attr_ version because we
975 * have mapped the mft record. That is ok because we know the
976 * runlist fragment must be mapped already to have ever gotten
977 * here, so we can just use the _rl_ version.
978 */
979 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
980 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
981 BUG_ON(!rl2);
982 BUG_ON(!rl2->length);
983 BUG_ON(rl2->lcn < LCN_HOLE);
984 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
985 /*
986 * If @highest_vcn is zero, calculate the real highest_vcn
987 * (which can really be zero).
988 */
989 if (!highest_vcn)
990 highest_vcn = (sle64_to_cpu(
991 a->data.non_resident.allocated_size) >>
992 vol->cluster_size_bits) - 1;
993 /*
994 * Determine the size of the mapping pairs array for the new
995 * extent, i.e. the old extent with the hole filled.
996 */
997 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
998 highest_vcn);
999 if (unlikely(mp_size <= 0)) {
1000 if (!(err = mp_size))
1001 err = -EIO;
1002 ntfs_debug("Failed to get size for mapping pairs "
1003 "array, error code %i.", err);
1004 break;
1005 }
1006 /*
1007 * Resize the attribute record to fit the new mapping pairs
1008 * array.
1009 */
1010 attr_rec_len = le32_to_cpu(a->length);
1011 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1012 a->data.non_resident.mapping_pairs_offset));
1013 if (unlikely(err)) {
1014 BUG_ON(err != -ENOSPC);
1015 // TODO: Deal with this by using the current attribute
1016 // and fill it with as much of the mapping pairs
1017 // array as possible. Then loop over each attribute
1018 // extent rewriting the mapping pairs arrays as we go
1019 // along and if when we reach the end we have not
1020 // enough space, try to resize the last attribute
1021 // extent and if even that fails, add a new attribute
1022 // extent.
1023 // We could also try to resize at each step in the hope
1024 // that we will not need to rewrite every single extent.
1025 // Note, we may need to decompress some extents to fill
1026 // the runlist as we are walking the extents...
1027 ntfs_error(vol->sb, "Not enough space in the mft "
1028 "record for the extended attribute "
1029 "record. This case is not "
1030 "implemented yet.");
1031 err = -EOPNOTSUPP;
1032 break ;
1033 }
1034 status.mp_rebuilt = 1;
1035 /*
1036 * Generate the mapping pairs array directly into the attribute
1037 * record.
1038 */
1039 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1040 a->data.non_resident.mapping_pairs_offset),
1041 mp_size, rl2, vcn, highest_vcn, NULL);
1042 if (unlikely(err)) {
1043 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1044 "attribute type 0x%x, because building "
1045 "the mapping pairs failed with error "
1046 "code %i.", vi->i_ino,
1047 (unsigned)le32_to_cpu(ni->type), err);
1048 err = -EIO;
1049 break;
1050 }
1051 /* Update the highest_vcn but only if it was not set. */
1052 if (unlikely(!a->data.non_resident.highest_vcn))
1053 a->data.non_resident.highest_vcn =
1054 cpu_to_sle64(highest_vcn);
1055 /*
1056 * If the attribute is sparse/compressed, update the compressed
1057 * size in the ntfs_inode structure and the attribute record.
1058 */
1059 if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1060 /*
1061 * If we are not in the first attribute extent, switch
1062 * to it, but first ensure the changes will make it to
1063 * disk later.
1064 */
1065 if (a->data.non_resident.lowest_vcn) {
1066 flush_dcache_mft_record_page(ctx->ntfs_ino);
1067 mark_mft_record_dirty(ctx->ntfs_ino);
1068 ntfs_attr_reinit_search_ctx(ctx);
1069 err = ntfs_attr_lookup(ni->type, ni->name,
1070 ni->name_len, CASE_SENSITIVE,
1071 0, NULL, 0, ctx);
1072 if (unlikely(err)) {
1073 status.attr_switched = 1;
1074 break;
1075 }
1076 /* @m is not used any more so do not set it. */
1077 a = ctx->attr;
1078 }
1079 write_lock_irqsave(&ni->size_lock, flags);
1080 ni->itype.compressed.size += vol->cluster_size;
1081 a->data.non_resident.compressed_size =
1082 cpu_to_sle64(ni->itype.compressed.size);
1083 write_unlock_irqrestore(&ni->size_lock, flags);
1084 }
1085 /* Ensure the changes make it to disk. */
1086 flush_dcache_mft_record_page(ctx->ntfs_ino);
1087 mark_mft_record_dirty(ctx->ntfs_ino);
1088 ntfs_attr_put_search_ctx(ctx);
1089 unmap_mft_record(base_ni);
1090 /* Successfully filled the hole. */
1091 status.runlist_merged = 0;
1092 status.mft_attr_mapped = 0;
1093 status.mp_rebuilt = 0;
1094 /* Setup the map cache and use that to deal with the buffer. */
1095 was_hole = TRUE;
1096 vcn = bh_cpos;
1097 vcn_len = 1;
1098 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1099 cdelta = 0;
1100 /*
1101 * If the number of remaining clusters in the @pages is smaller
1102 * or equal to the number of cached clusters, unlock the
1103 * runlist as the map cache will be used from now on.
1104 */
1105 if (likely(vcn + vcn_len >= cend)) {
1106 up_write(&ni->runlist.lock);
1107 rl_write_locked = FALSE;
1108 rl = NULL;
1109 }
1110 goto map_buffer_cached;
1111 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1112 /* If there are no errors, do the next page. */
1113 if (likely(!err && ++u < nr_pages))
1114 goto do_next_page;
1115 /* If there are no errors, release the runlist lock if we took it. */
1116 if (likely(!err)) {
1117 if (unlikely(rl_write_locked)) {
1118 up_write(&ni->runlist.lock);
1119 rl_write_locked = FALSE;
1120 } else if (unlikely(rl))
1121 up_read(&ni->runlist.lock);
1122 rl = NULL;
1123 }
1124 /* If we issued read requests, let them complete. */
1125 read_lock_irqsave(&ni->size_lock, flags);
1126 initialized_size = ni->initialized_size;
1127 read_unlock_irqrestore(&ni->size_lock, flags);
1128 while (wait_bh > wait) {
1129 bh = *--wait_bh;
1130 wait_on_buffer(bh);
1131 if (likely(buffer_uptodate(bh))) {
1132 page = bh->b_page;
1133 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1134 bh_offset(bh);
1135 /*
1136 * If the buffer overflows the initialized size, need
1137 * to zero the overflowing region.
1138 */
1139 if (unlikely(bh_pos + blocksize > initialized_size)) {
1140 u8 *kaddr;
1141 int ofs = 0;
1142
1143 if (likely(bh_pos < initialized_size))
1144 ofs = initialized_size - bh_pos;
1145 kaddr = kmap_atomic(page, KM_USER0);
1146 memset(kaddr + bh_offset(bh) + ofs, 0,
1147 blocksize - ofs);
1148 kunmap_atomic(kaddr, KM_USER0);
1149 flush_dcache_page(page);
1150 }
1151 } else /* if (unlikely(!buffer_uptodate(bh))) */
1152 err = -EIO;
1153 }
1154 if (likely(!err)) {
1155 /* Clear buffer_new on all buffers. */
1156 u = 0;
1157 do {
1158 bh = head = page_buffers(pages[u]);
1159 do {
1160 if (buffer_new(bh))
1161 clear_buffer_new(bh);
1162 } while ((bh = bh->b_this_page) != head);
1163 } while (++u < nr_pages);
1164 ntfs_debug("Done.");
1165 return err;
1166 }
1167 if (status.attr_switched) {
1168 /* Get back to the attribute extent we modified. */
1169 ntfs_attr_reinit_search_ctx(ctx);
1170 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1171 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1172 ntfs_error(vol->sb, "Failed to find required "
1173 "attribute extent of attribute in "
1174 "error code path. Run chkdsk to "
1175 "recover.");
1176 write_lock_irqsave(&ni->size_lock, flags);
1177 ni->itype.compressed.size += vol->cluster_size;
1178 write_unlock_irqrestore(&ni->size_lock, flags);
1179 flush_dcache_mft_record_page(ctx->ntfs_ino);
1180 mark_mft_record_dirty(ctx->ntfs_ino);
1181 /*
1182 * The only thing that is now wrong is the compressed
1183 * size of the base attribute extent which chkdsk
1184 * should be able to fix.
1185 */
1186 NVolSetErrors(vol);
1187 } else {
1188 m = ctx->mrec;
1189 a = ctx->attr;
1190 status.attr_switched = 0;
1191 }
1192 }
1193 /*
1194 * If the runlist has been modified, need to restore it by punching a
1195 * hole into it and we then need to deallocate the on-disk cluster as
1196 * well. Note, we only modify the runlist if we are able to generate a
1197 * new mapping pairs array, i.e. only when the mapped attribute extent
1198 * is not switched.
1199 */
1200 if (status.runlist_merged && !status.attr_switched) {
1201 BUG_ON(!rl_write_locked);
1202 /* Make the file cluster we allocated sparse in the runlist. */
1203 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1204 ntfs_error(vol->sb, "Failed to punch hole into "
1205 "attribute runlist in error code "
1206 "path. Run chkdsk to recover the "
1207 "lost cluster.");
1208 make_bad_inode(vi);
1209 make_bad_inode(VFS_I(base_ni));
1210 NVolSetErrors(vol);
1211 } else /* if (success) */ {
1212 status.runlist_merged = 0;
1213 /*
1214 * Deallocate the on-disk cluster we allocated but only
1215 * if we succeeded in punching its vcn out of the
1216 * runlist.
1217 */
1218 down_write(&vol->lcnbmp_lock);
1219 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1220 ntfs_error(vol->sb, "Failed to release "
1221 "allocated cluster in error "
1222 "code path. Run chkdsk to "
1223 "recover the lost cluster.");
1224 NVolSetErrors(vol);
1225 }
1226 up_write(&vol->lcnbmp_lock);
1227 }
1228 }
1229 /*
1230 * Resize the attribute record to its old size and rebuild the mapping
1231 * pairs array. Note, we only can do this if the runlist has been
1232 * restored to its old state which also implies that the mapped
1233 * attribute extent is not switched.
1234 */
1235 if (status.mp_rebuilt && !status.runlist_merged) {
1236 if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1237 ntfs_error(vol->sb, "Failed to restore attribute "
1238 "record in error code path. Run "
1239 "chkdsk to recover.");
1240 make_bad_inode(vi);
1241 make_bad_inode(VFS_I(base_ni));
1242 NVolSetErrors(vol);
1243 } else /* if (success) */ {
1244 if (ntfs_mapping_pairs_build(vol, (u8*)a +
1245 le16_to_cpu(a->data.non_resident.
1246 mapping_pairs_offset), attr_rec_len -
1247 le16_to_cpu(a->data.non_resident.
1248 mapping_pairs_offset), ni->runlist.rl,
1249 vcn, highest_vcn, NULL)) {
1250 ntfs_error(vol->sb, "Failed to restore "
1251 "mapping pairs array in error "
1252 "code path. Run chkdsk to "
1253 "recover.");
1254 make_bad_inode(vi);
1255 make_bad_inode(VFS_I(base_ni));
1256 NVolSetErrors(vol);
1257 }
1258 flush_dcache_mft_record_page(ctx->ntfs_ino);
1259 mark_mft_record_dirty(ctx->ntfs_ino);
1260 }
1261 }
1262 /* Release the mft record and the attribute. */
1263 if (status.mft_attr_mapped) {
1264 ntfs_attr_put_search_ctx(ctx);
1265 unmap_mft_record(base_ni);
1266 }
1267 /* Release the runlist lock. */
1268 if (rl_write_locked)
1269 up_write(&ni->runlist.lock);
1270 else if (rl)
1271 up_read(&ni->runlist.lock);
1272 /*
1273 * Zero out any newly allocated blocks to avoid exposing stale data.
1274 * If BH_New is set, we know that the block was newly allocated above
1275 * and that it has not been fully zeroed and marked dirty yet.
1276 */
1277 nr_pages = u;
1278 u = 0;
1279 end = bh_cpos << vol->cluster_size_bits;
1280 do {
1281 page = pages[u];
1282 bh = head = page_buffers(page);
1283 do {
1284 if (u == nr_pages &&
1285 ((s64)page->index << PAGE_CACHE_SHIFT) +
1286 bh_offset(bh) >= end)
1287 break;
1288 if (!buffer_new(bh))
1289 continue;
1290 clear_buffer_new(bh);
1291 if (!buffer_uptodate(bh)) {
1292 if (PageUptodate(page))
1293 set_buffer_uptodate(bh);
1294 else {
1295 u8 *kaddr = kmap_atomic(page, KM_USER0);
1296 memset(kaddr + bh_offset(bh), 0,
1297 blocksize);
1298 kunmap_atomic(kaddr, KM_USER0);
1299 flush_dcache_page(page);
1300 set_buffer_uptodate(bh);
1301 }
1302 }
1303 mark_buffer_dirty(bh);
1304 } while ((bh = bh->b_this_page) != head);
1305 } while (++u <= nr_pages);
1306 ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1307 return err;
1308}
1309
1310/*
1311 * Copy as much as we can into the pages and return the number of bytes which
1312 * were sucessfully copied. If a fault is encountered then clear the pages
1313 * out to (ofs + bytes) and return the number of bytes which were copied.
1314 */
1315static inline size_t ntfs_copy_from_user(struct page **pages,
1316 unsigned nr_pages, unsigned ofs, const char __user *buf,
1317 size_t bytes)
1318{
1319 struct page **last_page = pages + nr_pages;
1320 char *kaddr;
1321 size_t total = 0;
1322 unsigned len;
1323 int left;
1324
1325 do {
1326 len = PAGE_CACHE_SIZE - ofs;
1327 if (len > bytes)
1328 len = bytes;
1329 kaddr = kmap_atomic(*pages, KM_USER0);
1330 left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
1331 kunmap_atomic(kaddr, KM_USER0);
1332 if (unlikely(left)) {
1333 /* Do it the slow way. */
1334 kaddr = kmap(*pages);
1335 left = __copy_from_user(kaddr + ofs, buf, len);
1336 kunmap(*pages);
1337 if (unlikely(left))
1338 goto err_out;
1339 }
1340 total += len;
1341 bytes -= len;
1342 if (!bytes)
1343 break;
1344 buf += len;
1345 ofs = 0;
1346 } while (++pages < last_page);
1347out:
1348 return total;
1349err_out:
1350 total += len - left;
1351 /* Zero the rest of the target like __copy_from_user(). */
1352 while (++pages < last_page) {
1353 bytes -= len;
1354 if (!bytes)
1355 break;
1356 len = PAGE_CACHE_SIZE;
1357 if (len > bytes)
1358 len = bytes;
1359 kaddr = kmap_atomic(*pages, KM_USER0);
1360 memset(kaddr, 0, len);
1361 kunmap_atomic(kaddr, KM_USER0);
1362 }
1363 goto out;
1364}
1365
1366static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1367 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1368{
1369 size_t total = 0;
1370
1371 while (1) {
1372 const char __user *buf = iov->iov_base + iov_ofs;
1373 unsigned len;
1374 size_t left;
1375
1376 len = iov->iov_len - iov_ofs;
1377 if (len > bytes)
1378 len = bytes;
1379 left = __copy_from_user_inatomic(vaddr, buf, len);
1380 total += len;
1381 bytes -= len;
1382 vaddr += len;
1383 if (unlikely(left)) {
1384 /*
1385 * Zero the rest of the target like __copy_from_user().
1386 */
1387 memset(vaddr, 0, bytes);
1388 total -= left;
1389 break;
1390 }
1391 if (!bytes)
1392 break;
1393 iov++;
1394 iov_ofs = 0;
1395 }
1396 return total;
1397}
1398
1399static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1400 size_t *iov_ofsp, size_t bytes)
1401{
1402 const struct iovec *iov = *iovp;
1403 size_t iov_ofs = *iov_ofsp;
1404
1405 while (bytes) {
1406 unsigned len;
1407
1408 len = iov->iov_len - iov_ofs;
1409 if (len > bytes)
1410 len = bytes;
1411 bytes -= len;
1412 iov_ofs += len;
1413 if (iov->iov_len == iov_ofs) {
1414 iov++;
1415 iov_ofs = 0;
1416 }
1417 }
1418 *iovp = iov;
1419 *iov_ofsp = iov_ofs;
1420}
1421
1422/*
1423 * This has the same side-effects and return value as ntfs_copy_from_user().
1424 * The difference is that on a fault we need to memset the remainder of the
1425 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1426 * single-segment behaviour.
1427 *
1428 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
1429 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
1430 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1431 * fact, the only difference between __copy_from_user_inatomic() and
1432 * __copy_from_user() is that the latter calls might_sleep(). And on many
1433 * architectures __copy_from_user_inatomic() is just defined to
1434 * __copy_from_user() so it makes no difference at all on those architectures.
1435 */
1436static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1437 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1438 size_t *iov_ofs, size_t bytes)
1439{
1440 struct page **last_page = pages + nr_pages;
1441 char *kaddr;
1442 size_t copied, len, total = 0;
1443
1444 do {
1445 len = PAGE_CACHE_SIZE - ofs;
1446 if (len > bytes)
1447 len = bytes;
1448 kaddr = kmap_atomic(*pages, KM_USER0);
1449 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1450 *iov, *iov_ofs, len);
1451 kunmap_atomic(kaddr, KM_USER0);
1452 if (unlikely(copied != len)) {
1453 /* Do it the slow way. */
1454 kaddr = kmap(*pages);
1455 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1456 *iov, *iov_ofs, len);
1457 kunmap(*pages);
1458 if (unlikely(copied != len))
1459 goto err_out;
1460 }
1461 total += len;
1462 bytes -= len;
1463 if (!bytes)
1464 break;
1465 ntfs_set_next_iovec(iov, iov_ofs, len);
1466 ofs = 0;
1467 } while (++pages < last_page);
1468out:
1469 return total;
1470err_out:
1471 total += copied;
1472 /* Zero the rest of the target like __copy_from_user(). */
1473 while (++pages < last_page) {
1474 bytes -= len;
1475 if (!bytes)
1476 break;
1477 len = PAGE_CACHE_SIZE;
1478 if (len > bytes)
1479 len = bytes;
1480 kaddr = kmap_atomic(*pages, KM_USER0);
1481 memset(kaddr, 0, len);
1482 kunmap_atomic(kaddr, KM_USER0);
1483 }
1484 goto out;
1485}
1486
1487static inline void ntfs_flush_dcache_pages(struct page **pages,
1488 unsigned nr_pages)
1489{
1490 BUG_ON(!nr_pages);
1491 do {
1492 /*
1493 * Warning: Do not do the decrement at the same time as the
1494 * call because flush_dcache_page() is a NULL macro on i386
1495 * and hence the decrement never happens.
1496 */
1497 flush_dcache_page(pages[nr_pages]);
1498 } while (--nr_pages > 0);
1499}
1500
1501/**
1502 * ntfs_commit_pages_after_non_resident_write - commit the received data
1503 * @pages: array of destination pages
1504 * @nr_pages: number of pages in @pages
1505 * @pos: byte position in file at which the write begins
1506 * @bytes: number of bytes to be written
1507 *
1508 * See description of ntfs_commit_pages_after_write(), below.
1509 */
1510static inline int ntfs_commit_pages_after_non_resident_write(
1511 struct page **pages, const unsigned nr_pages,
1512 s64 pos, size_t bytes)
1513{
1514 s64 end, initialized_size;
1515 struct inode *vi;
1516 ntfs_inode *ni, *base_ni;
1517 struct buffer_head *bh, *head;
1518 ntfs_attr_search_ctx *ctx;
1519 MFT_RECORD *m;
1520 ATTR_RECORD *a;
1521 unsigned long flags;
1522 unsigned blocksize, u;
1523 int err;
1524
1525 vi = pages[0]->mapping->host;
1526 ni = NTFS_I(vi);
1527 blocksize = 1 << vi->i_blkbits;
1528 end = pos + bytes;
1529 u = 0;
1530 do {
1531 s64 bh_pos;
1532 struct page *page;
1533 BOOL partial;
1534
1535 page = pages[u];
1536 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1537 bh = head = page_buffers(page);
1538 partial = FALSE;
1539 do {
1540 s64 bh_end;
1541
1542 bh_end = bh_pos + blocksize;
1543 if (bh_end <= pos || bh_pos >= end) {
1544 if (!buffer_uptodate(bh))
1545 partial = TRUE;
1546 } else {
1547 set_buffer_uptodate(bh);
1548 mark_buffer_dirty(bh);
1549 }
1550 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1551 /*
1552 * If all buffers are now uptodate but the page is not, set the
1553 * page uptodate.
1554 */
1555 if (!partial && !PageUptodate(page))
1556 SetPageUptodate(page);
1557 } while (++u < nr_pages);
1558 /*
1559 * Finally, if we do not need to update initialized_size or i_size we
1560 * are finished.
1561 */
1562 read_lock_irqsave(&ni->size_lock, flags);
1563 initialized_size = ni->initialized_size;
1564 read_unlock_irqrestore(&ni->size_lock, flags);
1565 if (end <= initialized_size) {
1566 ntfs_debug("Done.");
1567 return 0;
1568 }
1569 /*
1570 * Update initialized_size/i_size as appropriate, both in the inode and
1571 * the mft record.
1572 */
1573 if (!NInoAttr(ni))
1574 base_ni = ni;
1575 else
1576 base_ni = ni->ext.base_ntfs_ino;
1577 /* Map, pin, and lock the mft record. */
1578 m = map_mft_record(base_ni);
1579 if (IS_ERR(m)) {
1580 err = PTR_ERR(m);
1581 m = NULL;
1582 ctx = NULL;
1583 goto err_out;
1584 }
1585 BUG_ON(!NInoNonResident(ni));
1586 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1587 if (unlikely(!ctx)) {
1588 err = -ENOMEM;
1589 goto err_out;
1590 }
1591 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1592 CASE_SENSITIVE, 0, NULL, 0, ctx);
1593 if (unlikely(err)) {
1594 if (err == -ENOENT)
1595 err = -EIO;
1596 goto err_out;
1597 }
1598 a = ctx->attr;
1599 BUG_ON(!a->non_resident);
1600 write_lock_irqsave(&ni->size_lock, flags);
1601 BUG_ON(end > ni->allocated_size);
1602 ni->initialized_size = end;
1603 a->data.non_resident.initialized_size = cpu_to_sle64(end);
1604 if (end > i_size_read(vi)) {
1605 i_size_write(vi, end);
1606 a->data.non_resident.data_size =
1607 a->data.non_resident.initialized_size;
1608 }
1609 write_unlock_irqrestore(&ni->size_lock, flags);
1610 /* Mark the mft record dirty, so it gets written back. */
1611 flush_dcache_mft_record_page(ctx->ntfs_ino);
1612 mark_mft_record_dirty(ctx->ntfs_ino);
1613 ntfs_attr_put_search_ctx(ctx);
1614 unmap_mft_record(base_ni);
1615 ntfs_debug("Done.");
1616 return 0;
1617err_out:
1618 if (ctx)
1619 ntfs_attr_put_search_ctx(ctx);
1620 if (m)
1621 unmap_mft_record(base_ni);
1622 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1623 "code %i).", err);
1624 if (err != -ENOMEM) {
1625 NVolSetErrors(ni->vol);
1626 make_bad_inode(VFS_I(base_ni));
1627 make_bad_inode(vi);
1628 }
1629 return err;
1630}
1631
1632/**
1633 * ntfs_commit_pages_after_write - commit the received data
1634 * @pages: array of destination pages
1635 * @nr_pages: number of pages in @pages
1636 * @pos: byte position in file at which the write begins
1637 * @bytes: number of bytes to be written
1638 *
1639 * This is called from ntfs_file_buffered_write() with i_sem held on the inode
1640 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1641 * locked but not kmap()ped. The source data has already been copied into the
1642 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1643 * the data was copied (for non-resident attributes only) and it returned
1644 * success.
1645 *
1646 * Need to set uptodate and mark dirty all buffers within the boundary of the
1647 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1648 *
1649 * Setting the buffers dirty ensures that they get written out later when
1650 * ntfs_writepage() is invoked by the VM.
1651 *
1652 * Finally, we need to update i_size and initialized_size as appropriate both
1653 * in the inode and the mft record.
1654 *
1655 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1656 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1657 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1658 * that case, it also marks the inode dirty.
1659 *
1660 * If things have gone as outlined in
1661 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1662 * content modifications here for non-resident attributes. For resident
1663 * attributes we need to do the uptodate bringing here which we combine with
1664 * the copying into the mft record which means we save one atomic kmap.
1665 *
1666 * Return 0 on success or -errno on error.
1667 */
1668static int ntfs_commit_pages_after_write(struct page **pages,
1669 const unsigned nr_pages, s64 pos, size_t bytes)
1670{
1671 s64 end, initialized_size;
1672 loff_t i_size;
1673 struct inode *vi;
1674 ntfs_inode *ni, *base_ni;
1675 struct page *page;
1676 ntfs_attr_search_ctx *ctx;
1677 MFT_RECORD *m;
1678 ATTR_RECORD *a;
1679 char *kattr, *kaddr;
1680 unsigned long flags;
1681 u32 attr_len;
1682 int err;
1683
1684 BUG_ON(!nr_pages);
1685 BUG_ON(!pages);
1686 page = pages[0];
1687 BUG_ON(!page);
1688 vi = page->mapping->host;
1689 ni = NTFS_I(vi);
1690 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1691 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1692 vi->i_ino, ni->type, page->index, nr_pages,
1693 (long long)pos, bytes);
1694 if (NInoNonResident(ni))
1695 return ntfs_commit_pages_after_non_resident_write(pages,
1696 nr_pages, pos, bytes);
1697 BUG_ON(nr_pages > 1);
1698 /*
1699 * Attribute is resident, implying it is not compressed, encrypted, or
1700 * sparse.
1701 */
1702 if (!NInoAttr(ni))
1703 base_ni = ni;
1704 else
1705 base_ni = ni->ext.base_ntfs_ino;
1706 BUG_ON(NInoNonResident(ni));
1707 /* Map, pin, and lock the mft record. */
1708 m = map_mft_record(base_ni);
1709 if (IS_ERR(m)) {
1710 err = PTR_ERR(m);
1711 m = NULL;
1712 ctx = NULL;
1713 goto err_out;
1714 }
1715 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1716 if (unlikely(!ctx)) {
1717 err = -ENOMEM;
1718 goto err_out;
1719 }
1720 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1721 CASE_SENSITIVE, 0, NULL, 0, ctx);
1722 if (unlikely(err)) {
1723 if (err == -ENOENT)
1724 err = -EIO;
1725 goto err_out;
1726 }
1727 a = ctx->attr;
1728 BUG_ON(a->non_resident);
1729 /* The total length of the attribute value. */
1730 attr_len = le32_to_cpu(a->data.resident.value_length);
1731 i_size = i_size_read(vi);
1732 BUG_ON(attr_len != i_size);
1733 BUG_ON(pos > attr_len);
1734 end = pos + bytes;
1735 BUG_ON(end > le32_to_cpu(a->length) -
1736 le16_to_cpu(a->data.resident.value_offset));
1737 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1738 kaddr = kmap_atomic(page, KM_USER0);
1739 /* Copy the received data from the page to the mft record. */
1740 memcpy(kattr + pos, kaddr + pos, bytes);
1741 /* Update the attribute length if necessary. */
1742 if (end > attr_len) {
1743 attr_len = end;
1744 a->data.resident.value_length = cpu_to_le32(attr_len);
1745 }
1746 /*
1747 * If the page is not uptodate, bring the out of bounds area(s)
1748 * uptodate by copying data from the mft record to the page.
1749 */
1750 if (!PageUptodate(page)) {
1751 if (pos > 0)
1752 memcpy(kaddr, kattr, pos);
1753 if (end < attr_len)
1754 memcpy(kaddr + end, kattr + end, attr_len - end);
1755 /* Zero the region outside the end of the attribute value. */
1756 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1757 flush_dcache_page(page);
1758 SetPageUptodate(page);
1759 }
1760 kunmap_atomic(kaddr, KM_USER0);
1761 /* Update initialized_size/i_size if necessary. */
1762 read_lock_irqsave(&ni->size_lock, flags);
1763 initialized_size = ni->initialized_size;
1764 BUG_ON(end > ni->allocated_size);
1765 read_unlock_irqrestore(&ni->size_lock, flags);
1766 BUG_ON(initialized_size != i_size);
1767 if (end > initialized_size) {
1768 unsigned long flags;
1769
1770 write_lock_irqsave(&ni->size_lock, flags);
1771 ni->initialized_size = end;
1772 i_size_write(vi, end);
1773 write_unlock_irqrestore(&ni->size_lock, flags);
1774 }
1775 /* Mark the mft record dirty, so it gets written back. */
1776 flush_dcache_mft_record_page(ctx->ntfs_ino);
1777 mark_mft_record_dirty(ctx->ntfs_ino);
1778 ntfs_attr_put_search_ctx(ctx);
1779 unmap_mft_record(base_ni);
1780 ntfs_debug("Done.");
1781 return 0;
1782err_out:
1783 if (err == -ENOMEM) {
1784 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1785 "commit the write.");
1786 if (PageUptodate(page)) {
1787 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1788 "dirty so the write will be retried "
1789 "later on by the VM.");
1790 /*
1791 * Put the page on mapping->dirty_pages, but leave its
1792 * buffers' dirty state as-is.
1793 */
1794 __set_page_dirty_nobuffers(page);
1795 err = 0;
1796 } else
1797 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1798 "data has been lost.");
1799 } else {
1800 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1801 "with error %i.", err);
1802 NVolSetErrors(ni->vol);
1803 make_bad_inode(VFS_I(base_ni));
1804 make_bad_inode(vi);
1805 }
1806 if (ctx)
1807 ntfs_attr_put_search_ctx(ctx);
1808 if (m)
1809 unmap_mft_record(base_ni);
1810 return err;
1811}
1812
1813/**
1814 * ntfs_file_buffered_write -
1815 *
1816 * Locking: The vfs is holding ->i_sem on the inode.
1817 */
1818static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1819 const struct iovec *iov, unsigned long nr_segs,
1820 loff_t pos, loff_t *ppos, size_t count)
1821{
1822 struct file *file = iocb->ki_filp;
1823 struct address_space *mapping = file->f_mapping;
1824 struct inode *vi = mapping->host;
1825 ntfs_inode *ni = NTFS_I(vi);
1826 ntfs_volume *vol = ni->vol;
1827 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1828 struct page *cached_page = NULL;
1829 char __user *buf = NULL;
1830 s64 end, ll;
1831 VCN last_vcn;
1832 LCN lcn;
1833 unsigned long flags;
1834 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
1835 ssize_t status, written;
1836 unsigned nr_pages;
1837 int err;
1838 struct pagevec lru_pvec;
1839
1840 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1841 "pos 0x%llx, count 0x%lx.",
1842 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1843 (unsigned long long)pos, (unsigned long)count);
1844 if (unlikely(!count))
1845 return 0;
1846 BUG_ON(NInoMstProtected(ni));
1847 /*
1848 * If the attribute is not an index root and it is encrypted or
1849 * compressed, we cannot write to it yet. Note we need to check for
1850 * AT_INDEX_ALLOCATION since this is the type of both directory and
1851 * index inodes.
1852 */
1853 if (ni->type != AT_INDEX_ALLOCATION) {
1854 /* If file is encrypted, deny access, just like NT4. */
1855 if (NInoEncrypted(ni)) {
1856 /*
1857 * Reminder for later: Encrypted files are _always_
1858 * non-resident so that the content can always be
1859 * encrypted.
1860 */
1861 ntfs_debug("Denying write access to encrypted file.");
1862 return -EACCES;
1863 }
1864 if (NInoCompressed(ni)) {
1865 /* Only unnamed $DATA attribute can be compressed. */
1866 BUG_ON(ni->type != AT_DATA);
1867 BUG_ON(ni->name_len);
1868 /*
1869 * Reminder for later: If resident, the data is not
1870 * actually compressed. Only on the switch to non-
1871 * resident does compression kick in. This is in
1872 * contrast to encrypted files (see above).
1873 */
1874 ntfs_error(vi->i_sb, "Writing to compressed files is "
1875 "not implemented yet. Sorry.");
1876 return -EOPNOTSUPP;
1877 }
1878 }
1879 /*
1880 * If a previous ntfs_truncate() failed, repeat it and abort if it
1881 * fails again.
1882 */
1883 if (unlikely(NInoTruncateFailed(ni))) {
1884 down_write(&vi->i_alloc_sem);
1885 err = ntfs_truncate(vi);
1886 up_write(&vi->i_alloc_sem);
1887 if (err || NInoTruncateFailed(ni)) {
1888 if (!err)
1889 err = -EIO;
1890 ntfs_error(vol->sb, "Cannot perform write to inode "
1891 "0x%lx, attribute type 0x%x, because "
1892 "ntfs_truncate() failed (error code "
1893 "%i).", vi->i_ino,
1894 (unsigned)le32_to_cpu(ni->type), err);
1895 return err;
1896 }
1897 }
1898 /* The first byte after the write. */
1899 end = pos + count;
1900 /*
1901 * If the write goes beyond the allocated size, extend the allocation
1902 * to cover the whole of the write, rounded up to the nearest cluster.
1903 */
1904 read_lock_irqsave(&ni->size_lock, flags);
1905 ll = ni->allocated_size;
1906 read_unlock_irqrestore(&ni->size_lock, flags);
1907 if (end > ll) {
1908 /* Extend the allocation without changing the data size. */
1909 ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1910 if (likely(ll >= 0)) {
1911 BUG_ON(pos >= ll);
1912 /* If the extension was partial truncate the write. */
1913 if (end > ll) {
1914 ntfs_debug("Truncating write to inode 0x%lx, "
1915 "attribute type 0x%x, because "
1916 "the allocation was only "
1917 "partially extended.",
1918 vi->i_ino, (unsigned)
1919 le32_to_cpu(ni->type));
1920 end = ll;
1921 count = ll - pos;
1922 }
1923 } else {
1924 err = ll;
1925 read_lock_irqsave(&ni->size_lock, flags);
1926 ll = ni->allocated_size;
1927 read_unlock_irqrestore(&ni->size_lock, flags);
1928 /* Perform a partial write if possible or fail. */
1929 if (pos < ll) {
1930 ntfs_debug("Truncating write to inode 0x%lx, "
1931 "attribute type 0x%x, because "
1932 "extending the allocation "
1933 "failed (error code %i).",
1934 vi->i_ino, (unsigned)
1935 le32_to_cpu(ni->type), err);
1936 end = ll;
1937 count = ll - pos;
1938 } else {
1939 ntfs_error(vol->sb, "Cannot perform write to "
1940 "inode 0x%lx, attribute type "
1941 "0x%x, because extending the "
1942 "allocation failed (error "
1943 "code %i).", vi->i_ino,
1944 (unsigned)
1945 le32_to_cpu(ni->type), err);
1946 return err;
1947 }
1948 }
1949 }
1950 pagevec_init(&lru_pvec, 0);
1951 written = 0;
1952 /*
1953 * If the write starts beyond the initialized size, extend it up to the
1954 * beginning of the write and initialize all non-sparse space between
1955 * the old initialized size and the new one. This automatically also
1956 * increments the vfs inode->i_size to keep it above or equal to the
1957 * initialized_size.
1958 */
1959 read_lock_irqsave(&ni->size_lock, flags);
1960 ll = ni->initialized_size;
1961 read_unlock_irqrestore(&ni->size_lock, flags);
1962 if (pos > ll) {
1963 err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1964 &lru_pvec);
1965 if (err < 0) {
1966 ntfs_error(vol->sb, "Cannot perform write to inode "
1967 "0x%lx, attribute type 0x%x, because "
1968 "extending the initialized size "
1969 "failed (error code %i).", vi->i_ino,
1970 (unsigned)le32_to_cpu(ni->type), err);
1971 status = err;
1972 goto err_out;
1973 }
1974 }
1975 /*
1976 * Determine the number of pages per cluster for non-resident
1977 * attributes.
1978 */
1979 nr_pages = 1;
1980 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1981 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1982 /* Finally, perform the actual write. */
1983 last_vcn = -1;
1984 if (likely(nr_segs == 1))
1985 buf = iov->iov_base;
1986 do {
1987 VCN vcn;
1988 pgoff_t idx, start_idx;
1989 unsigned ofs, do_pages, u;
1990 size_t copied;
1991
1992 start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1993 ofs = pos & ~PAGE_CACHE_MASK;
1994 bytes = PAGE_CACHE_SIZE - ofs;
1995 do_pages = 1;
1996 if (nr_pages > 1) {
1997 vcn = pos >> vol->cluster_size_bits;
1998 if (vcn != last_vcn) {
1999 last_vcn = vcn;
2000 /*
2001 * Get the lcn of the vcn the write is in. If
2002 * it is a hole, need to lock down all pages in
2003 * the cluster.
2004 */
2005 down_read(&ni->runlist.lock);
2006 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
2007 vol->cluster_size_bits, FALSE);
2008 up_read(&ni->runlist.lock);
2009 if (unlikely(lcn < LCN_HOLE)) {
2010 status = -EIO;
2011 if (lcn == LCN_ENOMEM)
2012 status = -ENOMEM;
2013 else
2014 ntfs_error(vol->sb, "Cannot "
2015 "perform write to "
2016 "inode 0x%lx, "
2017 "attribute type 0x%x, "
2018 "because the attribute "
2019 "is corrupt.",
2020 vi->i_ino, (unsigned)
2021 le32_to_cpu(ni->type));
2022 break;
2023 }
2024 if (lcn == LCN_HOLE) {
2025 start_idx = (pos & ~(s64)
2026 vol->cluster_size_mask)
2027 >> PAGE_CACHE_SHIFT;
2028 bytes = vol->cluster_size - (pos &
2029 vol->cluster_size_mask);
2030 do_pages = nr_pages;
2031 }
2032 }
2033 }
2034 if (bytes > count)
2035 bytes = count;
2036 /*
2037 * Bring in the user page(s) that we will copy from _first_.
2038 * Otherwise there is a nasty deadlock on copying from the same
2039 * page(s) as we are writing to, without it/them being marked
2040 * up-to-date. Note, at present there is nothing to stop the
2041 * pages being swapped out between us bringing them into memory
2042 * and doing the actual copying.
2043 */
2044 if (likely(nr_segs == 1))
2045 ntfs_fault_in_pages_readable(buf, bytes);
2046 else
2047 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2048 /* Get and lock @do_pages starting at index @start_idx. */
2049 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2050 pages, &cached_page, &lru_pvec);
2051 if (unlikely(status))
2052 break;
2053 /*
2054 * For non-resident attributes, we need to fill any holes with
2055 * actual clusters and ensure all bufferes are mapped. We also
2056 * need to bring uptodate any buffers that are only partially
2057 * being written to.
2058 */
2059 if (NInoNonResident(ni)) {
2060 status = ntfs_prepare_pages_for_non_resident_write(
2061 pages, do_pages, pos, bytes);
2062 if (unlikely(status)) {
2063 loff_t i_size;
2064
2065 do {
2066 unlock_page(pages[--do_pages]);
2067 page_cache_release(pages[do_pages]);
2068 } while (do_pages);
2069 /*
2070 * The write preparation may have instantiated
2071 * allocated space outside i_size. Trim this
2072 * off again. We can ignore any errors in this
2073 * case as we will just be waisting a bit of
2074 * allocated space, which is not a disaster.
2075 */
2076 i_size = i_size_read(vi);
2077 if (pos + bytes > i_size)
2078 vmtruncate(vi, i_size);
2079 break;
2080 }
2081 }
2082 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2083 if (likely(nr_segs == 1)) {
2084 copied = ntfs_copy_from_user(pages + u, do_pages - u,
2085 ofs, buf, bytes);
2086 buf += copied;
2087 } else
2088 copied = ntfs_copy_from_user_iovec(pages + u,
2089 do_pages - u, ofs, &iov, &iov_ofs,
2090 bytes);
2091 ntfs_flush_dcache_pages(pages + u, do_pages - u);
2092 status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2093 bytes);
2094 if (likely(!status)) {
2095 written += copied;
2096 count -= copied;
2097 pos += copied;
2098 if (unlikely(copied != bytes))
2099 status = -EFAULT;
2100 }
2101 do {
2102 unlock_page(pages[--do_pages]);
2103 mark_page_accessed(pages[do_pages]);
2104 page_cache_release(pages[do_pages]);
2105 } while (do_pages);
2106 if (unlikely(status))
2107 break;
2108 balance_dirty_pages_ratelimited(mapping);
2109 cond_resched();
2110 } while (count);
2111err_out:
2112 *ppos = pos;
2113 if (cached_page)
2114 page_cache_release(cached_page);
2115 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2116 if (likely(!status)) {
2117 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2118 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2119 status = generic_osync_inode(vi, mapping,
2120 OSYNC_METADATA|OSYNC_DATA);
2121 }
2122 }
2123 pagevec_lru_add(&lru_pvec);
2124 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2125 written ? "written" : "status", (unsigned long)written,
2126 (long)status);
2127 return written ? written : status;
2128}
2129
2130/**
2131 * ntfs_file_aio_write_nolock -
2132 */
2133static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2134 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2135{
2136 struct file *file = iocb->ki_filp;
2137 struct address_space *mapping = file->f_mapping;
2138 struct inode *inode = mapping->host;
2139 loff_t pos;
2140 unsigned long seg;
2141 size_t count; /* after file limit checks */
2142 ssize_t written, err;
2143
2144 count = 0;
2145 for (seg = 0; seg < nr_segs; seg++) {
2146 const struct iovec *iv = &iov[seg];
2147 /*
2148 * If any segment has a negative length, or the cumulative
2149 * length ever wraps negative then return -EINVAL.
2150 */
2151 count += iv->iov_len;
2152 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2153 return -EINVAL;
2154 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2155 continue;
2156 if (!seg)
2157 return -EFAULT;
2158 nr_segs = seg;
2159 count -= iv->iov_len; /* This segment is no good */
2160 break;
2161 }
2162 pos = *ppos;
2163 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2164 /* We can write back this queue in page reclaim. */
2165 current->backing_dev_info = mapping->backing_dev_info;
2166 written = 0;
2167 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2168 if (err)
2169 goto out;
2170 if (!count)
2171 goto out;
2172 err = remove_suid(file->f_dentry);
2173 if (err)
2174 goto out;
2175 inode_update_time(inode, 1);
2176 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2177 count);
2178out:
2179 current->backing_dev_info = NULL;
2180 return written ? written : err;
2181}
2182
2183/**
2184 * ntfs_file_aio_write -
2185 */
2186static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2187 size_t count, loff_t pos)
2188{
2189 struct file *file = iocb->ki_filp;
2190 struct address_space *mapping = file->f_mapping;
2191 struct inode *inode = mapping->host;
2192 ssize_t ret;
2193 struct iovec local_iov = { .iov_base = (void __user *)buf,
2194 .iov_len = count };
2195
2196 BUG_ON(iocb->ki_pos != pos);
2197
2198 down(&inode->i_sem);
2199 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2200 up(&inode->i_sem);
2201 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2202 int err = sync_page_range(inode, mapping, pos, ret);
2203 if (err < 0)
2204 ret = err;
2205 }
2206 return ret;
2207}
2208
2209/**
2210 * ntfs_file_writev -
2211 *
2212 * Basically the same as generic_file_writev() except that it ends up calling
2213 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2214 */
2215static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2216 unsigned long nr_segs, loff_t *ppos)
2217{
2218 struct address_space *mapping = file->f_mapping;
2219 struct inode *inode = mapping->host;
2220 struct kiocb kiocb;
2221 ssize_t ret;
2222
2223 down(&inode->i_sem);
2224 init_sync_kiocb(&kiocb, file);
2225 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2226 if (ret == -EIOCBQUEUED)
2227 ret = wait_on_sync_kiocb(&kiocb);
2228 up(&inode->i_sem);
2229 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2230 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2231 if (err < 0)
2232 ret = err;
2233 }
2234 return ret;
2235}
2236
2237/**
2238 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2239 */
2240static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2241 size_t count, loff_t *ppos)
2242{
2243 struct iovec local_iov = { .iov_base = (void __user *)buf,
2244 .iov_len = count };
2245
2246 return ntfs_file_writev(file, &local_iov, 1, ppos);
2247}
2248
2249/**
59 * ntfs_file_fsync - sync a file to disk 2250 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced 2251 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync 2252 * @dentry: dentry describing the file to sync
@@ -113,39 +2304,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
113#endif /* NTFS_RW */ 2304#endif /* NTFS_RW */
114 2305
115struct file_operations ntfs_file_ops = { 2306struct file_operations ntfs_file_ops = {
116 .llseek = generic_file_llseek, /* Seek inside file. */ 2307 .llseek = generic_file_llseek, /* Seek inside file. */
117 .read = generic_file_read, /* Read from file. */ 2308 .read = generic_file_read, /* Read from file. */
118 .aio_read = generic_file_aio_read, /* Async read from file. */ 2309 .aio_read = generic_file_aio_read, /* Async read from file. */
119 .readv = generic_file_readv, /* Read from file. */ 2310 .readv = generic_file_readv, /* Read from file. */
120#ifdef NTFS_RW 2311#ifdef NTFS_RW
121 .write = generic_file_write, /* Write to file. */ 2312 .write = ntfs_file_write, /* Write to file. */
122 .aio_write = generic_file_aio_write, /* Async write to file. */ 2313 .aio_write = ntfs_file_aio_write, /* Async write to file. */
123 .writev = generic_file_writev, /* Write to file. */ 2314 .writev = ntfs_file_writev, /* Write to file. */
124 /*.release = ,*/ /* Last file is closed. See 2315 /*.release = ,*/ /* Last file is closed. See
125 fs/ext2/file.c:: 2316 fs/ext2/file.c::
126 ext2_release_file() for 2317 ext2_release_file() for
127 how to use this to discard 2318 how to use this to discard
128 preallocated space for 2319 preallocated space for
129 write opened files. */ 2320 write opened files. */
130 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2321 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
131 /*.aio_fsync = ,*/ /* Sync all outstanding async 2322 /*.aio_fsync = ,*/ /* Sync all outstanding async
132 i/o operations on a 2323 i/o operations on a
133 kiocb. */ 2324 kiocb. */
134#endif /* NTFS_RW */ 2325#endif /* NTFS_RW */
135 /*.ioctl = ,*/ /* Perform function on the 2326 /*.ioctl = ,*/ /* Perform function on the
136 mounted filesystem. */ 2327 mounted filesystem. */
137 .mmap = generic_file_mmap, /* Mmap file. */ 2328 .mmap = generic_file_mmap, /* Mmap file. */
138 .open = ntfs_file_open, /* Open file. */ 2329 .open = ntfs_file_open, /* Open file. */
139 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2330 .sendfile = generic_file_sendfile, /* Zero-copy data send with
140 the data source being on 2331 the data source being on
141 the ntfs partition. We 2332 the ntfs partition. We do
142 do not need to care about 2333 not need to care about the
143 the data destination. */ 2334 data destination. */
144 /*.sendpage = ,*/ /* Zero-copy data send with 2335 /*.sendpage = ,*/ /* Zero-copy data send with
145 the data destination being 2336 the data destination being
146 on the ntfs partition. We 2337 on the ntfs partition. We
147 do not need to care about 2338 do not need to care about
148 the data source. */ 2339 the data source. */
149}; 2340};
150 2341
151struct inode_operations ntfs_file_inode_ops = { 2342struct inode_operations ntfs_file_inode_ops = {