aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ntfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ntfs/file.c')
-rw-r--r--fs/ntfs/file.c2256
1 files changed, 2224 insertions, 32 deletions
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423d..727533891813 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/pagevec.h>
25#include <linux/sched.h>
26#include <linux/swap.h>
27#include <linux/uio.h>
28#include <linux/writeback.h>
24 29
30#include <asm/page.h>
31#include <asm/uaccess.h>
32
33#include "attrib.h"
34#include "bitmap.h"
25#include "inode.h" 35#include "inode.h"
26#include "debug.h" 36#include "debug.h"
37#include "lcnalloc.h"
38#include "malloc.h"
39#include "mft.h"
27#include "ntfs.h" 40#include "ntfs.h"
28 41
29/** 42/**
@@ -56,6 +69,2185 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
56#ifdef NTFS_RW 69#ifdef NTFS_RW
57 70
58/** 71/**
72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
73 * @ni: ntfs inode of the attribute to extend
74 * @new_init_size: requested new initialized size in bytes
75 * @cached_page: store any allocated but unused page here
76 * @lru_pvec: lru-buffering pagevec of the caller
77 *
78 * Extend the initialized size of an attribute described by the ntfs inode @ni
79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
80 * the old initialized size and @new_init_size both in the page cache and on
81 * disk (if relevant complete pages are already uptodate in the page cache then
82 * these are simply marked dirty).
83 *
84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
85 * in the resident attribute case, it is tied to the initialized size and, in
86 * the non-resident attribute case, it may not fall below the initialized size.
87 *
88 * Note that if the attribute is resident, we do not need to touch the page
89 * cache at all. This is because if the page cache page is not uptodate we
90 * bring it uptodate later, when doing the write to the mft record since we
91 * then already have the page mapped. And if the page is uptodate, the
92 * non-initialized region will already have been zeroed when the page was
93 * brought uptodate and the region may in fact already have been overwritten
94 * with new data via mmap() based writes, so we cannot just zero it. And since
95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
96 * is unspecified, we choose not to do zeroing and thus we do not need to touch
97 * the page at all. For a more detailed explanation see ntfs_truncate() in
98 * fs/ntfs/inode.c.
99 *
100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
101 * pages.
102 *
103 * Return 0 on success and -errno on error. In the case that an error is
104 * encountered it is possible that the initialized size will already have been
105 * incremented some way towards @new_init_size but it is guaranteed that if
106 * this is the case, the necessary zeroing will also have happened and that all
107 * metadata is self-consistent.
108 *
109 * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller.
111 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
113 struct page **cached_page, struct pagevec *lru_pvec)
114{
115 s64 old_init_size;
116 loff_t old_i_size;
117 pgoff_t index, end_index;
118 unsigned long flags;
119 struct inode *vi = VFS_I(ni);
120 ntfs_inode *base_ni;
121 MFT_RECORD *m = NULL;
122 ATTR_RECORD *a;
123 ntfs_attr_search_ctx *ctx = NULL;
124 struct address_space *mapping;
125 struct page *page = NULL;
126 u8 *kattr;
127 int err;
128 u32 attr_len;
129
130 read_lock_irqsave(&ni->size_lock, flags);
131 old_init_size = ni->initialized_size;
132 old_i_size = i_size_read(vi);
133 BUG_ON(new_init_size > ni->allocated_size);
134 read_unlock_irqrestore(&ni->size_lock, flags);
135 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
136 "old_initialized_size 0x%llx, "
137 "new_initialized_size 0x%llx, i_size 0x%llx.",
138 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
139 (unsigned long long)old_init_size,
140 (unsigned long long)new_init_size, old_i_size);
141 if (!NInoAttr(ni))
142 base_ni = ni;
143 else
144 base_ni = ni->ext.base_ntfs_ino;
145 /* Use goto to reduce indentation and we need the label below anyway. */
146 if (NInoNonResident(ni))
147 goto do_non_resident_extend;
148 BUG_ON(old_init_size != old_i_size);
149 m = map_mft_record(base_ni);
150 if (IS_ERR(m)) {
151 err = PTR_ERR(m);
152 m = NULL;
153 goto err_out;
154 }
155 ctx = ntfs_attr_get_search_ctx(base_ni, m);
156 if (unlikely(!ctx)) {
157 err = -ENOMEM;
158 goto err_out;
159 }
160 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
161 CASE_SENSITIVE, 0, NULL, 0, ctx);
162 if (unlikely(err)) {
163 if (err == -ENOENT)
164 err = -EIO;
165 goto err_out;
166 }
167 m = ctx->mrec;
168 a = ctx->attr;
169 BUG_ON(a->non_resident);
170 /* The total length of the attribute value. */
171 attr_len = le32_to_cpu(a->data.resident.value_length);
172 BUG_ON(old_i_size != (loff_t)attr_len);
173 /*
174 * Do the zeroing in the mft record and update the attribute size in
175 * the mft record.
176 */
177 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
178 memset(kattr + attr_len, 0, new_init_size - attr_len);
179 a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
180 /* Finally, update the sizes in the vfs and ntfs inodes. */
181 write_lock_irqsave(&ni->size_lock, flags);
182 i_size_write(vi, new_init_size);
183 ni->initialized_size = new_init_size;
184 write_unlock_irqrestore(&ni->size_lock, flags);
185 goto done;
186do_non_resident_extend:
187 /*
188 * If the new initialized size @new_init_size exceeds the current file
189 * size (vfs inode->i_size), we need to extend the file size to the
190 * new initialized size.
191 */
192 if (new_init_size > old_i_size) {
193 m = map_mft_record(base_ni);
194 if (IS_ERR(m)) {
195 err = PTR_ERR(m);
196 m = NULL;
197 goto err_out;
198 }
199 ctx = ntfs_attr_get_search_ctx(base_ni, m);
200 if (unlikely(!ctx)) {
201 err = -ENOMEM;
202 goto err_out;
203 }
204 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
205 CASE_SENSITIVE, 0, NULL, 0, ctx);
206 if (unlikely(err)) {
207 if (err == -ENOENT)
208 err = -EIO;
209 goto err_out;
210 }
211 m = ctx->mrec;
212 a = ctx->attr;
213 BUG_ON(!a->non_resident);
214 BUG_ON(old_i_size != (loff_t)
215 sle64_to_cpu(a->data.non_resident.data_size));
216 a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
217 flush_dcache_mft_record_page(ctx->ntfs_ino);
218 mark_mft_record_dirty(ctx->ntfs_ino);
219 /* Update the file size in the vfs inode. */
220 i_size_write(vi, new_init_size);
221 ntfs_attr_put_search_ctx(ctx);
222 ctx = NULL;
223 unmap_mft_record(base_ni);
224 m = NULL;
225 }
226 mapping = vi->i_mapping;
227 index = old_init_size >> PAGE_CACHE_SHIFT;
228 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 do {
230 /*
231 * Read the page. If the page is not present, this will zero
232 * the uninitialized regions for us.
233 */
234 page = read_cache_page(mapping, index,
235 (filler_t*)mapping->a_ops->readpage, NULL);
236 if (IS_ERR(page)) {
237 err = PTR_ERR(page);
238 goto init_err_out;
239 }
240 wait_on_page_locked(page);
241 if (unlikely(!PageUptodate(page) || PageError(page))) {
242 page_cache_release(page);
243 err = -EIO;
244 goto init_err_out;
245 }
246 /*
247 * Update the initialized size in the ntfs inode. This is
248 * enough to make ntfs_writepage() work.
249 */
250 write_lock_irqsave(&ni->size_lock, flags);
251 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
252 if (ni->initialized_size > new_init_size)
253 ni->initialized_size = new_init_size;
254 write_unlock_irqrestore(&ni->size_lock, flags);
255 /* Set the page dirty so it gets written out. */
256 set_page_dirty(page);
257 page_cache_release(page);
258 /*
259 * Play nice with the vm and the rest of the system. This is
260 * very much needed as we can potentially be modifying the
261 * initialised size from a very small value to a really huge
262 * value, e.g.
263 * f = open(somefile, O_TRUNC);
264 * truncate(f, 10GiB);
265 * seek(f, 10GiB);
266 * write(f, 1);
267 * And this would mean we would be marking dirty hundreds of
268 * thousands of pages or as in the above example more than
269 * two and a half million pages!
270 *
271 * TODO: For sparse pages could optimize this workload by using
272 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
273 * would be set in readpage for sparse pages and here we would
274 * not need to mark dirty any pages which have this bit set.
275 * The only caveat is that we have to clear the bit everywhere
276 * where we allocate any clusters that lie in the page or that
277 * contain the page.
278 *
279 * TODO: An even greater optimization would be for us to only
280 * call readpage() on pages which are not in sparse regions as
281 * determined from the runlist. This would greatly reduce the
282 * number of pages we read and make dirty in the case of sparse
283 * files.
284 */
285 balance_dirty_pages_ratelimited(mapping);
286 cond_resched();
287 } while (++index < end_index);
288 read_lock_irqsave(&ni->size_lock, flags);
289 BUG_ON(ni->initialized_size != new_init_size);
290 read_unlock_irqrestore(&ni->size_lock, flags);
291 /* Now bring in sync the initialized_size in the mft record. */
292 m = map_mft_record(base_ni);
293 if (IS_ERR(m)) {
294 err = PTR_ERR(m);
295 m = NULL;
296 goto init_err_out;
297 }
298 ctx = ntfs_attr_get_search_ctx(base_ni, m);
299 if (unlikely(!ctx)) {
300 err = -ENOMEM;
301 goto init_err_out;
302 }
303 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
304 CASE_SENSITIVE, 0, NULL, 0, ctx);
305 if (unlikely(err)) {
306 if (err == -ENOENT)
307 err = -EIO;
308 goto init_err_out;
309 }
310 m = ctx->mrec;
311 a = ctx->attr;
312 BUG_ON(!a->non_resident);
313 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
314done:
315 flush_dcache_mft_record_page(ctx->ntfs_ino);
316 mark_mft_record_dirty(ctx->ntfs_ino);
317 if (ctx)
318 ntfs_attr_put_search_ctx(ctx);
319 if (m)
320 unmap_mft_record(base_ni);
321 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
322 (unsigned long long)new_init_size, i_size_read(vi));
323 return 0;
324init_err_out:
325 write_lock_irqsave(&ni->size_lock, flags);
326 ni->initialized_size = old_init_size;
327 write_unlock_irqrestore(&ni->size_lock, flags);
328err_out:
329 if (ctx)
330 ntfs_attr_put_search_ctx(ctx);
331 if (m)
332 unmap_mft_record(base_ni);
333 ntfs_debug("Failed. Returning error code %i.", err);
334 return err;
335}
336
337/**
338 * ntfs_fault_in_pages_readable -
339 *
340 * Fault a number of userspace pages into pagetables.
341 *
342 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
343 * with more than two userspace pages as well as handling the single page case
344 * elegantly.
345 *
346 * If you find this difficult to understand, then think of the while loop being
347 * the following code, except that we do without the integer variable ret:
348 *
349 * do {
350 * ret = __get_user(c, uaddr);
351 * uaddr += PAGE_SIZE;
352 * } while (!ret && uaddr < end);
353 *
354 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
355 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
356 * this is only a read and not a write, and since it is still in the same page,
357 * it should not matter and this makes the code much simpler.
358 */
359static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
360 int bytes)
361{
362 const char __user *end;
363 volatile char c;
364
365 /* Set @end to the first byte outside the last page we care about. */
366 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
367
368 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
369 ;
370}
371
372/**
373 * ntfs_fault_in_pages_readable_iovec -
374 *
375 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
376 */
377static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
378 size_t iov_ofs, int bytes)
379{
380 do {
381 const char __user *buf;
382 unsigned len;
383
384 buf = iov->iov_base + iov_ofs;
385 len = iov->iov_len - iov_ofs;
386 if (len > bytes)
387 len = bytes;
388 ntfs_fault_in_pages_readable(buf, len);
389 bytes -= len;
390 iov++;
391 iov_ofs = 0;
392 } while (bytes);
393}
394
395/**
396 * __ntfs_grab_cache_pages - obtain a number of locked pages
397 * @mapping: address space mapping from which to obtain page cache pages
398 * @index: starting index in @mapping at which to begin obtaining pages
399 * @nr_pages: number of page cache pages to obtain
400 * @pages: array of pages in which to return the obtained page cache pages
401 * @cached_page: allocated but as yet unused page
402 * @lru_pvec: lru-buffering pagevec of caller
403 *
404 * Obtain @nr_pages locked page cache pages from the mapping @maping and
405 * starting at index @index.
406 *
407 * If a page is newly created, increment its refcount and add it to the
408 * caller's lru-buffering pagevec @lru_pvec.
409 *
410 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
411 * are obtained at once instead of just one page and that 0 is returned on
412 * success and -errno on error.
413 *
414 * Note, the page locks are obtained in ascending page index order.
415 */
416static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
417 pgoff_t index, const unsigned nr_pages, struct page **pages,
418 struct page **cached_page, struct pagevec *lru_pvec)
419{
420 int err, nr;
421
422 BUG_ON(!nr_pages);
423 err = nr = 0;
424 do {
425 pages[nr] = find_lock_page(mapping, index);
426 if (!pages[nr]) {
427 if (!*cached_page) {
428 *cached_page = page_cache_alloc(mapping);
429 if (unlikely(!*cached_page)) {
430 err = -ENOMEM;
431 goto err_out;
432 }
433 }
434 err = add_to_page_cache(*cached_page, mapping, index,
435 GFP_KERNEL);
436 if (unlikely(err)) {
437 if (err == -EEXIST)
438 continue;
439 goto err_out;
440 }
441 pages[nr] = *cached_page;
442 page_cache_get(*cached_page);
443 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
444 __pagevec_lru_add(lru_pvec);
445 *cached_page = NULL;
446 }
447 index++;
448 nr++;
449 } while (nr < nr_pages);
450out:
451 return err;
452err_out:
453 while (nr > 0) {
454 unlock_page(pages[--nr]);
455 page_cache_release(pages[nr]);
456 }
457 goto out;
458}
459
460static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
461{
462 lock_buffer(bh);
463 get_bh(bh);
464 bh->b_end_io = end_buffer_read_sync;
465 return submit_bh(READ, bh);
466}
467
468/**
469 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
470 * @pages: array of destination pages
471 * @nr_pages: number of pages in @pages
472 * @pos: byte position in file at which the write begins
473 * @bytes: number of bytes to be written
474 *
475 * This is called for non-resident attributes from ntfs_file_buffered_write()
476 * with i_sem held on the inode (@pages[0]->mapping->host). There are
477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
478 * data has not yet been copied into the @pages.
479 *
480 * Need to fill any holes with actual clusters, allocate buffers if necessary,
481 * ensure all the buffers are mapped, and bring uptodate any buffers that are
482 * only partially being written to.
483 *
484 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
485 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
486 * the same cluster and that they are the entirety of that cluster, and that
487 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
488 *
489 * i_size is not to be modified yet.
490 *
491 * Return 0 on success or -errno on error.
492 */
493static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
494 unsigned nr_pages, s64 pos, size_t bytes)
495{
496 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
497 LCN lcn;
498 s64 bh_pos, vcn_len, end, initialized_size;
499 sector_t lcn_block;
500 struct page *page;
501 struct inode *vi;
502 ntfs_inode *ni, *base_ni = NULL;
503 ntfs_volume *vol;
504 runlist_element *rl, *rl2;
505 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
506 ntfs_attr_search_ctx *ctx = NULL;
507 MFT_RECORD *m = NULL;
508 ATTR_RECORD *a = NULL;
509 unsigned long flags;
510 u32 attr_rec_len = 0;
511 unsigned blocksize, u;
512 int err, mp_size;
513 BOOL rl_write_locked, was_hole, is_retry;
514 unsigned char blocksize_bits;
515 struct {
516 u8 runlist_merged:1;
517 u8 mft_attr_mapped:1;
518 u8 mp_rebuilt:1;
519 u8 attr_switched:1;
520 } status = { 0, 0, 0, 0 };
521
522 BUG_ON(!nr_pages);
523 BUG_ON(!pages);
524 BUG_ON(!*pages);
525 vi = pages[0]->mapping->host;
526 ni = NTFS_I(vi);
527 vol = ni->vol;
528 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
530 vi->i_ino, ni->type, pages[0]->index, nr_pages,
531 (long long)pos, bytes);
532 blocksize_bits = vi->i_blkbits;
533 blocksize = 1 << blocksize_bits;
534 u = 0;
535 do {
536 struct page *page = pages[u];
537 /*
538 * create_empty_buffers() will create uptodate/dirty buffers if
539 * the page is uptodate/dirty.
540 */
541 if (!page_has_buffers(page)) {
542 create_empty_buffers(page, blocksize, 0);
543 if (unlikely(!page_has_buffers(page)))
544 return -ENOMEM;
545 }
546 } while (++u < nr_pages);
547 rl_write_locked = FALSE;
548 rl = NULL;
549 err = 0;
550 vcn = lcn = -1;
551 vcn_len = 0;
552 lcn_block = -1;
553 was_hole = FALSE;
554 cpos = pos >> vol->cluster_size_bits;
555 end = pos + bytes;
556 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
557 /*
558 * Loop over each page and for each page over each buffer. Use goto to
559 * reduce indentation.
560 */
561 u = 0;
562do_next_page:
563 page = pages[u];
564 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
565 bh = head = page_buffers(page);
566 do {
567 VCN cdelta;
568 s64 bh_end;
569 unsigned bh_cofs;
570
571 /* Clear buffer_new on all buffers to reinitialise state. */
572 if (buffer_new(bh))
573 clear_buffer_new(bh);
574 bh_end = bh_pos + blocksize;
575 bh_cpos = bh_pos >> vol->cluster_size_bits;
576 bh_cofs = bh_pos & vol->cluster_size_mask;
577 if (buffer_mapped(bh)) {
578 /*
579 * The buffer is already mapped. If it is uptodate,
580 * ignore it.
581 */
582 if (buffer_uptodate(bh))
583 continue;
584 /*
585 * The buffer is not uptodate. If the page is uptodate
586 * set the buffer uptodate and otherwise ignore it.
587 */
588 if (PageUptodate(page)) {
589 set_buffer_uptodate(bh);
590 continue;
591 }
592 /*
593 * Neither the page nor the buffer are uptodate. If
594 * the buffer is only partially being written to, we
595 * need to read it in before the write, i.e. now.
596 */
597 if ((bh_pos < pos && bh_end > pos) ||
598 (bh_pos < end && bh_end > end)) {
599 /*
600 * If the buffer is fully or partially within
601 * the initialized size, do an actual read.
602 * Otherwise, simply zero the buffer.
603 */
604 read_lock_irqsave(&ni->size_lock, flags);
605 initialized_size = ni->initialized_size;
606 read_unlock_irqrestore(&ni->size_lock, flags);
607 if (bh_pos < initialized_size) {
608 ntfs_submit_bh_for_read(bh);
609 *wait_bh++ = bh;
610 } else {
611 u8 *kaddr = kmap_atomic(page, KM_USER0);
612 memset(kaddr + bh_offset(bh), 0,
613 blocksize);
614 kunmap_atomic(kaddr, KM_USER0);
615 flush_dcache_page(page);
616 set_buffer_uptodate(bh);
617 }
618 }
619 continue;
620 }
621 /* Unmapped buffer. Need to map it. */
622 bh->b_bdev = vol->sb->s_bdev;
623 /*
624 * If the current buffer is in the same clusters as the map
625 * cache, there is no need to check the runlist again. The
626 * map cache is made up of @vcn, which is the first cached file
627 * cluster, @vcn_len which is the number of cached file
628 * clusters, @lcn is the device cluster corresponding to @vcn,
629 * and @lcn_block is the block number corresponding to @lcn.
630 */
631 cdelta = bh_cpos - vcn;
632 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
633map_buffer_cached:
634 BUG_ON(lcn < 0);
635 bh->b_blocknr = lcn_block +
636 (cdelta << (vol->cluster_size_bits -
637 blocksize_bits)) +
638 (bh_cofs >> blocksize_bits);
639 set_buffer_mapped(bh);
640 /*
641 * If the page is uptodate so is the buffer. If the
642 * buffer is fully outside the write, we ignore it if
643 * it was already allocated and we mark it dirty so it
644 * gets written out if we allocated it. On the other
645 * hand, if we allocated the buffer but we are not
646 * marking it dirty we set buffer_new so we can do
647 * error recovery.
648 */
649 if (PageUptodate(page)) {
650 if (!buffer_uptodate(bh))
651 set_buffer_uptodate(bh);
652 if (unlikely(was_hole)) {
653 /* We allocated the buffer. */
654 unmap_underlying_metadata(bh->b_bdev,
655 bh->b_blocknr);
656 if (bh_end <= pos || bh_pos >= end)
657 mark_buffer_dirty(bh);
658 else
659 set_buffer_new(bh);
660 }
661 continue;
662 }
663 /* Page is _not_ uptodate. */
664 if (likely(!was_hole)) {
665 /*
666 * Buffer was already allocated. If it is not
667 * uptodate and is only partially being written
668 * to, we need to read it in before the write,
669 * i.e. now.
670 */
671 if (!buffer_uptodate(bh) && bh_pos < end &&
672 bh_end > pos &&
673 (bh_pos < pos ||
674 bh_end > end)) {
675 /*
676 * If the buffer is fully or partially
677 * within the initialized size, do an
678 * actual read. Otherwise, simply zero
679 * the buffer.
680 */
681 read_lock_irqsave(&ni->size_lock,
682 flags);
683 initialized_size = ni->initialized_size;
684 read_unlock_irqrestore(&ni->size_lock,
685 flags);
686 if (bh_pos < initialized_size) {
687 ntfs_submit_bh_for_read(bh);
688 *wait_bh++ = bh;
689 } else {
690 u8 *kaddr = kmap_atomic(page,
691 KM_USER0);
692 memset(kaddr + bh_offset(bh),
693 0, blocksize);
694 kunmap_atomic(kaddr, KM_USER0);
695 flush_dcache_page(page);
696 set_buffer_uptodate(bh);
697 }
698 }
699 continue;
700 }
701 /* We allocated the buffer. */
702 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
703 /*
704 * If the buffer is fully outside the write, zero it,
705 * set it uptodate, and mark it dirty so it gets
706 * written out. If it is partially being written to,
707 * zero region surrounding the write but leave it to
708 * commit write to do anything else. Finally, if the
709 * buffer is fully being overwritten, do nothing.
710 */
711 if (bh_end <= pos || bh_pos >= end) {
712 if (!buffer_uptodate(bh)) {
713 u8 *kaddr = kmap_atomic(page, KM_USER0);
714 memset(kaddr + bh_offset(bh), 0,
715 blocksize);
716 kunmap_atomic(kaddr, KM_USER0);
717 flush_dcache_page(page);
718 set_buffer_uptodate(bh);
719 }
720 mark_buffer_dirty(bh);
721 continue;
722 }
723 set_buffer_new(bh);
724 if (!buffer_uptodate(bh) &&
725 (bh_pos < pos || bh_end > end)) {
726 u8 *kaddr;
727 unsigned pofs;
728
729 kaddr = kmap_atomic(page, KM_USER0);
730 if (bh_pos < pos) {
731 pofs = bh_pos & ~PAGE_CACHE_MASK;
732 memset(kaddr + pofs, 0, pos - bh_pos);
733 }
734 if (bh_end > end) {
735 pofs = end & ~PAGE_CACHE_MASK;
736 memset(kaddr + pofs, 0, bh_end - end);
737 }
738 kunmap_atomic(kaddr, KM_USER0);
739 flush_dcache_page(page);
740 }
741 continue;
742 }
743 /*
744 * Slow path: this is the first buffer in the cluster. If it
745 * is outside allocated size and is not uptodate, zero it and
746 * set it uptodate.
747 */
748 read_lock_irqsave(&ni->size_lock, flags);
749 initialized_size = ni->allocated_size;
750 read_unlock_irqrestore(&ni->size_lock, flags);
751 if (bh_pos > initialized_size) {
752 if (PageUptodate(page)) {
753 if (!buffer_uptodate(bh))
754 set_buffer_uptodate(bh);
755 } else if (!buffer_uptodate(bh)) {
756 u8 *kaddr = kmap_atomic(page, KM_USER0);
757 memset(kaddr + bh_offset(bh), 0, blocksize);
758 kunmap_atomic(kaddr, KM_USER0);
759 flush_dcache_page(page);
760 set_buffer_uptodate(bh);
761 }
762 continue;
763 }
764 is_retry = FALSE;
765 if (!rl) {
766 down_read(&ni->runlist.lock);
767retry_remap:
768 rl = ni->runlist.rl;
769 }
770 if (likely(rl != NULL)) {
771 /* Seek to element containing target cluster. */
772 while (rl->length && rl[1].vcn <= bh_cpos)
773 rl++;
774 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
775 if (likely(lcn >= 0)) {
776 /*
777 * Successful remap, setup the map cache and
778 * use that to deal with the buffer.
779 */
780 was_hole = FALSE;
781 vcn = bh_cpos;
782 vcn_len = rl[1].vcn - vcn;
783 lcn_block = lcn << (vol->cluster_size_bits -
784 blocksize_bits);
785 cdelta = 0;
786 /*
787 * If the number of remaining clusters touched
788 * by the write is smaller or equal to the
789 * number of cached clusters, unlock the
790 * runlist as the map cache will be used from
791 * now on.
792 */
793 if (likely(vcn + vcn_len >= cend)) {
794 if (rl_write_locked) {
795 up_write(&ni->runlist.lock);
796 rl_write_locked = FALSE;
797 } else
798 up_read(&ni->runlist.lock);
799 rl = NULL;
800 }
801 goto map_buffer_cached;
802 }
803 } else
804 lcn = LCN_RL_NOT_MAPPED;
805 /*
806 * If it is not a hole and not out of bounds, the runlist is
807 * probably unmapped so try to map it now.
808 */
809 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
810 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
811 /* Attempt to map runlist. */
812 if (!rl_write_locked) {
813 /*
814 * We need the runlist locked for
815 * writing, so if it is locked for
816 * reading relock it now and retry in
817 * case it changed whilst we dropped
818 * the lock.
819 */
820 up_read(&ni->runlist.lock);
821 down_write(&ni->runlist.lock);
822 rl_write_locked = TRUE;
823 goto retry_remap;
824 }
825 err = ntfs_map_runlist_nolock(ni, bh_cpos,
826 NULL);
827 if (likely(!err)) {
828 is_retry = TRUE;
829 goto retry_remap;
830 }
831 /*
832 * If @vcn is out of bounds, pretend @lcn is
833 * LCN_ENOENT. As long as the buffer is out
834 * of bounds this will work fine.
835 */
836 if (err == -ENOENT) {
837 lcn = LCN_ENOENT;
838 err = 0;
839 goto rl_not_mapped_enoent;
840 }
841 } else
842 err = -EIO;
843 /* Failed to map the buffer, even after retrying. */
844 bh->b_blocknr = -1;
845 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
846 "attribute type 0x%x, vcn 0x%llx, "
847 "vcn offset 0x%x, because its "
848 "location on disk could not be "
849 "determined%s (error code %i).",
850 ni->mft_no, ni->type,
851 (unsigned long long)bh_cpos,
852 (unsigned)bh_pos &
853 vol->cluster_size_mask,
854 is_retry ? " even after retrying" : "",
855 err);
856 break;
857 }
858rl_not_mapped_enoent:
859 /*
860 * The buffer is in a hole or out of bounds. We need to fill
861 * the hole, unless the buffer is in a cluster which is not
862 * touched by the write, in which case we just leave the buffer
863 * unmapped. This can only happen when the cluster size is
864 * less than the page cache size.
865 */
866 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
867 bh_cend = (bh_end + vol->cluster_size - 1) >>
868 vol->cluster_size_bits;
869 if ((bh_cend <= cpos || bh_cpos >= cend)) {
870 bh->b_blocknr = -1;
871 /*
872 * If the buffer is uptodate we skip it. If it
873 * is not but the page is uptodate, we can set
874 * the buffer uptodate. If the page is not
875 * uptodate, we can clear the buffer and set it
876 * uptodate. Whether this is worthwhile is
877 * debatable and this could be removed.
878 */
879 if (PageUptodate(page)) {
880 if (!buffer_uptodate(bh))
881 set_buffer_uptodate(bh);
882 } else if (!buffer_uptodate(bh)) {
883 u8 *kaddr = kmap_atomic(page, KM_USER0);
884 memset(kaddr + bh_offset(bh), 0,
885 blocksize);
886 kunmap_atomic(kaddr, KM_USER0);
887 flush_dcache_page(page);
888 set_buffer_uptodate(bh);
889 }
890 continue;
891 }
892 }
893 /*
894 * Out of bounds buffer is invalid if it was not really out of
895 * bounds.
896 */
897 BUG_ON(lcn != LCN_HOLE);
898 /*
899 * We need the runlist locked for writing, so if it is locked
900 * for reading relock it now and retry in case it changed
901 * whilst we dropped the lock.
902 */
903 BUG_ON(!rl);
904 if (!rl_write_locked) {
905 up_read(&ni->runlist.lock);
906 down_write(&ni->runlist.lock);
907 rl_write_locked = TRUE;
908 goto retry_remap;
909 }
910 /* Find the previous last allocated cluster. */
911 BUG_ON(rl->lcn != LCN_HOLE);
912 lcn = -1;
913 rl2 = rl;
914 while (--rl2 >= ni->runlist.rl) {
915 if (rl2->lcn >= 0) {
916 lcn = rl2->lcn + rl2->length;
917 break;
918 }
919 }
920 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
921 FALSE);
922 if (IS_ERR(rl2)) {
923 err = PTR_ERR(rl2);
924 ntfs_debug("Failed to allocate cluster, error code %i.",
925 err);
926 break;
927 }
928 lcn = rl2->lcn;
929 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
930 if (IS_ERR(rl)) {
931 err = PTR_ERR(rl);
932 if (err != -ENOMEM)
933 err = -EIO;
934 if (ntfs_cluster_free_from_rl(vol, rl2)) {
935 ntfs_error(vol->sb, "Failed to release "
936 "allocated cluster in error "
937 "code path. Run chkdsk to "
938 "recover the lost cluster.");
939 NVolSetErrors(vol);
940 }
941 ntfs_free(rl2);
942 break;
943 }
944 ni->runlist.rl = rl;
945 status.runlist_merged = 1;
946 ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
947 /* Map and lock the mft record and get the attribute record. */
948 if (!NInoAttr(ni))
949 base_ni = ni;
950 else
951 base_ni = ni->ext.base_ntfs_ino;
952 m = map_mft_record(base_ni);
953 if (IS_ERR(m)) {
954 err = PTR_ERR(m);
955 break;
956 }
957 ctx = ntfs_attr_get_search_ctx(base_ni, m);
958 if (unlikely(!ctx)) {
959 err = -ENOMEM;
960 unmap_mft_record(base_ni);
961 break;
962 }
963 status.mft_attr_mapped = 1;
964 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
965 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
966 if (unlikely(err)) {
967 if (err == -ENOENT)
968 err = -EIO;
969 break;
970 }
971 m = ctx->mrec;
972 a = ctx->attr;
973 /*
974 * Find the runlist element with which the attribute extent
975 * starts. Note, we cannot use the _attr_ version because we
976 * have mapped the mft record. That is ok because we know the
977 * runlist fragment must be mapped already to have ever gotten
978 * here, so we can just use the _rl_ version.
979 */
980 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
981 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
982 BUG_ON(!rl2);
983 BUG_ON(!rl2->length);
984 BUG_ON(rl2->lcn < LCN_HOLE);
985 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
986 /*
987 * If @highest_vcn is zero, calculate the real highest_vcn
988 * (which can really be zero).
989 */
990 if (!highest_vcn)
991 highest_vcn = (sle64_to_cpu(
992 a->data.non_resident.allocated_size) >>
993 vol->cluster_size_bits) - 1;
994 /*
995 * Determine the size of the mapping pairs array for the new
996 * extent, i.e. the old extent with the hole filled.
997 */
998 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
999 highest_vcn);
1000 if (unlikely(mp_size <= 0)) {
1001 if (!(err = mp_size))
1002 err = -EIO;
1003 ntfs_debug("Failed to get size for mapping pairs "
1004 "array, error code %i.", err);
1005 break;
1006 }
1007 /*
1008 * Resize the attribute record to fit the new mapping pairs
1009 * array.
1010 */
1011 attr_rec_len = le32_to_cpu(a->length);
1012 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1013 a->data.non_resident.mapping_pairs_offset));
1014 if (unlikely(err)) {
1015 BUG_ON(err != -ENOSPC);
1016 // TODO: Deal with this by using the current attribute
1017 // and fill it with as much of the mapping pairs
1018 // array as possible. Then loop over each attribute
1019 // extent rewriting the mapping pairs arrays as we go
1020 // along and if when we reach the end we have not
1021 // enough space, try to resize the last attribute
1022 // extent and if even that fails, add a new attribute
1023 // extent.
1024 // We could also try to resize at each step in the hope
1025 // that we will not need to rewrite every single extent.
1026 // Note, we may need to decompress some extents to fill
1027 // the runlist as we are walking the extents...
1028 ntfs_error(vol->sb, "Not enough space in the mft "
1029 "record for the extended attribute "
1030 "record. This case is not "
1031 "implemented yet.");
1032 err = -EOPNOTSUPP;
1033 break ;
1034 }
1035 status.mp_rebuilt = 1;
1036 /*
1037 * Generate the mapping pairs array directly into the attribute
1038 * record.
1039 */
1040 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1041 a->data.non_resident.mapping_pairs_offset),
1042 mp_size, rl2, vcn, highest_vcn, NULL);
1043 if (unlikely(err)) {
1044 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1045 "attribute type 0x%x, because building "
1046 "the mapping pairs failed with error "
1047 "code %i.", vi->i_ino,
1048 (unsigned)le32_to_cpu(ni->type), err);
1049 err = -EIO;
1050 break;
1051 }
1052 /* Update the highest_vcn but only if it was not set. */
1053 if (unlikely(!a->data.non_resident.highest_vcn))
1054 a->data.non_resident.highest_vcn =
1055 cpu_to_sle64(highest_vcn);
1056 /*
1057 * If the attribute is sparse/compressed, update the compressed
1058 * size in the ntfs_inode structure and the attribute record.
1059 */
1060 if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1061 /*
1062 * If we are not in the first attribute extent, switch
1063 * to it, but first ensure the changes will make it to
1064 * disk later.
1065 */
1066 if (a->data.non_resident.lowest_vcn) {
1067 flush_dcache_mft_record_page(ctx->ntfs_ino);
1068 mark_mft_record_dirty(ctx->ntfs_ino);
1069 ntfs_attr_reinit_search_ctx(ctx);
1070 err = ntfs_attr_lookup(ni->type, ni->name,
1071 ni->name_len, CASE_SENSITIVE,
1072 0, NULL, 0, ctx);
1073 if (unlikely(err)) {
1074 status.attr_switched = 1;
1075 break;
1076 }
1077 /* @m is not used any more so do not set it. */
1078 a = ctx->attr;
1079 }
1080 write_lock_irqsave(&ni->size_lock, flags);
1081 ni->itype.compressed.size += vol->cluster_size;
1082 a->data.non_resident.compressed_size =
1083 cpu_to_sle64(ni->itype.compressed.size);
1084 write_unlock_irqrestore(&ni->size_lock, flags);
1085 }
1086 /* Ensure the changes make it to disk. */
1087 flush_dcache_mft_record_page(ctx->ntfs_ino);
1088 mark_mft_record_dirty(ctx->ntfs_ino);
1089 ntfs_attr_put_search_ctx(ctx);
1090 unmap_mft_record(base_ni);
1091 /* Successfully filled the hole. */
1092 status.runlist_merged = 0;
1093 status.mft_attr_mapped = 0;
1094 status.mp_rebuilt = 0;
1095 /* Setup the map cache and use that to deal with the buffer. */
1096 was_hole = TRUE;
1097 vcn = bh_cpos;
1098 vcn_len = 1;
1099 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1100 cdelta = 0;
1101 /*
1102 * If the number of remaining clusters in the @pages is smaller
1103 * or equal to the number of cached clusters, unlock the
1104 * runlist as the map cache will be used from now on.
1105 */
1106 if (likely(vcn + vcn_len >= cend)) {
1107 up_write(&ni->runlist.lock);
1108 rl_write_locked = FALSE;
1109 rl = NULL;
1110 }
1111 goto map_buffer_cached;
1112 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1113 /* If there are no errors, do the next page. */
1114 if (likely(!err && ++u < nr_pages))
1115 goto do_next_page;
1116 /* If there are no errors, release the runlist lock if we took it. */
1117 if (likely(!err)) {
1118 if (unlikely(rl_write_locked)) {
1119 up_write(&ni->runlist.lock);
1120 rl_write_locked = FALSE;
1121 } else if (unlikely(rl))
1122 up_read(&ni->runlist.lock);
1123 rl = NULL;
1124 }
1125 /* If we issued read requests, let them complete. */
1126 read_lock_irqsave(&ni->size_lock, flags);
1127 initialized_size = ni->initialized_size;
1128 read_unlock_irqrestore(&ni->size_lock, flags);
1129 while (wait_bh > wait) {
1130 bh = *--wait_bh;
1131 wait_on_buffer(bh);
1132 if (likely(buffer_uptodate(bh))) {
1133 page = bh->b_page;
1134 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1135 bh_offset(bh);
1136 /*
1137 * If the buffer overflows the initialized size, need
1138 * to zero the overflowing region.
1139 */
1140 if (unlikely(bh_pos + blocksize > initialized_size)) {
1141 u8 *kaddr;
1142 int ofs = 0;
1143
1144 if (likely(bh_pos < initialized_size))
1145 ofs = initialized_size - bh_pos;
1146 kaddr = kmap_atomic(page, KM_USER0);
1147 memset(kaddr + bh_offset(bh) + ofs, 0,
1148 blocksize - ofs);
1149 kunmap_atomic(kaddr, KM_USER0);
1150 flush_dcache_page(page);
1151 }
1152 } else /* if (unlikely(!buffer_uptodate(bh))) */
1153 err = -EIO;
1154 }
1155 if (likely(!err)) {
1156 /* Clear buffer_new on all buffers. */
1157 u = 0;
1158 do {
1159 bh = head = page_buffers(pages[u]);
1160 do {
1161 if (buffer_new(bh))
1162 clear_buffer_new(bh);
1163 } while ((bh = bh->b_this_page) != head);
1164 } while (++u < nr_pages);
1165 ntfs_debug("Done.");
1166 return err;
1167 }
1168 if (status.attr_switched) {
1169 /* Get back to the attribute extent we modified. */
1170 ntfs_attr_reinit_search_ctx(ctx);
1171 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1172 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1173 ntfs_error(vol->sb, "Failed to find required "
1174 "attribute extent of attribute in "
1175 "error code path. Run chkdsk to "
1176 "recover.");
1177 write_lock_irqsave(&ni->size_lock, flags);
1178 ni->itype.compressed.size += vol->cluster_size;
1179 write_unlock_irqrestore(&ni->size_lock, flags);
1180 flush_dcache_mft_record_page(ctx->ntfs_ino);
1181 mark_mft_record_dirty(ctx->ntfs_ino);
1182 /*
1183 * The only thing that is now wrong is the compressed
1184 * size of the base attribute extent which chkdsk
1185 * should be able to fix.
1186 */
1187 NVolSetErrors(vol);
1188 } else {
1189 m = ctx->mrec;
1190 a = ctx->attr;
1191 status.attr_switched = 0;
1192 }
1193 }
1194 /*
1195 * If the runlist has been modified, need to restore it by punching a
1196 * hole into it and we then need to deallocate the on-disk cluster as
1197 * well. Note, we only modify the runlist if we are able to generate a
1198 * new mapping pairs array, i.e. only when the mapped attribute extent
1199 * is not switched.
1200 */
1201 if (status.runlist_merged && !status.attr_switched) {
1202 BUG_ON(!rl_write_locked);
1203 /* Make the file cluster we allocated sparse in the runlist. */
1204 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1205 ntfs_error(vol->sb, "Failed to punch hole into "
1206 "attribute runlist in error code "
1207 "path. Run chkdsk to recover the "
1208 "lost cluster.");
1209 make_bad_inode(vi);
1210 make_bad_inode(VFS_I(base_ni));
1211 NVolSetErrors(vol);
1212 } else /* if (success) */ {
1213 status.runlist_merged = 0;
1214 /*
1215 * Deallocate the on-disk cluster we allocated but only
1216 * if we succeeded in punching its vcn out of the
1217 * runlist.
1218 */
1219 down_write(&vol->lcnbmp_lock);
1220 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1221 ntfs_error(vol->sb, "Failed to release "
1222 "allocated cluster in error "
1223 "code path. Run chkdsk to "
1224 "recover the lost cluster.");
1225 NVolSetErrors(vol);
1226 }
1227 up_write(&vol->lcnbmp_lock);
1228 }
1229 }
1230 /*
1231 * Resize the attribute record to its old size and rebuild the mapping
1232 * pairs array. Note, we only can do this if the runlist has been
1233 * restored to its old state which also implies that the mapped
1234 * attribute extent is not switched.
1235 */
1236 if (status.mp_rebuilt && !status.runlist_merged) {
1237 if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1238 ntfs_error(vol->sb, "Failed to restore attribute "
1239 "record in error code path. Run "
1240 "chkdsk to recover.");
1241 make_bad_inode(vi);
1242 make_bad_inode(VFS_I(base_ni));
1243 NVolSetErrors(vol);
1244 } else /* if (success) */ {
1245 if (ntfs_mapping_pairs_build(vol, (u8*)a +
1246 le16_to_cpu(a->data.non_resident.
1247 mapping_pairs_offset), attr_rec_len -
1248 le16_to_cpu(a->data.non_resident.
1249 mapping_pairs_offset), ni->runlist.rl,
1250 vcn, highest_vcn, NULL)) {
1251 ntfs_error(vol->sb, "Failed to restore "
1252 "mapping pairs array in error "
1253 "code path. Run chkdsk to "
1254 "recover.");
1255 make_bad_inode(vi);
1256 make_bad_inode(VFS_I(base_ni));
1257 NVolSetErrors(vol);
1258 }
1259 flush_dcache_mft_record_page(ctx->ntfs_ino);
1260 mark_mft_record_dirty(ctx->ntfs_ino);
1261 }
1262 }
1263 /* Release the mft record and the attribute. */
1264 if (status.mft_attr_mapped) {
1265 ntfs_attr_put_search_ctx(ctx);
1266 unmap_mft_record(base_ni);
1267 }
1268 /* Release the runlist lock. */
1269 if (rl_write_locked)
1270 up_write(&ni->runlist.lock);
1271 else if (rl)
1272 up_read(&ni->runlist.lock);
1273 /*
1274 * Zero out any newly allocated blocks to avoid exposing stale data.
1275 * If BH_New is set, we know that the block was newly allocated above
1276 * and that it has not been fully zeroed and marked dirty yet.
1277 */
1278 nr_pages = u;
1279 u = 0;
1280 end = bh_cpos << vol->cluster_size_bits;
1281 do {
1282 page = pages[u];
1283 bh = head = page_buffers(page);
1284 do {
1285 if (u == nr_pages &&
1286 ((s64)page->index << PAGE_CACHE_SHIFT) +
1287 bh_offset(bh) >= end)
1288 break;
1289 if (!buffer_new(bh))
1290 continue;
1291 clear_buffer_new(bh);
1292 if (!buffer_uptodate(bh)) {
1293 if (PageUptodate(page))
1294 set_buffer_uptodate(bh);
1295 else {
1296 u8 *kaddr = kmap_atomic(page, KM_USER0);
1297 memset(kaddr + bh_offset(bh), 0,
1298 blocksize);
1299 kunmap_atomic(kaddr, KM_USER0);
1300 flush_dcache_page(page);
1301 set_buffer_uptodate(bh);
1302 }
1303 }
1304 mark_buffer_dirty(bh);
1305 } while ((bh = bh->b_this_page) != head);
1306 } while (++u <= nr_pages);
1307 ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1308 return err;
1309}
1310
1311/*
1312 * Copy as much as we can into the pages and return the number of bytes which
1313 * were sucessfully copied. If a fault is encountered then clear the pages
1314 * out to (ofs + bytes) and return the number of bytes which were copied.
1315 */
1316static inline size_t ntfs_copy_from_user(struct page **pages,
1317 unsigned nr_pages, unsigned ofs, const char __user *buf,
1318 size_t bytes)
1319{
1320 struct page **last_page = pages + nr_pages;
1321 char *kaddr;
1322 size_t total = 0;
1323 unsigned len;
1324 int left;
1325
1326 do {
1327 len = PAGE_CACHE_SIZE - ofs;
1328 if (len > bytes)
1329 len = bytes;
1330 kaddr = kmap_atomic(*pages, KM_USER0);
1331 left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
1332 kunmap_atomic(kaddr, KM_USER0);
1333 if (unlikely(left)) {
1334 /* Do it the slow way. */
1335 kaddr = kmap(*pages);
1336 left = __copy_from_user(kaddr + ofs, buf, len);
1337 kunmap(*pages);
1338 if (unlikely(left))
1339 goto err_out;
1340 }
1341 total += len;
1342 bytes -= len;
1343 if (!bytes)
1344 break;
1345 buf += len;
1346 ofs = 0;
1347 } while (++pages < last_page);
1348out:
1349 return total;
1350err_out:
1351 total += len - left;
1352 /* Zero the rest of the target like __copy_from_user(). */
1353 while (++pages < last_page) {
1354 bytes -= len;
1355 if (!bytes)
1356 break;
1357 len = PAGE_CACHE_SIZE;
1358 if (len > bytes)
1359 len = bytes;
1360 kaddr = kmap_atomic(*pages, KM_USER0);
1361 memset(kaddr, 0, len);
1362 kunmap_atomic(kaddr, KM_USER0);
1363 }
1364 goto out;
1365}
1366
1367static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1368 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1369{
1370 size_t total = 0;
1371
1372 while (1) {
1373 const char __user *buf = iov->iov_base + iov_ofs;
1374 unsigned len;
1375 size_t left;
1376
1377 len = iov->iov_len - iov_ofs;
1378 if (len > bytes)
1379 len = bytes;
1380 left = __copy_from_user_inatomic(vaddr, buf, len);
1381 total += len;
1382 bytes -= len;
1383 vaddr += len;
1384 if (unlikely(left)) {
1385 /*
1386 * Zero the rest of the target like __copy_from_user().
1387 */
1388 memset(vaddr, 0, bytes);
1389 total -= left;
1390 break;
1391 }
1392 if (!bytes)
1393 break;
1394 iov++;
1395 iov_ofs = 0;
1396 }
1397 return total;
1398}
1399
1400static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1401 size_t *iov_ofsp, size_t bytes)
1402{
1403 const struct iovec *iov = *iovp;
1404 size_t iov_ofs = *iov_ofsp;
1405
1406 while (bytes) {
1407 unsigned len;
1408
1409 len = iov->iov_len - iov_ofs;
1410 if (len > bytes)
1411 len = bytes;
1412 bytes -= len;
1413 iov_ofs += len;
1414 if (iov->iov_len == iov_ofs) {
1415 iov++;
1416 iov_ofs = 0;
1417 }
1418 }
1419 *iovp = iov;
1420 *iov_ofsp = iov_ofs;
1421}
1422
1423/*
1424 * This has the same side-effects and return value as ntfs_copy_from_user().
1425 * The difference is that on a fault we need to memset the remainder of the
1426 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1427 * single-segment behaviour.
1428 *
1429 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
1430 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
1431 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1432 * fact, the only difference between __copy_from_user_inatomic() and
1433 * __copy_from_user() is that the latter calls might_sleep(). And on many
1434 * architectures __copy_from_user_inatomic() is just defined to
1435 * __copy_from_user() so it makes no difference at all on those architectures.
1436 */
1437static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1438 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1439 size_t *iov_ofs, size_t bytes)
1440{
1441 struct page **last_page = pages + nr_pages;
1442 char *kaddr;
1443 size_t copied, len, total = 0;
1444
1445 do {
1446 len = PAGE_CACHE_SIZE - ofs;
1447 if (len > bytes)
1448 len = bytes;
1449 kaddr = kmap_atomic(*pages, KM_USER0);
1450 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1451 *iov, *iov_ofs, len);
1452 kunmap_atomic(kaddr, KM_USER0);
1453 if (unlikely(copied != len)) {
1454 /* Do it the slow way. */
1455 kaddr = kmap(*pages);
1456 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1457 *iov, *iov_ofs, len);
1458 kunmap(*pages);
1459 if (unlikely(copied != len))
1460 goto err_out;
1461 }
1462 total += len;
1463 bytes -= len;
1464 if (!bytes)
1465 break;
1466 ntfs_set_next_iovec(iov, iov_ofs, len);
1467 ofs = 0;
1468 } while (++pages < last_page);
1469out:
1470 return total;
1471err_out:
1472 total += copied;
1473 /* Zero the rest of the target like __copy_from_user(). */
1474 while (++pages < last_page) {
1475 bytes -= len;
1476 if (!bytes)
1477 break;
1478 len = PAGE_CACHE_SIZE;
1479 if (len > bytes)
1480 len = bytes;
1481 kaddr = kmap_atomic(*pages, KM_USER0);
1482 memset(kaddr, 0, len);
1483 kunmap_atomic(kaddr, KM_USER0);
1484 }
1485 goto out;
1486}
1487
1488static inline void ntfs_flush_dcache_pages(struct page **pages,
1489 unsigned nr_pages)
1490{
1491 BUG_ON(!nr_pages);
1492 do {
1493 /*
1494 * Warning: Do not do the decrement at the same time as the
1495 * call because flush_dcache_page() is a NULL macro on i386
1496 * and hence the decrement never happens.
1497 */
1498 flush_dcache_page(pages[nr_pages]);
1499 } while (--nr_pages > 0);
1500}
1501
1502/**
1503 * ntfs_commit_pages_after_non_resident_write - commit the received data
1504 * @pages: array of destination pages
1505 * @nr_pages: number of pages in @pages
1506 * @pos: byte position in file at which the write begins
1507 * @bytes: number of bytes to be written
1508 *
1509 * See description of ntfs_commit_pages_after_write(), below.
1510 */
1511static inline int ntfs_commit_pages_after_non_resident_write(
1512 struct page **pages, const unsigned nr_pages,
1513 s64 pos, size_t bytes)
1514{
1515 s64 end, initialized_size;
1516 struct inode *vi;
1517 ntfs_inode *ni, *base_ni;
1518 struct buffer_head *bh, *head;
1519 ntfs_attr_search_ctx *ctx;
1520 MFT_RECORD *m;
1521 ATTR_RECORD *a;
1522 unsigned long flags;
1523 unsigned blocksize, u;
1524 int err;
1525
1526 vi = pages[0]->mapping->host;
1527 ni = NTFS_I(vi);
1528 blocksize = 1 << vi->i_blkbits;
1529 end = pos + bytes;
1530 u = 0;
1531 do {
1532 s64 bh_pos;
1533 struct page *page;
1534 BOOL partial;
1535
1536 page = pages[u];
1537 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1538 bh = head = page_buffers(page);
1539 partial = FALSE;
1540 do {
1541 s64 bh_end;
1542
1543 bh_end = bh_pos + blocksize;
1544 if (bh_end <= pos || bh_pos >= end) {
1545 if (!buffer_uptodate(bh))
1546 partial = TRUE;
1547 } else {
1548 set_buffer_uptodate(bh);
1549 mark_buffer_dirty(bh);
1550 }
1551 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1552 /*
1553 * If all buffers are now uptodate but the page is not, set the
1554 * page uptodate.
1555 */
1556 if (!partial && !PageUptodate(page))
1557 SetPageUptodate(page);
1558 } while (++u < nr_pages);
1559 /*
1560 * Finally, if we do not need to update initialized_size or i_size we
1561 * are finished.
1562 */
1563 read_lock_irqsave(&ni->size_lock, flags);
1564 initialized_size = ni->initialized_size;
1565 read_unlock_irqrestore(&ni->size_lock, flags);
1566 if (end <= initialized_size) {
1567 ntfs_debug("Done.");
1568 return 0;
1569 }
1570 /*
1571 * Update initialized_size/i_size as appropriate, both in the inode and
1572 * the mft record.
1573 */
1574 if (!NInoAttr(ni))
1575 base_ni = ni;
1576 else
1577 base_ni = ni->ext.base_ntfs_ino;
1578 /* Map, pin, and lock the mft record. */
1579 m = map_mft_record(base_ni);
1580 if (IS_ERR(m)) {
1581 err = PTR_ERR(m);
1582 m = NULL;
1583 ctx = NULL;
1584 goto err_out;
1585 }
1586 BUG_ON(!NInoNonResident(ni));
1587 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1588 if (unlikely(!ctx)) {
1589 err = -ENOMEM;
1590 goto err_out;
1591 }
1592 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1593 CASE_SENSITIVE, 0, NULL, 0, ctx);
1594 if (unlikely(err)) {
1595 if (err == -ENOENT)
1596 err = -EIO;
1597 goto err_out;
1598 }
1599 a = ctx->attr;
1600 BUG_ON(!a->non_resident);
1601 write_lock_irqsave(&ni->size_lock, flags);
1602 BUG_ON(end > ni->allocated_size);
1603 ni->initialized_size = end;
1604 a->data.non_resident.initialized_size = cpu_to_sle64(end);
1605 if (end > i_size_read(vi)) {
1606 i_size_write(vi, end);
1607 a->data.non_resident.data_size =
1608 a->data.non_resident.initialized_size;
1609 }
1610 write_unlock_irqrestore(&ni->size_lock, flags);
1611 /* Mark the mft record dirty, so it gets written back. */
1612 flush_dcache_mft_record_page(ctx->ntfs_ino);
1613 mark_mft_record_dirty(ctx->ntfs_ino);
1614 ntfs_attr_put_search_ctx(ctx);
1615 unmap_mft_record(base_ni);
1616 ntfs_debug("Done.");
1617 return 0;
1618err_out:
1619 if (ctx)
1620 ntfs_attr_put_search_ctx(ctx);
1621 if (m)
1622 unmap_mft_record(base_ni);
1623 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1624 "code %i).", err);
1625 if (err != -ENOMEM) {
1626 NVolSetErrors(ni->vol);
1627 make_bad_inode(VFS_I(base_ni));
1628 make_bad_inode(vi);
1629 }
1630 return err;
1631}
1632
1633/**
1634 * ntfs_commit_pages_after_write - commit the received data
1635 * @pages: array of destination pages
1636 * @nr_pages: number of pages in @pages
1637 * @pos: byte position in file at which the write begins
1638 * @bytes: number of bytes to be written
1639 *
1640 * This is called from ntfs_file_buffered_write() with i_sem held on the inode
1641 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1642 * locked but not kmap()ped. The source data has already been copied into the
1643 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1644 * the data was copied (for non-resident attributes only) and it returned
1645 * success.
1646 *
1647 * Need to set uptodate and mark dirty all buffers within the boundary of the
1648 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1649 *
1650 * Setting the buffers dirty ensures that they get written out later when
1651 * ntfs_writepage() is invoked by the VM.
1652 *
1653 * Finally, we need to update i_size and initialized_size as appropriate both
1654 * in the inode and the mft record.
1655 *
1656 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1657 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1658 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1659 * that case, it also marks the inode dirty.
1660 *
1661 * If things have gone as outlined in
1662 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1663 * content modifications here for non-resident attributes. For resident
1664 * attributes we need to do the uptodate bringing here which we combine with
1665 * the copying into the mft record which means we save one atomic kmap.
1666 *
1667 * Return 0 on success or -errno on error.
1668 */
1669static int ntfs_commit_pages_after_write(struct page **pages,
1670 const unsigned nr_pages, s64 pos, size_t bytes)
1671{
1672 s64 end, initialized_size;
1673 loff_t i_size;
1674 struct inode *vi;
1675 ntfs_inode *ni, *base_ni;
1676 struct page *page;
1677 ntfs_attr_search_ctx *ctx;
1678 MFT_RECORD *m;
1679 ATTR_RECORD *a;
1680 char *kattr, *kaddr;
1681 unsigned long flags;
1682 u32 attr_len;
1683 int err;
1684
1685 BUG_ON(!nr_pages);
1686 BUG_ON(!pages);
1687 page = pages[0];
1688 BUG_ON(!page);
1689 vi = page->mapping->host;
1690 ni = NTFS_I(vi);
1691 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1692 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1693 vi->i_ino, ni->type, page->index, nr_pages,
1694 (long long)pos, bytes);
1695 if (NInoNonResident(ni))
1696 return ntfs_commit_pages_after_non_resident_write(pages,
1697 nr_pages, pos, bytes);
1698 BUG_ON(nr_pages > 1);
1699 /*
1700 * Attribute is resident, implying it is not compressed, encrypted, or
1701 * sparse.
1702 */
1703 if (!NInoAttr(ni))
1704 base_ni = ni;
1705 else
1706 base_ni = ni->ext.base_ntfs_ino;
1707 BUG_ON(NInoNonResident(ni));
1708 /* Map, pin, and lock the mft record. */
1709 m = map_mft_record(base_ni);
1710 if (IS_ERR(m)) {
1711 err = PTR_ERR(m);
1712 m = NULL;
1713 ctx = NULL;
1714 goto err_out;
1715 }
1716 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1717 if (unlikely(!ctx)) {
1718 err = -ENOMEM;
1719 goto err_out;
1720 }
1721 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1722 CASE_SENSITIVE, 0, NULL, 0, ctx);
1723 if (unlikely(err)) {
1724 if (err == -ENOENT)
1725 err = -EIO;
1726 goto err_out;
1727 }
1728 a = ctx->attr;
1729 BUG_ON(a->non_resident);
1730 /* The total length of the attribute value. */
1731 attr_len = le32_to_cpu(a->data.resident.value_length);
1732 i_size = i_size_read(vi);
1733 BUG_ON(attr_len != i_size);
1734 BUG_ON(pos > attr_len);
1735 end = pos + bytes;
1736 BUG_ON(end > le32_to_cpu(a->length) -
1737 le16_to_cpu(a->data.resident.value_offset));
1738 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1739 kaddr = kmap_atomic(page, KM_USER0);
1740 /* Copy the received data from the page to the mft record. */
1741 memcpy(kattr + pos, kaddr + pos, bytes);
1742 /* Update the attribute length if necessary. */
1743 if (end > attr_len) {
1744 attr_len = end;
1745 a->data.resident.value_length = cpu_to_le32(attr_len);
1746 }
1747 /*
1748 * If the page is not uptodate, bring the out of bounds area(s)
1749 * uptodate by copying data from the mft record to the page.
1750 */
1751 if (!PageUptodate(page)) {
1752 if (pos > 0)
1753 memcpy(kaddr, kattr, pos);
1754 if (end < attr_len)
1755 memcpy(kaddr + end, kattr + end, attr_len - end);
1756 /* Zero the region outside the end of the attribute value. */
1757 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1758 flush_dcache_page(page);
1759 SetPageUptodate(page);
1760 }
1761 kunmap_atomic(kaddr, KM_USER0);
1762 /* Update initialized_size/i_size if necessary. */
1763 read_lock_irqsave(&ni->size_lock, flags);
1764 initialized_size = ni->initialized_size;
1765 BUG_ON(end > ni->allocated_size);
1766 read_unlock_irqrestore(&ni->size_lock, flags);
1767 BUG_ON(initialized_size != i_size);
1768 if (end > initialized_size) {
1769 unsigned long flags;
1770
1771 write_lock_irqsave(&ni->size_lock, flags);
1772 ni->initialized_size = end;
1773 i_size_write(vi, end);
1774 write_unlock_irqrestore(&ni->size_lock, flags);
1775 }
1776 /* Mark the mft record dirty, so it gets written back. */
1777 flush_dcache_mft_record_page(ctx->ntfs_ino);
1778 mark_mft_record_dirty(ctx->ntfs_ino);
1779 ntfs_attr_put_search_ctx(ctx);
1780 unmap_mft_record(base_ni);
1781 ntfs_debug("Done.");
1782 return 0;
1783err_out:
1784 if (err == -ENOMEM) {
1785 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1786 "commit the write.");
1787 if (PageUptodate(page)) {
1788 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1789 "dirty so the write will be retried "
1790 "later on by the VM.");
1791 /*
1792 * Put the page on mapping->dirty_pages, but leave its
1793 * buffers' dirty state as-is.
1794 */
1795 __set_page_dirty_nobuffers(page);
1796 err = 0;
1797 } else
1798 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1799 "data has been lost.");
1800 } else {
1801 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1802 "with error %i.", err);
1803 NVolSetErrors(ni->vol);
1804 make_bad_inode(VFS_I(base_ni));
1805 make_bad_inode(vi);
1806 }
1807 if (ctx)
1808 ntfs_attr_put_search_ctx(ctx);
1809 if (m)
1810 unmap_mft_record(base_ni);
1811 return err;
1812}
1813
1814/**
1815 * ntfs_file_buffered_write -
1816 *
1817 * Locking: The vfs is holding ->i_sem on the inode.
1818 */
1819static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1820 const struct iovec *iov, unsigned long nr_segs,
1821 loff_t pos, loff_t *ppos, size_t count)
1822{
1823 struct file *file = iocb->ki_filp;
1824 struct address_space *mapping = file->f_mapping;
1825 struct inode *vi = mapping->host;
1826 ntfs_inode *ni = NTFS_I(vi);
1827 ntfs_volume *vol = ni->vol;
1828 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1829 struct page *cached_page = NULL;
1830 char __user *buf = NULL;
1831 s64 end, ll;
1832 VCN last_vcn;
1833 LCN lcn;
1834 unsigned long flags;
1835 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
1836 ssize_t status, written;
1837 unsigned nr_pages;
1838 int err;
1839 struct pagevec lru_pvec;
1840
1841 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1842 "pos 0x%llx, count 0x%lx.",
1843 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1844 (unsigned long long)pos, (unsigned long)count);
1845 if (unlikely(!count))
1846 return 0;
1847 BUG_ON(NInoMstProtected(ni));
1848 /*
1849 * If the attribute is not an index root and it is encrypted or
1850 * compressed, we cannot write to it yet. Note we need to check for
1851 * AT_INDEX_ALLOCATION since this is the type of both directory and
1852 * index inodes.
1853 */
1854 if (ni->type != AT_INDEX_ALLOCATION) {
1855 /* If file is encrypted, deny access, just like NT4. */
1856 if (NInoEncrypted(ni)) {
1857 /*
1858 * Reminder for later: Encrypted files are _always_
1859 * non-resident so that the content can always be
1860 * encrypted.
1861 */
1862 ntfs_debug("Denying write access to encrypted file.");
1863 return -EACCES;
1864 }
1865 if (NInoCompressed(ni)) {
1866 /* Only unnamed $DATA attribute can be compressed. */
1867 BUG_ON(ni->type != AT_DATA);
1868 BUG_ON(ni->name_len);
1869 /*
1870 * Reminder for later: If resident, the data is not
1871 * actually compressed. Only on the switch to non-
1872 * resident does compression kick in. This is in
1873 * contrast to encrypted files (see above).
1874 */
1875 ntfs_error(vi->i_sb, "Writing to compressed files is "
1876 "not implemented yet. Sorry.");
1877 return -EOPNOTSUPP;
1878 }
1879 }
1880 /*
1881 * If a previous ntfs_truncate() failed, repeat it and abort if it
1882 * fails again.
1883 */
1884 if (unlikely(NInoTruncateFailed(ni))) {
1885 down_write(&vi->i_alloc_sem);
1886 err = ntfs_truncate(vi);
1887 up_write(&vi->i_alloc_sem);
1888 if (err || NInoTruncateFailed(ni)) {
1889 if (!err)
1890 err = -EIO;
1891 ntfs_error(vol->sb, "Cannot perform write to inode "
1892 "0x%lx, attribute type 0x%x, because "
1893 "ntfs_truncate() failed (error code "
1894 "%i).", vi->i_ino,
1895 (unsigned)le32_to_cpu(ni->type), err);
1896 return err;
1897 }
1898 }
1899 /* The first byte after the write. */
1900 end = pos + count;
1901 /*
1902 * If the write goes beyond the allocated size, extend the allocation
1903 * to cover the whole of the write, rounded up to the nearest cluster.
1904 */
1905 read_lock_irqsave(&ni->size_lock, flags);
1906 ll = ni->allocated_size;
1907 read_unlock_irqrestore(&ni->size_lock, flags);
1908 if (end > ll) {
1909 /* Extend the allocation without changing the data size. */
1910 ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1911 if (likely(ll >= 0)) {
1912 BUG_ON(pos >= ll);
1913 /* If the extension was partial truncate the write. */
1914 if (end > ll) {
1915 ntfs_debug("Truncating write to inode 0x%lx, "
1916 "attribute type 0x%x, because "
1917 "the allocation was only "
1918 "partially extended.",
1919 vi->i_ino, (unsigned)
1920 le32_to_cpu(ni->type));
1921 end = ll;
1922 count = ll - pos;
1923 }
1924 } else {
1925 err = ll;
1926 read_lock_irqsave(&ni->size_lock, flags);
1927 ll = ni->allocated_size;
1928 read_unlock_irqrestore(&ni->size_lock, flags);
1929 /* Perform a partial write if possible or fail. */
1930 if (pos < ll) {
1931 ntfs_debug("Truncating write to inode 0x%lx, "
1932 "attribute type 0x%x, because "
1933 "extending the allocation "
1934 "failed (error code %i).",
1935 vi->i_ino, (unsigned)
1936 le32_to_cpu(ni->type), err);
1937 end = ll;
1938 count = ll - pos;
1939 } else {
1940 ntfs_error(vol->sb, "Cannot perform write to "
1941 "inode 0x%lx, attribute type "
1942 "0x%x, because extending the "
1943 "allocation failed (error "
1944 "code %i).", vi->i_ino,
1945 (unsigned)
1946 le32_to_cpu(ni->type), err);
1947 return err;
1948 }
1949 }
1950 }
1951 pagevec_init(&lru_pvec, 0);
1952 written = 0;
1953 /*
1954 * If the write starts beyond the initialized size, extend it up to the
1955 * beginning of the write and initialize all non-sparse space between
1956 * the old initialized size and the new one. This automatically also
1957 * increments the vfs inode->i_size to keep it above or equal to the
1958 * initialized_size.
1959 */
1960 read_lock_irqsave(&ni->size_lock, flags);
1961 ll = ni->initialized_size;
1962 read_unlock_irqrestore(&ni->size_lock, flags);
1963 if (pos > ll) {
1964 err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1965 &lru_pvec);
1966 if (err < 0) {
1967 ntfs_error(vol->sb, "Cannot perform write to inode "
1968 "0x%lx, attribute type 0x%x, because "
1969 "extending the initialized size "
1970 "failed (error code %i).", vi->i_ino,
1971 (unsigned)le32_to_cpu(ni->type), err);
1972 status = err;
1973 goto err_out;
1974 }
1975 }
1976 /*
1977 * Determine the number of pages per cluster for non-resident
1978 * attributes.
1979 */
1980 nr_pages = 1;
1981 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1982 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1983 /* Finally, perform the actual write. */
1984 last_vcn = -1;
1985 if (likely(nr_segs == 1))
1986 buf = iov->iov_base;
1987 do {
1988 VCN vcn;
1989 pgoff_t idx, start_idx;
1990 unsigned ofs, do_pages, u;
1991 size_t copied;
1992
1993 start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1994 ofs = pos & ~PAGE_CACHE_MASK;
1995 bytes = PAGE_CACHE_SIZE - ofs;
1996 do_pages = 1;
1997 if (nr_pages > 1) {
1998 vcn = pos >> vol->cluster_size_bits;
1999 if (vcn != last_vcn) {
2000 last_vcn = vcn;
2001 /*
2002 * Get the lcn of the vcn the write is in. If
2003 * it is a hole, need to lock down all pages in
2004 * the cluster.
2005 */
2006 down_read(&ni->runlist.lock);
2007 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
2008 vol->cluster_size_bits, FALSE);
2009 up_read(&ni->runlist.lock);
2010 if (unlikely(lcn < LCN_HOLE)) {
2011 status = -EIO;
2012 if (lcn == LCN_ENOMEM)
2013 status = -ENOMEM;
2014 else
2015 ntfs_error(vol->sb, "Cannot "
2016 "perform write to "
2017 "inode 0x%lx, "
2018 "attribute type 0x%x, "
2019 "because the attribute "
2020 "is corrupt.",
2021 vi->i_ino, (unsigned)
2022 le32_to_cpu(ni->type));
2023 break;
2024 }
2025 if (lcn == LCN_HOLE) {
2026 start_idx = (pos & ~(s64)
2027 vol->cluster_size_mask)
2028 >> PAGE_CACHE_SHIFT;
2029 bytes = vol->cluster_size - (pos &
2030 vol->cluster_size_mask);
2031 do_pages = nr_pages;
2032 }
2033 }
2034 }
2035 if (bytes > count)
2036 bytes = count;
2037 /*
2038 * Bring in the user page(s) that we will copy from _first_.
2039 * Otherwise there is a nasty deadlock on copying from the same
2040 * page(s) as we are writing to, without it/them being marked
2041 * up-to-date. Note, at present there is nothing to stop the
2042 * pages being swapped out between us bringing them into memory
2043 * and doing the actual copying.
2044 */
2045 if (likely(nr_segs == 1))
2046 ntfs_fault_in_pages_readable(buf, bytes);
2047 else
2048 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2049 /* Get and lock @do_pages starting at index @start_idx. */
2050 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2051 pages, &cached_page, &lru_pvec);
2052 if (unlikely(status))
2053 break;
2054 /*
2055 * For non-resident attributes, we need to fill any holes with
2056 * actual clusters and ensure all bufferes are mapped. We also
2057 * need to bring uptodate any buffers that are only partially
2058 * being written to.
2059 */
2060 if (NInoNonResident(ni)) {
2061 status = ntfs_prepare_pages_for_non_resident_write(
2062 pages, do_pages, pos, bytes);
2063 if (unlikely(status)) {
2064 loff_t i_size;
2065
2066 do {
2067 unlock_page(pages[--do_pages]);
2068 page_cache_release(pages[do_pages]);
2069 } while (do_pages);
2070 /*
2071 * The write preparation may have instantiated
2072 * allocated space outside i_size. Trim this
2073 * off again. We can ignore any errors in this
2074 * case as we will just be waisting a bit of
2075 * allocated space, which is not a disaster.
2076 */
2077 i_size = i_size_read(vi);
2078 if (pos + bytes > i_size)
2079 vmtruncate(vi, i_size);
2080 break;
2081 }
2082 }
2083 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2084 if (likely(nr_segs == 1)) {
2085 copied = ntfs_copy_from_user(pages + u, do_pages - u,
2086 ofs, buf, bytes);
2087 buf += copied;
2088 } else
2089 copied = ntfs_copy_from_user_iovec(pages + u,
2090 do_pages - u, ofs, &iov, &iov_ofs,
2091 bytes);
2092 ntfs_flush_dcache_pages(pages + u, do_pages - u);
2093 status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2094 bytes);
2095 if (likely(!status)) {
2096 written += copied;
2097 count -= copied;
2098 pos += copied;
2099 if (unlikely(copied != bytes))
2100 status = -EFAULT;
2101 }
2102 do {
2103 unlock_page(pages[--do_pages]);
2104 mark_page_accessed(pages[do_pages]);
2105 page_cache_release(pages[do_pages]);
2106 } while (do_pages);
2107 if (unlikely(status))
2108 break;
2109 balance_dirty_pages_ratelimited(mapping);
2110 cond_resched();
2111 } while (count);
2112err_out:
2113 *ppos = pos;
2114 if (cached_page)
2115 page_cache_release(cached_page);
2116 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2117 if (likely(!status)) {
2118 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2119 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2120 status = generic_osync_inode(vi, mapping,
2121 OSYNC_METADATA|OSYNC_DATA);
2122 }
2123 }
2124 pagevec_lru_add(&lru_pvec);
2125 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2126 written ? "written" : "status", (unsigned long)written,
2127 (long)status);
2128 return written ? written : status;
2129}
2130
2131/**
2132 * ntfs_file_aio_write_nolock -
2133 */
2134static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2135 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2136{
2137 struct file *file = iocb->ki_filp;
2138 struct address_space *mapping = file->f_mapping;
2139 struct inode *inode = mapping->host;
2140 loff_t pos;
2141 unsigned long seg;
2142 size_t count; /* after file limit checks */
2143 ssize_t written, err;
2144
2145 count = 0;
2146 for (seg = 0; seg < nr_segs; seg++) {
2147 const struct iovec *iv = &iov[seg];
2148 /*
2149 * If any segment has a negative length, or the cumulative
2150 * length ever wraps negative then return -EINVAL.
2151 */
2152 count += iv->iov_len;
2153 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2154 return -EINVAL;
2155 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2156 continue;
2157 if (!seg)
2158 return -EFAULT;
2159 nr_segs = seg;
2160 count -= iv->iov_len; /* This segment is no good */
2161 break;
2162 }
2163 pos = *ppos;
2164 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2165 /* We can write back this queue in page reclaim. */
2166 current->backing_dev_info = mapping->backing_dev_info;
2167 written = 0;
2168 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2169 if (err)
2170 goto out;
2171 if (!count)
2172 goto out;
2173 err = remove_suid(file->f_dentry);
2174 if (err)
2175 goto out;
2176 inode_update_time(inode, 1);
2177 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2178 count);
2179out:
2180 current->backing_dev_info = NULL;
2181 return written ? written : err;
2182}
2183
2184/**
2185 * ntfs_file_aio_write -
2186 */
2187static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2188 size_t count, loff_t pos)
2189{
2190 struct file *file = iocb->ki_filp;
2191 struct address_space *mapping = file->f_mapping;
2192 struct inode *inode = mapping->host;
2193 ssize_t ret;
2194 struct iovec local_iov = { .iov_base = (void __user *)buf,
2195 .iov_len = count };
2196
2197 BUG_ON(iocb->ki_pos != pos);
2198
2199 down(&inode->i_sem);
2200 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2201 up(&inode->i_sem);
2202 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2203 int err = sync_page_range(inode, mapping, pos, ret);
2204 if (err < 0)
2205 ret = err;
2206 }
2207 return ret;
2208}
2209
2210/**
2211 * ntfs_file_writev -
2212 *
2213 * Basically the same as generic_file_writev() except that it ends up calling
2214 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2215 */
2216static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2217 unsigned long nr_segs, loff_t *ppos)
2218{
2219 struct address_space *mapping = file->f_mapping;
2220 struct inode *inode = mapping->host;
2221 struct kiocb kiocb;
2222 ssize_t ret;
2223
2224 down(&inode->i_sem);
2225 init_sync_kiocb(&kiocb, file);
2226 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2227 if (ret == -EIOCBQUEUED)
2228 ret = wait_on_sync_kiocb(&kiocb);
2229 up(&inode->i_sem);
2230 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2231 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2232 if (err < 0)
2233 ret = err;
2234 }
2235 return ret;
2236}
2237
2238/**
2239 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2240 */
2241static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2242 size_t count, loff_t *ppos)
2243{
2244 struct iovec local_iov = { .iov_base = (void __user *)buf,
2245 .iov_len = count };
2246
2247 return ntfs_file_writev(file, &local_iov, 1, ppos);
2248}
2249
2250/**
59 * ntfs_file_fsync - sync a file to disk 2251 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced 2252 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync 2253 * @dentry: dentry describing the file to sync
@@ -113,39 +2305,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
113#endif /* NTFS_RW */ 2305#endif /* NTFS_RW */
114 2306
115struct file_operations ntfs_file_ops = { 2307struct file_operations ntfs_file_ops = {
116 .llseek = generic_file_llseek, /* Seek inside file. */ 2308 .llseek = generic_file_llseek, /* Seek inside file. */
117 .read = generic_file_read, /* Read from file. */ 2309 .read = generic_file_read, /* Read from file. */
118 .aio_read = generic_file_aio_read, /* Async read from file. */ 2310 .aio_read = generic_file_aio_read, /* Async read from file. */
119 .readv = generic_file_readv, /* Read from file. */ 2311 .readv = generic_file_readv, /* Read from file. */
120#ifdef NTFS_RW 2312#ifdef NTFS_RW
121 .write = generic_file_write, /* Write to file. */ 2313 .write = ntfs_file_write, /* Write to file. */
122 .aio_write = generic_file_aio_write, /* Async write to file. */ 2314 .aio_write = ntfs_file_aio_write, /* Async write to file. */
123 .writev = generic_file_writev, /* Write to file. */ 2315 .writev = ntfs_file_writev, /* Write to file. */
124 /*.release = ,*/ /* Last file is closed. See 2316 /*.release = ,*/ /* Last file is closed. See
125 fs/ext2/file.c:: 2317 fs/ext2/file.c::
126 ext2_release_file() for 2318 ext2_release_file() for
127 how to use this to discard 2319 how to use this to discard
128 preallocated space for 2320 preallocated space for
129 write opened files. */ 2321 write opened files. */
130 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2322 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
131 /*.aio_fsync = ,*/ /* Sync all outstanding async 2323 /*.aio_fsync = ,*/ /* Sync all outstanding async
132 i/o operations on a 2324 i/o operations on a
133 kiocb. */ 2325 kiocb. */
134#endif /* NTFS_RW */ 2326#endif /* NTFS_RW */
135 /*.ioctl = ,*/ /* Perform function on the 2327 /*.ioctl = ,*/ /* Perform function on the
136 mounted filesystem. */ 2328 mounted filesystem. */
137 .mmap = generic_file_mmap, /* Mmap file. */ 2329 .mmap = generic_file_mmap, /* Mmap file. */
138 .open = ntfs_file_open, /* Open file. */ 2330 .open = ntfs_file_open, /* Open file. */
139 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2331 .sendfile = generic_file_sendfile, /* Zero-copy data send with
140 the data source being on 2332 the data source being on
141 the ntfs partition. We 2333 the ntfs partition. We do
142 do not need to care about 2334 not need to care about the
143 the data destination. */ 2335 data destination. */
144 /*.sendpage = ,*/ /* Zero-copy data send with 2336 /*.sendpage = ,*/ /* Zero-copy data send with
145 the data destination being 2337 the data destination being
146 on the ntfs partition. We 2338 on the ntfs partition. We
147 do not need to care about 2339 do not need to care about
148 the data source. */ 2340 the data source. */
149}; 2341};
150 2342
151struct inode_operations ntfs_file_inode_ops = { 2343struct inode_operations ntfs_file_inode_ops = {