aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c781
1 files changed, 548 insertions, 233 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413..79f24a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include "internal.h"
/*
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
+ * ->zone.lock
*
* ->i_mutex
* ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -593,7 +594,7 @@ void fastcall __lock_page_nosync(struct page *page)
* Is there a pagecache struct page at the given (mapping, offset) tuple?
* If yes, increment its refcount and return it; if no, return NULL.
*/
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
@@ -617,30 +618,31 @@ EXPORT_SYMBOL(find_get_page);
* Returns zero if the page was not present. find_lock_page() may sleep.
*/
struct page *find_lock_page(struct address_space *mapping,
- unsigned long offset)
+ pgoff_t offset)
{
struct page *page;
- read_lock_irq(&mapping->tree_lock);
repeat:
+ read_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page) {
page_cache_get(page);
if (TestSetPageLocked(page)) {
read_unlock_irq(&mapping->tree_lock);
__lock_page(page);
- read_lock_irq(&mapping->tree_lock);
/* Has the page been truncated while we slept? */
- if (unlikely(page->mapping != mapping ||
- page->index != offset)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
}
+ VM_BUG_ON(page->index != offset);
+ goto out;
}
}
read_unlock_irq(&mapping->tree_lock);
+out:
return page;
}
EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +665,24 @@ EXPORT_SYMBOL(find_lock_page);
* memory exhaustion.
*/
struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, gfp_t gfp_mask)
+ pgoff_t index, gfp_t gfp_mask)
{
- struct page *page, *cached_page = NULL;
+ struct page *page;
int err;
repeat:
page = find_lock_page(mapping, index);
if (!page) {
- if (!cached_page) {
- cached_page =
- __page_cache_alloc(gfp_mask);
- if (!cached_page)
- return NULL;
+ page = __page_cache_alloc(gfp_mask);
+ if (!page)
+ return NULL;
+ err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ page = NULL;
+ if (err == -EEXIST)
+ goto repeat;
}
- err = add_to_page_cache_lru(cached_page, mapping,
- index, gfp_mask);
- if (!err) {
- page = cached_page;
- cached_page = NULL;
- } else if (err == -EEXIST)
- goto repeat;
}
- if (cached_page)
- page_cache_release(cached_page);
return page;
}
EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +794,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
* and deadlock against the caller's locked page.
*/
struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
{
struct page *page = find_get_page(mapping, index);
@@ -859,34 +856,29 @@ static void shrink_readahead_size_eio(struct file *filp,
* It may be NULL.
*/
void do_generic_mapping_read(struct address_space *mapping,
- struct file_ra_state *_ra,
+ struct file_ra_state *ra,
struct file *filp,
loff_t *ppos,
read_descriptor_t *desc,
read_actor_t actor)
{
struct inode *inode = mapping->host;
- unsigned long index;
- unsigned long offset;
- unsigned long last_index;
- unsigned long next_index;
- unsigned long prev_index;
+ pgoff_t index;
+ pgoff_t last_index;
+ pgoff_t prev_index;
+ unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
- struct page *cached_page;
int error;
- struct file_ra_state ra = *_ra;
- cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
- next_index = index;
- prev_index = ra.prev_index;
- prev_offset = ra.prev_offset;
+ prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
for (;;) {
struct page *page;
- unsigned long end_index;
+ pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
@@ -895,7 +887,7 @@ find_page:
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
- &ra, filp,
+ ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
@@ -903,7 +895,7 @@ find_page:
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
- &ra, filp, page,
+ ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page))
@@ -966,7 +958,6 @@ page_ok:
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
- ra.prev_offset = offset;
page_cache_release(page);
if (ret == nr && desc->count)
@@ -1015,7 +1006,7 @@ readpage:
}
unlock_page(page);
error = -EIO;
- shrink_readahead_size_eio(filp, &ra);
+ shrink_readahead_size_eio(filp, ra);
goto readpage_error;
}
unlock_page(page);
@@ -1034,33 +1025,29 @@ no_cached_page:
* Ok, it wasn't cached, so we need to create a new
* page..
*/
- if (!cached_page) {
- cached_page = page_cache_alloc_cold(mapping);
- if (!cached_page) {
- desc->error = -ENOMEM;
- goto out;
- }
+ page = page_cache_alloc_cold(mapping);
+ if (!page) {
+ desc->error = -ENOMEM;
+ goto out;
}
- error = add_to_page_cache_lru(cached_page, mapping,
+ error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
+ page_cache_release(page);
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
- page = cached_page;
- cached_page = NULL;
goto readpage;
}
out:
- *_ra = ra;
- _ra->prev_index = prev_index;
+ ra->prev_pos = prev_index;
+ ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos |= prev_offset;
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- if (cached_page)
- page_cache_release(cached_page);
+ *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
if (filp)
file_accessed(filp);
}
@@ -1220,7 +1207,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
- unsigned long index, unsigned long nr)
+ pgoff_t index, unsigned long nr)
{
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
@@ -1240,8 +1227,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
if (file) {
if (file->f_mode & FMODE_READ) {
struct address_space *mapping = file->f_mapping;
- unsigned long start = offset >> PAGE_CACHE_SHIFT;
- unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
unsigned long len = end - start + 1;
ret = do_readahead(mapping, file, start, len);
}
@@ -1251,7 +1238,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
}
#ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
/**
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
@@ -1260,7 +1246,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
+static int fastcall page_cache_read(struct file * file, pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
@@ -1349,7 +1335,7 @@ retry_find:
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
- if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+ if (ra->mmap_miss > MMAP_LOTSAMISS)
goto no_cached_page;
/*
@@ -1375,7 +1361,7 @@ retry_find:
}
if (!did_readaround)
- ra->mmap_hit++;
+ ra->mmap_miss--;
/*
* We have a locked page in the page cache, now we need to check
@@ -1396,7 +1382,7 @@ retry_find:
* Found the page and have a reference on it.
*/
mark_page_accessed(page);
- ra->prev_index = page->index;
+ ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1501,39 +1487,32 @@ EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
- unsigned long index,
+ pgoff_t index,
int (*filler)(void *,struct page*),
void *data)
{
- struct page *page, *cached_page = NULL;
+ struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
- if (!cached_page) {
- cached_page = page_cache_alloc_cold(mapping);
- if (!cached_page)
- return ERR_PTR(-ENOMEM);
- }
- err = add_to_page_cache_lru(cached_page, mapping,
- index, GFP_KERNEL);
- if (err == -EEXIST)
- goto repeat;
- if (err < 0) {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ if (err == -EEXIST)
+ goto repeat;
/* Presumably ENOMEM for radix tree node */
- page_cache_release(cached_page);
return ERR_PTR(err);
}
- page = cached_page;
- cached_page = NULL;
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
}
}
- if (cached_page)
- page_cache_release(cached_page);
return page;
}
@@ -1542,7 +1521,7 @@ repeat:
* after submitting it to the filler.
*/
struct page *read_cache_page_async(struct address_space *mapping,
- unsigned long index,
+ pgoff_t index,
int (*filler)(void *,struct page*),
void *data)
{
@@ -1590,7 +1569,7 @@ EXPORT_SYMBOL(read_cache_page_async);
* If the page does not get brought uptodate, return -EIO.
*/
struct page *read_cache_page(struct address_space *mapping,
- unsigned long index,
+ pgoff_t index,
int (*filler)(void *,struct page*),
void *data)
{
@@ -1610,40 +1589,6 @@ struct page *read_cache_page(struct address_space *mapping,
EXPORT_SYMBOL(read_cache_page);
/*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec. This function is specifically for
- * generic_file_write().
- */
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
- struct page **cached_page, struct pagevec *lru_pvec)
-{
- int err;
- struct page *page;
-repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
- if (!*cached_page) {
- *cached_page = page_cache_alloc(mapping);
- if (!*cached_page)
- return NULL;
- }
- err = add_to_page_cache(*cached_page, mapping,
- index, GFP_KERNEL);
- if (err == -EEXIST)
- goto repeat;
- if (err == 0) {
- page = *cached_page;
- page_cache_get(page);
- if (!pagevec_add(lru_pvec, page))
- __pagevec_lru_add(lru_pvec);
- *cached_page = NULL;
- }
- }
- return page;
-}
-
-/*
* The logic we want is
*
* if suid or (sgid and xgrp)
@@ -1682,17 +1627,22 @@ int __remove_suid(struct dentry *dentry, int kill)
int remove_suid(struct dentry *dentry)
{
- int kill = should_remove_suid(dentry);
+ int killsuid = should_remove_suid(dentry);
+ int killpriv = security_inode_need_killpriv(dentry);
+ int error = 0;
- if (unlikely(kill))
- return __remove_suid(dentry, kill);
+ if (killpriv < 0)
+ return killpriv;
+ if (killpriv)
+ error = security_inode_killpriv(dentry);
+ if (!error && killsuid)
+ error = __remove_suid(dentry, killsuid);
- return 0;
+ return error;
}
EXPORT_SYMBOL(remove_suid);
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
size_t copied = 0, left = 0;
@@ -1715,6 +1665,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
}
/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic_nocache(kaddr + offset,
+ buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
+{
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+
+ while (bytes) {
+ int copy = min(bytes, iov->iov_len - base);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ }
+}
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ __iov_iter_advance_iov(i, bytes);
+ i->count -= bytes;
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+/*
* Performs necessary checks before doing a write
*
* Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1864,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
}
EXPORT_SYMBOL(generic_write_checks);
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ if (aops->write_begin) {
+ return aops->write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata);
+ } else {
+ int ret;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ struct inode *inode = mapping->host;
+ struct page *page;
+again:
+ page = __grab_cache_page(mapping, index);
+ *pagep = page;
+ if (!page)
+ return -ENOMEM;
+
+ if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
+ /*
+ * There is no way to resolve a short write situation
+ * for a !Uptodate page (except by double copying in
+ * the caller done by generic_perform_write_2copy).
+ *
+ * Instead, we have to bring it uptodate here.
+ */
+ ret = aops->readpage(file, page);
+ page_cache_release(page);
+ if (ret) {
+ if (ret == AOP_TRUNCATED_PAGE)
+ goto again;
+ return ret;
+ }
+ goto again;
+ }
+
+ ret = aops->prepare_write(file, page, offset, offset+len);
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ }
+ return ret;
+ }
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+ int ret;
+
+ if (aops->write_end) {
+ mark_page_accessed(page);
+ ret = aops->write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ } else {
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ struct inode *inode = mapping->host;
+
+ flush_dcache_page(page);
+ ret = aops->commit_write(file, page, offset, offset+len);
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+
+ if (ret < 0) {
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ } else if (ret > 0)
+ ret = min_t(size_t, copied, ret);
+ else
+ ret = copied;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
ssize_t
generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1988,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
-
- pagevec_init(&lru_pvec, 0);
+ int status;
+ struct page *page;
+repeat:
+ page = find_lock_page(mapping, index);
+ if (likely(page))
+ return page;
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
+ page = page_cache_alloc(mapping);
+ if (!page)
+ return NULL;
+ status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ if (unlikely(status)) {
+ page_cache_release(page);
+ if (status == -EEXIST)
+ goto repeat;
+ return NULL;
}
+ return page;
+}
+EXPORT_SYMBOL(__grab_cache_page);
+
+static ssize_t generic_perform_write_2copy(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status = 0;
+ ssize_t written = 0;
do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
+ struct page *src_page;
+ struct page *page;
+ pgoff_t index; /* Pagecache index for current page */
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /* We only need to worry about prefaulting when writes are from
- * user-space. NFSd uses vfs_writev with several non-aligned
- * segments in the vector, and limiting to one segment a time is
- * a noticeable performance for re-write
+ /*
+ * a non-NULL src_page indicates that we're doing the
+ * copy via get_user_pages and kmap.
*/
- if (!segment_eq(get_fs(), KERNEL_DS)) {
- /*
- * Limit the size of the copy to that of the current
- * segment, because fault_in_pages_readable() doesn't
- * know how to walk segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ src_page = NULL;
- /*
- * Bring in the user page that we will copy from
- * _first_. Otherwise there's a nasty deadlock on
- * copying from the same page as we're writing to,
- * without it being marked up-to-date.
- */
- fault_in_pages_readable(buf, bytes);
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
}
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+
+ page = __grab_cache_page(mapping, index);
if (!page) {
status = -ENOMEM;
break;
}
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
+ /*
+ * non-uptodate pages cannot cope with short copies, and we
+ * cannot take a pagefault with the destination page locked.
+ * So pin the source page to copy it.
+ */
+ if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
+ unlock_page(page);
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
- if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ src_page = alloc_page(GFP_KERNEL);
+ if (!src_page) {
+ page_cache_release(page);
+ status = -ENOMEM;
+ break;
+ }
- if (status != AOP_TRUNCATED_PAGE)
+ /*
+ * Cannot get_user_pages with a page locked for the
+ * same reason as we can't take a page fault with a
+ * page locked (as explained below).
+ */
+ copied = iov_iter_copy_from_user(src_page, i,
+ offset, bytes);
+ if (unlikely(copied == 0)) {
+ status = -EFAULT;
+ page_cache_release(page);
+ page_cache_release(src_page);
+ break;
+ }
+ bytes = copied;
+
+ lock_page(page);
+ /*
+ * Can't handle the page going uptodate here, because
+ * that means we would use non-atomic usercopies, which
+ * zero out the tail of the page, which can cause
+ * zeroes to become transiently visible. We could just
+ * use a non-zeroing copy, but the APIs aren't too
+ * consistent.
+ */
+ if (unlikely(!page->mapping || PageUptodate(page))) {
unlock_page(page);
- page_cache_release(page);
- if (status == AOP_TRUNCATED_PAGE)
+ page_cache_release(page);
+ page_cache_release(src_page);
continue;
+ }
+ }
+
+ status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ if (unlikely(status))
+ goto fs_write_aop_error;
+
+ if (!src_page) {
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * Must not enter the pagefault handler here, because
+ * we hold the page lock, so we might recursively
+ * deadlock on the same lock, or get an ABBA deadlock
+ * against a different lock, or against the mmap_sem
+ * (which nests outside the page lock). So increment
+ * preempt count, and use _atomic usercopies.
+ *
+ * The page is uptodate so we are OK to encounter a
+ * short copy: if unmodified parts of the page are
+ * marked dirty and written out to disk, it doesn't
+ * really matter.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i,
+ offset, bytes);
+ pagefault_enable();
+ } else {
+ void *src, *dst;
+ src = kmap_atomic(src_page, KM_USER0);
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst + offset, src + offset, bytes);
+ kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(src, KM_USER0);
+ copied = bytes;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
flush_dcache_page(page);
+
status = a_ops->commit_write(file, page, offset, offset+bytes);
- if (status == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- continue;
- }
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
- filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
+ if (unlikely(status < 0))
+ goto fs_write_aop_error;
+ if (unlikely(status > 0)) /* filesystem did partial write */
+ copied = min_t(size_t, copied, status);
+
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
- if (status < 0)
- break;
+ if (src_page)
+ page_cache_release(src_page);
+
+ iov_iter_advance(i, copied);
+ pos += copied;
+ written += copied;
+
balance_dirty_pages_ratelimited(mapping);
cond_resched();
- } while (count);
- *ppos = pos;
+ continue;
- if (cached_page)
- page_cache_release(cached_page);
+fs_write_aop_error:
+ unlock_page(page);
+ page_cache_release(page);
+ if (src_page)
+ page_cache_release(src_page);
+
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + bytes > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ break;
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = 0;
/*
- * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+ * Copies from kernel address space cannot fail (NFSD is a big user).
*/
+ if (segment_eq(get_fs(), KERNEL_DS))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ do {
+ struct page *page;
+ pgoff_t index; /* Pagecache index for current page */
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
+
+again:
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+ if (unlikely(status))
+ break;
+
+ pagefault_disable();
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
+ flush_dcache_page(page);
+
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
+ break;
+ copied = status;
+
+ cond_resched();
+
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ iov_iter_advance(i, copied);
+ pos += copied;
+ written += copied;
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ } while (iov_iter_count(i));
+
+ return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ ssize_t status;
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ if (a_ops->write_begin)
+ status = generic_perform_write(file, &i, pos);
+ else
+ status = generic_perform_write_2copy(file, &i, pos);
+
if (likely(status >= 0)) {
+ written += status;
+ *ppos = pos + status;
+
+ /*
+ * For now, when the user asks for O_SYNC, we'll actually give
+ * O_DSYNC
+ */
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (!a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2311,6 @@ zero_length_segment:
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait(mapping);
- pagevec_lru_add(&lru_pvec);
return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);