diff options
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 169 |
1 files changed, 108 insertions, 61 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30ac305..2f76c4a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,15 +26,9 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/tracepoint.h> #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -50,6 +44,21 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/writeback.h> + +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -65,22 +74,21 @@ int writeback_in_progress(struct backing_dev_info *bdi) static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { - spin_lock(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); + trace_writeback_queue(bdi, work); - /* - * If the default thread isn't there, make sure we add it. When - * it gets created and wakes up, we'll run this work. - */ - if (unlikely(list_empty_careful(&bdi->wb_list))) + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); - else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); } + spin_unlock_bh(&bdi->wb_lock); } static void @@ -95,8 +103,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) + if (bdi->wb.task) { + trace_writeback_nowork(bdi); wake_up_process(bdi->wb.task); + } return; } @@ -352,7 +362,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; - if (!(inode->i_state & (I_FREEING | I_CLEAR))) { + if (!(inode->i_state & I_FREEING)) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { /* * More pages get dirtied by a fast dirtier. @@ -499,7 +509,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, if (inode_dirtied_after(inode, wbc->wb_start)) return 1; - BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); + BUG_ON(inode->i_state & I_FREEING); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); @@ -643,10 +653,14 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) __writeback_inodes_sb(work->sb, wb, &wbc); else writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; @@ -674,6 +688,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); @@ -686,17 +701,17 @@ static long wb_writeback(struct bdi_writeback *wb, * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) +get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); if (!list_empty(&bdi->work_list)) { work = list_entry(bdi->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); return work; } @@ -744,7 +759,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) struct wb_writeback_work *work; long wrote = 0; - while ((work = get_next_work_item(bdi, wb)) != NULL) { + while ((work = get_next_work_item(bdi)) != NULL) { /* * Override sync mode, in case we must wait for completion * because this thread is exiting now. @@ -752,6 +767,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->sync_mode = WB_SYNC_ALL; + trace_writeback_exec(bdi, work); + wrote += wb_writeback(wb, work); /* @@ -776,47 +793,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct bdi_writeback *wb) +int bdi_writeback_thread(void *data) { - unsigned long last_active = jiffies; - unsigned long wait_jiffies = -1UL; + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; long pages_written; + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + wb->last_active = jiffies; + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) - last_active = jiffies; - else if (wait_jiffies != -1UL) { - unsigned long max_idle; + wb->last_active = jiffies; - /* - * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the task will get - * recreated automatically. - */ - max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + last_active)) - break; + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list)) { + __set_current_state(TASK_RUNNING); + continue; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); - } else { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty_careful(&wb->bdi->work_list) && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); + if (wb_has_dirty_io(wb) && dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ + schedule(); } try_to_freeze(); } + /* Flush any work that raced with us exiting */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } + /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. @@ -891,6 +927,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -936,7 +974,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (hlist_unhashed(&inode->i_hash)) goto out; } - if (inode->i_state & (I_FREEING|I_CLEAR)) + if (inode->i_state & I_FREEING) goto out; /* @@ -944,22 +982,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - struct backing_dev_info *bdi = wb->bdi; - - if (bdi_cap_writeback_dirty(bdi) && - !test_bit(BDI_registered, &bdi->state)) { - WARN_ON(1); - printk(KERN_ERR "bdi-%s not registered\n", - bdi->name); + bdi = inode_to_bdi(inode); + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_list, &bdi->wb.b_dirty); } } out: spin_unlock(&inode_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1002,7 +1049,7 @@ static void wait_sb_inodes(struct super_block *sb) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) |