diff options
47 files changed, 3183 insertions, 1550 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback new file mode 100644 index 0000000..8bb43b6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback @@ -0,0 +1,17 @@ +What: /sys/module/xen_blkback/parameters/max_buffer_pages +Date: March 2013 +KernelVersion: 3.11 +Contact: Roger Pau MonnĂ© <roger.pau@citrix.com> +Description: + Maximum number of free pages to keep in each block + backend buffer. + +What: /sys/module/xen_blkback/parameters/max_persistent_grants +Date: March 2013 +KernelVersion: 3.11 +Contact: Roger Pau MonnĂ© <roger.pau@citrix.com> +Description: + Maximum number of grants to map persistently in + blkback. If the frontend tries to use more than + max_persistent_grants, the LRU kicks in and starts + removing 5% of max_persistent_grants every 100ms. diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront new file mode 100644 index 0000000..c0a6cb7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront @@ -0,0 +1,10 @@ +What: /sys/module/xen_blkfront/parameters/max +Date: June 2013 +KernelVersion: 3.11 +Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Description: + Maximum number of segments that the frontend will negotiate + with the backend for indirect descriptors. The default value + is 32 - higher value means more potential throughput but more + memory usage. The backend picks the minimum of the frontend + and its default backend value. diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt index c3365f2..32b6c31 100644 --- a/Documentation/bcache.txt +++ b/Documentation/bcache.txt @@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't have to manually attach: make-bcache -B /dev/sda /dev/sdb -C /dev/sdc -To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: +bcache-tools now ships udev rules, and bcache devices are known to the kernel +immediately. Without udev, you can manually register devices like this: echo /dev/sdb > /sys/fs/bcache/register echo /dev/sdc > /sys/fs/bcache/register -To register your bcache devices automatically, you could add something like -this to an init script: +Registering the backing device makes the bcache device show up in /dev; you can +now format it and use it as normal. But the first time using a new bcache +device, it'll be running in passthrough mode until you attach it to a cache. +See the section on attaching. - echo /dev/sd* > /sys/fs/bcache/register_quiet +The devices show up as: -It'll look for bcache superblocks and ignore everything that doesn't have one. + /dev/bcache<N> -Registering the backing device makes the bcache show up in /dev; you can now -format it and use it as normal. But the first time using a new bcache device, -it'll be running in passthrough mode until you attach it to a cache. See the -section on attaching. +As well as (with udev): -The devices show up at /dev/bcacheN, and can be controlled via sysfs from -/sys/block/bcacheN/bcache: + /dev/bcache/by-uuid/<uuid> + /dev/bcache/by-label/<label> + +To get started: mkfs.ext4 /dev/bcache0 mount /dev/bcache0 /mnt +You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache . + Cache devices are managed as sets; multiple caches per set isn't supported yet but will allow for mirroring of metadata and dirty data in the future. Your new cache set shows up as /sys/fs/bcache/<UUID> @@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing device to a cache set is done thusly, with the UUID of the cache set in /sys/fs/bcache: - echo <UUID> > /sys/block/bcache0/bcache/attach + echo <CSET-UUID> > /sys/block/bcache0/bcache/attach This only has to be done once. The next time you reboot, just reregister all your bcache devices. If a backing device has data in a cache somewhere, the -/dev/bcache# device won't be created until the cache shows up - particularly +/dev/bcache<N> device won't be created until the cache shows up - particularly important if you have writeback caching turned on. If you're booting up and your cache device is gone and never coming back, you @@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking. SYSFS - BACKING DEVICE: +Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and +(if attached) /sys/fs/bcache/<cset-uuid>/bdev* + attach Echo the UUID of a cache set to this file to enable caching. @@ -300,6 +307,8 @@ cache_readaheads SYSFS - CACHE SET: +Available at /sys/fs/bcache/<cset-uuid> + average_key_size Average data per key in the btree. @@ -390,6 +399,8 @@ trigger_gc SYSFS - CACHE DEVICE: +Available at /sys/block/<cdev>/bcache + block_size Minimum granularity of writes - should match hardware sector size. diff --git a/MAINTAINERS b/MAINTAINERS index bf61e04..5d3facf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1642,7 +1642,7 @@ S: Maintained F: drivers/net/hamradio/baycom* BCACHE (BLOCK LAYER CACHE) -M: Kent Overstreet <koverstreet@google.com> +M: Kent Overstreet <kmo@daterainc.com> L: linux-bcache@vger.kernel.org W: http://bcache.evilpiepirate.org S: Maintained: @@ -3346,7 +3346,7 @@ F: Documentation/firmware_class/ F: drivers/base/firmware*.c F: include/linux/firmware.h -FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card) +FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) M: Joshua Morris <josh.h.morris@us.ibm.com> M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> S: Maintained diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b81ddfe..e07a5fd 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -532,11 +532,11 @@ config BLK_DEV_RBD If unsure, say N. config BLK_DEV_RSXX - tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" + tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" depends on PCI help Device driver for IBM's high speed PCIe SSD - storage devices: FlashSystem-70 and FlashSystem-80. + storage device: Flash Adapter 900GB Full Height. To compile this driver as a module, choose M here: the module will be called rsxx. diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 6608076..28c73ca 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev) wake_up(&mdev->al_wait); } +int drbd_initialize_al(struct drbd_conf *mdev, void *buffer) +{ + struct al_transaction_on_disk *al = buffer; + struct drbd_md *md = &mdev->ldev->md; + sector_t al_base = md->md_offset + md->al_offset; + int al_size_4k = md->al_stripes * md->al_stripe_size_4k; + int i; + + memset(al, 0, 4096); + al->magic = cpu_to_be32(DRBD_AL_MAGIC); + al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); + al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + + for (i = 0; i < al_size_4k; i++) { + int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE); + if (err) + return err; + } + return 0; +} + static int w_update_odbm(struct drbd_work *w, int unused) { struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f943aac..2d7f608 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */ unsigned susp_nod:1; /* IO suspended because no data */ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ struct mutex cstate_mutex; /* Protects graceful disconnects */ + unsigned int connect_cnt; /* Inc each time a connection is established */ unsigned long flags; struct net_conf *net_conf; /* content protected by rcu */ @@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev); void drbd_print_uuids(struct drbd_conf *mdev, const char *text); extern void conn_md_sync(struct drbd_tconn *tconn); +extern void drbd_md_write(struct drbd_conf *mdev, void *buffer); extern void drbd_md_sync(struct drbd_conf *mdev); extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); @@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev); extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); -enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); +enum determine_dev_size { + DS_ERROR_SHRINK = -3, + DS_ERROR_SPACE_MD = -2, + DS_ERROR = -1, + DS_UNCHANGED = 0, + DS_SHRUNK = 1, + DS_GREW = 2 +}; +extern enum determine_dev_size +drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, @@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, #define drbd_set_out_of_sync(mdev, sector, size) \ __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) extern void drbd_al_shrink(struct drbd_conf *mdev); +extern int drbd_initialize_al(struct drbd_conf *, void *); /* drbd_nl.c */ /* state info broadcast */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a5dca6a..55635ed 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2762,8 +2762,6 @@ int __init drbd_init(void) /* * allocate all necessary structs */ - err = -ENOMEM; - init_waitqueue_head(&drbd_pp_wait); drbd_proc = NULL; /* play safe for drbd_cleanup */ @@ -2773,6 +2771,7 @@ int __init drbd_init(void) if (err) goto fail; + err = -ENOMEM; drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { printk(KERN_ERR "drbd: unable to register proc file\n"); @@ -2803,7 +2802,6 @@ int __init drbd_init(void) fail: drbd_cleanup(); if (err == -ENOMEM) - /* currently always the case */ printk(KERN_ERR "drbd: ran out of memory\n"); else printk(KERN_ERR "drbd: initialization failure\n"); @@ -2881,34 +2879,14 @@ struct meta_data_on_disk { u8 reserved_u8[4096 - (7*8 + 10*4)]; } __packed; -/** - * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set - * @mdev: DRBD device. - */ -void drbd_md_sync(struct drbd_conf *mdev) + + +void drbd_md_write(struct drbd_conf *mdev, void *b) { - struct meta_data_on_disk *buffer; + struct meta_data_on_disk *buffer = b; sector_t sector; int i; - /* Don't accidentally change the DRBD meta data layout. */ - BUILD_BUG_ON(UI_SIZE != 4); - BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); - - del_timer(&mdev->md_sync_timer); - /* timer may be rearmed by drbd_md_mark_dirty() now. */ - if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) - return; - - /* We use here D_FAILED and not D_ATTACHING because we try to write - * metadata even if we detach due to a disk failure! */ - if (!get_ldev_if_state(mdev, D_FAILED)) - return; - - buffer = drbd_md_get_buffer(mdev); - if (!buffer) - goto out; - memset(buffer, 0, sizeof(*buffer)); buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev) dev_err(DEV, "meta data update failed!\n"); drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } +} + +/** + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set + * @mdev: DRBD device. + */ +void drbd_md_sync(struct drbd_conf *mdev) +{ + struct meta_data_on_disk *buffer; + + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + + del_timer(&mdev->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ + if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) + return; + + /* We use here D_FAILED and not D_ATTACHING because we try to write + * metadata even if we detach due to a disk failure! */ + if (!get_ldev_if_state(mdev, D_FAILED)) + return; + + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + + drbd_md_write(mdev, buffer); /* Update mdev->ldev->md.la_size_sect, * since we updated it on metadata. */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9e3f441..8cc1e64 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) bool conn_try_outdate_peer(struct drbd_tconn *tconn) { + unsigned int connect_cnt; union drbd_state mask = { }; union drbd_state val = { }; enum drbd_fencing_p fp; @@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) return false; } + spin_lock_irq(&tconn->req_lock); + connect_cnt = tconn->connect_cnt; + spin_unlock_irq(&tconn->req_lock); + fp = highest_fencing_policy(tconn); switch (fp) { case FP_NOT_AVAIL: @@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) here, because we might were able to re-establish the connection in the meantime. */ spin_lock_irq(&tconn->req_lock); - if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) - _conn_request_state(tconn, mask, val, CS_VERBOSE); + if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) { + if (tconn->connect_cnt != connect_cnt) + /* In case the connection was established and droped + while the fence-peer handler was running, ignore it */ + conn_info(tconn, "Ignoring fence-peer exit code\n"); + else + _conn_request_state(tconn, mask, val, CS_VERBOSE); + } spin_unlock_irq(&tconn->req_lock); return conn_highest_pdsk(tconn) <= D_OUTDATED; @@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev) * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function. */ -enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) +enum determine_dev_size +drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ sector_t la_size_sect, u_size; + struct drbd_md *md = &mdev->ldev->md; + u32 prev_al_stripe_size_4k; + u32 prev_al_stripes; sector_t size; char ppb[10]; + void *buffer; int md_moved, la_size_changed; - enum determine_dev_size rv = unchanged; + enum determine_dev_size rv = DS_UNCHANGED; /* race: * application request passes inc_ap_bio, @@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds * still lock the act_log to not trigger ASSERTs there. */ drbd_suspend_io(mdev); + buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */ + if (!buffer) { + drbd_resume_io(mdev); + return DS_ERROR; + } /* no wait necessary anymore, actually we could assert that */ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); @@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds prev_size = mdev->ldev->md.md_size_sect; la_size_sect = mdev->ldev->md.la_size_sect; - /* TODO: should only be some assert here, not (re)init... */ + if (rs) { + /* rs is non NULL if we should change the AL layout only */ + + prev_al_stripes = md->al_stripes; + prev_al_stripe_size_4k = md->al_stripe_size_4k; + + md->al_stripes = rs->al_stripes; + md->al_stripe_size_4k = rs->al_stripe_size / 4; + md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; + } + drbd_md_set_sector_offsets(mdev, mdev->ldev); rcu_read_lock(); @@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds rcu_read_unlock(); size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); + if (size < la_size_sect) { + if (rs && u_size == 0) { + /* Remove "rs &&" later. This check should always be active, but + right now the receiver expects the permissive behavior */ + dev_warn(DEV, "Implicit shrink not allowed. " + "Use --size=%llus for explicit shrink.\n", + (unsigned long long)size); + rv = DS_ERROR_SHRINK; + } + if (u_size > size) + rv = DS_ERROR_SPACE_MD; + if (rv != DS_UNCHANGED) + goto err_out; + } + if (drbd_get_capacity(mdev->this_bdev) != size || drbd_bm_capacity(mdev) != size) { int err; @@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds "Leaving size unchanged at size = %lu KB\n", (unsigned long)size); } - rv = dev_size_error; + rv = DS_ERROR; } /* racy, see comments above. */ drbd_set_my_capacity(mdev, size); @@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); } - if (rv == dev_size_error) - goto out; + if (rv <= DS_ERROR) + goto err_out; la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) || prev_size != mdev->ldev->md.md_size_sect; - if (la_size_changed || md_moved) { - int err; + if (la_size_changed || md_moved || rs) { + u32 prev_flags; drbd_al_shrink(mdev); /* All extents inactive. */ + + prev_flags = md->flags; + md->flags &= ~MDF_PRIMARY_IND; + drbd_md_write(mdev, buffer); + dev_info(DEV, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ - err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, - "size changed", BM_LOCKED_MASK); - if (err) { - rv = dev_size_error; - goto out; - } - drbd_md_mark_dirty(mdev); + drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); + drbd_initialize_al(mdev, buffer); + + md->flags = prev_flags; + drbd_md_write(mdev, buffer); + + if (rs) + dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", + md->al_stripes, md->al_stripe_size_4k * 4); } if (size > la_size_sect) - rv = grew; + rv = DS_GREW; if (size < la_size_sect) - rv = shrunk; -out: + rv = DS_SHRUNK; + + if (0) { + err_out: + if (rs) { + md->al_stripes = prev_al_stripes; + md->al_stripe_size_4k = prev_al_stripe_size_4k; + md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; + + drbd_md_set_sector_offsets(mdev, mdev->ldev); + } + } lc_unlock(mdev->act_log); wake_up(&mdev->al_wait); + drbd_md_put_buffer(mdev); drbd_resume_io(mdev); return rv; @@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) set_bit(USE_DEGR_WFC_T, &mdev->flags); - dd = drbd_determine_dev_size(mdev, 0); - if (dd == dev_size_error) { + dd = drbd_determine_dev_size(mdev, 0, NULL); + if (dd <= DS_ERROR) { retcode = ERR_NOMEM_BITMAP; goto force_diskless_dec; - } else if (dd == grew) + } else if (dd == DS_GREW) set_bit(RESYNC_AFTER_NEG, &mdev->flags); if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || @@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) struct drbd_conf *mdev; enum drbd_ret_code retcode; enum determine_dev_size dd; + bool change_al_layout = false; enum dds_flags ddsf; sector_t u_size; int err; @@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto fail; + mdev = adm_ctx.mdev; + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto fail; + } + memset(&rs, 0, sizeof(struct resize_parms)); + rs.al_stripes = mdev->ldev->md.al_stripes; + rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4; if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; drbd_msg_put_info(from_attrs_err_to_txt(err)); - goto fail; + goto fail_ldev; } } - mdev = adm_ctx.mdev; if (mdev->state.conn > C_CONNECTED) { retcode = ERR_RESIZE_RESYNC; - goto fail; + goto fail_ldev; } if (mdev->state.role == R_SECONDARY && mdev->state.peer == R_SECONDARY) { retcode = ERR_NO_PRIMARY; - goto fail; - } - - if (!get_ldev(mdev)) { - retcode = ERR_NO_DISK; - goto fail; + goto fail_ldev; } if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { @@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } } + if (mdev->ldev->md.al_stripes != rs.al_stripes || + mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { + u32 al_size_k = rs.al_stripes * rs.al_stripe_size; + + if (al_size_k > (16 * 1024 * 1024)) { + retcode = ERR_MD_LAYOUT_TOO_BIG; + goto fail_ldev; + } + + if (al_size_k < MD_32kB_SECT/2) { + retcode = ERR_MD_LAYOUT_TOO_SMALL; + goto fail_ldev; + } + + if (mdev->state.conn != C_CONNECTED) { + retcode = ERR_MD_LAYOUT_CONNECTED; + goto fail_ldev; + } + + change_al_layout = true; + } + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); @@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); - dd = drbd_determine_dev_size(mdev, ddsf); + dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL); drbd_md_sync(mdev); put_ldev(mdev); - if (dd == dev_size_error) { + if (dd == DS_ERROR) { retcode = ERR_NOMEM_BITMAP; goto fail; + } else if (dd == DS_ERROR_SPACE_MD) { + retcode = ERR_MD_LAYOUT_NO_FIT; + goto fail; + } else if (dd == DS_ERROR_SHRINK) { + retcode = ERR_IMPLICIT_SHRINK; + goto fail; } if (mdev->state.conn == C_CONNECTED) { - if (dd == grew) + if (dd == DS_GREW) set_bit(RESIZE_PENDING, &mdev->flags); drbd_send_uuids(mdev); @@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, const struct sib_info *sib) { struct state_info *si = NULL; /* for sizeof(si->member); */ - struct net_conf *nc; struct nlattr *nla; int got_ldev; int err = 0; @@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, goto nla_put_failure; rcu_read_lock(); - if (got_ldev) - if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) - goto nla_put_failure; + if (got_ldev) { + struct disk_conf *disk_conf; - nc = rcu_dereference(mdev->tconn->net_conf); - if (nc) - err = net_conf_to_skb(skb, nc, exclude_sensitive); + disk_conf = rcu_dereference(mdev->ldev->disk_conf); + err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); + } + if (!err) { + struct net_conf *nc; + + nc = rcu_dereference(mdev->tconn->net_conf); + if (nc) + err = net_conf_to_skb(skb, nc, exclude_sensitive); + } rcu_read_unlock(); if (err) goto nla_put_failure; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4222aff..cc29cd3 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1039,6 +1039,8 @@ randomize: rcu_read_lock(); idr_for_each_entry(&tconn->volumes, mdev, vnr) { kref_get(&mdev->kref); + rcu_read_unlock(); + /* Prevent a race between resync-handshake and * being promoted to Primary. * @@ -1049,8 +1051,6 @@ randomize: mutex_lock(mdev->state_mutex); mutex_unlock(mdev->state_mutex); - rcu_read_unlock(); - if (discard_my_data) set_bit(DISCARD_MY_DATA, &mdev->flags); else @@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) { struct drbd_conf *mdev; struct p_sizes *p = pi->data; - enum determine_dev_size dd = unchanged; + enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(mdev)) { - dd = drbd_determine_dev_size(mdev, ddsf); + dd = drbd_determine_dev_size(mdev, ddsf, NULL); put_ldev(mdev); - if (dd == dev_size_error) + if (dd == DS_ERROR) return -EIO; drbd_md_sync(mdev); } else { @@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) drbd_send_sizes(mdev, 0, ddsf); } if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || - (dd == grew && mdev->state.conn == C_CONNECTED)) { + (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) { if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.disk >= D_INCONSISTENT) { if (ddsf & DDSF_NO_RESYNC) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 90c5be2..216d47b 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_restart_nowait(&mdev->tconn->receiver); /* Resume AL writing if we get a connection */ - if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { drbd_resume_al(mdev); + mdev->tconn->connect_cnt++; + } /* remember last attach time so request_timer_fn() won't * kill newly established sessions while we are still trying to thaw diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 5af21f2..6e85e21 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -31,6 +31,8 @@ #include <linux/slab.h> #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/genhd.h> #include <linux/idr.h> @@ -39,8 +41,9 @@ #include "rsxx_cfg.h" #define NO_LEGACY 0 +#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ -MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); +MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION); @@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY; module_param(force_legacy, uint, 0444); MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); +static unsigned int sync_start = 1; +module_param(sync_start, uint, 0444); +MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " + "until the card startup has completed."); + static DEFINE_IDA(rsxx_disk_ida); static DEFINE_SPINLOCK(rsxx_ida_lock); +/* --------------------Debugfs Setup ------------------- */ + +struct rsxx_cram { + u32 f_pos; + u32 offset; + void *i_private; +}; + +static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) +{ + struct rsxx_cardinfo *card = m->private; + + seq_printf(m, "HWID 0x%08x\n", + ioread32(card->regmap + HWID)); + seq_printf(m, "SCRATCH 0x%08x\n", + ioread32(card->regmap + SCRATCH)); + seq_printf(m, "IER 0x%08x\n", + ioread32(card->regmap + IER)); + seq_printf(m, "IPR 0x%08x\n", + ioread32(card->regmap + IPR)); + seq_printf(m, "CREG_CMD 0x%08x\n", + ioread32(card->regmap + CREG_CMD)); + seq_printf(m, "CREG_ADD 0x%08x\n", + ioread32(card->regmap + CREG_ADD)); + seq_printf(m, "CREG_CNT 0x%08x\n", + ioread32(card->regmap + CREG_CNT)); + seq_printf(m, "CREG_STAT 0x%08x\n", + ioread32(card->regmap + CREG_STAT)); + seq_printf(m, "CREG_DATA0 0x%08x\n", + ioread32(card->regmap + CREG_DATA0)); + seq_printf(m, "CREG_DATA1 0x%08x\n", + ioread32(card->regmap + CREG_DATA1)); + seq_printf(m, "CREG_DATA2 0x%08x\n", + ioread32(card->regmap + CREG_DATA2)); + seq_printf(m, "CREG_DATA3 0x%08x\n", + ioread32(card->regmap + CREG_DATA3)); + seq_printf(m, "CREG_DATA4 0x%08x\n", + ioread32(card->regmap + CREG_DATA4)); + seq_printf(m, "CREG_DATA5 0x%08x\n", + ioread32(card->regmap + CREG_DATA5)); + seq_printf(m, "CREG_DATA6 0x%08x\n", + ioread32(card->regmap + CREG_DATA6)); + seq_printf(m, "CREG_DATA7 0x%08x\n", + ioread32(card->regmap + CREG_DATA7)); + seq_printf(m, "INTR_COAL 0x%08x\n", + ioread32(card->regmap + INTR_COAL)); + seq_printf(m, "HW_ERROR 0x%08x\n", + ioread32(card->regmap + HW_ERROR)); + seq_printf(m, "DEBUG0 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG0)); + seq_printf(m, "DEBUG1 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG1)); + seq_printf(m, "DEBUG2 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG2)); + seq_printf(m, "DEBUG3 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG3)); + seq_printf(m, "DEBUG4 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG4)); + seq_printf(m, "DEBUG5 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG5)); + seq_printf(m, "DEBUG6 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG6)); + seq_printf(m, "DEBUG7 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG7)); + seq_printf(m, "RECONFIG 0x%08x\n", + ioread32(card->regmap + PCI_RECONFIG)); + + return 0; +} + +static int rsxx_attr_stats_show(struct seq_file *m, void *p) +{ + struct rsxx_cardinfo *card = m->private; + int i; + + for (i = 0; i < card->n_targets; i++) { + seq_printf(m, "Ctrl %d CRC Errors = %d\n", + i, card->ctrl[i].stats.crc_errors); + seq_printf(m, "Ctrl %d Hard Errors = %d\n", + i, card->ctrl[i].stats.hard_errors); + seq_printf(m, "Ctrl %d Soft Errors = %d\n", + i, card->ctrl[i].stats.soft_errors); + seq_printf(m, "Ctrl %d Writes Issued = %d\n", + i, card->ctrl[i].stats.writes_issued); + seq_printf(m, "Ctrl %d Writes Failed = %d\n", + i, card->ctrl[i].stats.writes_failed); + seq_printf(m, "Ctrl %d Reads Issued = %d\n", + i, card->ctrl[i].stats.reads_issued); + seq_printf(m, "Ctrl %d Reads Failed = %d\n", + i, card->ctrl[i].stats.reads_failed); + seq_printf(m, "Ctrl %d Reads Retried = %d\n", + i, card->ctrl[i].stats.reads_retried); + seq_printf(m, "Ctrl %d Discards Issued = %d\n", + i, card->ctrl[i].stats.discards_issued); + seq_printf(m, "Ctrl %d Discards Failed = %d\n", + i, card->ctrl[i].stats.discards_failed); + seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", + i, card->ctrl[i].stats.dma_sw_err); + seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", + i, card->ctrl[i].stats.dma_hw_fault); + seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", + i, card->ctrl[i].stats.dma_cancelled); + seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", + i, card->ctrl[i].stats.sw_q_depth); + seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", + i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); + } + + return 0; +} + +static int rsxx_attr_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rsxx_attr_stats_show, inode->i_private); +} + +static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) +{ + return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); +} + +static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct rsxx_cram *info = fp->private_data; + struct rsxx_cardinfo *card = info->i_private; + char *buf; + int st; + + buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + info->f_pos = (u32)*ppos + info->offset; + + st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); + if (st) + return st; + + st = copy_to_user(ubuf, buf, cnt); + if (st) + return st; + + info->offset += cnt; + + kfree(buf); + + return cnt; +} + +static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct rsxx_cram *info = fp->private_data; + struct rsxx_cardinfo *card = info->i_private; + char *buf; + int st; + + buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + st = copy_from_user(buf, ubuf, cnt); + if (st) + return st; + + info->f_pos = (u32)*ppos + info->offset; + + st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); + if (st) + return st; + + info->offset += cnt; + + kfree(buf); + + return cnt; +} + +static int rsxx_cram_open(struct inode *inode, struct file *file) +{ + struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->i_private = inode->i_private; + info->f_pos = file->f_pos; + file->private_data = info; + + return 0; +} + +static int rsxx_cram_release(struct inode *inode, struct file *file) +{ + struct rsxx_cram *info = file->private_data; + + if (!info) + return 0; + + kfree(info); + file->private_data = NULL; + + return 0; +} + +static const struct file_operations debugfs_cram_fops = { + .owner = THIS_MODULE, + .open = rsxx_cram_open, + .read = rsxx_cram_read, + .write = rsxx_cram_write, + .release = rsxx_cram_release, +}; + +static const struct file_operations debugfs_stats_fops = { + .owner = THIS_MODULE, + .open = rsxx_attr_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations debugfs_pci_regs_fops = { + .owner = THIS_MODULE, + .open = rsxx_attr_pci_regs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) +{ + struct dentry *debugfs_stats; + struct dentry *debugfs_pci_regs; + struct dentry *debugfs_cram; + + card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); + if (IS_ERR_OR_NULL(card->debugfs_dir)) + goto failed_debugfs_dir; + + debugfs_stats = debugfs_create_file("stats", S_IRUGO, + card->debugfs_dir, card, + &debugfs_stats_fops); + if (IS_ERR_OR_NULL(debugfs_stats)) + goto failed_debugfs_stats; + + debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO, + card->debugfs_dir, card, + &debugfs_pci_regs_fops); + if (IS_ERR_OR_NULL(debugfs_pci_regs)) + goto failed_debugfs_pci_regs; + + debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR, + card->debugfs_dir, card, + &debugfs_cram_fops); + if (IS_ERR_OR_NULL(debugfs_cram)) + goto failed_debugfs_cram; + + return; +failed_debugfs_cram: + debugfs_remove(debugfs_pci_regs); +failed_debugfs_pci_regs: + debugfs_remove(debugfs_stats); +failed_debugfs_stats: + debugfs_remove(card->debugfs_dir); +failed_debugfs_dir: + card->debugfs_dir = NULL; +} + /*----------------- Interrupt Control & Handling -------------------*/ static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) @@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) } if (isr & CR_INTR_CREG) { - schedule_work(&card->creg_ctrl.done_work); + queue_work(card->creg_ctrl.creg_wq, + &card->creg_ctrl.done_work); handled++; } if (isr & CR_INTR_EVENT) { - schedule_work(&card->event_work); + queue_work(card->event_wq, &card->event_work); rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); handled++; } @@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev) int i; int st; - dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); + dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); card->eeh_state = 1; rsxx_mask_interrupts(card); @@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev) { struct rsxx_cardinfo *card = pci_get_drvdata(dev); int i; + int cnt = 0; - dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); + dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); card->eeh_state = 1; + card->halt = 1; - for (i = 0; i < card->n_targets; i++) - del_timer_sync(&card->ctrl[i].activity_timer); + for (i = 0; i < card->n_targets; i++) { + spin_lock_bh(&card->ctrl[i].queue_lock); + cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], + &card->ctrl[i].queue); + spin_unlock_bh(&card->ctrl[i].queue_lock); + + cnt += rsxx_dma_cancel(&card->ctrl[i]); - rsxx_eeh_cancel_dmas(card); + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d queued DMAs on channel %d\n", + cnt, card->ctrl[i].id); + } } static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) @@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) int st; dev_warn(&dev->dev, - "IBM FlashSystem PCI: recovering from slot reset.\n"); + "IBM Flash Adapter PCI: recovering from slot reset.\n"); st = pci_enable_device(dev); if (st) @@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) &card->ctrl[i].issue_dma_work); } - dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); + dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); return PCI_ERS_RESULT_RECOVERED; @@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, { struct rsxx_cardinfo *card; int st; + unsigned int sync_timeout; dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); @@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev, } /************* Setup Processor Command Interface *************/ - rsxx_creg_setup(card); + st = rsxx_creg_setup(card); + if (st) { + dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); + goto failed_creg_setup; + } spin_lock_irq(&card->irq_lock); rsxx_enable_ier_and_isr(card, CR_INTR_CREG); @@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev, } /************* Setup Card Event Handler *************/ + card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); + if (!card->event_wq) { + dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); + goto failed_event_handler; + } + INIT_WORK(&card->event_work, card_event_handler); st = rsxx_setup_dev(card); @@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev, if (st) dev_crit(CARD_TO_DEV(card), "Failed issuing card startup\n"); + if (sync_start) { + sync_timeout = SYNC_START_TIMEOUT; + + dev_info(CARD_TO_DEV(card), + "Waiting for card to startup\n"); + + do { + ssleep(1); + sync_timeout--; + + rsxx_get_card_state(card, &card->state); + } while (sync_timeout && + (card->state == CARD_STATE_STARTING)); + + if (card->state == CARD_STATE_STARTING) { + dev_warn(CARD_TO_DEV(card), + "Card startup timed out\n"); + card->size8 = 0; + } else { + dev_info(CARD_TO_DEV(card), + "card state: %s\n", + rsxx_card_state_to_str(card->state)); + st = rsxx_get_card_size8(card, &card->size8); + if (st) + card->size8 = 0; + } + } } else if (card->state == CARD_STATE_GOOD || card->state == CARD_STATE_RD_ONLY_FAULT) { st = rsxx_get_card_size8(card, &card->size8); @@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev, rsxx_attach_dev(card); + /************* Setup Debugfs *************/ + rsxx_debugfs_dev_new(card); + return 0; failed_create_dev: + destroy_workqueue(card->event_wq); + card->event_wq = NULL; +failed_event_handler: rsxx_dma_destroy(card); failed_dma_setup: failed_compatiblity_check: + destroy_workqueue(card->creg_ctrl.creg_wq); + card->creg_ctrl.creg_wq = NULL; +failed_creg_setup: spin_lock_irq(&card->irq_lock); rsxx_disable_ier_and_isr(card, CR_INTR_ALL); spin_unlock_irq(&card->irq_lock); @@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev) /* Prevent work_structs from re-queuing themselves. */ card->halt = 1; + debugfs_remove_recursive(card->debugfs_dir); + free_irq(dev->irq, card); if (!force_legacy) diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 4b5c020..926dce9 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c @@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card, *hw_stat = completion.creg_status; if (completion.st) { + /* + * This read is needed to verify that there has not been any + * extreme errors that might have occurred, i.e. EEH. The + * function iowrite32 will not detect EEH errors, so it is + * necessary that we recover if such an error is the reason + * for the timeout. This is a dummy read. + */ + ioread32(card->regmap + SCRATCH); + dev_warn(CARD_TO_DEV(card), "creg command failed(%d x%08x)\n", completion.st, addr); @@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card) { card->creg_ctrl.active_cmd = NULL; + card->creg_ctrl.creg_wq = + create_singlethread_workqueue(DRIVER_NAME"_creg"); + if (!card->creg_ctrl.creg_wq) + return -ENOMEM; + INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); mutex_init(&card->creg_ctrl.reset_lock); INIT_LIST_HEAD(&card->creg_ctrl.queue); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 4346d17..d7af441 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, atomic_set(&meta->error, 1); if (atomic_dec_and_test(&meta->pending_dmas)) { - disk_stats_complete(card, meta->bio, meta->start_time); + if (!card->eeh_state && card->gendisk) + disk_stats_complete(card, meta->bio, meta->start_time); bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); kmem_cache_free(bio_meta_pool, meta); @@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) might_sleep(); + if (!card) + goto req_err; + + if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk)) + goto req_err; + if (unlikely(card->halt)) { st = -EFAULT; goto req_err; @@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) atomic_set(&bio_meta->pending_dmas, 0); bio_meta->start_time = jiffies; - disk_stats_start(card, bio); + if (!unlikely(card->halt)) + disk_stats_start(card, bio); dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", bio_data_dir(bio) ? 'W' : 'R', bio_meta, @@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card) return (pci_rev >= RSXX_DISCARD_SUPPORT); } -static unsigned short rsxx_get_logical_block_size( - struct rsxx_cardinfo *card) -{ - u32 capabilities = 0; - int st; - - st = rsxx_get_card_capabilities(card, &capabilities); - if (st) - dev_warn(CARD_TO_DEV(card), - "Failed reading card capabilities register\n"); - - /* Earlier firmware did not have support for 512 byte accesses */ - if (capabilities & CARD_CAP_SUBPAGE_WRITES) - return 512; - else - return RSXX_HW_BLK_SIZE; -} - int rsxx_attach_dev(struct rsxx_cardinfo *card) { mutex_lock(&card->dev_lock); @@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - blk_size = rsxx_get_logical_block_size(card); + blk_size = card->config.data.block_size; blk_queue_make_request(card->queue, rsxx_make_request); blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); @@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card) card->gendisk = NULL; blk_cleanup_queue(card->queue); + card->queue->queuedata = NULL; unregister_blkdev(card->major, DRIVER_NAME); } diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 0607513..bed32f1 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, kmem_cache_free(rsxx_dma_pool, dma); } +int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, + struct list_head *q) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int cnt = 0; + + list_for_each_entry_safe(dma, tmp, q, list) { + list_del(&dma->list); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + cnt++; + } + + return cnt; +} + static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) { @@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, * Requeued DMAs go to the front of the queue so they are issued * first. */ - spin_lock(&ctrl->queue_lock); + spin_lock_bh(&ctrl->queue_lock); + ctrl->stats.sw_q_depth++; list_add(&dma->list, &ctrl->queue); - spin_unlock(&ctrl->queue_lock); + spin_unlock_bh(&ctrl->queue_lock); } static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, @@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, static void dma_engine_stalled(unsigned long data) { struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; + int cnt; if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || unlikely(ctrl->card->eeh_state)) @@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data) "DMA channel %d has stalled, faulting interface.\n", ctrl->id); ctrl->card->dma_fault = 1; + + /* Clean up the DMA queue */ + spin_lock(&ctrl->queue_lock); + cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); + spin_unlock(&ctrl->queue_lock); + + cnt += rsxx_dma_cancel(ctrl); + + if (cnt) + dev_info(CARD_TO_DEV(ctrl->card), + "Freed %d queued DMAs on channel %d\n", + cnt, ctrl->id); } } -static void rsxx_issue_dmas(struct work_struct *work) +static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) { - struct rsxx_dma_ctrl *ctrl; struct rsxx_dma *dma; int tag; int cmds_pending = 0; struct hw_cmd *hw_cmd_buf; - ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); hw_cmd_buf = ctrl->cmd.buf; if (unlikely(ctrl->card->halt) || @@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work) return; while (1) { - spin_lock(&ctrl->queue_lock); + spin_lock_bh(&ctrl->queue_lock); if (list_empty(&ctrl->queue)) { - spin_unlock(&ctrl->queue_lock); + spin_unlock_bh(&ctrl->queue_lock); break; } - spin_unlock(&ctrl->queue_lock); + spin_unlock_bh(&ctrl->queue_lock); tag = pop_tracker(ctrl->trackers); if (tag == -1) break; - spin_lock(&ctrl->queue_lock); + spin_lock_bh(&ctrl->queue_lock); dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); list_del(&dma->list); ctrl->stats.sw_q_depth--; - spin_unlock(&ctrl->queue_lock); + spin_unlock_bh(&ctrl->queue_lock); /* * This will catch any DMAs that slipped in right before the @@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work) } } -static void rsxx_dma_done(struct work_struct *work) +static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) { - struct rsxx_dma_ctrl *ctrl; struct rsxx_dma *dma; unsigned long flags; u16 count; @@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work) u8 tag; struct hw_status *hw_st_buf; - ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); hw_st_buf = ctrl->status.buf; if (unlikely(ctrl->card->halt) || @@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work) rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); - spin_lock(&ctrl->queue_lock); + spin_lock_bh(&ctrl->queue_lock); if (ctrl->stats.sw_q_depth) queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); - spin_unlock(&ctrl->queue_lock); + spin_unlock_bh(&ctrl->queue_lock); } -static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, - struct list_head *q) +static void rsxx_schedule_issue(struct work_struct *work) { - struct rsxx_dma *dma; - struct rsxx_dma *tmp; - int cnt = 0; + struct rsxx_dma_ctrl *ctrl; - list_for_each_entry_safe(dma, tmp, q, list) { - list_del(&dma->list); + ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); - if (dma->dma_addr) - pci_unmap_page(card->dev, dma->dma_addr, - get_dma_size(dma), - (dma->cmd == HW_CMD_BLK_WRITE) ? - PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - kmem_cache_free(rsxx_dma_pool, dma); - cnt++; - } + mutex_lock(&ctrl->work_lock); + rsxx_issue_dmas(ctrl); + mutex_unlock(&ctrl->work_lock); +} - return cnt; +static void rsxx_schedule_done(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + + ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); + + mutex_lock(&ctrl->work_lock); + rsxx_dma_done(ctrl); + mutex_unlock(&ctrl->work_lock); } static int rsxx_queue_discard(struct rsxx_cardinfo *card, @@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, for (i = 0; i < card->n_targets; i++) { if (!list_empty(&dma_list[i])) { - spin_lock(&card->ctrl[i].queue_lock); + spin_lock_bh(&card->ctrl[i].queue_lock); card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; list_splice_tail(&dma_list[i], &card->ctrl[i].queue); - spin_unlock(&card->ctrl[i].queue_lock); + spin_unlock_bh(&card->ctrl[i].queue_lock); queue_work(card->ctrl[i].issue_wq, &card->ctrl[i].issue_dma_work); @@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, return 0; bvec_err: - for (i = 0; i < card->n_targets; i++) - rsxx_cleanup_dma_queue(card, &dma_list[i]); + for (i = 0; i < card->n_targets; i++) { + spin_lock_bh(&card->ctrl[i].queue_lock); + rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); + spin_unlock_bh(&card->ctrl[i].queue_lock); + } return st; } @@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, spin_lock_init(&ctrl->trackers->lock); spin_lock_init(&ctrl->queue_lock); + mutex_init(&ctrl->work_lock); INIT_LIST_HEAD(&ctrl->queue); setup_timer(&ctrl->activity_timer, dma_engine_stalled, @@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, if (!ctrl->done_wq) return -ENOMEM; - INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); - INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); + INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); + INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); st = rsxx_hw_buffers_init(dev, ctrl); if (st) @@ -918,13 +947,30 @@ failed_dma_setup: return st; } +int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) +{ + struct rsxx_dma *dma; + int i; + int cnt = 0; + + /* Clean up issued DMAs */ + for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { + dma = get_tracker_dma(ctrl->trackers, i); + if (dma) { + atomic_dec(&ctrl->stats.hw_q_depth); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + push_tracker(ctrl->trackers, i); + cnt++; + } + } + + return cnt; +} void rsxx_dma_destroy(struct rsxx_cardinfo *card) { struct rsxx_dma_ctrl *ctrl; - struct rsxx_dma *dma; - int i, j; - int cnt = 0; + int i; for (i = 0; i < card->n_targets; i++) { ctrl = &card->ctrl[i]; @@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) del_timer_sync(&ctrl->activity_timer); /* Clean up the DMA queue */ - spin_lock(&ctrl->queue_lock); - cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); - spin_unlock(&ctrl->queue_lock); - - if (cnt) - dev_info(CARD_TO_DEV(card), - "Freed %d queued DMAs on channel %d\n", - cnt, i); - - /* Clean up issued DMAs */ - for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { - dma = get_tracker_dma(ctrl->trackers, j); - if (dma) { - pci_unmap_page(card->dev, dma->dma_addr, - get_dma_size(dma), - (dma->cmd == HW_CMD_BLK_WRITE) ? - PCI_DMA_TODEVICE : - PCI_DMA_FROMDEVICE); - kmem_cache_free(rsxx_dma_pool, dma); - cnt++; - } - } + spin_lock_bh(&ctrl->queue_lock); + rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); + spin_unlock_bh(&ctrl->queue_lock); - if (cnt) - dev_info(CARD_TO_DEV(card), - "Freed %d pending DMAs on channel %d\n", - cnt, i); + rsxx_dma_cancel(ctrl); vfree(ctrl->trackers); @@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) cnt++; } - spin_lock(&card->ctrl[i].queue_lock); + spin_lock_bh(&card->ctrl[i].queue_lock); list_splice(&issued_dmas[i], &card->ctrl[i].queue); atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); @@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); } - spin_unlock(&card->ctrl[i].queue_lock); + spin_unlock_bh(&card->ctrl[i].queue_lock); } kfree(issued_dmas); @@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) return 0; } -void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) -{ - struct rsxx_dma *dma; - struct rsxx_dma *tmp; - int i; - - for (i = 0; i < card->n_targets; i++) { - spin_lock(&card->ctrl[i].queue_lock); - list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) { - list_del(&dma->list); - - rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED); - } - spin_unlock(&card->ctrl[i].queue_lock); - } -} - int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) { struct rsxx_dma *dma; int i; for (i = 0; i < card->n_targets; i++) { - spin_lock(&card->ctrl[i].queue_lock); + spin_lock_bh(&card->ctrl[i].queue_lock); list_for_each_entry(dma, &card->ctrl[i].queue, list) { dma->dma_addr = pci_map_page(card->dev, dma->page, dma->pg_off, get_dma_size(dma), @@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (!dma->dma_addr) { - spin_unlock(&card->ctrl[i].queue_lock); + spin_unlock_bh(&card->ctrl[i].queue_lock); kmem_cache_free(rsxx_dma_pool, dma); return -ENOMEM; } } - spin_unlock(&card->ctrl[i].queue_lock); + spin_unlock_bh(&card->ctrl[i].queue_lock); } return 0; diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 382e8bf..5ad5055 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -39,6 +39,7 @@ #include <linux/vmalloc.h> #include <linux/timer.h> #include <linux/ioctl.h> +#include <linux/delay.h> #include "rsxx.h" #include "rsxx_cfg.h" @@ -114,6 +115,7 @@ struct rsxx_dma_ctrl { struct timer_list activity_timer; struct dma_tracker_list *trackers; struct rsxx_dma_stats stats; + struct mutex work_lock; }; struct rsxx_cardinfo { @@ -134,6 +136,7 @@ struct rsxx_cardinfo { spinlock_t lock; bool active; struct creg_cmd *active_cmd; + struct workqueue_struct *creg_wq; struct work_struct done_work; struct list_head queue; unsigned int q_depth; @@ -154,6 +157,7 @@ struct rsxx_cardinfo { int buf_len; } log; + struct workqueue_struct *event_wq; struct work_struct event_work; unsigned int state; u64 size8; @@ -181,6 +185,8 @@ struct rsxx_cardinfo { int n_targets; struct rsxx_dma_ctrl *ctrl; + + struct dentry *debugfs_dir; }; enum rsxx_pci_regmap { @@ -283,6 +289,7 @@ enum rsxx_creg_addr { CREG_ADD_CAPABILITIES = 0x80001050, CREG_ADD_LOG = 0x80002000, CREG_ADD_NUM_TARGETS = 0x80003000, + CREG_ADD_CRAM = 0xA0000000, CREG_ADD_CONFIG = 0xB0000000, }; @@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, int rsxx_dma_setup(struct rsxx_cardinfo *card); void rsxx_dma_destroy(struct rsxx_cardinfo *card); int rsxx_dma_init(void); +int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); +int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); void rsxx_dma_cleanup(void); void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); int rsxx_dma_configure(struct rsxx_cardinfo *card); @@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, void *cb_data); int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); -void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); /***** cregs.c *****/ diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fe..bf4b9d2 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -50,110 +50,118 @@ #include "common.h" /* - * These are rather arbitrary. They are fairly large because adjacent requests - * pulled from a communication ring are quite likely to end up being part of - * the same scatter/gather request at the disc. + * Maximum number of unused free pages to keep in the internal buffer. + * Setting this to a value too low will reduce memory used in each backend, + * but can have a performance penalty. * - * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** - * - * This will increase the chances of being able to write whole tracks. - * 64 should be enough to keep us competitive with Linux. + * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can + * be set to a lower value that might degrade performance on some intensive + * IO workloads. */ -static int xen_blkif_reqs = 64; -module_param_named(reqs, xen_blkif_reqs, int, 0); -MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); -/* Run-time switchable: /sys/module/blkback/parameters/ */ -static unsigned int log_stats; -module_param(log_stats, int, 0644); +static int xen_blkif_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in each block backend buffer"); /* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. Each buffer_head that completes decrements - * the pendcnt towards zero. When it hits zero, the specified domain has a - * response queued for it, with the saved 'id' passed back. + * Maximum number of grants to map persistently in blkback. For maximum + * performance this should be the total numbers of grants that can be used + * to fill the ring, but since this might become too high, specially with + * the use of indirect descriptors, we set it to a value that provides good + * performance without using too much memory. + * + * When the list of persistent grants is full we clean it up using a LRU + * algorithm. */ -struct pending_req { - struct xen_blkif *blkif; - u64 id; - int nr_pages; - atomic_t pendcnt; - unsigned short operation; - int status; - struct list_head free_list; - DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); -}; -#define BLKBACK_INVALID_HANDLE (~0) +static int xen_blkif_max_pgrants = 1056; +module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); +MODULE_PARM_DESC(max_persistent_grants, + "Maximum number of grants to map persistently"); -struct xen_blkbk { - struct pending_req *pending_reqs; - /* List of all 'pending_req' available */ - struct list_head pending_free; - /* And its spinlock. */ - spinlock_t pending_free_lock; - wait_queue_head_t pending_free_wq; - /* The list of all pages that are available. */ - struct page **pending_pages; - /* And the grant handles that are available. */ - grant_handle_t *pending_grant_handles; -}; - -static struct xen_blkbk *blkbk; +/* + * The LRU mechanism to clean the lists of persistent grants needs to + * be executed periodically. The time interval between consecutive executions + * of the purge mechanism is set in ms. + */ +#define LRU_INTERVAL 100 /* - * Maximum number of grant pages that can be mapped in blkback. - * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of - * pages that blkback will persistently map. - * Currently, this is: - * RING_SIZE = 32 (for all known ring types) - * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 - * sizeof(struct persistent_gnt) = 48 - * So the maximum memory used to store the grants is: - * 32 * 11 * 48 = 16896 bytes + * When the persistent grants list is full we will remove unused grants + * from the list. The percent number of grants to be removed at each LRU + * execution. */ -static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) +#define LRU_PERCENT_CLEAN 5 + +/* Run-time switchable: /sys/module/blkback/parameters/ */ +static unsigned int log_stats; +module_param(log_stats, int, 0644); + +#define BLKBACK_INVALID_HANDLE (~0) + +/* Number of free pages to remove on each call to free_xenballooned_pages */ +#define NUM_BATCH_FREE_PAGES 10 + +static inline int get_free_page(struct xen_blkif *blkif, struct page **page) { - switch (protocol) { - case BLKIF_PROTOCOL_NATIVE: - return __CONST_RING_SIZE(blkif, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - case BLKIF_PROTOCOL_X86_32: - return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - case BLKIF_PROTOCOL_X86_64: - return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * - BLKIF_MAX_SEGMENTS_PER_REQUEST; - default: - BUG(); + unsigned long flags; + + spin_lock_irqsave(&blkif->free_pages_lock, flags); + if (list_empty(&blkif->free_pages)) { + BUG_ON(blkif->free_pages_num != 0); + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + return alloc_xenballooned_pages(1, page, false); } + BUG_ON(blkif->free_pages_num == 0); + page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + list_del(&page[0]->lru); + blkif->free_pages_num--; + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + return 0; } - -/* - * Little helpful macro to figure out the index and virtual address of the - * pending_pages[..]. For each 'pending_req' we have have up to - * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through - * 10 and would index in the pending_pages[..]. - */ -static inline int vaddr_pagenr(struct pending_req *req, int seg) +static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, + int num) { - return (req - blkbk->pending_reqs) * - BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; -} + unsigned long flags; + int i; -#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] + spin_lock_irqsave(&blkif->free_pages_lock, flags); + for (i = 0; i < num; i++) + list_add(&page[i]->lru, &blkif->free_pages); + blkif->free_pages_num += num; + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); +} -static inline unsigned long vaddr(struct pending_req *req, int seg) +static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) { - unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); - return (unsigned long)pfn_to_kaddr(pfn); -} + /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ + struct page *page[NUM_BATCH_FREE_PAGES]; + unsigned int num_pages = 0; + unsigned long flags; -#define pending_handle(_req, _seg) \ - (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) + spin_lock_irqsave(&blkif->free_pages_lock, flags); + while (blkif->free_pages_num > num) { + BUG_ON(list_empty(&blkif->free_pages)); + page[num_pages] = list_first_entry(&blkif->free_pages, + struct page, lru); + list_del(&page[num_pages]->lru); + blkif->free_pages_num--; + if (++num_pages == NUM_BATCH_FREE_PAGES) { + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + free_xenballooned_pages(num_pages, page); + spin_lock_irqsave(&blkif->free_pages_lock, flags); + num_pages = 0; + } + } + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + if (num_pages != 0) + free_xenballooned_pages(num_pages, page); +} +#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) static int do_block_io_op(struct xen_blkif *blkif); static int dispatch_rw_block_io(struct xen_blkif *blkif, @@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id, (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) -static void add_persistent_gnt(struct rb_root *root, +/* + * We don't need locking around the persistent grant helpers + * because blkback uses a single-thread for each backed, so we + * can be sure that this functions will never be called recursively. + * + * The only exception to that is put_persistent_grant, that can be called + * from interrupt context (by xen_blkbk_unmap), so we have to use atomic + * bit operations to modify the flags of a persistent grant and to count + * the number of used grants. + */ +static int add_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *persistent_gnt) { - struct rb_node **new = &(root->rb_node), *parent = NULL; + struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (!blkif->vbd.overflow_max_grants) + blkif->vbd.overflow_max_grants = 1; + return -EBUSY; + } /* Figure out where to put new node */ + new = &blkif->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root, else if (persistent_gnt->gnt > this->gnt) new = &((*new)->rb_right); else { - pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); - BUG(); + pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n"); + return -EINVAL; } } + bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); + set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), root); + rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); + blkif->persistent_gnt_c++; + atomic_inc(&blkif->persistent_gnt_in_use); + return 0; } -static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, grant_ref_t gref) { struct persistent_gnt *data; - struct rb_node *node = root->rb_node; + struct rb_node *node = NULL; + node = blkif->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, node = node->rb_left; else if (gref > data->gnt) node = node->rb_right; - else + else { + if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { + pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n"); + return NULL; + } + set_bit(PERSISTENT_GNT_ACTIVE, data->flags); + atomic_inc(&blkif->persistent_gnt_in_use); return data; + } } return NULL; } -static void free_persistent_gnts(struct rb_root *root, unsigned int num) +static void put_persistent_gnt(struct xen_blkif *blkif, + struct persistent_gnt *persistent_gnt) +{ + if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + pr_alert_ratelimited(DRV_PFX " freeing a grant already unused"); + set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); + clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); + atomic_dec(&blkif->persistent_gnt_in_use); +} + +static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, + unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); BUG_ON(ret); - free_xenballooned_pages(segs_to_unmap, pages); + put_free_pages(blkif, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) BUG_ON(num != 0); } +static void unmap_purged_grants(struct work_struct *work) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct persistent_gnt *persistent_gnt; + int ret, segs_to_unmap = 0; + struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + + while(!list_empty(&blkif->persistent_purge_list)) { + persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + struct persistent_gnt, + remove_node); + list_del(&persistent_gnt->remove_node); + + gnttab_set_unmap_op(&unmap[segs_to_unmap], + vaddr(persistent_gnt->page), + GNTMAP_host_map, + persistent_gnt->handle); + + pages[segs_to_unmap] = persistent_gnt->page; + + if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { + ret = gnttab_unmap_refs(unmap, NULL, pages, + segs_to_unmap); + BUG_ON(ret); + put_free_pages(blkif, pages, segs_to_unmap); + segs_to_unmap = 0; + } + kfree(persistent_gnt); + } + if (segs_to_unmap > 0) { + ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); + BUG_ON(ret); + put_free_pages(blkif, pages, segs_to_unmap); + } +} + +static void purge_persistent_gnt(struct xen_blkif *blkif) +{ + struct persistent_gnt *persistent_gnt; + struct rb_node *n; + unsigned int num_clean, total; + bool scan_used = false, clean_used = false; + struct rb_root *root; + + if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || + (blkif->persistent_gnt_c == xen_blkif_max_pgrants && + !blkif->vbd.overflow_max_grants)) { + return; + } + + if (work_pending(&blkif->persistent_purge_work)) { + pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n"); + return; + } + + num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; + num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(blkif->persistent_gnt_c, num_clean); + if ((num_clean == 0) || + (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) + return; + + /* + * At this point, we can assure that there will be no calls + * to get_persistent_grant (because we are executing this code from + * xen_blkif_schedule), there can only be calls to put_persistent_gnt, + * which means that the number of currently used grants will go down, + * but never up, so we will always be able to remove the requested + * number of grants. + */ + + total = num_clean; + + pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); + + INIT_LIST_HEAD(&blkif->persistent_purge_list); + root = &blkif->persistent_gnts; +purge_list: + foreach_grant_safe(persistent_gnt, n, root, node) { + BUG_ON(persistent_gnt->handle == + BLKBACK_INVALID_HANDLE); + + if (clean_used) { + clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); + continue; + } + + if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + continue; + if (!scan_used && + (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) + continue; + + rb_erase(&persistent_gnt->node, root); + list_add(&persistent_gnt->remove_node, + &blkif->persistent_purge_list); + if (--num_clean == 0) + goto finished; + } + /* + * If we get here it means we also need to start cleaning + * grants that were used since last purge in order to cope + * with the requested num + */ + if (!scan_used && !clean_used) { + pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean); + scan_used = true; + goto purge_list; + } +finished: + if (!clean_used) { + pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n"); + clean_used = true; + goto purge_list; + } + + blkif->persistent_gnt_c -= (total - num_clean); + blkif->vbd.overflow_max_grants = 0; + + /* We can defer this work */ + INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants); + schedule_work(&blkif->persistent_purge_work); + pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); + return; +} + /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(void) +static struct pending_req *alloc_req(struct xen_blkif *blkif) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkbk->pending_free_lock, flags); - if (!list_empty(&blkbk->pending_free)) { - req = list_entry(blkbk->pending_free.next, struct pending_req, + spin_lock_irqsave(&blkif->pending_free_lock, flags); + if (!list_empty(&blkif->pending_free)) { + req = list_entry(blkif->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); + spin_unlock_irqrestore(&blkif->pending_free_lock, flags); return req; } @@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct pending_req *req) +static void free_req(struct xen_blkif *blkif, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkbk->pending_free_lock, flags); - was_empty = list_empty(&blkbk->pending_free); - list_add(&req->free_list, &blkbk->pending_free); - spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); + spin_lock_irqsave(&blkif->pending_free_lock, flags); + was_empty = list_empty(&blkif->pending_free); + list_add(&req->free_list, &blkif->pending_free); + spin_unlock_irqrestore(&blkif->pending_free_lock, flags); if (was_empty) - wake_up(&blkbk->pending_free_wq); + wake_up(&blkif->pending_free_wq); } /* @@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif *blkif) { pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" - " | ds %4llu\n", + " | ds %4llu | pg: %4u/%4d\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req); + blkif->st_f_req, blkif->st_ds_req, + blkif->persistent_gnt_c, + xen_blkif_max_pgrants); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; @@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg) { struct xen_blkif *blkif = arg; struct xen_vbd *vbd = &blkif->vbd; + unsigned long timeout; + int ret; xen_blkif_get(blkif); @@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg) if (unlikely(vbd->size != vbd_sz(vbd))) xen_vbd_resize(blkif); - wait_event_interruptible( + timeout = msecs_to_jiffies(LRU_INTERVAL); + + timeout = wait_event_interruptible_timeout( blkif->wq, - blkif->waiting_reqs || kthread_should_stop()); - wait_event_interruptible( - blkbk->pending_free_wq, - !list_empty(&blkbk->pending_free) || - kthread_should_stop()); + blkif->waiting_reqs || kthread_should_stop(), + timeout); + if (timeout == 0) + goto purge_gnt_list; + timeout = wait_event_interruptible_timeout( + blkif->pending_free_wq, + !list_empty(&blkif->pending_free) || + kthread_should_stop(), + timeout); + if (timeout == 0) + goto purge_gnt_list; blkif->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - if (do_block_io_op(blkif)) + ret = do_block_io_op(blkif); + if (ret > 0) blkif->waiting_reqs = 1; + if (ret == -EACCES) + wait_event_interruptible(blkif->shutdown_wq, + kthread_should_stop()); + +purge_gnt_list: + if (blkif->vbd.feature_gnt_persistent && + time_after(jiffies, blkif->next_lru)) { + purge_persistent_gnt(blkif); + blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + } + + /* Shrink if we have more than xen_blkif_max_buffer_pages */ + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); if (log_stats && time_after(jiffies, blkif->st_print)) print_stats(blkif); } + /* Since we are shutting down remove all pages from the buffer */ + shrink_free_pagepool(blkif, 0 /* All */); + /* Free all persistent grant pages */ if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(&blkif->persistent_gnts, + free_persistent_gnts(blkif, &blkif->persistent_gnts, blkif->persistent_gnt_c); BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); @@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg) return 0; } -struct seg_buf { - unsigned int offset; - unsigned int nsec; -}; /* * Unmap the grant references, and also remove the M2P over-rides * used in the 'pending_req'. */ -static void xen_blkbk_unmap(struct pending_req *req) +static void xen_blkbk_unmap(struct xen_blkif *blkif, + struct grant_page *pages[], + int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int i, invcount = 0; - grant_handle_t handle; int ret; - for (i = 0; i < req->nr_pages; i++) { - if (!test_bit(i, req->unmap_seg)) + for (i = 0; i < num; i++) { + if (pages[i]->persistent_gnt != NULL) { + put_persistent_gnt(blkif, pages[i]->persistent_gnt); continue; - handle = pending_handle(req, i); - if (handle == BLKBACK_INVALID_HANDLE) + } + if (pages[i]->handle == BLKBACK_INVALID_HANDLE) continue; - gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), - GNTMAP_host_map, handle); - pending_handle(req, i) = BLKBACK_INVALID_HANDLE; - pages[invcount] = virt_to_page(vaddr(req, i)); - invcount++; + unmap_pages[invcount] = pages[i]->page; + gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page), + GNTMAP_host_map, pages[i]->handle); + pages[i]->handle = BLKBACK_INVALID_HANDLE; + if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, + invcount); + BUG_ON(ret); + put_free_pages(blkif, unmap_pages, invcount); + invcount = 0; + } + } + if (invcount) { + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); + BUG_ON(ret); + put_free_pages(blkif, unmap_pages, invcount); } - - ret = gnttab_unmap_refs(unmap, NULL, pages, invcount); - BUG_ON(ret); } -static int xen_blkbk_map(struct blkif_request *req, - struct pending_req *pending_req, - struct seg_buf seg[], - struct page *pages[]) +static int xen_blkbk_map(struct xen_blkif *blkif, + struct grant_page *pages[], + int num, bool ro) { struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt = NULL; - struct xen_blkif *blkif = pending_req->blkif; phys_addr_t addr = 0; - int i, j; - bool new_map; - int nseg = req->u.rw.nr_segments; + int i, seg_idx, new_map_idx; int segs_to_map = 0; int ret = 0; + int last_map = 0, map_until = 0; int use_persistent_gnts; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); - BUG_ON(blkif->persistent_gnt_c > - max_mapped_grant_pages(pending_req->blkif->blk_protocol)); - /* * Fill out preq.nr_sects with proper amount of sectors, and setup * assign map[..] with the PFN of the page in our domain with the * corresponding grant reference for each page. */ - for (i = 0; i < nseg; i++) { +again: + for (i = map_until; i < num; i++) { uint32_t flags; if (use_persistent_gnts) persistent_gnt = get_persistent_gnt( - &blkif->persistent_gnts, - req->u.rw.seg[i].gref); + blkif, + pages[i]->gref); if (persistent_gnt) { /* * We are using persistent grants and * the grant is already mapped */ - new_map = false; - } else if (use_persistent_gnts && - blkif->persistent_gnt_c < - max_mapped_grant_pages(blkif->blk_protocol)) { - /* - * We are using persistent grants, the grant is - * not mapped but we have room for it - */ - new_map = true; - persistent_gnt = kmalloc( - sizeof(struct persistent_gnt), - GFP_KERNEL); - if (!persistent_gnt) - return -ENOMEM; - if (alloc_xenballooned_pages(1, &persistent_gnt->page, - false)) { - kfree(persistent_gnt); - return -ENOMEM; - } - persistent_gnt->gnt = req->u.rw.seg[i].gref; - persistent_gnt->handle = BLKBACK_INVALID_HANDLE; - - pages_to_gnt[segs_to_map] = - persistent_gnt->page; - addr = (unsigned long) pfn_to_kaddr( - page_to_pfn(persistent_gnt->page)); - - add_persistent_gnt(&blkif->persistent_gnts, - persistent_gnt); - blkif->persistent_gnt_c++; - pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, - max_mapped_grant_pages(blkif->blk_protocol)); + pages[i]->page = persistent_gnt->page; + pages[i]->persistent_gnt = persistent_gnt; } else { - /* - * We are either using persistent grants and - * hit the maximum limit of grants mapped, - * or we are not using persistent grants. - */ - if (use_persistent_gnts && - !blkif->vbd.overflow_max_grants) { - blkif->vbd.overflow_max_grants = 1; - pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", - blkif->domid, blkif->vbd.handle); - } - new_map = true; - pages[i] = blkbk->pending_page(pending_req, i); - addr = vaddr(pending_req, i); - pages_to_gnt[segs_to_map] = - blkbk->pending_page(pending_req, i); - } - - if (persistent_gnt) { - pages[i] = persistent_gnt->page; - persistent_gnts[i] = persistent_gnt; - } else { - persistent_gnts[i] = NULL; - } - - if (new_map) { + if (get_free_page(blkif, &pages[i]->page)) + goto out_of_memory; + addr = vaddr(pages[i]->page); + pages_to_gnt[segs_to_map] = pages[i]->page; + pages[i]->persistent_gnt = NULL; flags = GNTMAP_host_map; - if (!persistent_gnt && - (pending_req->operation != BLKIF_OP_READ)) + if (!use_persistent_gnts && ro) flags |= GNTMAP_readonly; gnttab_set_map_op(&map[segs_to_map++], addr, - flags, req->u.rw.seg[i].gref, + flags, pages[i]->gref, blkif->domid); } + map_until = i + 1; + if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) + break; } if (segs_to_map) { @@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req, * so that when we access vaddr(pending_req,i) it has the contents of * the page from the other domain. */ - bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - for (i = 0, j = 0; i < nseg; i++) { - if (!persistent_gnts[i] || - persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { + for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { + if (!pages[seg_idx]->persistent_gnt) { /* This is a newly mapped grant */ - BUG_ON(j >= segs_to_map); - if (unlikely(map[j].status != 0)) { + BUG_ON(new_map_idx >= segs_to_map); + if (unlikely(map[new_map_idx].status != 0)) { pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); - map[j].handle = BLKBACK_INVALID_HANDLE; + pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; - if (persistent_gnts[i]) { - rb_erase(&persistent_gnts[i]->node, - &blkif->persistent_gnts); - blkif->persistent_gnt_c--; - kfree(persistent_gnts[i]); - persistent_gnts[i] = NULL; - } + goto next; } + pages[seg_idx]->handle = map[new_map_idx].handle; + } else { + continue; } - if (persistent_gnts[i]) { - if (persistent_gnts[i]->handle == - BLKBACK_INVALID_HANDLE) { + if (use_persistent_gnts && + blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + /* + * We are using persistent grants, the grant is + * not mapped but we might have room for it. + */ + persistent_gnt = kmalloc(sizeof(struct persistent_gnt), + GFP_KERNEL); + if (!persistent_gnt) { /* - * If this is a new persistent grant - * save the handler + * If we don't have enough memory to + * allocate the persistent_gnt struct + * map this grant non-persistenly */ - persistent_gnts[i]->handle = map[j++].handle; + goto next; } - pending_handle(pending_req, i) = - persistent_gnts[i]->handle; + persistent_gnt->gnt = map[new_map_idx].ref; + persistent_gnt->handle = map[new_map_idx].handle; + persistent_gnt->page = pages[seg_idx]->page; + if (add_persistent_gnt(blkif, + persistent_gnt)) { + kfree(persistent_gnt); + persistent_gnt = NULL; + goto next; + } + pages[seg_idx]->persistent_gnt = persistent_gnt; + pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", + persistent_gnt->gnt, blkif->persistent_gnt_c, + xen_blkif_max_pgrants); + goto next; + } + if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { + blkif->vbd.overflow_max_grants = 1; + pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", + blkif->domid, blkif->vbd.handle); + } + /* + * We could not map this grant persistently, so use it as + * a non-persistent grant. + */ +next: + new_map_idx++; + } + segs_to_map = 0; + last_map = map_until; + if (map_until != num) + goto again; - if (ret) - continue; - } else { - pending_handle(pending_req, i) = map[j++].handle; - bitmap_set(pending_req->unmap_seg, i, 1); + return ret; + +out_of_memory: + pr_alert(DRV_PFX "%s: out of memory\n", __func__); + put_free_pages(blkif, pages_to_gnt, segs_to_map); + return -ENOMEM; +} + +static int xen_blkbk_map_seg(struct pending_req *pending_req) +{ + int rc; + + rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + pending_req->nr_pages, + (pending_req->operation != BLKIF_OP_READ)); + + return rc; +} - if (ret) - continue; +static int xen_blkbk_parse_indirect(struct blkif_request *req, + struct pending_req *pending_req, + struct seg_buf seg[], + struct phys_req *preq) +{ + struct grant_page **pages = pending_req->indirect_pages; + struct xen_blkif *blkif = pending_req->blkif; + int indirect_grefs, rc, n, nseg, i; + struct blkif_request_segment_aligned *segments = NULL; + + nseg = pending_req->nr_pages; + indirect_grefs = INDIRECT_PAGES(nseg); + BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); + + for (i = 0; i < indirect_grefs; i++) + pages[i]->gref = req->u.indirect.indirect_grefs[i]; + + rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + if (rc) + goto unmap; + + for (n = 0, i = 0; n < nseg; n++) { + if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { + /* Map indirect segments */ + if (segments) + kunmap_atomic(segments); + segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); + } + i = n % SEGS_PER_INDIRECT_FRAME; + pending_req->segments[n]->gref = segments[i].gref; + seg[n].nsec = segments[i].last_sect - + segments[i].first_sect + 1; + seg[n].offset = (segments[i].first_sect << 9); + if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || + (segments[i].last_sect < segments[i].first_sect)) { + rc = -EINVAL; + goto unmap; } - seg[i].offset = (req->u.rw.seg[i].first_sect << 9); + preq->nr_sects += seg[n].nsec; } - return ret; + +unmap: + if (segments) + kunmap_atomic(segments); + xen_blkbk_unmap(blkif, pages, indirect_grefs); + return rc; } static int dispatch_discard_io(struct xen_blkif *blkif, @@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif, int status = BLKIF_RSP_OKAY; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; + struct phys_req preq; + + preq.sector_number = req->u.discard.sector_number; + preq.nr_sects = req->u.discard.nr_sectors; + err = xen_vbd_translate(&preq, blkif, WRITE); + if (err) { + pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n", + preq.sector_number, + preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); + goto fail_response; + } blkif->st_ds_req++; xen_blkif_get(blkif); @@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, err = blkdev_issue_discard(bdev, req->u.discard.sector_number, req->u.discard.nr_sectors, GFP_KERNEL, secure); - +fail_response: if (err == -EOPNOTSUPP) { pr_debug(DRV_PFX "discard op failed, not supported\n"); status = BLKIF_RSP_EOPNOTSUPP; @@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif, struct blkif_request *req, struct pending_req *pending_req) { - free_req(pending_req); + free_req(blkif, pending_req); make_response(blkif, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; @@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) * the proper response on the ring. */ if (atomic_dec_and_test(&pending_req->pendcnt)) { - xen_blkbk_unmap(pending_req); + xen_blkbk_unmap(pending_req->blkif, + pending_req->segments, + pending_req->nr_pages); make_response(pending_req->blkif, pending_req->id, pending_req->operation, pending_req->status); xen_blkif_put(pending_req->blkif); @@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if (atomic_read(&pending_req->blkif->drain)) complete(&pending_req->blkif->drain_complete); } - free_req(pending_req); + free_req(pending_req->blkif, pending_req); } } @@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif) rp = blk_rings->common.sring->req_prod; rmb(); /* Ensure we see queued requests up to 'rp'. */ + if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { + rc = blk_rings->common.rsp_prod_pvt; + pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", + rp, rc, rp - rc, blkif->vbd.pdevice); + return -EACCES; + } while (rc != rp) { if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) @@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(); + pending_req = alloc_req(blkif); if (NULL == pending_req) { blkif->st_oo_req++; more_to_do = 1; @@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: + case BLKIF_OP_INDIRECT: if (dispatch_rw_block_io(blkif, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(pending_req); + free_req(blkif, pending_req); if (dispatch_discard_io(blkif, &req)) goto done; break; @@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, struct pending_req *pending_req) { struct phys_req preq; - struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct seg_buf *seg = pending_req->seg; unsigned int nseg; struct bio *bio = NULL; - struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct bio **biolist = pending_req->biolist; int i, nbio = 0; int operation; struct blk_plug plug; bool drain = false; - struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct grant_page **pages = pending_req->segments; + unsigned short req_operation; + + req_operation = req->operation == BLKIF_OP_INDIRECT ? + req->u.indirect.indirect_op : req->operation; + if ((req->operation == BLKIF_OP_INDIRECT) && + (req_operation != BLKIF_OP_READ) && + (req_operation != BLKIF_OP_WRITE)) { + pr_debug(DRV_PFX "Invalid indirect operation (%u)\n", + req_operation); + goto fail_response; + } - switch (req->operation) { + switch (req_operation) { case BLKIF_OP_READ: blkif->st_rd_req++; operation = READ; @@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, } /* Check that the number of segments is sane. */ - nseg = req->u.rw.nr_segments; + nseg = req->operation == BLKIF_OP_INDIRECT ? + req->u.indirect.nr_segments : req->u.rw.nr_segments; if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + unlikely((req->operation != BLKIF_OP_INDIRECT) && + (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || + unlikely((req->operation == BLKIF_OP_INDIRECT) && + (nseg > MAX_INDIRECT_SEGMENTS))) { pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", nseg); /* Haven't submitted any bio's yet. */ goto fail_response; } - preq.sector_number = req->u.rw.sector_number; preq.nr_sects = 0; pending_req->blkif = blkif; pending_req->id = req->u.rw.id; - pending_req->operation = req->operation; + pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; - for (i = 0; i < nseg; i++) { - seg[i].nsec = req->u.rw.seg[i].last_sect - - req->u.rw.seg[i].first_sect + 1; - if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || - (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) + if (req->operation != BLKIF_OP_INDIRECT) { + preq.dev = req->u.rw.handle; + preq.sector_number = req->u.rw.sector_number; + for (i = 0; i < nseg; i++) { + pages[i]->gref = req->u.rw.seg[i].gref; + seg[i].nsec = req->u.rw.seg[i].last_sect - + req->u.rw.seg[i].first_sect + 1; + seg[i].offset = (req->u.rw.seg[i].first_sect << 9); + if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (req->u.rw.seg[i].last_sect < + req->u.rw.seg[i].first_sect)) + goto fail_response; + preq.nr_sects += seg[i].nsec; + } + } else { + preq.dev = req->u.indirect.handle; + preq.sector_number = req->u.indirect.sector_number; + if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) goto fail_response; - preq.nr_sects += seg[i].nsec; - } if (xen_vbd_translate(&preq, blkif, operation) != 0) { @@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * the hypercall to unmap the grants - that is all done in * xen_blkbk_unmap. */ - if (xen_blkbk_map(req, pending_req, seg, pages)) + if (xen_blkbk_map_seg(pending_req)) goto fail_flush; /* @@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, for (i = 0; i < nseg; i++) { while ((bio == NULL) || (bio_add_page(bio, - pages[i], + pages[i]->page, seg[i].nsec << 9, seg[i].offset) == 0)) { - bio = bio_alloc(GFP_KERNEL, nseg-i); + int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); + bio = bio_alloc(GFP_KERNEL, nr_iovecs); if (unlikely(bio == NULL)) goto fail_put_bio; @@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, return 0; fail_flush: - xen_blkbk_unmap(pending_req); + xen_blkbk_unmap(blkif, pending_req->segments, + pending_req->nr_pages); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); - free_req(pending_req); + make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(blkif, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, static int __init xen_blkif_init(void) { - int i, mmap_pages; int rc = 0; if (!xen_domain()) return -ENODEV; - blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); - if (!blkbk) { - pr_alert(DRV_PFX "%s: out of memory!\n", __func__); - return -ENOMEM; - } - - mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - - blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * - xen_blkif_reqs, GFP_KERNEL); - blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * - mmap_pages, GFP_KERNEL); - blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * - mmap_pages, GFP_KERNEL); - - if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || - !blkbk->pending_pages) { - rc = -ENOMEM; - goto out_of_memory; - } - - for (i = 0; i < mmap_pages; i++) { - blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; - blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); - if (blkbk->pending_pages[i] == NULL) { - rc = -ENOMEM; - goto out_of_memory; - } - } rc = xen_blkif_interface_init(); if (rc) goto failed_init; - INIT_LIST_HEAD(&blkbk->pending_free); - spin_lock_init(&blkbk->pending_free_lock); - init_waitqueue_head(&blkbk->pending_free_wq); - - for (i = 0; i < xen_blkif_reqs; i++) - list_add_tail(&blkbk->pending_reqs[i].free_list, - &blkbk->pending_free); - rc = xen_blkif_xenbus_init(); if (rc) goto failed_init; - return 0; - - out_of_memory: - pr_alert(DRV_PFX "%s: out of memory\n", __func__); failed_init: - kfree(blkbk->pending_reqs); - kfree(blkbk->pending_grant_handles); - if (blkbk->pending_pages) { - for (i = 0; i < mmap_pages; i++) { - if (blkbk->pending_pages[i]) - __free_page(blkbk->pending_pages[i]); - } - kfree(blkbk->pending_pages); - } - kfree(blkbk); - blkbk = NULL; return rc; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 60103e2..8d88075 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -50,6 +50,19 @@ __func__, __LINE__, ##args) +/* + * This is the maximum number of segments that would be allowed in indirect + * requests. This value will also be passed to the frontend. + */ +#define MAX_INDIRECT_SEGMENTS 256 + +#define SEGS_PER_INDIRECT_FRAME \ + (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) +#define MAX_INDIRECT_PAGES \ + ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +#define INDIRECT_PAGES(_segs) \ + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) + /* Not a real protocol. Used to generate ring structs which contain * the elements common to all protocols only. This way we get a * compiler-checkable way to use common struct elements, so we can @@ -83,12 +96,31 @@ struct blkif_x86_32_request_other { uint64_t id; /* private guest value, echoed in resp */ } __attribute__((__packed__)); +struct blkif_x86_32_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad1; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; + /* + * The maximum number of indirect segments (and pages) that will + * be used is determined by MAX_INDIRECT_SEGMENTS, this value + * is also exported to the guest (via xenstore + * feature-max-indirect-segments entry), so the frontend knows how + * many indirect segments the backend supports. + */ + uint64_t _pad2; /* make it 64 byte aligned */ +} __attribute__((__packed__)); + struct blkif_x86_32_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_32_request_rw rw; struct blkif_x86_32_request_discard discard; struct blkif_x86_32_request_other other; + struct blkif_x86_32_request_indirect indirect; } u; } __attribute__((__packed__)); @@ -127,12 +159,32 @@ struct blkif_x86_64_request_other { uint64_t id; /* private guest value, echoed in resp */ } __attribute__((__packed__)); +struct blkif_x86_64_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; + uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */ + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad2; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; + /* + * The maximum number of indirect segments (and pages) that will + * be used is determined by MAX_INDIRECT_SEGMENTS, this value + * is also exported to the guest (via xenstore + * feature-max-indirect-segments entry), so the frontend knows how + * many indirect segments the backend supports. + */ + uint32_t _pad3; /* make it 64 byte aligned */ +} __attribute__((__packed__)); + struct blkif_x86_64_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_64_request_rw rw; struct blkif_x86_64_request_discard discard; struct blkif_x86_64_request_other other; + struct blkif_x86_64_request_indirect indirect; } u; } __attribute__((__packed__)); @@ -182,12 +234,26 @@ struct xen_vbd { struct backend_info; +/* Number of available flags */ +#define PERSISTENT_GNT_FLAGS_SIZE 2 +/* This persistent grant is currently in use */ +#define PERSISTENT_GNT_ACTIVE 0 +/* + * This persistent grant has been used, this flag is set when we remove the + * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. + */ +#define PERSISTENT_GNT_WAS_ACTIVE 1 + +/* Number of requests that we can fit in a ring */ +#define XEN_BLKIF_REQS 32 struct persistent_gnt { struct page *page; grant_ref_t gnt; grant_handle_t handle; + DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); struct rb_node node; + struct list_head remove_node; }; struct xen_blkif { @@ -219,6 +285,23 @@ struct xen_blkif { /* tree to store persistent grants */ struct rb_root persistent_gnts; unsigned int persistent_gnt_c; + atomic_t persistent_gnt_in_use; + unsigned long next_lru; + + /* used by the kworker that offload work from the persistent purge */ + struct list_head persistent_purge_list; + struct work_struct persistent_purge_work; + + /* buffer of free pages to map grant refs */ + spinlock_t free_pages_lock; + int free_pages_num; + struct list_head free_pages; + + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; /* statistics */ unsigned long st_print; @@ -231,6 +314,41 @@ struct xen_blkif { unsigned long long st_wr_sect; wait_queue_head_t waiting_to_free; + /* Thread shutdown wait queue. */ + wait_queue_head_t shutdown_wq; +}; + +struct seg_buf { + unsigned long offset; + unsigned int nsec; +}; + +struct grant_page { + struct page *page; + struct persistent_gnt *persistent_gnt; + grant_handle_t handle; + grant_ref_t gref; +}; + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +struct pending_req { + struct xen_blkif *blkif; + u64 id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; + struct grant_page *segments[MAX_INDIRECT_SEGMENTS]; + /* Indirect descriptors */ + struct grant_page *indirect_pages[MAX_INDIRECT_PAGES]; + struct seg_buf seg[MAX_INDIRECT_SEGMENTS]; + struct bio *biolist[MAX_INDIRECT_SEGMENTS]; }; @@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); +int xen_blkif_purge_persistent(void *arg); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); @@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); static inline void blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) { - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; dst->operation = src->operation; switch (src->operation) { case BLKIF_OP_READ: @@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = src->u.indirect.nr_segments; + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + barrier(); + j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < j; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; default: /* * Don't know how to translate this op. Only get the @@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, static inline void blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src) { - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; dst->operation = src->operation; switch (src->operation) { case BLKIF_OP_READ: @@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = src->u.indirect.nr_segments; + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + barrier(); + j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < j; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; default: /* * Don't know how to translate this op. Only get the diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 04608a6..fe5c3cd 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) err = PTR_ERR(blkif->xenblkd); blkif->xenblkd = NULL; xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); + return; } } static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; + struct pending_req *req, *n; + int i, j; + + BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); if (!blkif) @@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->st_print = jiffies; init_waitqueue_head(&blkif->waiting_to_free); blkif->persistent_gnts.rb_node = NULL; + spin_lock_init(&blkif->free_pages_lock); + INIT_LIST_HEAD(&blkif->free_pages); + blkif->free_pages_num = 0; + atomic_set(&blkif->persistent_gnt_in_use, 0); + + INIT_LIST_HEAD(&blkif->pending_free); + + for (i = 0; i < XEN_BLKIF_REQS; i++) { + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto fail; + list_add_tail(&req->free_list, + &blkif->pending_free); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + req->segments[j] = kzalloc(sizeof(*req->segments[0]), + GFP_KERNEL); + if (!req->segments[j]) + goto fail; + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), + GFP_KERNEL); + if (!req->indirect_pages[j]) + goto fail; + } + } + spin_lock_init(&blkif->pending_free_lock); + init_waitqueue_head(&blkif->pending_free_wq); + init_waitqueue_head(&blkif->shutdown_wq); return blkif; + +fail: + list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_del(&req->free_list); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + if (!req->segments[j]) + break; + kfree(req->segments[j]); + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + if (!req->indirect_pages[j]) + break; + kfree(req->indirect_pages[j]); + } + kfree(req); + } + + kmem_cache_free(xen_blkif_cachep, blkif); + + return ERR_PTR(-ENOMEM); } static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, @@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) { if (blkif->xenblkd) { kthread_stop(blkif->xenblkd); + wake_up(&blkif->shutdown_wq); blkif->xenblkd = NULL; } @@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) static void xen_blkif_free(struct xen_blkif *blkif) { + struct pending_req *req, *n; + int i = 0, j; + if (!atomic_dec_and_test(&blkif->refcnt)) BUG(); + + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_del(&req->free_list); + + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); + + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } + + WARN_ON(i != XEN_BLKIF_REQS); + kmem_cache_free(xen_blkif_cachep, blkif); } @@ -678,6 +753,11 @@ again: dev->nodename); goto abort; } + err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", + MAX_INDIRECT_SEGMENTS); + if (err) + dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", + dev->nodename, err); err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); @@ -704,6 +784,11 @@ again: dev->nodename); goto abort; } + err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", + bdev_physical_block_size(be->blkif->vbd.bdev)); + if (err) + xenbus_dev_error(dev, err, "writing %s/physical-sector-size", + dev->nodename); err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d89ef86..a4660bb 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -74,12 +74,30 @@ struct grant { struct blk_shadow { struct blkif_request req; struct request *request; - struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct grant **grants_used; + struct grant **indirect_grants; + struct scatterlist *sg; +}; + +struct split_bio { + struct bio *bio; + atomic_t pending; + int err; }; static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; +/* + * Maximum number of segments in indirect requests, the actual value used by + * the frontend driver is the minimum of this value and the value provided + * by the backend driver. + */ + +static unsigned int xen_blkif_max_segments = 32; +module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); +MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); + #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) /* @@ -98,7 +116,6 @@ struct blkfront_info enum blkif_state connected; int ring_ref; struct blkif_front_ring ring; - struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; @@ -114,6 +131,7 @@ struct blkfront_info unsigned int discard_granularity; unsigned int discard_alignment; unsigned int feature_persistent:1; + unsigned int max_indirect_segments; int is_ready; }; @@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ +#define SEGS_PER_INDIRECT_FRAME \ + (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) +#define INDIRECT_GREFS(_segs) \ + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) + +static int blkfront_setup_indirect(struct blkfront_info *info); + static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; @@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req) struct blkif_request *ring_req; unsigned long id; unsigned int fsect, lsect; - int i, ref; + int i, ref, n; + struct blkif_request_segment_aligned *segments = NULL; /* * Used to store if we are able to queue the request by just using @@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req) grant_ref_t gref_head; struct grant *gnt_list_entry = NULL; struct scatterlist *sg; + int nseg, max_grefs; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; - /* Check if we have enought grants to allocate a requests */ - if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { + max_grefs = info->max_indirect_segments ? + info->max_indirect_segments + + INDIRECT_GREFS(info->max_indirect_segments) : + BLKIF_MAX_SEGMENTS_PER_REQUEST; + + /* Check if we have enough grants to allocate a requests */ + if (info->persistent_gnts_c < max_grefs) { new_persistent_gnts = 1; if (gnttab_alloc_grant_references( - BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, + max_grefs - info->persistent_gnts_c, &gref_head) < 0) { gnttab_request_free_callback( &info->callback, blkif_restart_queue_callback, info, - BLKIF_MAX_SEGMENTS_PER_REQUEST); + max_grefs); return 1; } } else @@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - ring_req->u.rw.id = id; - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - ring_req->operation = info->flush_op; - } - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - /* id, sector_number and handle are set above. */ ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; } else { - ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, - info->sg); - BUG_ON(ring_req->u.rw.nr_segments > - BLKIF_MAX_SEGMENTS_PER_REQUEST); - - for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { + BUG_ON(info->max_indirect_segments == 0 && + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + req->nr_phys_segments > info->max_indirect_segments); + nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + ring_req->u.rw.id = id; + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = nseg; + } else { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + /* + * Ideally we can do an unordered flush-to-disk. In case the + * backend onlysupports barriers, use that. A barrier request + * a superset of FUA, so we can implement it the same + * way. (It's also a FLUSH+FUA, since it is + * guaranteed ordered WRT previous writes.) + */ + ring_req->operation = info->flush_op; + } + ring_req->u.rw.nr_segments = nseg; + } + for_each_sg(info->shadow[id].sg, sg, nseg, i) { fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (i % SEGS_PER_INDIRECT_FRAME == 0)) { + if (segments) + kunmap_atomic(segments); + + n = i / SEGS_PER_INDIRECT_FRAME; + gnt_list_entry = get_grant(&gref_head, info); + info->shadow[id].indirect_grants[n] = gnt_list_entry; + segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + gnt_list_entry = get_grant(&gref_head, info); ref = gnt_list_entry->gref; @@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req) BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic( - pfn_to_page(gnt_list_entry->pfn)); + shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); bvec_data = kmap_atomic(sg_page(sg)); /* @@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req) kunmap_atomic(bvec_data); kunmap_atomic(shared_data); } - - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + n = i % SEGS_PER_INDIRECT_FRAME; + segments[n] = + (struct blkif_request_segment_aligned) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } } + if (segments) + kunmap_atomic(segments); } info->ring.req_prod_pvt++; @@ -542,7 +608,9 @@ wait: flush_requests(info); } -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, + unsigned int physical_sector_size, + unsigned int segments) { struct request_queue *rq; struct blkfront_info *info = gd->private_data; @@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); - blk_queue_max_hw_sectors(rq, 512); + blk_queue_physical_block_size(rq, physical_sector_size); + blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_segments(rq, segments); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s %s\n", + printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", info->gd->disk_name, info->flush_op == BLKIF_OP_WRITE_BARRIER ? "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled" : "disabled", - info->feature_persistent ? "using persistent grants" : ""); + info->feature_flush ? "enabled;" : "disabled;", + "persistent grants:", + info->feature_persistent ? "enabled;" : "disabled;", + "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, - u16 vdisk_info, u16 sector_size) + u16 vdisk_info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->driverfs_dev = &(info->xbdev->dev); set_capacity(gd, capacity); - if (xlvbd_init_blk_queue(gd, sector_size)) { + if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, + info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)) { del_gendisk(gd); goto release; } @@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) { struct grant *persistent_gnt; struct grant *n; + int i, j, segs; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend) } BUG_ON(info->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE; i++) { + /* + * Clear persistent grants present in requests already + * on the shared ring + */ + if (!info->shadow[i].request) + goto free_shadow; + + segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + info->shadow[i].req.u.indirect.nr_segments : + info->shadow[i].req.u.rw.nr_segments; + for (j = 0; j < segs; j++) { + persistent_gnt = info->shadow[i].grants_used[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + + if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + /* + * If this is not an indirect operation don't try to + * free indirect segments + */ + goto free_shadow; + + for (j = 0; j < INDIRECT_GREFS(segs); j++) { + persistent_gnt = info->shadow[i].indirect_grants[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + +free_shadow: + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + } + /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); spin_unlock_irq(&info->io_lock); @@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { int i = 0; - struct bio_vec *bvec; - struct req_iterator iter; - unsigned long flags; + struct scatterlist *sg; char *bvec_data; void *shared_data; - unsigned int offset = 0; + int nseg; + + nseg = s->req.operation == BLKIF_OP_INDIRECT ? + s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; if (bret->operation == BLKIF_OP_READ) { /* @@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * than PAGE_SIZE, we have to keep track of the current offset, * to be sure we are copying the data from the right shared page. */ - rq_for_each_segment(bvec, s->request, iter) { - BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); - if (bvec->bv_offset < offset) - i++; - BUG_ON(i >= s->req.u.rw.nr_segments); + for_each_sg(s->sg, sg, nseg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); shared_data = kmap_atomic( pfn_to_page(s->grants_used[i]->pfn)); - bvec_data = bvec_kmap_irq(bvec, &flags); - memcpy(bvec_data, shared_data + bvec->bv_offset, - bvec->bv_len); - bvec_kunmap_irq(bvec_data, &flags); + bvec_data = kmap_atomic(sg_page(sg)); + memcpy(bvec_data + sg->offset, + shared_data + sg->offset, + sg->length); + kunmap_atomic(bvec_data); kunmap_atomic(shared_data); - offset = bvec->bv_offset + bvec->bv_len; } } /* Add the persistent grant into the list of free grants */ - for (i = 0; i < s->req.u.rw.nr_segments; i++) { + for (i = 0; i < nseg; i++) { list_add(&s->grants_used[i]->node, &info->persistent_gnts); info->persistent_gnts_c++; } + if (s->req.operation == BLKIF_OP_INDIRECT) { + for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); + info->persistent_gnts_c++; + } + } } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev, SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - /* Allocate memory for grants */ - err = fill_grant_buffer(info, BLK_RING_SIZE * - BLKIF_MAX_SEGMENTS_PER_REQUEST); - if (err) - goto fail; - err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); if (err < 0) { free_page((unsigned long)sring); @@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } +/* + * This is a clone of md_trim_bio, used to split a bio into smaller ones + */ +static void trim_bio(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + * This requires adjusting bi_sector, bi_size, and bi_io_vec + */ + int i; + struct bio_vec *bvec; + int sofar = 0; + + size <<= 9; + if (offset == 0 && size == bio->bi_size) + return; + + bio->bi_sector += offset; + bio->bi_size = size; + offset <<= 9; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + + while (bio->bi_idx < bio->bi_vcnt && + bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { + /* remove this whole bio_vec */ + offset -= bio->bi_io_vec[bio->bi_idx].bv_len; + bio->bi_idx++; + } + if (bio->bi_idx < bio->bi_vcnt) { + bio->bi_io_vec[bio->bi_idx].bv_offset += offset; + bio->bi_io_vec[bio->bi_idx].bv_len -= offset; + } + /* avoid any complications with bi_idx being non-zero*/ + if (bio->bi_idx) { + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); + bio->bi_vcnt -= bio->bi_idx; + bio->bi_idx = 0; + } + /* Make sure vcnt and last bv are not too big */ + bio_for_each_segment(bvec, bio, i) { + if (sofar + bvec->bv_len > size) + bvec->bv_len = size - sofar; + if (bvec->bv_len == 0) { + bio->bi_vcnt = i; + break; + } + sofar += bvec->bv_len; + } +} + +static void split_bio_end(struct bio *bio, int error) +{ + struct split_bio *split_bio = bio->bi_private; + + if (error) + split_bio->err = error; + + if (atomic_dec_and_test(&split_bio->pending)) { + split_bio->bio->bi_phys_segments = 0; + bio_endio(split_bio->bio, split_bio->err); + kfree(split_bio); + } + bio_put(bio); +} static int blkif_recover(struct blkfront_info *info) { int i; - struct blkif_request *req; + struct request *req, *n; struct blk_shadow *copy; - int j; + int rc; + struct bio *bio, *cloned_bio; + struct bio_list bio_list, merge_bio; + unsigned int segs, offset; + int pending, size; + struct split_bio *split_bio; + struct list_head requests; /* Stage 1: Make a safe copy of the shadow state. */ copy = kmemdup(info->shadow, sizeof(info->shadow), @@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info) info->shadow_free = info->ring.req_prod_pvt; info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Stage 3: Find pending requests and requeue them. */ + rc = blkfront_setup_indirect(info); + if (rc) { + kfree(copy); + return rc; + } + + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; + blk_queue_max_segments(info->rq, segs); + bio_list_init(&bio_list); + INIT_LIST_HEAD(&requests); for (i = 0; i < BLK_RING_SIZE; i++) { /* Not in use? */ if (!copy[i].request) continue; - /* Grab a request slot and copy shadow state into it. */ - req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - *req = copy[i].req; - - /* We get a new request id, and must reset the shadow state. */ - req->u.rw.id = get_id_from_freelist(info); - memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); - - if (req->operation != BLKIF_OP_DISCARD) { - /* Rewrite any grant references invalidated by susp/resume. */ - for (j = 0; j < req->u.rw.nr_segments; j++) - gnttab_grant_foreign_access_ref( - req->u.rw.seg[j].gref, - info->xbdev->otherend_id, - pfn_to_mfn(copy[i].grants_used[j]->pfn), - 0); + /* + * Get the bios in the request so we can re-queue them. + */ + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; } - info->shadow[req->u.rw.id].req = *req; - - info->ring.req_prod_pvt++; + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_put_request(copy[i].request); } kfree(copy); + /* + * Empty the queue, this is important because we might have + * requests in the queue with more segments than what we + * can handle now. + */ + spin_lock_irq(&info->io_lock); + while ((req = blk_fetch_request(info->rq)) != NULL) { + if (req->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + list_add(&req->queuelist, &requests); + continue; + } + merge_bio.head = req->bio; + merge_bio.tail = req->biotail; + bio_list_merge(&bio_list, &merge_bio); + req->bio = NULL; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) + pr_alert("diskcache flush request found!\n"); + __blk_put_request(info->rq, req); + } + spin_unlock_irq(&info->io_lock); + xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Send off requeued requests */ - flush_requests(info); - /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(info); + list_for_each_entry_safe(req, n, &requests, queuelist) { + /* Requeue pending requests (flush or discard) */ + list_del_init(&req->queuelist); + BUG_ON(req->nr_phys_segments > segs); + blk_requeue_request(info->rq, req); + } spin_unlock_irq(&info->io_lock); + while ((bio = bio_list_pop(&bio_list)) != NULL) { + /* Traverse the list of pending bios and re-queue them */ + if (bio_segments(bio) > segs) { + /* + * This bio has more segments than what we can + * handle, we have to split it. + */ + pending = (bio_segments(bio) + segs - 1) / segs; + split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); + BUG_ON(split_bio == NULL); + atomic_set(&split_bio->pending, pending); + split_bio->bio = bio; + for (i = 0; i < pending; i++) { + offset = (i * segs * PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + (unsigned int)(bio->bi_size >> 9) - offset); + cloned_bio = bio_clone(bio, GFP_NOIO); + BUG_ON(cloned_bio == NULL); + trim_bio(cloned_bio, offset, size); + cloned_bio->bi_private = split_bio; + cloned_bio->bi_end_io = split_bio_end; + submit_bio(cloned_bio->bi_rw, cloned_bio); + } + /* + * Now we have to wait for all those smaller bios to + * end, so we can also end the "parent" bio. + */ + continue; + } + /* We don't need to split this bio */ + submit_bio(bio->bi_rw, bio); + } + return 0; } @@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev) blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = talk_to_blkback(dev, info); - if (info->connected == BLKIF_STATE_SUSPENDED && !err) - err = blkif_recover(info); + + /* + * We have to wait for the backend to switch to + * connected state, since we want to read which + * features it supports. + */ return err; } @@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info) kfree(type); } +static int blkfront_setup_indirect(struct blkfront_info *info) +{ + unsigned int indirect_segments, segs; + int err, i; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) { + info->max_indirect_segments = 0; + segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + } else { + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + segs = info->max_indirect_segments; + } + + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + if (err) + goto out_of_memory; + + for (i = 0; i < BLK_RING_SIZE; i++) { + info->shadow[i].grants_used = kzalloc( + sizeof(info->shadow[i].grants_used[0]) * segs, + GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + if (info->max_indirect_segments) + info->shadow[i].indirect_grants = kzalloc( + sizeof(info->shadow[i].indirect_grants[0]) * + INDIRECT_GREFS(segs), + GFP_NOIO); + if ((info->shadow[i].grants_used == NULL) || + (info->shadow[i].sg == NULL) || + (info->max_indirect_segments && + (info->shadow[i].indirect_grants == NULL))) + goto out_of_memory; + sg_init_table(info->shadow[i].sg, segs); + } + + + return 0; + +out_of_memory: + for (i = 0; i < BLK_RING_SIZE; i++) { + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + } + return -ENOMEM; +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; unsigned long sector_size; + unsigned int physical_sector_size; unsigned int binfo; int err; int barrier, flush, discard, persistent; @@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info) set_capacity(info->gd, sectors); revalidate_disk(info->gd); - /* fall through */ + return; case BLKIF_STATE_SUSPENDED: + /* + * If we are recovering from suspension, we need to wait + * for the backend to announce it's features before + * reconnecting, at least we need to know if the backend + * supports indirect descriptors, and how many. + */ + blkif_recover(info); return; default: @@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info) return; } + /* + * physcial-sector-size is a newer field, so old backends may not + * provide this. Assume physical sector size to be the same as + * sector_size in that case. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "physical-sector-size", "%u", &physical_sector_size); + if (err != 1) + physical_sector_size = sector_size; + info->feature_flush = 0; info->flush_op = 0; @@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info) else info->feature_persistent = persistent; - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); + err = blkfront_setup_indirect(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f294..e45f557 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,7 +63,10 @@ #include "bcache.h" #include "btree.h" +#include <linux/freezer.h> +#include <linux/kthread.h> #include <linux/random.h> +#include <trace/events/bcache.h> #define MAX_IN_FLIGHT_DISCARDS 8U @@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w) mutex_unlock(&ca->set->bucket_lock); closure_wake_up(&ca->set->bucket_wait); - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); closure_put(&ca->set->cl); } @@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca) break; } - pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", - fifo_used(&ca->free), ca->free.size, - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->unused), ca->unused.size); + trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ do { \ - DEFINE_WAIT(__wait); \ - \ while (1) { \ - prepare_to_wait(&ca->set->alloc_wait, \ - &__wait, TASK_INTERRUPTIBLE); \ + set_current_state(TASK_INTERRUPTIBLE); \ if (cond) \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ - finish_wait(&ca->set->alloc_wait, &__wait); \ - closure_return(cl); \ - } \ + if (kthread_should_stop()) \ + return 0; \ \ + try_to_freeze(); \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ - \ - finish_wait(&ca->set->alloc_wait, &__wait); \ + __set_current_state(TASK_RUNNING); \ } while (0) -void bch_allocator_thread(struct closure *cl) +static int bch_allocator_thread(void *arg) { - struct cache *ca = container_of(cl, struct cache, alloc); + struct cache *ca = arg; mutex_lock(&ca->set->bucket_lock); @@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) { long r = -1; again: - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); if (fifo_used(&ca->free) > ca->watermark[watermark] && fifo_pop(&ca->free, r)) { @@ -476,9 +471,7 @@ again: return r; } - pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", - atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + trace_bcache_alloc_fail(ca); if (cl) { closure_wait(&ca->set->bucket_wait, cl); @@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, /* Init */ +int bch_cache_allocator_start(struct cache *ca) +{ + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + + ca->alloc_thread = k; + return 0; +} + void bch_cache_allocator_exit(struct cache *ca) { struct discard *d; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b4..b39f6f0 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -178,7 +178,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ #include <linux/bio.h> -#include <linux/blktrace_api.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mutex.h> @@ -388,8 +387,6 @@ struct keybuf_key { typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); struct keybuf { - keybuf_pred_fn *key_predicate; - struct bkey last_scanned; spinlock_t lock; @@ -437,9 +434,12 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; + int flush_done; + + uint64_t nr_stripes; + unsigned stripe_size_bits; + atomic_t *stripe_sectors_dirty; - atomic_long_t sectors_dirty; - unsigned long sectors_dirty_gc; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -531,6 +531,7 @@ struct cached_dev { unsigned sequential_merge:1; unsigned verify:1; + unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; unsigned writeback_running:1; unsigned char writeback_percent; @@ -565,8 +566,7 @@ struct cache { unsigned watermark[WATERMARK_MAX]; - struct closure alloc; - struct workqueue_struct *alloc_workqueue; + struct task_struct *alloc_thread; struct closure prio; struct prio_set *disk_buckets; @@ -664,13 +664,9 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). - * - * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down - * the allocation thread. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 -#define CACHE_SET_STOPPING_2 2 struct cache_set { struct closure cl; @@ -703,9 +699,6 @@ struct cache_set { /* For the btree cache */ struct shrinker shrink; - /* For the allocator itself */ - wait_queue_head_t alloc_wait; - /* For the btree cache and anything allocation related */ struct mutex bucket_lock; @@ -823,10 +816,9 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - this is a single element mempool for btree_read_work() + * on the stack - have to dynamically allocate them */ - struct mutex fill_lock; - struct btree_iter *fill_iter; + mempool_t *fill_iter; /* * btree_sort() is a merge sort and requires temporary space - single @@ -834,6 +826,7 @@ struct cache_set { */ struct mutex sort_lock; struct bset *sort; + unsigned sort_crit_factor; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void) return local_clock() >> 10; } -#define MAX_BSETS 4U - #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768 @@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } -/* Blktrace macros */ - -#define blktrace_msg(c, fmt, ...) \ -do { \ - struct request_queue *q = bdev_get_queue(c->bdev); \ - if (q) \ - blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ -} while (0) - -#define blktrace_msg_all(s, fmt, ...) \ -do { \ - struct cache *_c; \ - unsigned i; \ - for_each_cache(_c, (s), i) \ - blktrace_msg(_c, fmt, ##__VA_ARGS__); \ -} while (0) - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) @@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) static struct kobj_attribute ksysfs_##n = \ __ATTR(n, S_IWUSR|S_IRUSR, show, store) -/* Forward declarations */ +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + for_each_cache(ca, c, i) + wake_up_process(ca->alloc_thread); +} -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *, unsigned); +/* Forward declarations */ void bch_count_io_errors(struct cache *, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, @@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -void bch_allocator_thread(struct closure *); long bch_bucket_alloc(struct cache *, unsigned, struct closure *); void bch_bucket_free(struct cache_set *, struct bkey *); @@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); +int bch_cache_allocator_start(struct cache *ca); void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1d27d3a..8010eed 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) { unsigned i; + char buf[80]; if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) goto bad; @@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) return false; bad: - cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); return true; } @@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) #ifdef CONFIG_BCACHE_EDEBUG bug: mutex_unlock(&b->c->bucket_lock); - btree_bug(b, + + { + char buf[80]; + + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + } return true; #endif } @@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) new->sets->size = 0; } +#define SORT_CRIT (4096 / sizeof(uint64_t)) + void bch_btree_sort_lazy(struct btree *b) { - if (b->nsets) { - unsigned i, j, keys = 0, total; + unsigned crit = SORT_CRIT; + int i; - for (i = 0; i <= b->nsets; i++) - keys += b->sets[i].data->keys; - - total = keys; + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; - for (j = 0; j < b->nsets; j++) { - if (keys * 2 < total || - keys < 1000) { - bch_btree_sort_partial(b, j); - return; - } + /* If not a leaf node, always sort */ + if (b->level) { + bch_btree_sort(b); + return; + } - keys -= b->sets[j].data->keys; - } + for (i = b->nsets - 1; i >= 0; --i) { + crit *= b->c->sort_crit_factor; - /* Must sort if b->nsets == 3 or we'll overflow */ - if (b->nsets >= (MAX_BSETS - 1) - b->level) { - bch_btree_sort(b); + if (b->sets[i].data->keys < crit) { + bch_btree_sort_partial(b, i); return; } } + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b); + return; + } + +out: bset_build_written_tree(b); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff..ae115a2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,6 +1,8 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H +#include <linux/slab.h> + /* * BKEYS: * @@ -142,6 +144,8 @@ /* Btree key comparison/iteration */ +#define MAX_BSETS 4U + struct btree_iter { size_t size, used; struct btree_iter_set { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f..ee37288 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include <linux/slab.h> #include <linux/bitops.h> @@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +static void bch_btree_node_read_done(struct btree *b) { - struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); - - if (error) - set_btree_node_io_error(b); - - bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) - ? "writing btree" : "reading btree"); - closure_put(cl); -} - -static void btree_bio_init(struct btree *b) -{ - BUG_ON(b->bio); - b->bio = bch_bbio_alloc(b->c); - - b->bio->bi_end_io = btree_bio_endio; - b->bio->bi_private = &b->io.cl; -} - -void bch_btree_read_done(struct closure *cl) -{ - struct btree *b = container_of(cl, struct btree, io.cl); - struct bset *i = b->sets[0].data; - struct btree_iter *iter = b->c->fill_iter; const char *err = "bad btree header"; - BUG_ON(b->nsets || b->written); - - bch_bbio_free(b->bio, b->c); - b->bio = NULL; + struct bset *i = b->sets[0].data; + struct btree_iter *iter; - mutex_lock(&b->c->fill_lock); + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; - if (btree_node_io_error(b) || - !i->seq) + if (!i->seq) goto err; for (; @@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) if (b->written < btree_blocks(b)) bch_bset_init_next(b); out: - - mutex_unlock(&b->c->fill_lock); - - spin_lock(&b->c->btree_read_time_lock); - bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); - spin_unlock(&b->c->btree_read_time_lock); - - smp_wmb(); /* read_done is our write lock */ - set_btree_node_read_done(b); - - closure_return(cl); + mempool_free(iter, b->c->fill_iter); + return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", @@ -247,48 +212,69 @@ err: goto out; } -void bch_btree_read(struct btree *b) +static void btree_node_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +void bch_btree_node_read(struct btree *b) { - BUG_ON(b->nsets || b->written); + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + + bch_bio_map(bio, b->sets[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); - if (!closure_trylock(&b->io.cl, &b->c->cl)) - BUG(); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); - b->io_start_time = local_clock(); + bch_bbio_free(bio, b->c); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|READ_SYNC; - b->bio->bi_size = KEY_SIZE(&b->key) << 9; + if (btree_node_io_error(b)) + goto err; - bch_bio_map(b->bio, b->sets[0].data); + bch_btree_node_read_done(b); - pr_debug("%s", pbtree(b)); - trace_bcache_btree_read(b->bio); - bch_submit_bbio(b->bio, b->c, &b->key, 0); + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, start_time); + spin_unlock(&b->c->btree_read_time_lock); - continue_at(&b->io.cl, bch_btree_read_done, system_wq); + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %lu", + PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); __closure_wake_up(&b->c->journal.wait); } - if (w->owner) - closure_put(w->owner); - w->prio_blocked = 0; w->journal = NULL; - w->owner = NULL; } -static void __btree_write_done(struct closure *cl) +static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct btree_write *w = btree_prev_write(b); @@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) closure_return(cl); } -static void btree_write_done(struct closure *cl) +static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct bio_vec *bv; @@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) __bio_for_each_segment(bv, b->bio, n, 0) __free_page(bv->bv_page); - __btree_write_done(cl); + __btree_node_write_done(cl); } -static void do_btree_write(struct btree *b) +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io.cl; struct bset *i = b->sets[b->nsets].data; @@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|WRITE_SYNC; - b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = &b->io.cl; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); - if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); - continue_at(cl, btree_write_done, NULL); + continue_at(cl, btree_node_write_done, NULL); } else { b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_write_done(cl); + __btree_node_write_done(cl); } } -static void __btree_write(struct btree *b) +void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; + trace_bcache_btree_write(b); + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(b->sets->data->seq != i->seq); + bch_check_key_order(b, i); - closure_lock(&b->io, &b->c->cl); cancel_delayed_work(&b->work); + /* If caller isn't waiting for write, parent refcount is cache set */ + closure_lock(&b->io, parent ?: &b->c->cl); + clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); - bch_check_key_order(b, i); - BUG_ON(b->written && !i->keys); - - do_btree_write(b); - - pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); + do_btree_node_write(b); b->written += set_blocks(i, b->c); atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, @@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) bch_bset_init_next(b); } -static void btree_write_work(struct work_struct *w) +static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - down_write(&b->lock); + rw_lock(true, b, b->level); if (btree_node_dirty(b)) - __btree_write(b); - up_write(&b->lock); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } -void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); - BUG_ON(b->written && - (b->written >= btree_blocks(b) || - i->seq != b->sets[0].data->seq || - !i->keys)); + BUG_ON(!b->written); + BUG_ON(!i->keys); - if (!btree_node_dirty(b)) { - set_btree_node_dirty(b); - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); - } + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); - w->prio_blocked += b->prio_blocked; - b->prio_blocked = 0; + set_btree_node_dirty(b); - if (op && op->journal && !b->level) { + if (op && op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) } } - if (current->bio_list) - return; - /* Force write if set is too big */ - if (now || - b->level || - set_bytes(i) > PAGE_SIZE - 48) { - if (op && now) { - /* Must wait on multiple writes */ - BUG_ON(w->owner); - w->owner = &op->cl; - closure_get(&op->cl); - } - - __btree_write(b); - } - BUG_ON(!b->written); + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); } /* @@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); INIT_LIST_HEAD(&b->list); - INIT_DELAYED_WORK(&b->work, btree_write_work); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; closure_init_unlocked(&b->io); @@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); if (cl && btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); if (cl) closure_wait_event_async(&b->io.wait, cl, @@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) else if (!mutex_trylock(&c->bucket_lock)) return -1; + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); @@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, int ret = -ENOMEM; struct btree *i; + trace_bcache_btree_cache_cannibalize(c); + if (!cl) return ERR_PTR(-ENOMEM); @@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); } - /* XXX: tracepoint */ c->try_harder = cl; c->try_harder_start = local_clock(); retry: @@ -905,6 +912,9 @@ retry: b = mca_find(c, k); if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + mutex_lock(&c->bucket_lock); b = mca_alloc(c, k, level, &op->cl); mutex_unlock(&c->bucket_lock); @@ -914,7 +924,7 @@ retry: if (IS_ERR(b)) return b; - bch_btree_read(b); + bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); @@ -937,15 +947,12 @@ retry: for (; i <= b->nsets; i++) prefetch(b->sets[i].data); - if (!closure_wait_event(&b->io.wait, &op->cl, - btree_node_read_done(b))) { - rw_unlock(write, b); - b = ERR_PTR(-EAGAIN); - } else if (btree_node_io_error(b)) { + if (btree_node_io_error(b)) { rw_unlock(write, b); - b = ERR_PTR(-EIO); - } else - BUG_ON(!b->written); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); return b; } @@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { - bch_btree_read(b); + bch_btree_node_read(b); rw_unlock(true, b); } } @@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) { unsigned i; + trace_bcache_btree_node_free(b); + /* * The BUG_ON() in btree_node_get() implies that we must have a write * lock on parent to free or even invalidate a node */ BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); - pr_debug("bucket %s", pbtree(b)); if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); - if (b->prio_blocked && - !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); - - b->prio_blocked = 0; - cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1028,17 +1030,20 @@ retry: goto retry; } - set_btree_node_read_done(b); b->accessed = 1; bch_bset_init_next(b); mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); return b; } @@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nkeys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) { + if (KEY_DIRTY(k)) gc->dirty += KEY_SIZE(k); - if (d) - d->sectors_dirty_gc += KEY_SIZE(k); - } } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, if (!IS_ERR_OR_NULL(n)) { swap(b, n); + __bkey_put(b->c, &b->key); memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - __bkey_put(b->c, &b->key); - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - btree_node_free(n, op); up_write(&n->lock); } @@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, btree_node_free(r->b, op); up_write(&r->b->lock); - pr_debug("coalesced %u nodes", nodes); + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; nodes--; @@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, void write(struct btree *r) { if (!r->written) - bch_btree_write(r, true, op); - else if (btree_node_dirty(r)) { - BUG_ON(btree_current_write(r)->owner); - btree_current_write(r)->owner = writes; - closure_get(writes); - - bch_btree_write(r, true, NULL); - } + bch_btree_node_write(r, &op->cl); + else if (btree_node_dirty(r)) + bch_btree_node_write(r, writes); up_write(&r->lock); } @@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ret = btree_gc_recurse(b, op, writes, gc); if (!b->written || btree_node_dirty(b)) { - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - bch_btree_write(b, true, n ? op : NULL); + bch_btree_node_write(b, n ? &op->cl : NULL); } if (!IS_ERR_OR_NULL(n)) { @@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - struct bcache_device **d; unsigned i; if (!c->gc_mark_valid) @@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { b->gc_gen = b->gen; - if (!atomic_read(&b->pin)) + if (!atomic_read(&b->pin)) { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) - (*d)->sectors_dirty_gc = 0; - mutex_unlock(&c->bucket_lock); } @@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) size_t available = 0; struct bucket *b; struct cache *ca; - struct bcache_device **d; unsigned i; mutex_lock(&c->bucket_lock); @@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) { - unsigned long last = - atomic_long_read(&((*d)->sectors_dirty)); - long difference = (*d)->sectors_dirty_gc - last; - - pr_debug("sectors dirty off by %li", difference); - - (*d)->sectors_dirty_last += difference; - - atomic_long_set(&((*d)->sectors_dirty), - (*d)->sectors_dirty_gc); - } - mutex_unlock(&c->bucket_lock); return available; } @@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl) struct gc_stat stats; struct closure writes; struct btree_op op; - uint64_t start_time = local_clock(); - trace_bcache_gc_start(c->sb.set_uuid); - blktrace_msg_all(c, "Starting gc"); + + trace_bcache_gc_start(c); memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); @@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl) btree_gc_start(c); + atomic_inc(&c->prio_blocked); + ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&op.cl); closure_sync(&writes); if (ret) { - blktrace_msg_all(c, "Stopped gc"); pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); } @@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl) available = bch_btree_gc_finish(c); + atomic_dec(&c->prio_blocked); + wake_up_allocators(c); + bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); @@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl) stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); - trace_bcache_gc_end(c->sb.set_uuid); - wake_up(&c->alloc_wait); + trace_bcache_gc_end(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b, struct btree_iter *iter, struct btree_op *op) { - void subtract_dirty(struct bkey *k, int sectors) + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { - struct bcache_device *d = b->c->devices[KEY_INODE(k)]; - - if (KEY_DIRTY(k) && d) - atomic_long_sub(sectors, &d->sectors_dirty); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + offset, -sectors); } + uint64_t old_offset; unsigned old_size, sectors_found = 0; while (1) { @@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(k, &START_KEY(insert)) <= 0) continue; + old_offset = KEY_START(k); old_size = KEY_SIZE(k); /* @@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *top; - subtract_dirty(k, KEY_SIZE(insert)); + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); if (bkey_written(b, k)) { /* @@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b, } } - subtract_dirty(k, old_size - KEY_SIZE(k)); + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); } check_failed: @@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; - const char *status = "insert"; + unsigned status = BTREE_INSERT_STATUS_INSERT; BUG_ON(bkey_cmp(k, &b->key) > 0); BUG_ON(b->level && !KEY_PTRS(k)); @@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto insert; /* prev is in the tree, if we merge we're done */ - status = "back merging"; + status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev && bch_bkey_try_merge(b, prev, k)) goto merged; - status = "overwrote front"; + status = BTREE_INSERT_STATUS_OVERWROTE; if (m != end(i) && KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; - status = "front merge"; + status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; @@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); - bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) - b->prio_blocked++; + btree_current_write(b)->prio_blocked++; - pr_debug("%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + trace_bcache_btree_insert_key(b, k, op->type, status); return true; } -bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) { bool ret = false; struct bkey *k; @@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, should_split(b)) goto out; - op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); + op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); SET_KEY_PTRS(&op->replace, 1); get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); @@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, BUG_ON(op->type != BTREE_INSERT); BUG_ON(!btree_insert_key(b, op, &tmp.k)); - bch_btree_write(b, false, NULL); ret = true; out: downgrade_write(&b->lock); @@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op) split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; - pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", - pbtree(b), n1->sets[0].data->keys); - if (split) { unsigned keys = 0; + trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); if (IS_ERR(n2)) goto err_free1; @@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&op->keys, &n2->key); - bch_btree_write(n2, true, op); + bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); - } else + } else { + trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + bch_btree_insert_keys(n1, op); + } bch_keylist_add(&op->keys, &n1->key); - bch_btree_write(n1, true, op); + bch_btree_node_write(n1, &op->cl); if (n3) { bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op); - bch_btree_write(n3, true, op); + bch_btree_node_write(n3, &op->cl); closure_sync(&op->cl); bch_btree_set_root(n3); @@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bch_btree_insert_keys(b, op)) - bch_btree_write(b, false, op); + if (bch_btree_insert_keys(b, op)) { + if (!b->level) + bch_btree_leaf_dirty(b, op); + else + bch_btree_node_write(b, &op->cl); + } } return 0; @@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) void bch_btree_set_root(struct btree *b) { unsigned i; + struct closure cl; + + closure_init_stack(&cl); + + trace_bcache_btree_set_root(b); BUG_ON(!b->written); @@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b) b->c->root = b; __bkey_put(b->c, &b->key); - bch_journal_meta(b->c, NULL); - pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); + bch_journal_meta(b->c, &cl); + closure_sync(&cl); } /* Cache lookup */ @@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, KEY_OFFSET(k) - bio->bi_sector); n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!n) - return -EAGAIN; - if (n == bio) op->lookup_done = true; @@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; - trace_bcache_cache_hit(n); __bch_submit_bbio(n, b->c); } @@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) struct btree_iter iter; bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, - (uint64_t) bio->bi_sector); - do { k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); if (!k) { @@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, } static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end) + struct keybuf *buf, struct bkey *end, + keybuf_pred_fn *pred) { struct btree_iter iter; bch_btree_iter_init(b, &iter, &buf->last_scanned); @@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (bkey_cmp(&buf->last_scanned, end) >= 0) break; - if (buf->key_predicate(buf, k)) { + if (pred(buf, k)) { struct keybuf_key *w; - pr_debug("%s", pkey(k)); - spin_lock(&buf->lock); w = array_alloc(&buf->freelist); @@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (!k) break; - btree(refill_keybuf, k, b, op, buf, end); + btree(refill_keybuf, k, b, op, buf, end, pred); /* * Might get an error here, but can't really do anything * and it'll get logged elsewhere. Just read what we @@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct btree_op op; @@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end); + btree_root(refill_keybuf, c, &op, buf, end, pred); closure_sync(&op.cl); pr_debug("found %s keys from %llu:%llu to %llu:%llu", @@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; } - bch_refill_keybuf(c, buf, end); + bch_refill_keybuf(c, buf, end, pred); } return ret; } -void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +void bch_keybuf_init(struct keybuf *buf) { - buf->key_predicate = fn; buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af4a709..3333d37 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -102,7 +102,6 @@ #include "debug.h" struct btree_write { - struct closure *owner; atomic_t *journal; /* If btree_split() frees a btree node, it writes a new pointer to that @@ -142,16 +141,12 @@ struct btree { */ struct bset_tree sets[MAX_BSETS]; - /* Used to refcount bio splits, also protects b->bio */ + /* For outstanding btree writes, used as a lock - protects write_idx */ struct closure_with_waitlist io; - /* Gets transferred to w->prio_blocked - see the comment there */ - int prio_blocked; - struct list_head list; struct delayed_work work; - uint64_t io_start_time; struct btree_write writes[2]; struct bio *bio; }; @@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ enum btree_flags { - BTREE_NODE_read_done, BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, }; -BTREE_FLAG(read_done); BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); @@ -278,6 +271,13 @@ struct btree_op { BKEY_PADDED(replace); }; +enum { + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + void bch_btree_op_init_stack(struct btree_op *); static inline void rw_lock(bool w, struct btree *b, int level) @@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b) #ifdef CONFIG_BCACHE_EDEBUG unsigned i; - if (w && - b->key.ptr[0] && - btree_node_read_done(b)) + if (w && b->key.ptr[0]) for (i = 0; i <= b->nsets; i++) bch_check_key_order(b, b->sets[i].data); #endif @@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b) > btree_blocks(b)); } -void bch_btree_read_done(struct closure *); -void bch_btree_read(struct btree *); -void bch_btree_write(struct btree *b, bool now, struct btree_op *op); +void bch_btree_node_read(struct btree *); +void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); @@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, struct btree_op *); -bool bch_btree_insert_keys(struct btree *, struct btree_op *); bool bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bio *); int bch_btree_insert(struct btree_op *, struct cache_set *); @@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); -void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); +void bch_keybuf_init(struct keybuf *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, + keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, - struct keybuf *, struct bkey *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); #endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a..9aba201 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) } else { struct closure *parent = cl->parent; struct closure_waitlist *wait = closure_waitlist(cl); + closure_fn *destructor = cl->fn; closure_debug_destroy(cl); + smp_mb(); atomic_set(&cl->remaining, -1); if (wait) closure_wake_up(wait); - if (cl->fn) - cl->fn(cl); + if (destructor) + destructor(cl); if (parent) closure_put(parent); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd520..88e6411 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) return ""; } -struct keyprint_hack bch_pkey(const struct bkey *k) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) { unsigned i = 0; - struct keyprint_hack r; - char *out = r.s, *end = r.s + KEYHACK_SIZE; + char *out = buf, *end = buf + size; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) @@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k) if (KEY_CSUM(k)) p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); #undef p - return r; + return out - buf; } -struct keyprint_hack bch_pbtree(const struct btree *b) +int bch_btree_to_text(char *buf, size_t size, const struct btree *b) { - struct keyprint_hack r; - - snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); - return r; + return scnprintf(buf, size, "%zu level %i/%i", + PTR_BUCKET_NR(b->c, &b->key, 0), + b->level, b->c->root ? b->c->root->level : -1); } #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) @@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i) { struct bkey *k; unsigned j; + char buf[80]; for (k = i->start; k < end(i); k = bkey_next(k)) { + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), - (uint64_t *) k - i->d, i->keys, pkey(k)); + (uint64_t *) k - i->d, i->keys, buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); @@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) v->written = 0; v->level = b->level; - bch_btree_read(v); + bch_btree_node_read(v); closure_wait_event(&v->io.wait, &cl, atomic_read(&b->io.cl.remaining) == -1); @@ -200,7 +199,7 @@ void bch_data_verify(struct search *s) if (!check) return; - if (bch_bio_alloc_pages(check, GFP_NOIO)) + if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; check->bi_rw = READ_SYNC; @@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, va_list args) { unsigned i; + char buf[80]; console_lock(); @@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, console_unlock(); - panic("at %s\n", pbtree(b)); + bch_btree_to_text(buf, sizeof(buf), b); + panic("at %s\n", buf); } void bch_check_key_order_msg(struct btree *b, struct bset *i, @@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, { struct dump_iterator *i = file->private_data; ssize_t ret = 0; + char kbuf[80]; while (size) { struct keybuf_key *w; @@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (i->bytes) break; - w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); if (!w) break; - i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); + bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } @@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->c = c; - bch_keybuf_init(&i->keys, dump_pred); + bch_keybuf_init(&i->keys); i->keys.last_scanned = KEY(0, 0, 0); return 0; @@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c) #endif -/* Fuzz tester has rotted: */ -#if 0 - -static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, - const char *buffer, size_t size) -{ - void dump(struct btree *b) - { - struct bset *i; - - for (i = b->sets[0].data; - index(i, b) < btree_blocks(b) && - i->seq == b->sets[0].data->seq; - i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) - dump_bset(b, i); - } - - struct cache_sb *sb; - struct cache_set *c; - struct btree *all[3], *b, *fill, *orig; - int j; - - struct btree_op op; - bch_btree_op_init_stack(&op); - - sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); - if (!sb) - return -ENOMEM; - - sb->bucket_size = 128; - sb->block_size = 4; - - c = bch_cache_set_alloc(sb); - if (!c) - return -ENOMEM; - - for (j = 0; j < 3; j++) { - BUG_ON(list_empty(&c->btree_cache)); - all[j] = list_first_entry(&c->btree_cache, struct btree, list); - list_del_init(&all[j]->list); - - all[j]->key = KEY(0, 0, c->sb.bucket_size); - bkey_copy_key(&all[j]->key, &MAX_KEY); - } - - b = all[0]; - fill = all[1]; - orig = all[2]; - - while (1) { - for (j = 0; j < 3; j++) - all[j]->written = all[j]->nsets = 0; - - bch_bset_init_next(b); - - while (1) { - struct bset *i = write_block(b); - struct bkey *k = op.keys.top; - unsigned rand; - - bkey_init(k); - rand = get_random_int(); - - op.type = rand & 1 - ? BTREE_INSERT - : BTREE_REPLACE; - rand >>= 1; - - SET_KEY_SIZE(k, bucket_remainder(c, rand)); - rand >>= c->bucket_bits; - rand &= 1024 * 512 - 1; - rand += c->sb.bucket_size; - SET_KEY_OFFSET(k, rand); -#if 0 - SET_KEY_PTRS(k, 1); -#endif - bch_keylist_push(&op.keys); - bch_btree_insert_keys(b, &op); - - if (should_split(b) || - set_blocks(i, b->c) != - __set_blocks(i, i->keys + 15, b->c)) { - i->csum = csum_set(i); - - memcpy(write_block(fill), - i, set_bytes(i)); - - b->written += set_blocks(i, b->c); - fill->written = b->written; - if (b->written == btree_blocks(b)) - break; - - bch_btree_sort_lazy(b); - bch_bset_init_next(b); - } - } - - memcpy(orig->sets[0].data, - fill->sets[0].data, - btree_bytes(c)); - - bch_btree_sort(b); - fill->written = 0; - bch_btree_read_done(&fill->io.cl); - - if (b->sets[0].data->keys != fill->sets[0].data->keys || - memcmp(b->sets[0].data->start, - fill->sets[0].data->start, - b->sets[0].data->keys * sizeof(uint64_t))) { - struct bset *i = b->sets[0].data; - struct bkey *k, *l; - - for (k = i->start, - l = fill->sets[0].data->start; - k < end(i); - k = bkey_next(k), l = bkey_next(l)) - if (bkey_cmp(k, l) || - KEY_SIZE(k) != KEY_SIZE(l)) - pr_err("key %zi differs: %s != %s", - (uint64_t *) k - i->d, - pkey(k), pkey(l)); - - for (j = 0; j < 3; j++) { - pr_err("**** Set %i ****", j); - dump(all[j]); - } - panic("\n"); - } - - pr_info("fuzz complete: %i keys", b->sets[0].data->keys); - } -} - -kobj_attribute_write(fuzz, btree_fuzz); -#endif - void bch_debug_exit(void) { if (!IS_ERR_OR_NULL(debug)) @@ -554,11 +421,6 @@ void bch_debug_exit(void) int __init bch_debug_init(struct kobject *kobj) { int ret = 0; -#if 0 - ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); - if (ret) - return ret; -#endif debug = debugfs_create_dir("bcache", NULL); return ret; diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index f9378a2..1c39b5a 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -3,15 +3,8 @@ /* Btree/bkey debug printing */ -#define KEYHACK_SIZE 80 -struct keyprint_hack { - char s[KEYHACK_SIZE]; -}; - -struct keyprint_hack bch_pkey(const struct bkey *k); -struct keyprint_hack bch_pbtree(const struct btree *b); -#define pkey(k) (&bch_pkey(k).s[0]) -#define pbtree(b) (&bch_pbtree(b).s[0]) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +int bch_btree_to_text(char *buf, size_t size, const struct btree *b); #ifdef CONFIG_BCACHE_EDEBUG diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4d..9056632 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -9,6 +9,8 @@ #include "bset.h" #include "debug.h" +#include <linux/blkdev.h> + static void bch_bi_idx_hack_endio(struct bio *bio, int error) { struct bio *p = bio->bi_private; @@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio) * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a * bvec boundry; it is the caller's responsibility to ensure that @bio is not * freed before the split. - * - * If bch_bio_split() is running under generic_make_request(), it's not safe to - * allocate more than one bio from the same bio set. Therefore, if it is running - * under generic_make_request() it masks out __GFP_WAIT when doing the - * allocation. The caller must check for failure if there's any possibility of - * it being called from under generic_make_request(); it is then the caller's - * responsibility to retry from a safe context (by e.g. punting to workqueue). */ struct bio *bch_bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) @@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); - /* - * If we're being called from underneath generic_make_request() and we - * already allocated any bios from this bio set, we risk deadlock if we - * use the mempool. So instead, we possibly fail and let the caller punt - * to workqueue or somesuch and retry in a safe context. - */ - if (current->bio_list) - gfp &= ~__GFP_WAIT; - if (sectors >= bio_sectors(bio)) return bio; if (bio->bi_rw & REQ_DISCARD) { ret = bio_alloc_bioset(gfp, 1, bs); + if (!ret) + return NULL; idx = 0; goto out; } @@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, queue_max_segments(q)); - struct bio_vec *bv, *end = bio_iovec(bio) + - min_t(int, bio_segments(bio), max_segments); if (bio->bi_rw & REQ_DISCARD) return min(ret, q->limits.max_discard_sectors); if (bio_segments(bio) > max_segments || q->merge_bvec_fn) { + struct bio_vec *bv; + int i, seg = 0; + ret = 0; - for (bv = bio_iovec(bio); bv < end; bv++) { + bio_for_each_segment(bv, bio, i) { struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, @@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio) .bi_rw = bio->bi_rw, }; + if (seg == max_segments) + break; + if (q->merge_bvec_fn && q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) break; + seg++; ret += bv->bv_len >> 9; } } @@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error) closure_put(cl); } -static void __bch_bio_submit_split(struct closure *cl) -{ - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - struct bio *bio = s->bio, *n; - - do { - n = bch_bio_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s->p->bio_split); - if (!n) - continue_at(cl, __bch_bio_submit_split, system_wq); - - n->bi_end_io = bch_bio_submit_split_endio; - n->bi_private = cl; - - closure_get(cl); - bch_generic_make_request_hack(n); - } while (n != bio); - - continue_at(cl, bch_bio_submit_split_done, NULL); -} - void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) { struct bio_split_hook *s; + struct bio *n; if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) goto submit; @@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) goto submit; s = mempool_alloc(p->bio_split_hook, GFP_NOIO); + closure_init(&s->cl, NULL); s->bio = bio; s->p = p; @@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) s->bi_private = bio->bi_private; bio_get(bio); - closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); - return; + do { + n = bch_bio_split(bio, bch_bio_max_sectors(bio), + GFP_NOIO, s->p->bio_split); + + n->bi_end_io = bch_bio_submit_split_endio; + n->bi_private = &s->cl; + + closure_get(&s->cl); + bch_generic_make_request_hack(n); + } while (n != bio); + + continue_at(&s->cl, bch_bio_submit_split_done, NULL); submit: bch_generic_make_request_hack(bio); } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdc..ba95ab8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include <trace/events/bcache.h> + /* * Journal replay/recovery: * @@ -182,9 +184,14 @@ bsearch: pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + m = (l + r) >> 1; + read_bucket(m); - if (read_bucket(m)) + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) l = m; else r = m; @@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, for (k = i->j.start; k < end(&i->j); k = bkey_next(k)) { - pr_debug("%s", pkey(k)); + trace_bcache_journal_replay_key(k); + bkey_copy(op->keys.top, k); bch_keylist_push(&op->keys); @@ -384,7 +392,7 @@ out: return; found: if (btree_node_dirty(best)) - bch_btree_write(best, true, NULL); + bch_btree_node_write(best, NULL); rw_unlock(true, best); } @@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; - bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; bio->bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; @@ -712,7 +720,8 @@ void bch_journal(struct closure *cl) spin_lock(&c->journal.lock); if (journal_full(&c->journal)) { - /* XXX: tracepoint */ + trace_bcache_journal_full(c); + closure_wait(&c->journal.wait, cl); journal_reclaim(c); @@ -728,13 +737,15 @@ void bch_journal(struct closure *cl) if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || b > c->journal.blocks_free) { - /* XXX: If we were inserting so many keys that they won't fit in + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they won't fit in * an _empty_ journal write, we'll deadlock. For now, handle * this in bch_keylist_realloc() - but something to think about. */ BUG_ON(!w->data->keys); - /* XXX: tracepoint */ BUG_ON(!closure_wait(&w->wait, cl)); closure_flush(&c->journal.io); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512..1a3b4f4 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include <trace/events/bcache.h> + struct moving_io { struct keybuf_key *w; struct search s; @@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, s.cl); struct bio *bio = &io->bio.bio; - struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); + struct bio_vec *bv; + int i; - while (bv-- != bio->bi_io_vec) + bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); - pr_debug("%s %s", io->s.op.insert_collision - ? "collision moving" : "moved", - pkey(&io->w->key)); + if (io->s.op.insert_collision) + trace_bcache_gc_copy_collision(&io->w->key); bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); @@ -94,8 +96,6 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); if (!s->error) { - trace_bcache_write_moving(&io->bio.bio); - moving_init(io); io->bio.bio.bi_sector = KEY_START(&io->w->key); @@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); struct bio *bio = &io->bio.bio; - trace_bcache_read_moving(bio); bch_submit_bbio(bio, s->op.c, &io->w->key, 0); continue_at(cl, write_moving, bch_gc_wq); @@ -138,7 +137,8 @@ static void read_moving(struct closure *cl) /* XXX: if we error, background writeback could stall indefinitely */ while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { - w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); if (!w) break; @@ -159,10 +159,10 @@ static void read_moving(struct closure *cl) bio->bi_rw = READ; bio->bi_end_io = read_moving_endio; - if (bch_bio_alloc_pages(bio, GFP_KERNEL)) + if (bio_alloc_pages(bio, GFP_KERNEL)) goto err; - pr_debug("%s", pkey(&w->key)); + trace_bcache_gc_copy(&w->key); closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); @@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl) void bch_moving_init_cache_set(struct cache_set *c) { - bch_keybuf_init(&c->moving_gc_keys, moving_pred); + bch_keybuf_init(&c->moving_gc_keys); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e..786a1a4 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include <linux/cgroup.h> #include <linux/module.h> @@ -21,8 +22,6 @@ #define CUTOFF_CACHE_ADD 95 #define CUTOFF_CACHE_READA 90 -#define CUTOFF_WRITEBACK 50 -#define CUTOFF_WRITEBACK_SYNC 75 struct kmem_cache *bch_search_cache; @@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl) bch_queue_gc(op->c); } + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + do { unsigned i; struct bkey *k; @@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); - if (!n) { - __bkey_put(op->c, k); - continue_at(cl, bch_insert_data_loop, bcache_wq); - } n->bi_end_io = bch_insert_data_endio; n->bi_private = cl; @@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl) if (KEY_CSUM(k)) bio_csum(n, k); - pr_debug("%s", pkey(k)); + trace_bcache_cache_insert(k); bch_keylist_push(&op->keys); - trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); @@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; @@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl) int i; if (s->recoverable) { - /* The cache read failed, but we can retry from the backing - * device. - */ - pr_debug("recovering at sector %llu", - (uint64_t) s->orig_bio->bi_sector); + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); s->error = 0; bv = s->bio.bio.bi_io_vec; @@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - trace_bcache_read_retry(&s->bio.bio); closure_bio_submit(&s->bio.bio, &s->cl, s->d); } @@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl) */ if (s->op.cache_bio) { - struct bio_vec *src, *dst; - unsigned src_offset, dst_offset, bytes; - void *dst_ptr; - bio_reset(s->op.cache_bio); s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; bch_bio_map(s->op.cache_bio, NULL); - src = bio_iovec(s->op.cache_bio); - dst = bio_iovec(s->cache_miss); - src_offset = src->bv_offset; - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - - while (1) { - if (dst_offset == dst->bv_offset + dst->bv_len) { - kunmap(dst->bv_page); - dst++; - if (dst == bio_iovec_idx(s->cache_miss, - s->cache_miss->bi_vcnt)) - break; - - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - } - - if (src_offset == src->bv_offset + src->bv_len) { - src++; - if (src == bio_iovec_idx(s->op.cache_bio, - s->op.cache_bio->bi_vcnt)) - BUG(); - - src_offset = src->bv_offset; - } - - bytes = min(dst->bv_offset + dst->bv_len - dst_offset, - src->bv_offset + src->bv_len - src_offset); - - memcpy(dst_ptr + dst_offset, - page_address(src->bv_page) + src_offset, - bytes); - - src_offset += bytes; - dst_offset += bytes; - } + bio_copy_data(s->cache_miss, s->op.cache_bio); bio_put(s->cache_miss); s->cache_miss = NULL; @@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); if (s->error) continue_at_nobarrier(cl, request_read_error, bcache_wq); @@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *miss; miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!miss) - return -EAGAIN; - if (miss == bio) s->op.lookup_done = true; @@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, reada = min(dc->readahead >> 9, sectors - bio_sectors(miss)); - if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); + if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) + reada = bdev_sectors(miss->bi_bdev) - + bio_end_sector(miss); } s->cache_bio_sectors = bio_sectors(miss) + reada; @@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_put; bch_bio_map(s->op.cache_bio, NULL); - if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; s->cache_miss = miss; bio_get(s->op.cache_bio); - trace_bcache_cache_miss(s->orig_bio); closure_bio_submit(s->op.cache_bio, &s->cl, s->d); return ret; @@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static bool should_writeback(struct cached_dev *dc, struct bio *bio) -{ - unsigned threshold = (bio->bi_rw & REQ_SYNC) - ? CUTOFF_WRITEBACK_SYNC - : CUTOFF_WRITEBACK; - - return !atomic_read(&dc->disk.detaching) && - cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && - dc->disk.c->gc_stats.in_use < threshold; -} - static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; struct bio *bio = &s->bio.bio; struct bkey start, end; start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end(bio), 0); + end = KEY(dc->disk.id, bio_end_sector(bio), 0); bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); @@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s) if (bio->bi_rw & REQ_DISCARD) goto skip; + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->op.skip)) { + s->op.skip = false; + s->writeback = true; + } + if (s->op.skip) goto skip; - if (should_writeback(dc, s->orig_bio)) - s->writeback = true; + trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); if (!s->writeback) { s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, dc->disk.bio_split); - trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { - s->op.cache_bio = bio; - trace_bcache_writeback(s->orig_bio); - bch_writeback_add(dc, bio_sectors(bio)); + bch_writeback_add(dc); + + if (s->op.flush_journal) { + /* Also need to send a flush to the backing device */ + s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); + + bio->bi_size = 0; + bio->bi_vcnt = 0; + closure_bio_submit(bio, cl, s->d); + } else { + s->op.cache_bio = bio; + } } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); @@ -1058,7 +1016,6 @@ skip: s->op.skip = true; s->op.cache_bio = s->orig_bio; bio_get(s->op.cache_bio); - trace_bcache_write_skip(s->orig_bio); if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -int bch_get_congested(struct cache_set *c) +unsigned bch_get_congested(struct cache_set *c) { int i; + long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c) i += CONGESTED_MAX; - return i <= 0 ? 1 : fract_exp_two(i, 6); + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } static void add_sequential(struct task_struct *t) @@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) { struct cache_set *c = s->op.c; struct bio *bio = &s->bio.bio; - - long rand; - int cutoff = bch_get_congested(c); unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) goto skip; } - if (!cutoff) { - cutoff = dc->sequential_cutoff >> 9; + if (!congested && !dc->sequential_cutoff) + goto rescale; - if (!cutoff) - goto rescale; - - if (mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - } + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; if (dc->sequential_merge) { struct io *i; @@ -1177,7 +1136,7 @@ found: if (i->sequential + bio->bi_size > i->sequential) i->sequential += bio->bi_size; - i->last = bio_end(bio); + i->last = bio_end_sector(bio); i->jiffies = jiffies + msecs_to_jiffies(5000); s->task->sequential_io = i->sequential; @@ -1192,12 +1151,19 @@ found: add_sequential(s->task); } - rand = get_random_int(); - cutoff -= bitmap_weight(&rand, BITS_PER_LONG); + sectors = max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9; - if (cutoff <= (int) (max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9)) + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(s->orig_bio); goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(s->orig_bio); + goto skip; + } rescale: bch_rescale_priorities(c, bio_sectors(bio)); @@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { + struct bio_vec *bv; + int i; + /* Zero fill bio */ - while (bio->bi_idx != bio->bi_vcnt) { - struct bio_vec *bv = bio_iovec(bio); + bio_for_each_segment(bv, bio, i) { unsigned j = min(bv->bv_len >> 9, sectors); void *p = kmap(bv->bv_page); memset(p + bv->bv_offset, 0, j << 9); kunmap(bv->bv_page); - bv->bv_len -= j << 9; - bv->bv_offset += j << 9; - - if (bv->bv_len) - return 0; - - bio->bi_sector += j; - bio->bi_size -= j << 9; - - bio->bi_idx++; - sectors -= j; + sectors -= j; } - s->op.lookup_done = true; + bio_advance(bio, min(sectors << 9, bio->bi_size)); + + if (!bio->bi_size) + s->op.lookup_done = true; return 0; } @@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) closure_call(&s->op.cl, btree_read_async, NULL, cl); } else if (bio_has_data(bio) || s->op.skip) { bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, - &KEY(d->id, bio->bi_sector, 0), - &KEY(d->id, bio_end(bio), 0)); + &KEY(d->id, bio->bi_sector, 0), + &KEY(d->id, bio_end_sector(bio), 0)); s->writeback = true; s->op.cache_bio = bio; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab..57dc478 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -30,7 +30,7 @@ struct search { }; void bch_cache_read_endio(struct bio *, int); -int bch_get_congested(struct cache_set *); +unsigned bch_get_congested(struct cache_set *); void bch_insert_data(struct closure *cl); void bch_btree_insert_async(struct closure *); void bch_cache_read_endio(struct bio *, int); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b6..547c4c5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -10,10 +10,13 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" +#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/reboot.h> @@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, struct closure *cl = &c->uuid_write.cl; struct uuid_entry *u; unsigned i; + char buf[80]; BUG_ON(!parent); closure_lock(&c->uuid_write, parent); @@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", - pkey(&c->uuid_bucket)); + bch_bkey_to_text(buf, sizeof(buf), k); + pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) if (!bch_is_zero(u->uuid, 16)) @@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca) pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; @@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d) atomic_set(&d->detaching, 0); } - bcache_device_unlink(d); + if (!d->flush_done) + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->stripe_sectors_dirty)) + vfree(d->stripe_sectors_dirty); + else + kfree(d->stripe_sectors_dirty); closure_debug_destroy(&d->cl); } -static int bcache_device_init(struct bcache_device *d, unsigned block_size) +static int bcache_device_init(struct bcache_device *d, unsigned block_size, + sector_t sectors) { struct request_queue *q; + size_t n; + + if (!d->stripe_size_bits) + d->stripe_size_bits = 31; + + d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> + d->stripe_size_bits; + + if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + return -ENOMEM; + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->stripe_sectors_dirty) + return -ENOMEM; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, @@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; + set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); d->disk->major = bcache_major; @@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + return 0; } @@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; + char buf[SB_LABEL_SIZE + 1]; + char *env[] = { + "DRIVER=bcache", + kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), + NULL, + NULL, + }; + + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE] = '\0'; + env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); if (atomic_xchg(&dc->running, 1)) return; @@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc) add_disk(d->disk); bd_link_disk_holder(dc->bdev, dc->disk.disk); -#if 0 - char *env[] = { "SYMLINK=label" , NULL }; + /* won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); -#endif + kfree(env[1]); + kfree(env[2]); + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) pr_debug("error creating sysfs link"); @@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) atomic_set(&dc->count, 1); if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + bch_sectors_dirty_init(dc); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; + mutex_lock(&bch_register_lock); + d->flush_done = 1; + + if (d->c) + bcache_device_unlink(d); + + mutex_unlock(&bch_register_lock); + bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); @@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - ret = bcache_device_init(&dc->disk, block_size); + ret = bcache_device_init(&dc->disk, block_size, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) return ret; @@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c))) + if (bcache_device_init(d, block_bytes(c), u->sectors)) goto err; bcache_device_attach(d, c, u - c->uuids); - set_capacity(d->disk, u->sectors); bch_flash_dev_request_init(d); add_disk(d->disk); @@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl) free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); - kfree(c->fill_iter); if (c->bio_split) bioset_free(c->bio_split); + if (c->fill_iter) + mempool_destroy(c->fill_iter); if (c->bio_meta) mempool_destroy(c->bio_meta); if (c->search) @@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl) static void cache_set_flush(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cache *ca; struct btree *b; - - /* Shut down allocator threads */ - set_bit(CACHE_SET_STOPPING_2, &c->flags); - wake_up(&c->alloc_wait); + unsigned i; bch_cache_accounting_destroy(&c->accounting); @@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl) /* Should skip this if we're unregistering because of an error */ list_for_each_entry(b, &c->btree_cache, list) if (btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); + + for_each_cache(ca, c, i) + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); closure_return(cl); } @@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl) static void __cache_set_unregister(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc, *t; + struct cached_dev *dc; size_t i; mutex_lock(&bch_register_lock); - if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) - list_for_each_entry_safe(dc, t, &c->cached_devs, list) - bch_cached_dev_detach(dc); - for (i = 0; i < c->nr_uuids; i++) - if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) - bcache_device_stop(c->devices[i]); + if (c->devices[i]) { + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); + } else { + bcache_device_stop(c->devices[i]); + } + } mutex_unlock(&bch_register_lock); @@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - init_waitqueue_head(&c->alloc_wait); + c->sort_crit_factor = int_sqrt(c->btree_pages); + mutex_init(&c->bucket_lock); - mutex_init(&c->fill_lock); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); @@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) !(c->bio_meta = mempool_create_kmalloc_pool(2, sizeof(struct bbio) + sizeof(struct bio_vec) * bucket_pages(c))) || + !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || @@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_open_buckets_alloc(c)) goto err; - c->fill_iter->size = sb->bucket_size / sb->block_size; - c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = 8 << IO_ERROR_SHIFT; @@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c) */ bch_journal_next(&c->journal); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - system_wq, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c) bch_btree_gc_finish(c); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - ca->alloc_workqueue, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) bch_prio_write(ca); mutex_unlock(&c->bucket_lock); - wake_up(&c->alloc_wait); - err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) goto err_unlock_gc; @@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c) goto err_unlock_gc; bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_write(c->root, true, &op); + bch_btree_node_write(c->root, &op.cl); bch_btree_set_root(c->root); rw_unlock(true, c->root); @@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj) bio_split_pool_free(&ca->bio_split_hook); - if (ca->alloc_workqueue) - destroy_workqueue(ca->alloc_workqueue); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || - !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) return -ENOMEM; @@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +static bool bch_is_open_backing(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev == bdev) + return true; + list_for_each_entry_safe(dc, t, &uncached_devices, list) + if (dc->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open_cache(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cache *ca; + unsigned i; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + for_each_cache(ca, c, i) + if (ca->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open(struct block_device *bdev) { + return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); +} + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { @@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; + if (bdev == ERR_PTR(-EBUSY)) { + bdev = lookup_bdev(strim(path)); + if (!IS_ERR(bdev) && bch_is_open(bdev)) + err = "device already registered"; + else + err = "device busy"; + } goto err; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca4..12a2c28 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -9,7 +9,9 @@ #include "sysfs.h" #include "btree.h" #include "request.h" +#include "writeback.h" +#include <linux/blkdev.h> #include <linux/sort.h> static const char * const cache_replacement_policies[] = { @@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(discard); @@ -127,7 +132,7 @@ SHOW(__bch_cached_dev) char derivative[20]; char target[20]; bch_hprint(dirty, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(derivative, dc->writeback_rate_derivative << 9); bch_hprint(target, dc->writeback_rate_target << 9); @@ -143,7 +148,10 @@ SHOW(__bch_cached_dev) } sysfs_hprint(dirty_data, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); + + sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + var_printf(partial_stripes_expensive, "%u"); var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); @@ -170,6 +178,7 @@ STORE(__cached_dev) disk.kobj); unsigned v = size; struct cache_set *c; + struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -214,6 +223,7 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { + /* note: endlines are preserved */ memcpy(dc->sb.label, buf, SB_LABEL_SIZE); bch_write_bdev_super(dc, NULL); if (dc->disk.c) { @@ -221,6 +231,15 @@ STORE(__cached_dev) buf, SB_LABEL_SIZE); bch_uuid_write(dc->disk.c); } + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + add_uevent_var(env, "DRIVER=bcache"); + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_LABEL=%s", buf); + kobject_uevent_env( + &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kfree(env); } if (attr == &sysfs_attach) { @@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_sequential_merge, &sysfs_clear_stats, @@ -665,12 +686,10 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - /* Number of quantiles we compute */ - const unsigned nq = 31; - size_t n = ca->sb.nbuckets, i, unused, btree; uint64_t sum = 0; - uint16_t q[nq], *p, *cached; + /* Compute 31 quantiles */ + uint16_t q[31], *p, *cached; ssize_t ret; cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); @@ -703,26 +722,29 @@ SHOW(__bch_cache) if (n) do_div(sum, n); - for (i = 0; i < nq; i++) - q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / + (ARRAY_SIZE(q) + 1)]; vfree(p); - ret = snprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (nq + 1)); - - for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) - ret += snprintf(buf + ret, PAGE_SIZE - ret, - i < nq - 1 ? "%u " : "%u]\n", q[i]); - - buf[PAGE_SIZE - 1] = '\0'; + ret = scnprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + btree * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + ret--; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + return ret; } diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 983f9bb..f7b6c19 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -2,6 +2,7 @@ #include "btree.h" #include "request.h" +#include <linux/blktrace_api.h> #include <linux/module.h> #define CREATE_TRACE_POINTS @@ -9,18 +10,44 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); + EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e..98eb811 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } -int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment(bv, bio, i) { - bv->bv_page = alloc_page(gfp); - if (!bv->bv_page) { - while (bv-- != bio->bi_io_vec + bio->bi_idx) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} - /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e..1ae2a73 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ struct closure; -#include <trace/events/bcache.h> - #ifdef CONFIG_BCACHE_EDEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) @@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) return x; } -#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) - void bch_bio_map(struct bio *bio, void *base); -int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); - static inline sector_t bdev_sectors(struct block_device *bdev) { return bdev->bd_inode->i_size >> 9; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3..22cbff5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -9,6 +9,9 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "writeback.h" + +#include <trace/events/bcache.h> static struct workqueue_struct *dirty_wq; @@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc) int change = 0; int64_t error; - int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; dc->disk.sectors_dirty_last = dirty; @@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe; + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + unsigned stripe_size = 1 << dc->disk.stripe_size_bits; + + if (!KEY_DIRTY(k)) + return false; + + stripe = KEY_START(k) >> dc->disk.stripe_size_bits; + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != + stripe_size) + return false; + + if (nr_sectors <= stripe_size) + return true; + + nr_sectors -= stripe_size; + stripe++; + } +} + static void dirty_init(struct keybuf_key *w) { struct dirty_io *io = w->private; @@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl) searched_from_start = true; } - bch_refill_keybuf(dc->disk.c, buf, &end); + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + 1 << dc->disk.stripe_size_bits) + goto full_stripes; + + goto normal_refill; +full_stripes: + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { /* Searched the entire btree - delay awhile */ @@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc) } } -void bch_writeback_add(struct cached_dev *dc, unsigned sectors) +void bch_writeback_add(struct cached_dev *dc) { - atomic_long_add(sectors, &dc->disk.sectors_dirty); - if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { atomic_inc(&dc->count); @@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors) } } +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_size, stripe_offset; + uint64_t stripe; + + if (!d) + return; + + stripe_size = 1 << d->stripe_size_bits; + stripe = offset >> d->stripe_size_bits; + stripe_offset = offset & (stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + atomic_add(s, d->stripe_sectors_dirty + stripe); + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + /* Background writeback - IO loop */ static void dirty_io_destructor(struct closure *cl) @@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; - struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); + struct bio_vec *bv; + int i; - while (bv-- != io->bio.bi_io_vec) + bio_for_each_segment_all(bv, &io->bio, i) __free_page(bv->bv_page); /* This is kind of a dumb way of signalling errors. */ @@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - pr_debug("clearing %s", pkey(&w->key)); bch_btree_insert(&op, dc->disk.c); closure_sync(&op.cl); + if (op.insert_collision) + trace_bcache_writeback_collision(&w->key); + atomic_long_inc(op.insert_collision ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); @@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl) io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; - trace_bcache_write_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty_finish, dirty_wq); @@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); - trace_bcache_read_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty, dirty_wq); @@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl) io->bio.bi_rw = READ; io->bio.bi_end_io = read_dirty_endio; - if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + if (bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; - pr_debug("%s", pkey(&w->key)); + trace_bcache_writeback(&w->key); closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); @@ -375,12 +445,49 @@ err: refill_dirty(cl); } +/* Init */ + +static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, + struct cached_dev *dc) +{ + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) + if (!b->level) { + if (KEY_INODE(k) > dc->disk.id) + break; + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, dc->disk.id, + KEY_START(k), + KEY_SIZE(k)); + } else { + btree(sectors_dirty_init, k, b, op, dc); + if (KEY_INODE(k) > dc->disk.id) + break; + + cond_resched(); + } + + return 0; +} + +void bch_sectors_dirty_init(struct cached_dev *dc) +{ + struct btree_op op; + + bch_btree_op_init_stack(&op); + btree_root(sectors_dirty_init, dc->disk.c, &op, dc); +} + void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys, dirty_pred); + bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; dc->writeback_running = true; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 0000000..c91f61b --- /dev/null +++ b/drivers/md/bcache/writeback.h @@ -0,0 +1,64 @@ +#ifndef _BCACHE_WRITEBACK_H +#define _BCACHE_WRITEBACK_H + +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, + uint64_t offset, + unsigned nr_sectors) +{ + uint64_t stripe = offset >> d->stripe_size_bits; + + while (1) { + if (atomic_read(d->stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= 1 << d->stripe_size_bits) + return false; + + nr_sectors -= 1 << d->stripe_size_bits; + stripe++; + } +} + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned cache_mode, bool would_skip) +{ + unsigned in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + atomic_read(&dc->disk.detaching) || + in_use > CUTOFF_WRITEBACK_SYNC) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return bio->bi_rw & REQ_SYNC || + in_use <= CUTOFF_WRITEBACK; +} + +void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); +void bch_writeback_queue(struct cached_dev *); +void bch_writeback_add(struct cached_dev *); + +void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_cached_dev_writeback_init(struct cached_dev *); + +#endif diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 1b4d4ee..de7d74a 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -177,7 +177,11 @@ enum drbd_ret_code { ERR_NEED_APV_100 = 163, ERR_NEED_ALLOW_TWO_PRI = 164, ERR_MD_UNCLEAN = 165, - + ERR_MD_LAYOUT_CONNECTED = 166, + ERR_MD_LAYOUT_TOO_BIG = 167, + ERR_MD_LAYOUT_TOO_SMALL = 168, + ERR_MD_LAYOUT_NO_FIT = 169, + ERR_IMPLICIT_SHRINK = 170, /* insert new ones above this line */ AFTER_LAST_ERR_CODE }; diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index d0d8fac..e8c4457 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) + __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) + __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) ) GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fedf2b..17e50bb 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -215,4 +215,13 @@ #define DRBD_ALWAYS_ASBP_DEF 0 #define DRBD_USE_RLE_DEF 1 +#define DRBD_AL_STRIPES_MIN 1 +#define DRBD_AL_STRIPES_MAX 1024 +#define DRBD_AL_STRIPES_DEF 1 +#define DRBD_AL_STRIPES_SCALE '1' + +#define DRBD_AL_STRIPE_SIZE_MIN 4 +#define DRBD_AL_STRIPE_SIZE_MAX 16777216 +#define DRBD_AL_STRIPE_SIZE_DEF 32 +#define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ #endif diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 3cc5a0b..5ebda97 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -9,9 +9,7 @@ struct search; DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio), TP_STRUCT__entry( @@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request, __field(dev_t, orig_sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->orig_sector = bio->bi_sector - 16; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", + TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - __entry->orig_major, __entry->orig_minor, + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->orig_major, __entry->orig_minor, (unsigned long long)__entry->orig_sector) ); -DEFINE_EVENT(bcache_request, bcache_request_start, +DECLARE_EVENT_CLASS(bkey, + TP_PROTO(struct bkey *k), + TP_ARGS(k), - TP_PROTO(struct search *s, struct bio *bio), + TP_STRUCT__entry( + __field(u32, size ) + __field(u32, inode ) + __field(u64, offset ) + __field(bool, dirty ) + ), - TP_ARGS(s, bio) + TP_fast_assign( + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + ), + + TP_printk("%u:%llu len %u dirty %u", __entry->inode, + __entry->offset, __entry->size, __entry->dirty) ); -DEFINE_EVENT(bcache_request, bcache_request_end, +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree *b), + TP_ARGS(b), + + TP_STRUCT__entry( + __field(size_t, bucket ) + ), + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + ), + + TP_printk("bucket %zu", __entry->bucket) +); + +/* request.c */ + +DEFINE_EVENT(bcache_request, bcache_request_start, TP_PROTO(struct search *s, struct bio *bio), + TP_ARGS(s, bio) +); +DEFINE_EVENT(bcache_request, bcache_request_end, + TP_PROTO(struct search *s, struct bio *bio), TP_ARGS(s, bio) ); DECLARE_EVENT_CLASS(bcache_bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), TP_STRUCT__entry( @@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio, __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -78,191 +104,328 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) ); - -DEFINE_EVENT(bcache_bio, bcache_passthrough, - +DEFINE_EVENT(bcache_bio, bcache_bypass_sequential, TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); +DEFINE_EVENT(bcache_bio, bcache_bypass_congested, + TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_cache_hit, +TRACE_EVENT(bcache_read, + TP_PROTO(struct bio *bio, bool hit, bool bypass), + TP_ARGS(bio, hit, bypass), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(bool, cache_hit ) + __field(bool, bypass ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + __entry->cache_hit = hit; + __entry->bypass = bypass; + ), + + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->cache_hit, __entry->bypass) ); -DEFINE_EVENT(bcache_bio, bcache_cache_miss, +TRACE_EVENT(bcache_write, + TP_PROTO(struct bio *bio, bool writeback, bool bypass), + TP_ARGS(bio, writeback, bypass), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(bool, writeback ) + __field(bool, bypass ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + __entry->writeback = writeback; + __entry->bypass = bypass; + ), + + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->writeback, __entry->bypass) ); DEFINE_EVENT(bcache_bio, bcache_read_retry, - TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_writethrough, +DEFINE_EVENT(bkey, bcache_cache_insert, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_PROTO(struct bio *bio), +/* Journal */ - TP_ARGS(bio) -); +DECLARE_EVENT_CLASS(cache_set, + TP_PROTO(struct cache_set *c), + TP_ARGS(c), -DEFINE_EVENT(bcache_bio, bcache_writeback, + TP_STRUCT__entry( + __array(char, uuid, 16 ) + ), - TP_PROTO(struct bio *bio), + TP_fast_assign( + memcpy(__entry->uuid, c->sb.set_uuid, 16); + ), - TP_ARGS(bio) + TP_printk("%pU", __entry->uuid) ); -DEFINE_EVENT(bcache_bio, bcache_write_skip, - - TP_PROTO(struct bio *bio), +DEFINE_EVENT(bkey, bcache_journal_replay_key, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_ARGS(bio) +DEFINE_EVENT(cache_set, bcache_journal_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); -DEFINE_EVENT(bcache_bio, bcache_btree_read, +DEFINE_EVENT(cache_set, bcache_journal_entry_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); +DEFINE_EVENT(bcache_bio, bcache_journal_write, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_btree_write, +/* Btree */ - TP_PROTO(struct bio *bio), +DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_ARGS(bio) +DEFINE_EVENT(btree_node, bcache_btree_read, + TP_PROTO(struct btree *b), + TP_ARGS(b) ); -DEFINE_EVENT(bcache_bio, bcache_write_dirty, +TRACE_EVENT(bcache_btree_write, + TP_PROTO(struct btree *b), + TP_ARGS(b), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(size_t, bucket ) + __field(unsigned, block ) + __field(unsigned, keys ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->block = b->written; + __entry->keys = b->sets[b->nsets].data->keys; + ), + + TP_printk("bucket %zu", __entry->bucket) ); -DEFINE_EVENT(bcache_bio, bcache_read_dirty, +DEFINE_EVENT(btree_node, bcache_btree_node_alloc, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_PROTO(struct bio *bio), +DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_ARGS(bio) +DEFINE_EVENT(btree_node, bcache_btree_node_free, + TP_PROTO(struct btree *b), + TP_ARGS(b) ); -DEFINE_EVENT(bcache_bio, bcache_write_moving, +TRACE_EVENT(bcache_btree_gc_coalesce, + TP_PROTO(unsigned nodes), + TP_ARGS(nodes), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(unsigned, nodes ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->nodes = nodes; + ), + + TP_printk("coalesced %u nodes", __entry->nodes) ); -DEFINE_EVENT(bcache_bio, bcache_read_moving, +DEFINE_EVENT(cache_set, bcache_gc_start, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_PROTO(struct bio *bio), +DEFINE_EVENT(cache_set, bcache_gc_end, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_ARGS(bio) +DEFINE_EVENT(bkey, bcache_gc_copy, + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); -DEFINE_EVENT(bcache_bio, bcache_journal_write, +DEFINE_EVENT(bkey, bcache_gc_copy_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_PROTO(struct bio *bio), +TRACE_EVENT(bcache_btree_insert_key, + TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status), + TP_ARGS(b, k, op, status), - TP_ARGS(bio) -); + TP_STRUCT__entry( + __field(u64, btree_node ) + __field(u32, btree_level ) + __field(u32, inode ) + __field(u64, offset ) + __field(u32, size ) + __field(u8, dirty ) + __field(u8, op ) + __field(u8, status ) + ), -DECLARE_EVENT_CLASS(bcache_cache_bio, + TP_fast_assign( + __entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->btree_level = b->level; + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + __entry->op = op; + __entry->status = status; + ), - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device* orig_bdev), + TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u", + __entry->status, __entry->op, + __entry->btree_node, __entry->btree_level, + __entry->inode, __entry->offset, + __entry->size, __entry->dirty) +); - TP_ARGS(bio, orig_sector, orig_bdev), +DECLARE_EVENT_CLASS(btree_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(dev_t, orig_dev ) - __field(sector_t, sector ) - __field(sector_t, orig_sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) + __field(size_t, bucket ) + __field(unsigned, keys ) ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_dev = orig_bdev->bd_dev; - __entry->sector = bio->bi_sector; - __entry->orig_sector = orig_sector; - __entry->nr_sector = bio->bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->keys = keys; ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), - (unsigned long long)__entry->orig_sector) + TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys) ); -DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, - - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device *orig_bdev), +DEFINE_EVENT(btree_split, bcache_btree_node_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) +); - TP_ARGS(bio, orig_sector, orig_bdev) +DEFINE_EVENT(btree_split, bcache_btree_node_compact, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) ); -DECLARE_EVENT_CLASS(bcache_gc, +DEFINE_EVENT(btree_node, bcache_btree_set_root, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_PROTO(uint8_t *uuid), +/* Allocator */ - TP_ARGS(uuid), +TRACE_EVENT(bcache_alloc_invalidate, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), TP_STRUCT__entry( - __field(uint8_t *, uuid) + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, free_inc_size ) + __field(unsigned, unused ) ), TP_fast_assign( - __entry->uuid = uuid; + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->free_inc_size = ca->free_inc.size; + __entry->unused = fifo_used(&ca->unused); ), - TP_printk("%pU", __entry->uuid) + TP_printk("free %u free_inc %u/%u unused %u", __entry->free, + __entry->free_inc, __entry->free_inc_size, __entry->unused) ); +TRACE_EVENT(bcache_alloc_fail, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), -DEFINE_EVENT(bcache_gc, bcache_gc_start, + TP_STRUCT__entry( + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, unused ) + __field(unsigned, blocked ) + ), - TP_PROTO(uint8_t *uuid), + TP_fast_assign( + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->unused = fifo_used(&ca->unused); + __entry->blocked = atomic_read(&ca->set->prio_blocked); + ), - TP_ARGS(uuid) + TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, + __entry->free_inc, __entry->unused, __entry->blocked) ); -DEFINE_EVENT(bcache_gc, bcache_gc_end, +/* Background writeback */ - TP_PROTO(uint8_t *uuid), +DEFINE_EVENT(bkey, bcache_writeback, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_ARGS(uuid) +DEFINE_EVENT(bkey, bcache_writeback_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); #endif /* _TRACE_BCACHE_H */ diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ffd4652..65e1209 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t; #define BLKIF_OP_DISCARD 5 /* + * Recognized if "feature-max-indirect-segments" in present in the backend + * xenbus info. The "feature-max-indirect-segments" node contains the maximum + * number of segments allowed by the backend per request. If the node is + * present, the frontend might use blkif_request_indirect structs in order to + * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The + * maximum number of indirect segments is fixed by the backend, but the + * frontend can issue requests with any number of indirect segments as long as + * it's less than the number provided by the backend. The indirect_grefs field + * in blkif_request_indirect should be filled by the frontend with the + * grant references of the pages that are holding the indirect segments. + * This pages are filled with an array of blkif_request_segment_aligned + * that hold the information about the segments. The number of indirect + * pages to use is determined by the maximum number of segments + * a indirect request contains. Every indirect page can contain a maximum + * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), + * so to calculate the number of indirect pages to use we have to do + * ceil(indirect_segments/512). + * + * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* + * create the "feature-max-indirect-segments" node! + */ +#define BLKIF_OP_INDIRECT 6 + +/* * Maximum scatter/gather segments per request. * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 +#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 + +struct blkif_request_segment_aligned { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; + uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ +} __attribute__((__packed__)); + struct blkif_request_rw { uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ @@ -147,12 +181,31 @@ struct blkif_request_other { uint64_t id; /* private guest value, echoed in resp */ } __attribute__((__packed__)); +struct blkif_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; +#ifdef CONFIG_X86_64 + uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */ +#endif + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad2; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; +#ifdef CONFIG_X86_64 + uint32_t _pad3; /* make it 64 byte aligned */ +#else + uint64_t _pad3; /* make it 64 byte aligned */ +#endif +} __attribute__((__packed__)); + struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_request_rw rw; struct blkif_request_discard discard; struct blkif_request_other other; + struct blkif_request_indirect indirect; } u; } __attribute__((__packed__)); diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 75271b9..7d28aff 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -188,6 +188,11 @@ struct __name##_back_ring { \ #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) +/* Ill-behaved frontend determination: Can there be this many requests? */ +#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ + (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) + + #define RING_PUSH_REQUESTS(_r) do { \ wmb(); /* back sees requests /before/ updated producer index */ \ (_r)->sring->req_prod = (_r)->req_prod_pvt; \ |