diff options
Diffstat (limited to 'drivers/block/drbd/drbd_req.c')
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 157 |
1 files changed, 43 insertions, 114 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e609557..ca28b56 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -149,46 +149,16 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const drbd_req_free(req); } -static void queue_barrier(struct drbd_conf *mdev) -{ - struct drbd_tl_epoch *b; - struct drbd_tconn *tconn = mdev->tconn; - - /* We are within the req_lock. Once we queued the barrier for sending, - * we set the CREATE_BARRIER bit. It is cleared as soon as a new - * barrier/epoch object is added. This is the only place this bit is - * set. It indicates that the barrier for this epoch is already queued, - * and no new epoch has been created yet. */ - if (test_bit(CREATE_BARRIER, &tconn->flags)) - return; - - b = tconn->newest_tle; - b->w.cb = w_send_barrier; - b->w.mdev = mdev; - /* inc_ap_pending done here, so we won't - * get imbalanced on connection loss. - * dec_ap_pending will be done in got_BarrierAck - * or (on connection loss) in tl_clear. */ - inc_ap_pending(mdev); - drbd_queue_work(&tconn->sender_work, &b->w); - set_bit(CREATE_BARRIER, &tconn->flags); +static void wake_all_senders(struct drbd_tconn *tconn) { + wake_up(&tconn->sender_work.q_wait); } -static void _about_to_complete_local_write(struct drbd_conf *mdev, - struct drbd_request *req) +/* must hold resource->req_lock */ +static void start_new_tl_epoch(struct drbd_tconn *tconn) { - const unsigned long s = req->rq_state; - - /* Before we can signal completion to the upper layers, - * we may need to close the current epoch. - * We can skip this, if this request has not even been sent, because we - * did not have a fully established connection yet/anymore, during - * bitmap exchange, or while we are C_AHEAD due to congestion policy. - */ - if (mdev->state.conn >= C_CONNECTED && - (s & RQ_NET_SENT) != 0 && - req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) - queue_barrier(mdev); + tconn->current_tle_writes = 0; + atomic_inc(&tconn->current_tle_nr); + wake_all_senders(tconn); } void complete_master_bio(struct drbd_conf *mdev, @@ -320,9 +290,16 @@ void req_may_be_completed(struct drbd_request *req, struct bio_and_error *m) } else if (!(s & RQ_POSTPONED)) D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); - /* for writes we need to do some extra housekeeping */ - if (rw == WRITE) - _about_to_complete_local_write(mdev, req); + /* Before we can signal completion to the upper layers, + * we may need to close the current transfer log epoch. + * We are within the request lock, so we can simply compare + * the request epoch number with the current transfer log + * epoch number. If they match, increase the current_tle_nr, + * and reset the transfer log epoch write_cnt. + */ + if (rw == WRITE && + req->epoch == atomic_read(&mdev->tconn->current_tle_nr)) + start_new_tl_epoch(mdev->tconn); /* Update disk stats */ _drbd_end_io_acct(mdev, req); @@ -514,15 +491,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * hurting performance. */ set_bit(UNPLUG_REMOTE, &mdev->flags); - /* see __drbd_make_request, - * just after it grabs the req_lock */ - D_ASSERT(test_bit(CREATE_BARRIER, &mdev->tconn->flags) == 0); - - req->epoch = atomic_read(&mdev->tconn->current_tle_nr); - - /* increment size of current epoch */ - mdev->tconn->newest_tle->n_writes++; - /* queue work item to send data */ D_ASSERT(req->rq_state & RQ_NET_PENDING); req->rq_state |= RQ_NET_QUEUED; @@ -534,8 +502,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, nc = rcu_dereference(mdev->tconn->net_conf); p = nc->max_epoch_size; rcu_read_unlock(); - if (mdev->tconn->newest_tle->n_writes >= p) - queue_barrier(mdev); + if (mdev->tconn->current_tle_writes >= p) + start_new_tl_epoch(mdev->tconn); break; @@ -692,6 +660,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, During connection handshake, we ensure that the peer was not rebooted. */ if (!(req->rq_state & RQ_NET_OK)) { if (req->w.cb) { + /* w.cb expected to be w_send_dblock, or w_send_read_req */ drbd_queue_work(&mdev->tconn->sender_work, &req->w); rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; } @@ -708,7 +677,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * this is bad, because if the connection is lost now, * we won't be able to clean them up... */ dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n"); - list_move(&req->tl_requests, &mdev->tconn->out_of_sequence_requests); } if ((req->rq_state & RQ_NET_MASK) != 0) { req->rq_state |= RQ_NET_DONE; @@ -835,7 +803,6 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s const int rw = bio_rw(bio); const int size = bio->bi_size; const sector_t sector = bio->bi_sector; - struct drbd_tl_epoch *b = NULL; struct drbd_request *req; struct net_conf *nc; int local, remote, send_oos = 0; @@ -916,24 +883,6 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s goto fail_free_complete; } - /* For WRITE request, we have to make sure that we have an - * unused_spare_tle, in case we need to start a new epoch. - * I try to be smart and avoid to pre-allocate always "just in case", - * but there is a race between testing the bit and pointer outside the - * spinlock, and grabbing the spinlock. - * if we lost that race, we retry. */ - if (rw == WRITE && (remote || send_oos) && - mdev->tconn->unused_spare_tle == NULL && - test_bit(CREATE_BARRIER, &mdev->tconn->flags)) { -allocate_barrier: - b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); - if (!b) { - dev_err(DEV, "Failed to alloc barrier.\n"); - err = -ENOMEM; - goto fail_free_complete; - } - } - /* GOOD, everything prepared, grab the spin_lock */ spin_lock_irq(&mdev->tconn->req_lock); @@ -969,42 +918,9 @@ allocate_barrier: } } - if (b && mdev->tconn->unused_spare_tle == NULL) { - mdev->tconn->unused_spare_tle = b; - b = NULL; - } - if (rw == WRITE && (remote || send_oos) && - mdev->tconn->unused_spare_tle == NULL && - test_bit(CREATE_BARRIER, &mdev->tconn->flags)) { - /* someone closed the current epoch - * while we were grabbing the spinlock */ - spin_unlock_irq(&mdev->tconn->req_lock); - goto allocate_barrier; - } - - /* Update disk stats */ _drbd_start_io_acct(mdev, req, bio); - /* _maybe_start_new_epoch(mdev); - * If we need to generate a write barrier packet, we have to add the - * new epoch (barrier) object, and queue the barrier packet for sending, - * and queue the req's data after it _within the same lock_, otherwise - * we have race conditions were the reorder domains could be mixed up. - * - * Even read requests may start a new epoch and queue the corresponding - * barrier packet. To get the write ordering right, we only have to - * make sure that, if this is a write request and it triggered a - * barrier packet, this request is queued within the same spinlock. */ - if ((remote || send_oos) && mdev->tconn->unused_spare_tle && - test_and_clear_bit(CREATE_BARRIER, &mdev->tconn->flags)) { - _tl_add_barrier(mdev->tconn, mdev->tconn->unused_spare_tle); - mdev->tconn->unused_spare_tle = NULL; - } else { - D_ASSERT(!(remote && rw == WRITE && - test_bit(CREATE_BARRIER, &mdev->tconn->flags))); - } - /* NOTE * Actually, 'local' may be wrong here already, since we may have failed * to write to the meta data, and may become wrong anytime because of @@ -1025,7 +941,12 @@ allocate_barrier: if (local) _req_mod(req, TO_BE_SUBMITTED); - list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests); + /* which transfer log epoch does this belong to? */ + req->epoch = atomic_read(&mdev->tconn->current_tle_nr); + if (rw == WRITE) + mdev->tconn->current_tle_writes++; + + list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -1059,7 +980,9 @@ allocate_barrier: } if (congested) { - queue_barrier(mdev); /* last barrier, after mirrored writes */ + if (mdev->tconn->current_tle_writes) + /* start a new epoch for non-mirrored writes */ + start_new_tl_epoch(mdev->tconn); if (nc->on_congestion == OC_PULL_AHEAD) _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); @@ -1070,7 +993,6 @@ allocate_barrier: rcu_read_unlock(); spin_unlock_irq(&mdev->tconn->req_lock); - kfree(b); /* if someone else has beaten us to it... */ if (local) { req->private_bio->bi_bdev = mdev->ldev->backing_bdev; @@ -1108,7 +1030,6 @@ fail_and_free_req: drbd_req_free(req); dec_ap_bio(mdev); - kfree(b); return ret; } @@ -1164,12 +1085,23 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } +struct drbd_request *find_oldest_request(struct drbd_tconn *tconn) +{ + /* Walk the transfer log, + * and find the oldest not yet completed request */ + struct drbd_request *r; + list_for_each_entry(r, &tconn->transfer_log, tl_requests) { + if (r->rq_state & (RQ_NET_PENDING|RQ_LOCAL_PENDING)) + return r; + } + return NULL; +} + void request_timer_fn(unsigned long data) { struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_tconn *tconn = mdev->tconn; struct drbd_request *req; /* oldest request */ - struct list_head *le; struct net_conf *nc; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1193,16 +1125,13 @@ void request_timer_fn(unsigned long data) now = jiffies; spin_lock_irq(&tconn->req_lock); - le = &tconn->oldest_tle->requests; - if (list_empty(le)) { + req = find_oldest_request(tconn); + if (!req) { spin_unlock_irq(&tconn->req_lock); mod_timer(&mdev->request_timer, now + et); return; } - le = le->prev; - req = list_entry(le, struct drbd_request, tl_requests); - /* The request is considered timed out, if * - we have some effective timeout from the configuration, * with above state restrictions applied, |