From d85b705663905b3dae30007f824355bdcfcf3f00 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 10 May 2010 10:24:48 -0700 Subject: ceph: resubmit requests on pg mapping change (not just primary change) OSD requests need to be resubmitted on any pg mapping change, not just when the pg primary changes. Resending only when the primary changes results in occasional 'hung' requests during osd cluster recovery or rebalancing. Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 19 +++++++++++++++---- fs/ceph/osd_client.h | 2 ++ fs/ceph/osdmap.c | 29 ++++++++++++++++++++++++----- fs/ceph/osdmap.h | 2 ++ fs/ceph/rados.h | 1 + 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 8128082..3514f71 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc, { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; - int o = -1; + int acting[CEPH_PG_MAX_SIZE]; + int o = -1, num = 0; int err; dout("map_osds %p tid %lld\n", req, req->r_tid); @@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc, pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; - o = ceph_calc_pg_primary(osdc->osdmap, pgid); + err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); + if (err > 0) { + o = acting[0]; + num = err; + } if ((req->r_osd && req->r_osd->o_osd == o && - req->r_sent >= req->r_osd->o_incarnation) || + req->r_sent >= req->r_osd->o_incarnation && + req->r_num_pg_osds == num && + memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || (req->r_osd == NULL && o == -1)) return 0; /* no change */ @@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc, req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, req->r_osd ? req->r_osd->o_osd : -1); + /* record full pg acting set */ + memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); + req->r_num_pg_osds = num; + if (req->r_osd) { __cancel_request(req); list_del_init(&req->r_osd_item); @@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc, __remove_osd_from_lru(req->r_osd); list_add(&req->r_osd_item, &req->r_osd->o_requests); } - err = 1; /* osd changed */ + err = 1; /* osd or pg changed */ out: return err; diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index c5191d6..ce77698 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -48,6 +48,8 @@ struct ceph_osd_request { struct list_head r_osd_item; struct ceph_osd *r_osd; struct ceph_pg r_pgid; + int r_pg_osds[CEPH_PG_MAX_SIZE]; + int r_num_pg_osds; struct ceph_connection *r_con_filling_msg; diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 2e2c15e..cfdd8f4 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -1041,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } /* + * Return acting set for given pgid. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting) +{ + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, o, num = CEPH_PG_MAX_SIZE; + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + o = 0; + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) + acting[o++] = osds[i]; + return o; +} + +/* * Return primary osd for given pgid, or -1 if none. */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int rawosds[10], *osds; - int i, num = ARRAY_SIZE(rawosds); + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, num = CEPH_PG_MAX_SIZE; osds = calc_pg_raw(osdmap, pgid, rawosds, &num); if (!osds) @@ -1054,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) /* primary is first up osd */ for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) { + if (ceph_osd_is_up(osdmap, osds[i])) return osds[i]; - break; - } return -1; } diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index 8bc9f1e..970b547 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h @@ -120,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol, const char *oid, struct ceph_file_layout *fl, struct ceph_osdmap *osdmap); +extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index a1fc1d0..fd56451 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -58,6 +58,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ /* * placement group. -- cgit v1.1