aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/dst.h
blob: e26fed84b1aaf4ec0c4869f80bf7fe6122235207 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
/*
 * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#ifndef __DST_H
#define __DST_H

#include <linux/types.h>
#include <linux/connector.h>

#define DST_NAMELEN		32
#define DST_NAME		"dst"

enum {
	/* Remove node with given id from storage */
	DST_DEL_NODE	= 0,
	/* Add remote node with given id to the storage */
	DST_ADD_REMOTE,
	/* Add local node with given id to the storage to be exported and used by remote peers */
	DST_ADD_EXPORT,
	/* Crypto initialization command (hash/cipher used to protect the connection) */
	DST_CRYPTO,
	/* Security attributes for given connection (permissions for example) */
	DST_SECURITY,
	/* Register given node in the block layer subsystem */
	DST_START,
	DST_CMD_MAX
};

struct dst_ctl
{
	/* Storage name */
	char			name[DST_NAMELEN];
	/* Command flags */
	__u32			flags;
	/* Command itself (see above) */
	__u32			cmd;
	/* Maximum number of pages per single request in this device */
	__u32			max_pages;
	/* Stale/error transaction scanning timeout in milliseconds */
	__u32			trans_scan_timeout;
	/* Maximum number of retry sends before completing transaction as broken */
	__u32			trans_max_retries;
	/* Storage size */
	__u64			size;
};

/* Reply command carries completion status */
struct dst_ctl_ack
{
	struct cn_msg		msg;
	int			error;
	int			unused[3];
};

/*
 * Unfortunaltely socket address structure is not exported to userspace
 * and is redefined there.
 */
#define SADDR_MAX_DATA	128

struct saddr {
	/* address family, AF_xxx	*/
	unsigned short		sa_family;
	/* 14 bytes of protocol address	*/
	char			sa_data[SADDR_MAX_DATA];
	/* Number of bytes used in sa_data */
	unsigned short		sa_data_len;
};

/* Address structure */
struct dst_network_ctl
{
	/* Socket type: datagram, stream...*/
	unsigned int		type;
	/* Let me guess, is it a Jupiter diameter? */
	unsigned int		proto;
	/* Peer's address */
	struct saddr		addr;
};

struct dst_crypto_ctl
{
	/* Cipher and hash names */
	char			cipher_algo[DST_NAMELEN];
	char			hash_algo[DST_NAMELEN];

	/* Key sizes. Can be zero for digest for example */
	unsigned int		cipher_keysize, hash_keysize;
	/* Alignment. Calculated by the DST itself. */
	unsigned int		crypto_attached_size;
	/* Number of threads to perform crypto operations */
	int			thread_num;
};

/* Export security attributes have this bits checked in when client connects */
#define DST_PERM_READ		(1<<0)
#define DST_PERM_WRITE		(1<<1)

/*
 * Right now it is simple model, where each remote address
 * is assigned to set of permissions it is allowed to perform.
 * In real world block device does not know anything but
 * reading and writing, so it should be more than enough.
 */
struct dst_secure_user
{
	unsigned int		permissions;
	struct saddr		addr;
};

/*
 * Export control command: device to export and network address to accept
 * clients to work with given device
 */
struct dst_export_ctl
{
	char			device[DST_NAMELEN];
	struct dst_network_ctl	ctl;
};

enum {
	DST_CFG	= 1, 		/* Request remote configuration */
	DST_IO,			/* IO command */
	DST_IO_RESPONSE,	/* IO response */
	DST_PING,		/* Keepalive message */
	DST_NCMD_MAX,
};

struct dst_cmd
{
	/* Network command itself, see above */
	__u32			cmd;
	/*
	 * Size of the attached data
	 * (in most cases, for READ command it means how many bytes were requested)
	 */
	__u32			size;
	/* Crypto size: number of attached bytes with digest/hmac */
	__u32			csize;
	/* Here we can carry secret data */
	__u32			reserved;
	/* Read/write bits, see how they are encoded in bio structure */
	__u64			rw;
	/* BIO flags */
	__u64			flags;
	/* Unique command id (like transaction ID) */
	__u64			id;
	/* Sector to start IO from */
	__u64			sector;
	/* Hash data is placed after this header */
	__u8			hash[0];
};

/*
 * Convert command to/from network byte order.
 * We do not use hton*() functions, since there is
 * no 64-bit implementation.
 */
static inline void dst_convert_cmd(struct dst_cmd *c)
{
	c->cmd = __cpu_to_be32(c->cmd);
	c->csize = __cpu_to_be32(c->csize);
	c->size = __cpu_to_be32(c->size);
	c->sector = __cpu_to_be64(c->sector);
	c->id = __cpu_to_be64(c->id);
	c->flags = __cpu_to_be64(c->flags);
	c->rw = __cpu_to_be64(c->rw);
}

/* Transaction id */
typedef __u64 dst_gen_t;

#ifdef __KERNEL__

#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/device.h>
#include <linux/mempool.h>
#include <linux/net.h>
#include <linux/poll.h>
#include <linux/rbtree.h>

#ifdef CONFIG_DST_DEBUG
#define dprintk(f, a...) printk(KERN_NOTICE f, ##a)
#else
static inline void __attribute__ ((format (printf, 1, 2)))
	dprintk(const char *fmt, ...) {}
#endif

struct dst_node;

struct dst_trans
{
	/* DST node we are working with */
	struct dst_node		*n;

	/* Entry inside transaction tree */
	struct rb_node		trans_entry;

	/* Merlin kills this transaction when this memory cell equals zero */
	atomic_t		refcnt;

	/* How this transaction should be processed by crypto engine */
	short			enc;
	/* How many times this transaction was resent */
	short			retries;
	/* Completion status */
	int			error;

	/* When did we send it to the remote peer */
	long			send_time;

	/* My name is...
	 * Well, computers does not speak, they have unique id instead */
	dst_gen_t		gen;

	/* Block IO we are working with */
	struct bio		*bio;

	/* Network command for above block IO request */
	struct dst_cmd		cmd;
};

struct dst_crypto_engine
{
	/* What should we do with all block requests */
	struct crypto_hash	*hash;
	struct crypto_ablkcipher	*cipher;

	/* Pool of pages used to encrypt data into before sending */
	int			page_num;
	struct page		**pages;

	/* What to do with current request */
	int			enc;
	/* Who we are and where do we go */
	struct scatterlist	*src, *dst;

	/* Maximum timeout waiting for encryption to be completed */
	long			timeout;
	/* IV is a 64-bit sequential counter */
	u64			iv;

	/* Secret data */
	void			*private;

	/* Cached temporary data lives here */
	int			size;
	void			*data;
};

struct dst_state
{
	/* The main state protection */
	struct mutex		state_lock;

	/* Polling machinery for sockets */
	wait_queue_t 		wait;
	wait_queue_head_t 	*whead;
	/* Most of events are being waited here */
	wait_queue_head_t 	thread_wait;

	/* Who owns this? */
	struct dst_node		*node;

	/* Network address for this state */
	struct dst_network_ctl	ctl;

	/* Permissions to work with: read-only or rw connection */
	u32			permissions;

	/* Called when we need to clean private data */
	void			(* cleanup)(struct dst_state *st);

	/* Used by the server: BIO completion queues BIOs here */
	struct list_head	request_list;
	spinlock_t		request_lock;

	/* Guess what? No, it is not number of planets */
	atomic_t		refcnt;

	/* This flags is set when connection should be dropped */
	int			need_exit;

	/*
	 * Socket to work with. Second pointer is used for
	 * lockless check if socket was changed before performing
	 * next action (like working with cached polling result)
	 */
	struct socket		*socket, *read_socket;

	/* Cached preallocated data */
	void			*data;
	unsigned int		size;

	/* Currently processed command */
	struct dst_cmd		cmd;
};

struct dst_info
{
	/* Device size */
	u64			size;

	/* Local device name for export devices */
	char			local[DST_NAMELEN];

	/* Network setup */
	struct dst_network_ctl	net;

	/* Sysfs bits use this */
	struct device		device;
};

struct dst_node
{
	struct list_head	node_entry;

	/* Hi, my name is stored here */
	char			name[DST_NAMELEN];
	/* My cache name is stored here */
	char			cache_name[DST_NAMELEN];

	/* Block device attached to given node.
	 * Only valid for exporting nodes */
	struct block_device 	*bdev;
	/* Network state machine for given peer */
	struct dst_state	*state;

	/* Block IO machinery */
	struct request_queue	*queue;
	struct gendisk		*disk;

	/* Number of threads in processing pool */
	int			thread_num;
	/* Maximum number of pages in single IO */
	int			max_pages;

	/* I'm that big in bytes */
	loff_t			size;

	/* Exported to userspace node information */
	struct dst_info		*info;

	/*
	 * Security attribute list.
	 * Used only by exporting node currently.
	 */
	struct list_head	security_list;
	struct mutex		security_lock;

	/*
	 * When this unerflows below zero, university collapses.
	 * But this will not happen, since node will be freed,
	 * when reference counter reaches zero.
	 */
	atomic_t		refcnt;

	/* How precisely should I be started? */
	int 			(*start)(struct dst_node *);

	/* Crypto capabilities */
	struct dst_crypto_ctl	crypto;
	u8			*hash_key;
	u8			*cipher_key;

	/* Pool of processing thread */
	struct thread_pool	*pool;

	/* Transaction IDs live here */
	atomic_long_t		gen;

	/*
	 * How frequently and how many times transaction
	 * tree should be scanned to drop stale objects.
	 */
	long			trans_scan_timeout;
	int			trans_max_retries;

	/* Small gnomes live here */
	struct rb_root		trans_root;
	struct mutex		trans_lock;

	/*
	 * Transaction cache/memory pool.
	 * It is big enough to contain not only transaction
	 * itself, but additional crypto data (digest/hmac).
	 */
	struct kmem_cache	*trans_cache;
	mempool_t		*trans_pool;

	/* This entity scans transaction tree */
	struct delayed_work 	trans_work;

	wait_queue_head_t	wait;
};

/* Kernel representation of the security attribute */
struct dst_secure
{
	struct list_head	sec_entry;
	struct dst_secure_user	sec;
};

int dst_process_bio(struct dst_node *n, struct bio *bio);

int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r);
int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le);

static inline struct dst_state *dst_state_get(struct dst_state *st)
{
	BUG_ON(atomic_read(&st->refcnt) == 0);
	atomic_inc(&st->refcnt);
	return st;
}

void dst_state_put(struct dst_state *st);

struct dst_state *dst_state_alloc(struct dst_node *n);
int dst_state_socket_create(struct dst_state *st);
void dst_state_socket_release(struct dst_state *st);

void dst_state_exit_connected(struct dst_state *st);

int dst_state_schedule_receiver(struct dst_state *st);

void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str);

static inline void dst_state_lock(struct dst_state *st)
{
	mutex_lock(&st->state_lock);
}

static inline void dst_state_unlock(struct dst_state *st)
{
	mutex_unlock(&st->state_lock);
}

void dst_poll_exit(struct dst_state *st);
int dst_poll_init(struct dst_state *st);

static inline unsigned int dst_state_poll(struct dst_state *st)
{
	unsigned int revents = POLLHUP | POLLERR;

	dst_state_lock(st);
	if (st->socket)
		revents = st->socket->ops->poll(NULL, st->socket, NULL);
	dst_state_unlock(st);

	return revents;
}

static inline int dst_thread_setup(void *private, void *data)
{
	return 0;
}

void dst_node_put(struct dst_node *n);

static inline struct dst_node *dst_node_get(struct dst_node *n)
{
	atomic_inc(&n->refcnt);
	return n;
}

int dst_data_recv(struct dst_state *st, void *data, unsigned int size);
int dst_recv_cdata(struct dst_state *st, void *cdata);
int dst_data_send_header(struct socket *sock,
		void *data, unsigned int size, int more);

int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio);

int dst_process_io(struct dst_state *st);
int dst_export_crypto(struct dst_node *n, struct bio *bio);
int dst_export_send_bio(struct bio *bio);
int dst_start_export(struct dst_node *n);

int __init dst_export_init(void);
void dst_export_exit(void);

/* Private structure for export block IO requests */
struct dst_export_priv
{
	struct list_head		request_entry;
	struct dst_state		*state;
	struct bio			*bio;
	struct dst_cmd			cmd;
};

static inline void dst_trans_get(struct dst_trans *t)
{
	atomic_inc(&t->refcnt);
}

struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen);
int dst_trans_remove(struct dst_trans *t);
int dst_trans_remove_nolock(struct dst_trans *t);
void dst_trans_put(struct dst_trans *t);

/*
 * Convert bio into network command.
 */
static inline void dst_bio_to_cmd(struct bio *bio, struct dst_cmd *cmd,
		u32 command, u64 id)
{
	cmd->cmd = command;
	cmd->flags = (bio->bi_flags << BIO_POOL_BITS) >> BIO_POOL_BITS;
	cmd->rw = bio->bi_rw;
	cmd->size = bio->bi_size;
	cmd->csize = 0;
	cmd->id = id;
	cmd->sector = bio->bi_sector;
};

int dst_trans_send(struct dst_trans *t);
int dst_trans_crypto(struct dst_trans *t);

int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl);
void dst_node_crypto_exit(struct dst_node *n);

static inline int dst_need_crypto(struct dst_node *n)
{
	struct dst_crypto_ctl *c = &n->crypto;
	/*
	 * Logical OR is appropriate here, but boolean one produces
	 * more optimal code, so it is used instead.
	 */
	return (c->hash_algo[0] | c->cipher_algo[0]);
}

int dst_node_trans_init(struct dst_node *n, unsigned int size);
void dst_node_trans_exit(struct dst_node *n);

/*
 * Pool of threads.
 * Ready list contains threads currently free to be used,
 * active one contains threads with some work scheduled for them.
 * Caller can wait in given queue when thread is ready.
 */
struct thread_pool
{
	int			thread_num;
	struct mutex		thread_lock;
	struct list_head	ready_list, active_list;

	wait_queue_head_t	wait;
};

void thread_pool_del_worker(struct thread_pool *p);
void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id);
int thread_pool_add_worker(struct thread_pool *p,
		char *name,
		unsigned int id,
		void *(* init)(void *data),
		void (* cleanup)(void *data),
		void *data);

void thread_pool_destroy(struct thread_pool *p);
struct thread_pool *thread_pool_create(int num, char *name,
		void *(* init)(void *data),
		void (* cleanup)(void *data),
		void *data);

int thread_pool_schedule(struct thread_pool *p,
		int (* setup)(void *stored_private, void *setup_data),
		int (* action)(void *stored_private, void *setup_data),
		void *setup_data, long timeout);
int thread_pool_schedule_private(struct thread_pool *p,
		int (* setup)(void *private, void *data),
		int (* action)(void *private, void *data),
		void *data, long timeout, void *id);

#endif /* __KERNEL__ */
#endif /* __DST_H */