aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEvgeniy Polyakov <zbr@ioremap.net>2009-01-13 18:05:27 -0500
committerGreg Kroah-Hartman <gregkh@suse.de>2009-04-03 17:53:32 -0400
commitce0d9d7255a55628fd3732bf583c83e90150b699 (patch)
treed8aa3910a4ba9d87f98639dafe2fdf69b591fa15
parentdab8c35990692026fca989c3449fd67a59275c6a (diff)
Staging: dst: core files.
This patch contains DST core files, which introduce block layer, connector and sysfs registration glue and main headers. Connector is used for the configuration of the node (its type, address, device name and so on). Sysfs provides bits of information about running devices in the following format: +/* + * DST sysfs tree for device called 'storage': + * + * /sys/bus/dst/devices/storage/ + * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025 + * /sys/bus/dst/devices/storage/size : 800 + * /sys/bus/dst/devices/storage/name : storage + */ DST header contains structure definitions and protocol command description. Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-rw-r--r--drivers/staging/dst/dcore.c972
-rw-r--r--include/linux/connector.h4
-rw-r--r--include/linux/dst.h587
3 files changed, 1562 insertions, 1 deletions
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
new file mode 100644
index 000000000000..c6e3cd1a5051
--- /dev/null
+++ b/drivers/staging/dst/dcore.c
@@ -0,0 +1,972 @@
1/*
2 * 2007+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 */
15
16#include <linux/module.h>
17#include <linux/kernel.h>
18#include <linux/blkdev.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/connector.h>
22#include <linux/dst.h>
23#include <linux/device.h>
24#include <linux/jhash.h>
25#include <linux/idr.h>
26#include <linux/init.h>
27#include <linux/namei.h>
28#include <linux/slab.h>
29#include <linux/socket.h>
30
31#include <linux/in.h>
32#include <linux/in6.h>
33
34#include <net/sock.h>
35
36static int dst_major;
37
38static DEFINE_MUTEX(dst_hash_lock);
39static struct list_head *dst_hashtable;
40static unsigned int dst_hashtable_size = 128;
41module_param(dst_hashtable_size, uint, 0644);
42
43static char dst_name[] = "Dementianting goldfish";
44
45static DEFINE_IDR(dst_index_idr);
46static struct cb_id cn_dst_id = { CN_DST_IDX, CN_DST_VAL };
47
48/*
49 * DST sysfs tree for device called 'storage':
50 *
51 * /sys/bus/dst/devices/storage/
52 * /sys/bus/dst/devices/storage/type : 192.168.4.80:1025
53 * /sys/bus/dst/devices/storage/size : 800
54 * /sys/bus/dst/devices/storage/name : storage
55 */
56
57static int dst_dev_match(struct device *dev, struct device_driver *drv)
58{
59 return 1;
60}
61
62static struct bus_type dst_dev_bus_type = {
63 .name = "dst",
64 .match = &dst_dev_match,
65};
66
67static void dst_node_release(struct device *dev)
68{
69 struct dst_info *info = container_of(dev, struct dst_info, device);
70
71 kfree(info);
72}
73
74static struct device dst_node_dev = {
75 .bus = &dst_dev_bus_type,
76 .release = &dst_node_release
77};
78
79/*
80 * Setting size of the node after it was changed.
81 */
82static void dst_node_set_size(struct dst_node *n)
83{
84 struct block_device *bdev;
85
86 set_capacity(n->disk, n->size >> 9);
87
88 bdev = bdget_disk(n->disk, 0);
89 if (bdev) {
90 mutex_lock(&bdev->bd_inode->i_mutex);
91 i_size_write(bdev->bd_inode, n->size);
92 mutex_unlock(&bdev->bd_inode->i_mutex);
93 bdput(bdev);
94 }
95}
96
97/*
98 * Distributed storage request processing function.
99 */
100static int dst_request(struct request_queue *q, struct bio *bio)
101{
102 struct dst_node *n = q->queuedata;
103
104 bio_get(bio);
105
106 return dst_process_bio(n, bio);
107}
108
109/*
110 * Open/close callbacks for appropriate block device.
111 */
112static int dst_bdev_open(struct block_device *bdev, fmode_t mode)
113{
114 struct dst_node *n = bdev->bd_disk->private_data;
115
116 dst_node_get(n);
117 return 0;
118}
119
120static int dst_bdev_release(struct gendisk *disk, fmode_t mode)
121{
122 struct dst_node *n = disk->private_data;
123
124 dst_node_put(n);
125 return 0;
126}
127
128static struct block_device_operations dst_blk_ops = {
129 .open = dst_bdev_open,
130 .release = dst_bdev_release,
131 .owner = THIS_MODULE,
132};
133
134/*
135 * Block layer binding - disk is created when array is fully configured
136 * by userspace request.
137 */
138static int dst_node_create_disk(struct dst_node *n)
139{
140 int err = -ENOMEM;
141 u32 index = 0;
142
143 n->queue = blk_init_queue(NULL, NULL);
144 if (!n->queue)
145 goto err_out_exit;
146
147 n->queue->queuedata = n;
148 blk_queue_make_request(n->queue, dst_request);
149 blk_queue_max_phys_segments(n->queue, n->max_pages);
150 blk_queue_max_hw_segments(n->queue, n->max_pages);
151
152 err = -ENOMEM;
153 n->disk = alloc_disk(1);
154 if (!n->disk)
155 goto err_out_free_queue;
156
157 if (!(n->state->permissions & DST_PERM_WRITE)) {
158 printk(KERN_INFO "DST node %s attached read-only.\n", n->name);
159 set_disk_ro(n->disk, 1);
160 }
161
162 if (!idr_pre_get(&dst_index_idr, GFP_KERNEL))
163 goto err_out_put;
164
165 mutex_lock(&dst_hash_lock);
166 err = idr_get_new(&dst_index_idr, NULL, &index);
167 mutex_unlock(&dst_hash_lock);
168 if (err)
169 goto err_out_put;
170
171 n->disk->major = dst_major;
172 n->disk->first_minor = index;
173 n->disk->fops = &dst_blk_ops;
174 n->disk->queue = n->queue;
175 n->disk->private_data = n;
176 snprintf(n->disk->disk_name, sizeof(n->disk->disk_name), "dst-%s", n->name);
177
178 return 0;
179
180err_out_put:
181 put_disk(n->disk);
182err_out_free_queue:
183 blk_cleanup_queue(n->queue);
184err_out_exit:
185 return err;
186}
187
188/*
189 * Sysfs machinery: show device's size.
190 */
191static ssize_t dst_show_size(struct device *dev,
192 struct device_attribute *attr, char *buf)
193{
194 struct dst_info *info = container_of(dev, struct dst_info, device);
195
196 return sprintf(buf, "%llu\n", info->size);
197}
198
199/*
200 * Show local exported device.
201 */
202static ssize_t dst_show_local(struct device *dev,
203 struct device_attribute *attr, char *buf)
204{
205 struct dst_info *info = container_of(dev, struct dst_info, device);
206
207 return sprintf(buf, "%s\n", info->local);
208}
209
210/*
211 * Shows type of the remote node - device major/minor number
212 * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes.
213 */
214static ssize_t dst_show_type(struct device *dev,
215 struct device_attribute *attr, char *buf)
216{
217 struct dst_info *info = container_of(dev, struct dst_info, device);
218 int family = info->net.addr.sa_family;
219
220 if (family == AF_INET) {
221 struct sockaddr_in *sin = (struct sockaddr_in *)&info->net.addr;
222 return sprintf(buf, "%u.%u.%u.%u:%d\n",
223 NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
224 } else if (family == AF_INET6) {
225 struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&info->net.addr;
226 return sprintf(buf,
227 "%pi6:%d\n",
228 &sin->sin6_addr, ntohs(sin->sin6_port));
229 } else {
230 int i, sz = PAGE_SIZE - 2; /* 0 symbol and '\n' below */
231 int size, addrlen = info->net.addr.sa_data_len;
232 unsigned char *a = (unsigned char *)&info->net.addr.sa_data;
233 char *buf_orig = buf;
234
235 size = snprintf(buf, sz, "family: %d, addrlen: %u, addr: ",
236 family, addrlen);
237 sz -= size;
238 buf += size;
239
240 for (i=0; i<addrlen; ++i) {
241 if (sz < 3)
242 break;
243
244 size = snprintf(buf, sz, "%02x ", a[i]);
245 sz -= size;
246 buf += size;
247 }
248 buf += sprintf(buf, "\n");
249
250 return buf - buf_orig;
251 }
252 return 0;
253}
254
255static struct device_attribute dst_node_attrs[] = {
256 __ATTR(size, 0444, dst_show_size, NULL),
257 __ATTR(type, 0444, dst_show_type, NULL),
258 __ATTR(local, 0444, dst_show_local, NULL),
259};
260
261static int dst_create_node_attributes(struct dst_node *n)
262{
263 int err, i;
264
265 for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i) {
266 err = device_create_file(&n->info->device,
267 &dst_node_attrs[i]);
268 if (err)
269 goto err_out_remove_all;
270 }
271 return 0;
272
273err_out_remove_all:
274 while (--i >= 0)
275 device_remove_file(&n->info->device,
276 &dst_node_attrs[i]);
277
278 return err;
279}
280
281static void dst_remove_node_attributes(struct dst_node *n)
282{
283 int i;
284
285 for (i=0; i<ARRAY_SIZE(dst_node_attrs); ++i)
286 device_remove_file(&n->info->device,
287 &dst_node_attrs[i]);
288}
289
290/*
291 * Sysfs cleanup and initialization.
292 * Shows number of useful parameters.
293 */
294static void dst_node_sysfs_exit(struct dst_node *n)
295{
296 if (n->info) {
297 dst_remove_node_attributes(n);
298 device_unregister(&n->info->device);
299 n->info = NULL;
300 }
301}
302
303static int dst_node_sysfs_init(struct dst_node *n)
304{
305 int err;
306
307 n->info = kzalloc(sizeof(struct dst_info), GFP_KERNEL);
308 if (!n->info)
309 return -ENOMEM;
310
311 memcpy(&n->info->device, &dst_node_dev, sizeof(struct device));
312 n->info->size = n->size;
313
314 snprintf(n->info->device.bus_id, sizeof(n->info->device.bus_id), "dst-%s", n->name);
315 err = device_register(&n->info->device);
316 if (err) {
317 dprintk(KERN_ERR "Failed to register node '%s', err: %d.\n",
318 n->name, err);
319 goto err_out_exit;
320 }
321
322 dst_create_node_attributes(n);
323
324 return 0;
325
326err_out_exit:
327 kfree(n->info);
328 n->info = NULL;
329 return err;
330}
331
332/*
333 * DST node hash tables machinery.
334 */
335static inline unsigned int dst_hash(char *str, unsigned int size)
336{
337 return (jhash(str, size, 0) % dst_hashtable_size);
338}
339
340static void dst_node_remove(struct dst_node *n)
341{
342 mutex_lock(&dst_hash_lock);
343 list_del_init(&n->node_entry);
344 mutex_unlock(&dst_hash_lock);
345}
346
347static void dst_node_add(struct dst_node *n)
348{
349 unsigned hash = dst_hash(n->name, sizeof(n->name));
350
351 mutex_lock(&dst_hash_lock);
352 list_add_tail(&n->node_entry, &dst_hashtable[hash]);
353 mutex_unlock(&dst_hash_lock);
354}
355
356/*
357 * Cleaning node when it is about to be freed.
358 * There are still users of the socket though,
359 * so connection cleanup should be protected.
360 */
361static void dst_node_cleanup(struct dst_node *n)
362{
363 struct dst_state *st = n->state;
364
365 if (!st)
366 return;
367
368 if (n->queue) {
369 blk_cleanup_queue(n->queue);
370
371 mutex_lock(&dst_hash_lock);
372 idr_remove(&dst_index_idr, n->disk->first_minor);
373 mutex_unlock(&dst_hash_lock);
374
375 put_disk(n->disk);
376 }
377
378 if (n->bdev) {
379 sync_blockdev(n->bdev);
380 blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
381 }
382
383 dst_state_lock(st);
384 st->need_exit = 1;
385 dst_state_exit_connected(st);
386 dst_state_unlock(st);
387
388 wake_up(&st->thread_wait);
389
390 dst_state_put(st);
391 n->state = NULL;
392}
393
394/*
395 * Free security attributes attached to given node.
396 */
397static void dst_security_exit(struct dst_node *n)
398{
399 struct dst_secure *s, *tmp;
400
401 list_for_each_entry_safe(s, tmp, &n->security_list, sec_entry) {
402 list_del(&s->sec_entry);
403 kfree(s);
404 }
405}
406
407/*
408 * Free node when there are no more users.
409 * Actually node has to be freed on behalf od userspace process,
410 * since there are number of threads, which are embedded in the
411 * node, so they can not exit and free node from there, that is
412 * why there is a wakeup if reference counter is not equal to zero.
413 */
414void dst_node_put(struct dst_node *n)
415{
416 if (unlikely(!n))
417 return;
418
419 dprintk("%s: n: %p, refcnt: %d.\n",
420 __func__, n, atomic_read(&n->refcnt));
421
422 if (atomic_dec_and_test(&n->refcnt)) {
423 dst_node_remove(n);
424 n->trans_scan_timeout = 0;
425 dst_node_cleanup(n);
426 thread_pool_destroy(n->pool);
427 dst_node_sysfs_exit(n);
428 dst_node_crypto_exit(n);
429 dst_security_exit(n);
430 dst_node_trans_exit(n);
431
432 kfree(n);
433
434 dprintk("%s: freed n: %p.\n", __func__, n);
435 } else {
436 wake_up(&n->wait);
437 }
438}
439
440/*
441 * This function finds devices major/minor numbers for given pathname.
442 */
443static int dst_lookup_device(const char *path, dev_t *dev)
444{
445 int err;
446 struct nameidata nd;
447 struct inode *inode;
448
449 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
450 if (err)
451 return err;
452
453 inode = nd.path.dentry->d_inode;
454 if (!inode) {
455 err = -ENOENT;
456 goto out;
457 }
458
459 if (!S_ISBLK(inode->i_mode)) {
460 err = -ENOTBLK;
461 goto out;
462 }
463
464 *dev = inode->i_rdev;
465
466out:
467 path_put(&nd.path);
468 return err;
469}
470
471/*
472 * Setting up export device: lookup by the name, get its size
473 * and setup listening socket, which will accept clients, which
474 * will submit IO for given storage.
475 */
476static int dst_setup_export(struct dst_node *n, struct dst_ctl *ctl,
477 struct dst_export_ctl *le)
478{
479 int err;
480 dev_t dev = 0; /* gcc likes to scream here */
481
482 snprintf(n->info->local, sizeof(n->info->local), "%s", le->device);
483
484 err = dst_lookup_device(le->device, &dev);
485 if (err)
486 return err;
487
488 n->bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
489 if (!n->bdev)
490 return -ENODEV;
491
492 if (n->size != 0)
493 n->size = min_t(loff_t, n->bdev->bd_inode->i_size, n->size);
494 else
495 n->size = n->bdev->bd_inode->i_size;
496
497 n->info->size = n->size;
498 err = dst_node_init_listened(n, le);
499 if (err)
500 goto err_out_cleanup;
501
502 return 0;
503
504err_out_cleanup:
505 blkdev_put(n->bdev, FMODE_READ|FMODE_WRITE);
506 n->bdev = NULL;
507
508 return err;
509}
510
511/* Empty thread pool callbacks for the network processing threads. */
512static inline void *dst_thread_network_init(void *data)
513{
514 dprintk("%s: data: %p.\n", __func__, data);
515 return data;
516}
517
518static inline void dst_thread_network_cleanup(void *data)
519{
520 dprintk("%s: data: %p.\n", __func__, data);
521}
522
523/*
524 * Allocate DST node and initialize some of its parameters.
525 */
526static struct dst_node *dst_alloc_node(struct dst_ctl *ctl,
527 int (*start)(struct dst_node *),
528 int num)
529{
530 struct dst_node *n;
531 int err;
532
533 n = kzalloc(sizeof(struct dst_node), GFP_KERNEL);
534 if (!n)
535 return NULL;
536
537 INIT_LIST_HEAD(&n->node_entry);
538
539 INIT_LIST_HEAD(&n->security_list);
540 mutex_init(&n->security_lock);
541
542 init_waitqueue_head(&n->wait);
543
544 n->trans_scan_timeout = msecs_to_jiffies(ctl->trans_scan_timeout);
545 if (!n->trans_scan_timeout)
546 n->trans_scan_timeout = HZ;
547
548 n->trans_max_retries = ctl->trans_max_retries;
549 if (!n->trans_max_retries)
550 n->trans_max_retries = 10;
551
552 /*
553 * Pretty much arbitrary default numbers.
554 * 32 matches maximum number of pages in bio originated from ext3 (31).
555 */
556 n->max_pages = ctl->max_pages;
557 if (!n->max_pages)
558 n->max_pages = 32;
559
560 if (n->max_pages > 1024)
561 n->max_pages = 1024;
562
563 n->start = start;
564 n->size = ctl->size;
565
566 atomic_set(&n->refcnt, 1);
567 atomic_long_set(&n->gen, 0);
568 snprintf(n->name, sizeof(n->name), "%s", ctl->name);
569
570 err = dst_node_sysfs_init(n);
571 if (err)
572 goto err_out_free;
573
574 n->pool = thread_pool_create(num, n->name, dst_thread_network_init,
575 dst_thread_network_cleanup, n);
576 if (IS_ERR(n->pool)) {
577 err = PTR_ERR(n->pool);
578 goto err_out_sysfs_exit;
579 }
580
581 dprintk("%s: n: %p, name: %s.\n", __func__, n, n->name);
582
583 return n;
584
585err_out_sysfs_exit:
586 dst_node_sysfs_exit(n);
587err_out_free:
588 kfree(n);
589 return NULL;
590}
591
592/*
593 * Starting a node, connected to the remote server:
594 * register block device and initialize transaction mechanism.
595 * In revers order though.
596 *
597 * It will autonegotiate some parameters with the remote node
598 * and update local if needed.
599 *
600 * Transaction initialization should be the last thing before
601 * starting the node, since transaction should include not only
602 * block IO, but also crypto related data (if any), which are
603 * initialized separately.
604 */
605static int dst_start_remote(struct dst_node *n)
606{
607 int err;
608
609 err = dst_node_trans_init(n, sizeof(struct dst_trans));
610 if (err)
611 return err;
612
613 err = dst_node_create_disk(n);
614 if (err)
615 return err;
616
617 dst_node_set_size(n);
618 add_disk(n->disk);
619
620 dprintk("DST: started remote node '%s', minor: %d.\n", n->name, n->disk->first_minor);
621
622 return 0;
623}
624
625/*
626 * Adding remote node and initialize connection.
627 */
628static int dst_add_remote(struct dst_node *n, struct dst_ctl *ctl,
629 void *data, unsigned int size)
630{
631 int err;
632 struct dst_network_ctl *rctl = data;
633
634 if (n)
635 return -EEXIST;
636
637 if (size != sizeof(struct dst_network_ctl))
638 return -EINVAL;
639
640 n = dst_alloc_node(ctl, dst_start_remote, 1);
641 if (!n)
642 return -ENOMEM;
643
644 memcpy(&n->info->net, rctl, sizeof(struct dst_network_ctl));
645 err = dst_node_init_connected(n, rctl);
646 if (err)
647 goto err_out_free;
648
649 dst_node_add(n);
650
651 return 0;
652
653err_out_free:
654 dst_node_put(n);
655 return err;
656}
657
658/*
659 * Adding export node: initializing block device and listening socket.
660 */
661static int dst_add_export(struct dst_node *n, struct dst_ctl *ctl,
662 void *data, unsigned int size)
663{
664 int err;
665 struct dst_export_ctl *le = data;
666
667 if (n)
668 return -EEXIST;
669
670 if (size != sizeof(struct dst_export_ctl))
671 return -EINVAL;
672
673 n = dst_alloc_node(ctl, dst_start_export, 2);
674 if (!n)
675 return -EINVAL;
676
677 err = dst_setup_export(n, ctl, le);
678 if (err)
679 goto err_out_free;
680
681 dst_node_add(n);
682
683 return 0;
684
685err_out_free:
686 dst_node_put(n);
687 return err;
688}
689
690static int dst_node_remove_unload(struct dst_node *n)
691{
692 printk(KERN_INFO "STOPPED name: '%s', size: %llu.\n",
693 n->name, n->size);
694
695 if (n->disk)
696 del_gendisk(n->disk);
697
698 dst_node_remove(n);
699 dst_node_sysfs_exit(n);
700
701 /*
702 * This is not a hack. Really.
703 * Node's reference counter allows to implement fine grained
704 * node freeing, but since all transactions (which hold node's
705 * reference counter) are processed in the dedicated thread,
706 * it is possible that reference will hit zero in that thread,
707 * so we will not be able to exit thread and cleanup the node.
708 *
709 * So, we remove disk, so no new activity is possible, and
710 * wait until all pending transaction are completed (either
711 * in receiving thread or by timeout in workqueue), in this
712 * case reference counter will be less or equal to 2 (once set in
713 * dst_alloc_node() and then in connector message parser;
714 * or when we force module unloading, and connector message
715 * parser does not hold a reference, in this case reference
716 * counter will be equal to 1),
717 * and subsequent dst_node_put() calls will free the node.
718 */
719 dprintk("%s: going to sleep with %d refcnt.\n", __func__, atomic_read(&n->refcnt));
720 wait_event(n->wait, atomic_read(&n->refcnt) <= 2);
721
722 dst_node_put(n);
723 return 0;
724}
725
726/*
727 * Remove node from the hash table.
728 */
729static int dst_del_node(struct dst_node *n, struct dst_ctl *ctl,
730 void *data, unsigned int size)
731{
732 if (!n)
733 return -ENODEV;
734
735 return dst_node_remove_unload(n);
736}
737
738/*
739 * Initialize crypto processing for given node.
740 */
741static int dst_crypto_init(struct dst_node *n, struct dst_ctl *ctl,
742 void *data, unsigned int size)
743{
744 struct dst_crypto_ctl *crypto = data;
745
746 if (!n)
747 return -ENODEV;
748
749 if (size != sizeof(struct dst_crypto_ctl) + crypto->hash_keysize +
750 crypto->cipher_keysize)
751 return -EINVAL;
752
753 if (n->trans_cache)
754 return -EEXIST;
755
756 return dst_node_crypto_init(n, crypto);
757}
758
759/*
760 * Security attributes for given node.
761 */
762static int dst_security_init(struct dst_node *n, struct dst_ctl *ctl,
763 void *data, unsigned int size)
764{
765 struct dst_secure *s;
766
767 if (!n)
768 return -ENODEV;
769
770 if (size != sizeof(struct dst_secure_user))
771 return -EINVAL;
772
773 s = kmalloc(sizeof(struct dst_secure), GFP_KERNEL);
774 if (!s)
775 return -ENOMEM;
776
777 memcpy(&s->sec, data, size);
778
779 mutex_lock(&n->security_lock);
780 list_add_tail(&s->sec_entry, &n->security_list);
781 mutex_unlock(&n->security_lock);
782
783 return 0;
784}
785
786/*
787 * Kill'em all!
788 */
789static int dst_start_node(struct dst_node *n, struct dst_ctl *ctl,
790 void *data, unsigned int size)
791{
792 int err;
793
794 if (!n)
795 return -ENODEV;
796
797 if (n->trans_cache)
798 return 0;
799
800 err = n->start(n);
801 if (err)
802 return err;
803
804 printk(KERN_INFO "STARTED name: '%s', size: %llu.\n", n->name, n->size);
805 return 0;
806}
807
808typedef int (*dst_command_func)(struct dst_node *n, struct dst_ctl *ctl,
809 void *data, unsigned int size);
810
811/*
812 * List of userspace commands.
813 */
814static dst_command_func dst_commands[] = {
815 [DST_ADD_REMOTE] = &dst_add_remote,
816 [DST_ADD_EXPORT] = &dst_add_export,
817 [DST_DEL_NODE] = &dst_del_node,
818 [DST_CRYPTO] = &dst_crypto_init,
819 [DST_SECURITY] = &dst_security_init,
820 [DST_START] = &dst_start_node,
821};
822
823/*
824 * Configuration parser.
825 */
826static void cn_dst_callback(void *data)
827{
828 struct dst_ctl *ctl;
829 struct cn_msg *msg = data;
830 int err;
831 struct dst_ctl_ack ack;
832 struct dst_node *n = NULL, *tmp;
833 unsigned int hash;
834
835 if (msg->len < sizeof(struct dst_ctl)) {
836 err = -EBADMSG;
837 goto out;
838 }
839
840 ctl = (struct dst_ctl *)msg->data;
841
842 if (ctl->cmd >= DST_CMD_MAX) {
843 err = -EINVAL;
844 goto out;
845 }
846 hash = dst_hash(ctl->name, sizeof(ctl->name));
847
848 mutex_lock(&dst_hash_lock);
849 list_for_each_entry(tmp, &dst_hashtable[hash], node_entry) {
850 if (!memcmp(tmp->name, ctl->name, sizeof(tmp->name))) {
851 n = tmp;
852 dst_node_get(n);
853 break;
854 }
855 }
856 mutex_unlock(&dst_hash_lock);
857
858 err = dst_commands[ctl->cmd](n, ctl, msg->data + sizeof(struct dst_ctl),
859 msg->len - sizeof(struct dst_ctl));
860
861 dst_node_put(n);
862out:
863 memcpy(&ack.msg, msg, sizeof(struct cn_msg));
864
865 ack.msg.ack = msg->ack + 1;
866 ack.msg.len = sizeof(struct dst_ctl_ack) - sizeof(struct cn_msg);
867
868 ack.error = err;
869
870 cn_netlink_send(&ack.msg, 0, GFP_KERNEL);
871}
872
873/*
874 * Global initialization: sysfs, hash table, block device registration,
875 * connector and various caches.
876 */
877static int __init dst_sysfs_init(void)
878{
879 return bus_register(&dst_dev_bus_type);
880}
881
882static void dst_sysfs_exit(void)
883{
884 bus_unregister(&dst_dev_bus_type);
885}
886
887static int __init dst_hashtable_init(void)
888{
889 unsigned int i;
890
891 dst_hashtable = kcalloc(dst_hashtable_size, sizeof(struct list_head),
892 GFP_KERNEL);
893 if (!dst_hashtable)
894 return -ENOMEM;
895
896 for (i=0; i<dst_hashtable_size; ++i)
897 INIT_LIST_HEAD(&dst_hashtable[i]);
898
899 return 0;
900}
901
902static void dst_hashtable_exit(void)
903{
904 unsigned int i;
905 struct dst_node *n, *tmp;
906
907 for (i=0; i<dst_hashtable_size; ++i) {
908 list_for_each_entry_safe(n, tmp, &dst_hashtable[i], node_entry) {
909 dst_node_remove_unload(n);
910 }
911 }
912
913 kfree(dst_hashtable);
914}
915
916static int __init dst_sys_init(void)
917{
918 int err = -ENOMEM;
919
920 err = dst_hashtable_init();
921 if (err)
922 goto err_out_exit;
923
924 err = dst_export_init();
925 if (err)
926 goto err_out_hashtable_exit;
927
928 err = register_blkdev(dst_major, DST_NAME);
929 if (err < 0)
930 goto err_out_export_exit;
931 if (err)
932 dst_major = err;
933
934 err = dst_sysfs_init();
935 if (err)
936 goto err_out_unregister;
937
938 err = cn_add_callback(&cn_dst_id, "DST", cn_dst_callback);
939 if (err)
940 goto err_out_sysfs_exit;
941
942 printk(KERN_INFO "Distributed storage, '%s' release.\n", dst_name);
943
944 return 0;
945
946err_out_sysfs_exit:
947 dst_sysfs_exit();
948err_out_unregister:
949 unregister_blkdev(dst_major, DST_NAME);
950err_out_export_exit:
951 dst_export_exit();
952err_out_hashtable_exit:
953 dst_hashtable_exit();
954err_out_exit:
955 return err;
956}
957
958static void __exit dst_sys_exit(void)
959{
960 cn_del_callback(&cn_dst_id);
961 unregister_blkdev(dst_major, DST_NAME);
962 dst_hashtable_exit();
963 dst_sysfs_exit();
964 dst_export_exit();
965}
966
967module_init(dst_sys_init);
968module_exit(dst_sys_exit);
969
970MODULE_DESCRIPTION("Distributed storage");
971MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
972MODULE_LICENSE("GPL");
diff --git a/include/linux/connector.h b/include/linux/connector.h
index fc65d219d88c..b9966e64604e 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -39,8 +39,10 @@
39#define CN_IDX_V86D 0x4 39#define CN_IDX_V86D 0x4
40#define CN_VAL_V86D_UVESAFB 0x1 40#define CN_VAL_V86D_UVESAFB 0x1
41#define CN_IDX_BB 0x5 /* BlackBoard, from the TSP GPL sampling framework */ 41#define CN_IDX_BB 0x5 /* BlackBoard, from the TSP GPL sampling framework */
42#define CN_DST_IDX 0x6
43#define CN_DST_VAL 0x1
42 44
43#define CN_NETLINK_USERS 6 45#define CN_NETLINK_USERS 7
44 46
45/* 47/*
46 * Maximum connector's message size. 48 * Maximum connector's message size.
diff --git a/include/linux/dst.h b/include/linux/dst.h
new file mode 100644
index 000000000000..e26fed84b1aa
--- /dev/null
+++ b/include/linux/dst.h
@@ -0,0 +1,587 @@
1/*
2 * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 */
15
16#ifndef __DST_H
17#define __DST_H
18
19#include <linux/types.h>
20#include <linux/connector.h>
21
22#define DST_NAMELEN 32
23#define DST_NAME "dst"
24
25enum {
26 /* Remove node with given id from storage */
27 DST_DEL_NODE = 0,
28 /* Add remote node with given id to the storage */
29 DST_ADD_REMOTE,
30 /* Add local node with given id to the storage to be exported and used by remote peers */
31 DST_ADD_EXPORT,
32 /* Crypto initialization command (hash/cipher used to protect the connection) */
33 DST_CRYPTO,
34 /* Security attributes for given connection (permissions for example) */
35 DST_SECURITY,
36 /* Register given node in the block layer subsystem */
37 DST_START,
38 DST_CMD_MAX
39};
40
41struct dst_ctl
42{
43 /* Storage name */
44 char name[DST_NAMELEN];
45 /* Command flags */
46 __u32 flags;
47 /* Command itself (see above) */
48 __u32 cmd;
49 /* Maximum number of pages per single request in this device */
50 __u32 max_pages;
51 /* Stale/error transaction scanning timeout in milliseconds */
52 __u32 trans_scan_timeout;
53 /* Maximum number of retry sends before completing transaction as broken */
54 __u32 trans_max_retries;
55 /* Storage size */
56 __u64 size;
57};
58
59/* Reply command carries completion status */
60struct dst_ctl_ack
61{
62 struct cn_msg msg;
63 int error;
64 int unused[3];
65};
66
67/*
68 * Unfortunaltely socket address structure is not exported to userspace
69 * and is redefined there.
70 */
71#define SADDR_MAX_DATA 128
72
73struct saddr {
74 /* address family, AF_xxx */
75 unsigned short sa_family;
76 /* 14 bytes of protocol address */
77 char sa_data[SADDR_MAX_DATA];
78 /* Number of bytes used in sa_data */
79 unsigned short sa_data_len;
80};
81
82/* Address structure */
83struct dst_network_ctl
84{
85 /* Socket type: datagram, stream...*/
86 unsigned int type;
87 /* Let me guess, is it a Jupiter diameter? */
88 unsigned int proto;
89 /* Peer's address */
90 struct saddr addr;
91};
92
93struct dst_crypto_ctl
94{
95 /* Cipher and hash names */
96 char cipher_algo[DST_NAMELEN];
97 char hash_algo[DST_NAMELEN];
98
99 /* Key sizes. Can be zero for digest for example */
100 unsigned int cipher_keysize, hash_keysize;
101 /* Alignment. Calculated by the DST itself. */
102 unsigned int crypto_attached_size;
103 /* Number of threads to perform crypto operations */
104 int thread_num;
105};
106
107/* Export security attributes have this bits checked in when client connects */
108#define DST_PERM_READ (1<<0)
109#define DST_PERM_WRITE (1<<1)
110
111/*
112 * Right now it is simple model, where each remote address
113 * is assigned to set of permissions it is allowed to perform.
114 * In real world block device does not know anything but
115 * reading and writing, so it should be more than enough.
116 */
117struct dst_secure_user
118{
119 unsigned int permissions;
120 struct saddr addr;
121};
122
123/*
124 * Export control command: device to export and network address to accept
125 * clients to work with given device
126 */
127struct dst_export_ctl
128{
129 char device[DST_NAMELEN];
130 struct dst_network_ctl ctl;
131};
132
133enum {
134 DST_CFG = 1, /* Request remote configuration */
135 DST_IO, /* IO command */
136 DST_IO_RESPONSE, /* IO response */
137 DST_PING, /* Keepalive message */
138 DST_NCMD_MAX,
139};
140
141struct dst_cmd
142{
143 /* Network command itself, see above */
144 __u32 cmd;
145 /*
146 * Size of the attached data
147 * (in most cases, for READ command it means how many bytes were requested)
148 */
149 __u32 size;
150 /* Crypto size: number of attached bytes with digest/hmac */
151 __u32 csize;
152 /* Here we can carry secret data */
153 __u32 reserved;
154 /* Read/write bits, see how they are encoded in bio structure */
155 __u64 rw;
156 /* BIO flags */
157 __u64 flags;
158 /* Unique command id (like transaction ID) */
159 __u64 id;
160 /* Sector to start IO from */
161 __u64 sector;
162 /* Hash data is placed after this header */
163 __u8 hash[0];
164};
165
166/*
167 * Convert command to/from network byte order.
168 * We do not use hton*() functions, since there is
169 * no 64-bit implementation.
170 */
171static inline void dst_convert_cmd(struct dst_cmd *c)
172{
173 c->cmd = __cpu_to_be32(c->cmd);
174 c->csize = __cpu_to_be32(c->csize);
175 c->size = __cpu_to_be32(c->size);
176 c->sector = __cpu_to_be64(c->sector);
177 c->id = __cpu_to_be64(c->id);
178 c->flags = __cpu_to_be64(c->flags);
179 c->rw = __cpu_to_be64(c->rw);
180}
181
182/* Transaction id */
183typedef __u64 dst_gen_t;
184
185#ifdef __KERNEL__
186
187#include <linux/blkdev.h>
188#include <linux/bio.h>
189#include <linux/device.h>
190#include <linux/mempool.h>
191#include <linux/net.h>
192#include <linux/poll.h>
193#include <linux/rbtree.h>
194
195#ifdef CONFIG_DST_DEBUG
196#define dprintk(f, a...) printk(KERN_NOTICE f, ##a)
197#else
198static inline void __attribute__ ((format (printf, 1, 2)))
199 dprintk(const char *fmt, ...) {}
200#endif
201
202struct dst_node;
203
204struct dst_trans
205{
206 /* DST node we are working with */
207 struct dst_node *n;
208
209 /* Entry inside transaction tree */
210 struct rb_node trans_entry;
211
212 /* Merlin kills this transaction when this memory cell equals zero */
213 atomic_t refcnt;
214
215 /* How this transaction should be processed by crypto engine */
216 short enc;
217 /* How many times this transaction was resent */
218 short retries;
219 /* Completion status */
220 int error;
221
222 /* When did we send it to the remote peer */
223 long send_time;
224
225 /* My name is...
226 * Well, computers does not speak, they have unique id instead */
227 dst_gen_t gen;
228
229 /* Block IO we are working with */
230 struct bio *bio;
231
232 /* Network command for above block IO request */
233 struct dst_cmd cmd;
234};
235
236struct dst_crypto_engine
237{
238 /* What should we do with all block requests */
239 struct crypto_hash *hash;
240 struct crypto_ablkcipher *cipher;
241
242 /* Pool of pages used to encrypt data into before sending */
243 int page_num;
244 struct page **pages;
245
246 /* What to do with current request */
247 int enc;
248 /* Who we are and where do we go */
249 struct scatterlist *src, *dst;
250
251 /* Maximum timeout waiting for encryption to be completed */
252 long timeout;
253 /* IV is a 64-bit sequential counter */
254 u64 iv;
255
256 /* Secret data */
257 void *private;
258
259 /* Cached temporary data lives here */
260 int size;
261 void *data;
262};
263
264struct dst_state
265{
266 /* The main state protection */
267 struct mutex state_lock;
268
269 /* Polling machinery for sockets */
270 wait_queue_t wait;
271 wait_queue_head_t *whead;
272 /* Most of events are being waited here */
273 wait_queue_head_t thread_wait;
274
275 /* Who owns this? */
276 struct dst_node *node;
277
278 /* Network address for this state */
279 struct dst_network_ctl ctl;
280
281 /* Permissions to work with: read-only or rw connection */
282 u32 permissions;
283
284 /* Called when we need to clean private data */
285 void (* cleanup)(struct dst_state *st);
286
287 /* Used by the server: BIO completion queues BIOs here */
288 struct list_head request_list;
289 spinlock_t request_lock;
290
291 /* Guess what? No, it is not number of planets */
292 atomic_t refcnt;
293
294 /* This flags is set when connection should be dropped */
295 int need_exit;
296
297 /*
298 * Socket to work with. Second pointer is used for
299 * lockless check if socket was changed before performing
300 * next action (like working with cached polling result)
301 */
302 struct socket *socket, *read_socket;
303
304 /* Cached preallocated data */
305 void *data;
306 unsigned int size;
307
308 /* Currently processed command */
309 struct dst_cmd cmd;
310};
311
312struct dst_info
313{
314 /* Device size */
315 u64 size;
316
317 /* Local device name for export devices */
318 char local[DST_NAMELEN];
319
320 /* Network setup */
321 struct dst_network_ctl net;
322
323 /* Sysfs bits use this */
324 struct device device;
325};
326
327struct dst_node
328{
329 struct list_head node_entry;
330
331 /* Hi, my name is stored here */
332 char name[DST_NAMELEN];
333 /* My cache name is stored here */
334 char cache_name[DST_NAMELEN];
335
336 /* Block device attached to given node.
337 * Only valid for exporting nodes */
338 struct block_device *bdev;
339 /* Network state machine for given peer */
340 struct dst_state *state;
341
342 /* Block IO machinery */
343 struct request_queue *queue;
344 struct gendisk *disk;
345
346 /* Number of threads in processing pool */
347 int thread_num;
348 /* Maximum number of pages in single IO */
349 int max_pages;
350
351 /* I'm that big in bytes */
352 loff_t size;
353
354 /* Exported to userspace node information */
355 struct dst_info *info;
356
357 /*
358 * Security attribute list.
359 * Used only by exporting node currently.
360 */
361 struct list_head security_list;
362 struct mutex security_lock;
363
364 /*
365 * When this unerflows below zero, university collapses.
366 * But this will not happen, since node will be freed,
367 * when reference counter reaches zero.
368 */
369 atomic_t refcnt;
370
371 /* How precisely should I be started? */
372 int (*start)(struct dst_node *);
373
374 /* Crypto capabilities */
375 struct dst_crypto_ctl crypto;
376 u8 *hash_key;
377 u8 *cipher_key;
378
379 /* Pool of processing thread */
380 struct thread_pool *pool;
381
382 /* Transaction IDs live here */
383 atomic_long_t gen;
384
385 /*
386 * How frequently and how many times transaction
387 * tree should be scanned to drop stale objects.
388 */
389 long trans_scan_timeout;
390 int trans_max_retries;
391
392 /* Small gnomes live here */
393 struct rb_root trans_root;
394 struct mutex trans_lock;
395
396 /*
397 * Transaction cache/memory pool.
398 * It is big enough to contain not only transaction
399 * itself, but additional crypto data (digest/hmac).
400 */
401 struct kmem_cache *trans_cache;
402 mempool_t *trans_pool;
403
404 /* This entity scans transaction tree */
405 struct delayed_work trans_work;
406
407 wait_queue_head_t wait;
408};
409
410/* Kernel representation of the security attribute */
411struct dst_secure
412{
413 struct list_head sec_entry;
414 struct dst_secure_user sec;
415};
416
417int dst_process_bio(struct dst_node *n, struct bio *bio);
418
419int dst_node_init_connected(struct dst_node *n, struct dst_network_ctl *r);
420int dst_node_init_listened(struct dst_node *n, struct dst_export_ctl *le);
421
422static inline struct dst_state *dst_state_get(struct dst_state *st)
423{
424 BUG_ON(atomic_read(&st->refcnt) == 0);
425 atomic_inc(&st->refcnt);
426 return st;
427}
428
429void dst_state_put(struct dst_state *st);
430
431struct dst_state *dst_state_alloc(struct dst_node *n);
432int dst_state_socket_create(struct dst_state *st);
433void dst_state_socket_release(struct dst_state *st);
434
435void dst_state_exit_connected(struct dst_state *st);
436
437int dst_state_schedule_receiver(struct dst_state *st);
438
439void dst_dump_addr(struct socket *sk, struct sockaddr *sa, char *str);
440
441static inline void dst_state_lock(struct dst_state *st)
442{
443 mutex_lock(&st->state_lock);
444}
445
446static inline void dst_state_unlock(struct dst_state *st)
447{
448 mutex_unlock(&st->state_lock);
449}
450
451void dst_poll_exit(struct dst_state *st);
452int dst_poll_init(struct dst_state *st);
453
454static inline unsigned int dst_state_poll(struct dst_state *st)
455{
456 unsigned int revents = POLLHUP | POLLERR;
457
458 dst_state_lock(st);
459 if (st->socket)
460 revents = st->socket->ops->poll(NULL, st->socket, NULL);
461 dst_state_unlock(st);
462
463 return revents;
464}
465
466static inline int dst_thread_setup(void *private, void *data)
467{
468 return 0;
469}
470
471void dst_node_put(struct dst_node *n);
472
473static inline struct dst_node *dst_node_get(struct dst_node *n)
474{
475 atomic_inc(&n->refcnt);
476 return n;
477}
478
479int dst_data_recv(struct dst_state *st, void *data, unsigned int size);
480int dst_recv_cdata(struct dst_state *st, void *cdata);
481int dst_data_send_header(struct socket *sock,
482 void *data, unsigned int size, int more);
483
484int dst_send_bio(struct dst_state *st, struct dst_cmd *cmd, struct bio *bio);
485
486int dst_process_io(struct dst_state *st);
487int dst_export_crypto(struct dst_node *n, struct bio *bio);
488int dst_export_send_bio(struct bio *bio);
489int dst_start_export(struct dst_node *n);
490
491int __init dst_export_init(void);
492void dst_export_exit(void);
493
494/* Private structure for export block IO requests */
495struct dst_export_priv
496{
497 struct list_head request_entry;
498 struct dst_state *state;
499 struct bio *bio;
500 struct dst_cmd cmd;
501};
502
503static inline void dst_trans_get(struct dst_trans *t)
504{
505 atomic_inc(&t->refcnt);
506}
507
508struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen);
509int dst_trans_remove(struct dst_trans *t);
510int dst_trans_remove_nolock(struct dst_trans *t);
511void dst_trans_put(struct dst_trans *t);
512
513/*
514 * Convert bio into network command.
515 */
516static inline void dst_bio_to_cmd(struct bio *bio, struct dst_cmd *cmd,
517 u32 command, u64 id)
518{
519 cmd->cmd = command;
520 cmd->flags = (bio->bi_flags << BIO_POOL_BITS) >> BIO_POOL_BITS;
521 cmd->rw = bio->bi_rw;
522 cmd->size = bio->bi_size;
523 cmd->csize = 0;
524 cmd->id = id;
525 cmd->sector = bio->bi_sector;
526};
527
528int dst_trans_send(struct dst_trans *t);
529int dst_trans_crypto(struct dst_trans *t);
530
531int dst_node_crypto_init(struct dst_node *n, struct dst_crypto_ctl *ctl);
532void dst_node_crypto_exit(struct dst_node *n);
533
534static inline int dst_need_crypto(struct dst_node *n)
535{
536 struct dst_crypto_ctl *c = &n->crypto;
537 /*
538 * Logical OR is appropriate here, but boolean one produces
539 * more optimal code, so it is used instead.
540 */
541 return (c->hash_algo[0] | c->cipher_algo[0]);
542}
543
544int dst_node_trans_init(struct dst_node *n, unsigned int size);
545void dst_node_trans_exit(struct dst_node *n);
546
547/*
548 * Pool of threads.
549 * Ready list contains threads currently free to be used,
550 * active one contains threads with some work scheduled for them.
551 * Caller can wait in given queue when thread is ready.
552 */
553struct thread_pool
554{
555 int thread_num;
556 struct mutex thread_lock;
557 struct list_head ready_list, active_list;
558
559 wait_queue_head_t wait;
560};
561
562void thread_pool_del_worker(struct thread_pool *p);
563void thread_pool_del_worker_id(struct thread_pool *p, unsigned int id);
564int thread_pool_add_worker(struct thread_pool *p,
565 char *name,
566 unsigned int id,
567 void *(* init)(void *data),
568 void (* cleanup)(void *data),
569 void *data);
570
571void thread_pool_destroy(struct thread_pool *p);
572struct thread_pool *thread_pool_create(int num, char *name,
573 void *(* init)(void *data),
574 void (* cleanup)(void *data),
575 void *data);
576
577int thread_pool_schedule(struct thread_pool *p,
578 int (* setup)(void *stored_private, void *setup_data),
579 int (* action)(void *stored_private, void *setup_data),
580 void *setup_data, long timeout);
581int thread_pool_schedule_private(struct thread_pool *p,
582 int (* setup)(void *private, void *data),
583 int (* action)(void *private, void *data),
584 void *data, long timeout, void *id);
585
586#endif /* __KERNEL__ */
587#endif /* __DST_H */