[ENBD] 2.6.0 support

Peter T. Breuer ptb at it.uc3m.es
Fri Jan 16 02:24:29 MST 2004


"Also sprach Tim ter Laak:"
> Incidentally, I thought I sent my (mostly working) 2.6.0 patch to the list
> yesterday. Perhaps they didn't get through because they were attached?

Mmm possibly. There are about 10-20 spams "for my attention" every day,
and I never attend to them! So I daresay yours is trapped with the
others that need attention in the administrative limbo that results from
my unwillingness to deal with dozens of posts over a web interface.
Sorry!

:-(

And many thanks for this post!


> Well, here they are inlined. It's one incremental kernel patch (so it goes
> over the test2 patch), and one small patch for the userspace utils. Apart
> from the kdev_t issue, there was an issue with the _IO{R|W} macros, and
> blk_init_queue now allocates memory for the queue itself.
> 
> With the following patches, ENBD for 2.6.0 compiles, modules load and make
> test mostly works. Only some ioctl tests appear to fail :-( I don't know
> yet why this is.

It's possibly because I never ported the remote ioctl work! I'll look
and check.

> ---begin kernelpatch ---
> 
> diff -Naur linux-2.6.0-orig/drivers/block/enbd/enbd_base.c linux-2.6.0-enbd/drivers/block/enbd/enbd_base.c
> --- linux-2.6.0-orig/drivers/block/enbd/enbd_base.c	2004-01-15 13:31:03.000000000 +0100
> +++ linux-2.6.0-enbd/drivers/block/enbd/enbd_base.c	2004-01-15 13:28:57.000000000 +0100
> @@ -530,7 +530,7 @@
>  		goto fail;
>  	}
> 
> -	minor = minor (inode->i_rdev);
> +	minor = iminor (inode);

OK - that's interesting. I translated that as MINOR(inode->i_rdev)
last night. So "iminor" is a function for getting minors out of inodes!


>  	nbd = minor >> ENBD_SHIFT;
> 
>  	// PTB sync_dev is async. fsync_dev is sync.
> @@ -678,7 +678,7 @@
>  		return -EINVAL;
>  	}
> 
> -	dev = minor (inode->i_rdev);
> +	dev = iminor (inode);

Ditto.

>  	nbd = dev >> ENBD_SHIFT;
>  	part = dev - (nbd << ENBD_SHIFT);
>  	islot = part - 1;
> @@ -2219,7 +2219,7 @@
>          struct enbd_md *md = &enbd_md;
> 
>  	for (j = 0; j - 1 < lo->nslot; j++) {
> -		kdev_t enbd_dev = mk_kdev (major, j + (lo->nbd << ENBD_SHIFT));
> +		dev_t enbd_dev = MKDEV (major, j + (lo->nbd << ENBD_SHIFT));


Yes. I did that too.


>  		if (j == 0) {
>  			if (atomic_read (&lo->md_count) <= 0)
>  				continue;
> @@ -3258,12 +3258,12 @@
>  		ENBD_ERROR ("given bad inode.\n");
>  		return -EINVAL;
>  	}
> -	if (major (inode->i_rdev) != major) {
> +	if (imajor (inode) != major) {

Yes, I translated that as MAJOR(inode->i_rdev).


>  		ENBD_ERROR ("pseudo-major %d != %d\n",
> -			   major (inode->i_rdev), major);
> +			   imajor (inode), major);

ditto.

>  		return -ENODEV;
>  	}
> -	minor = minor (inode->i_rdev);
> +	minor = iminor (inode);

Similar.

>  	nbd = minor >> ENBD_SHIFT;
>  	if (nbd >= MAX_NBD) {
>  		ENBD_ERROR ("tried to open too many devices, %d\n", minor);
> @@ -3563,7 +3563,7 @@
>  		return 0;
> 
>  #ifndef BLKMDNTFY
> -#define BLKMDNTFY _IOW(0x12,133,sizeof(int))
> +#define BLKMDNTFY _IOW(0x12,133,int)


Correct. (blush). I had to do that too. They put an abuse detector
in the macro. Took me some time looking at it to understand where the
complaint about "[" in a case came from!


>  #endif
>                case BLKMDNTFY:
>                  ENBD_INFO ("received BLKMDNTFY, am now in raid %x\n",
> @@ -3583,7 +3583,7 @@
>                  return 0;
> 
>  #ifndef BLKMDUNTFY
> -#define BLKMDUNTFY _IOW(0x12,134,sizeof(int))
> +#define BLKMDUNTFY _IOW(0x12,134,int)

ditto.

>  #endif
>                case BLKMDUNTFY:
>                  ENBD_INFO ("received BLKMDUNTFY, now out of raid %x\n",
> @@ -3607,10 +3607,10 @@
>                  return 0;
> 
>  #ifndef BLKMDRGTR
> -#define BLKMDRGTR _IOW(0x12,135,sizeof(unsigned long))
> +#define BLKMDRGTR _IOW(0x12,135,unsigned long)


ditto. Funny - I don't remember that one. But yes, I did that too.


>  #endif
>                case BLKMDRGTR:
> -                enbd_md.reg(&enbd_md, (int(*)(kdev_t, int))arg);
> +                enbd_md.reg(&enbd_md, (int(*)(dev_t, int))arg);

Correct. I also changed all kdev_t's to dev_t's, which appears to be 
__u32, eventually.

>                  return 0;
> 
>  	} // PTB endsw
> @@ -3645,7 +3645,7 @@
>  		ENBD_ALERT ("null inode.\n");
>  		return -ENODEV;
>  	}
> -	dev = minor (inode->i_rdev);
> +	dev = iminor (inode);

OK. I did MINOR(inode->i_rdev).


>  	nbd = dev >> ENBD_SHIFT;
> 
>  	if (nbd >= MAX_NBD) {
> @@ -3965,12 +3965,14 @@
>  };
> 
>  static void
> -enbd_init_queue (struct enbd_device *lo, struct request_queue *queue)
> +enbd_init_queue (struct enbd_device *lo, struct request_queue **queue)
>  {
> 
>  // PTB - set up kernel queue struct with default methods
> -	blk_init_queue (queue, do_nbd_request, &enbd_lock);
> -	blk_queue_max_sectors (queue, buf_sectors);	/* max per request */
> +	*queue = blk_init_queue (do_nbd_request, &enbd_lock);
> +    if (*queue == NULL)
> +      return;
> +	blk_queue_max_sectors (*queue, buf_sectors);	/* max per request */


Ah. Interesting. Yes. The blk_init_queue has to go up out of here and
into just before the point where this routine is called from, in
enbd_setup.

But why change to a double indirection?  I don't think that's necessary.
One can pass the pointer to the queue into this routine as per normal,
and then all the rest of these changes are not required.  I simply have:

  static void
  enbd_init_queue (struct enbd_device *lo, struct request_queue *queue)
  {

   // PTB - set up kernel queue struct with default methods
        blk_queue_max_sectors (queue, buf_sectors);     /* max per request */


and the rest of the routine is unaltered.

> 
>  /*
>   * PTB - I think that put:
> @@ -3995,9 +3997,9 @@
>   */
> 
>  	// PTB control merge attempts so we do not overflow our buffer
> -	lo->ll_merge_requests_fn = queue->merge_requests_fn;
> -	lo->ll_front_merge_fn = queue->front_merge_fn;
> -	lo->ll_back_merge_fn = queue->back_merge_fn;
> +	lo->ll_merge_requests_fn = (*queue)->merge_requests_fn;
> +	lo->ll_front_merge_fn = (*queue)->front_merge_fn;
> +	lo->ll_back_merge_fn = (*queue)->back_merge_fn;
> 
>  // JSA - Add this line because under >=2.4.1, merge optimizations are in flux
>  /*
> @@ -4013,10 +4015,10 @@
>   * PTB - The functions below just impose our own stricter size limit before
>   *     - calling the defaults if all seems OK sizewise.
>   */
> -	queue->merge_requests_fn = &enbd_merge_requests_fn;
> -	queue->front_merge_fn = &enbd_front_merge_fn;
> -	queue->back_merge_fn = &enbd_back_merge_fn;
> -
> +	(*queue)->merge_requests_fn = &enbd_merge_requests_fn;
> +	(*queue)->front_merge_fn = &enbd_front_merge_fn;
> +	(*queue)->back_merge_fn = &enbd_back_merge_fn;
> +    return;


OK - so I didn't change any of those things.


>  }
> 
>  int __init
> @@ -4041,11 +4043,17 @@
>  	        *  struct.  These structs are big so we dynamically allocate
>  	        *  them.
>  	        */
> +           /* Tim responds ...
> +            *  Apparently this was changed later on in 2.5 (even 2.6.0-test>2?)
> +            *  so blk_init_queue now allocates the space.
> +            *  So we now first do enbd_init_queue, then check if it really
> +            *  was allocated.
> +            */


Most of the changes I saw as I went through the kernel patches (I
downloaded them all last night!) were in test5.


>                  struct gendisk *disk = alloc_disk(ENBD_MAXCONN);
> -	        memset (lo, 0, sizeof (*lo));
> +	            memset (lo, 0, sizeof (*lo));

(:- thanks)

>                  if (disk) {
>                      lo->disk = disk;
> -                    disk->queue = kmalloc(sizeof(*disk->queue), GFP_KERNEL);
> +                    enbd_init_queue(lo, &(disk->queue));

Hmm. I'm not sure that's correct, but it's the right idea. What I did
was:

                    ...
                    lo->disk = disk;
-                   disk->queue = kmalloc(sizeof(*disk->queue), GFP_KERNEL);
+                   disk->queue = blk_init_queue(do_nbd_request, &enbd_lock);
                    if (!disk->queue) {
                                put_disk(disk);
                                ...

and left the enbd_init_queue in its usual place lower down after the
following check.


>                      if (!disk->queue) {
>                                  put_disk(disk);
>                                  while (--i >= 0) {
> @@ -4059,8 +4067,6 @@
>                                  }
>                                  return -ENOMEM;
>                      }
> -                    memset(disk->queue, 0, sizeof(*disk->queue));
> -                    enbd_init_queue(lo, disk->queue);

Yes. The memset goes. But I left the enbd_init_queue here, as it does
what it is supposed to do.

                                                put_disk(disk);
                                        }
                                }
                                return -ENOMEM;
                    }
-                   memset(disk->queue, 0, sizeof(*disk->queue));
                    enbd_init_queue(lo, disk->queue);
                }
        }



>                  }
>          }
> 
> @@ -4162,8 +4168,8 @@
>                          struct request_queue *queue = disk->queue;
>  			if (queue) {
>                                  blk_cleanup_queue (queue);
> -				kfree (queue);
> -				disk->queue = NULL;
> +				//kfree (queue);
> +				//disk->queue = NULL;


Arrrgh! Well spotted! I forgot that.  Yes, we probably can't kfree
since we didn't kmalloc. Does blk_cleanup_queue now internally handle
the cleanup? Good. Yes -

 * Description:
 *     blk_cleanup_queue is the pair to blk_init_queue() or
 *     blk_queue_make_request().  It should be called when a request
 *     queue is
 *     being released; typically when a block device is being
 *     de-registered.
 *     Currently, its primary task it to free all the &struct request
 *     structures that were allocated to the queue and the queue
 *     itself.

and I see inside it

         kfree(q);

You are correct.

>  			}
>  			del_gendisk (disk);
>  			put_disk (disk);
> diff -Naur linux-2.6.0-orig/drivers/block/enbd/enbd_md.c linux-2.6.0-enbd/drivers/block/enbd/enbd_md.c
> --- linux-2.6.0-orig/drivers/block/enbd/enbd_md.c	2004-01-15 13:31:03.000000000 +0100
> +++ linux-2.6.0-enbd/drivers/block/enbd/enbd_md.c	2004-01-15 11:47:38.000000000 +0100
> @@ -44,7 +44,7 @@
>          return res;
>  }
>  static int
> -enbd_md_reg (struct enbd_md *md, int(*fn)(kdev_t, int)) {
> +enbd_md_reg (struct enbd_md *md, int(*fn)(dev_t, int)) {

Correct. kdev_t => dev_t.


>          spin_lock(&md->access_lock);
>          if (!md->notify_fn) {
>                  md->notify_fn = fn;
> @@ -60,7 +60,7 @@
>   *   @lo the nbd device to tell them about
>   */
>  static int
> -enbd_notify_md_device (struct enbd_md *md, kdev_t enbd_dev, int cmd)
> +enbd_notify_md_device (struct enbd_md *md, dev_t enbd_dev, int cmd)

Ditto.

>  {
>  	//kdev_t enbd_dev = mk_kdev (major, nbd << ENBD_SHIFT);
>          int err;
> @@ -71,7 +71,7 @@
>                  md->notify_pid = current->pid;
>  	        spin_unlock (&md->access_lock);
>  		ENBD_ALERT ("notifying %d for %x:%x to raid devices via fn\n",
> -                        cmd, major(enbd_dev), minor(enbd_dev));
> +                        cmd, MAJOR(enbd_dev), MINOR(enbd_dev));


Yes, now you are using the MAJOR and MINOR macros too. I'll have to check
what imajor and iminor do.


>  		err = md->notify_fn (enbd_dev, cmd);
>  	        spin_lock (&md->access_lock);
>                  md->doing_notify = 0;
> @@ -101,4 +101,3 @@
>          md->inc = enbd_md_inc;
>          md->reg = enbd_md_reg;
>  }
> -
> diff -Naur linux-2.6.0-orig/include/linux/enbd.h linux-2.6.0-enbd/include/linux/enbd.h
> --- linux-2.6.0-orig/include/linux/enbd.h	2004-01-15 13:31:03.000000000 +0100
> +++ linux-2.6.0-enbd/include/linux/enbd.h	2004-01-15 11:47:38.000000000 +0100
> @@ -162,11 +162,11 @@
>      int doing_notify;
>      int notify_pid;
>      spinlock_t access_lock;
> -    int (*notify_fn)(kdev_t, int);
> -    int (*notify)(struct enbd_md *,kdev_t, int cmd);
> +    int (*notify_fn)(dev_t, int);
> +    int (*notify)(struct enbd_md *,dev_t, int cmd);
>      int (*dec)(struct enbd_md *);
>      int (*inc)(struct enbd_md *);
> -    int (*reg)(struct enbd_md *, int(*)(kdev_t, int));
> +    int (*reg)(struct enbd_md *, int(*)(dev_t, int));


Yes. All correct.


>    };
> 
>    struct enbd_speed {
> @@ -181,7 +181,7 @@
> 
>    struct enbd_md_list {
>      struct list_head list;
> -    kdev_t dev;
> +    dev_t dev;

Correct.

>    };
> 
>    struct enbd_seqno;  // forward decl
> @@ -525,5 +525,3 @@
>    #endif /* MAJOR_NR */
> 
>  #endif /* LINUX_ENBD_H */
> -
> -
> 
> -- end kernelpatch ---
> 
> -- begin userspace patch ---
> 
> diff -Naur nbd-2.4.32-orig/nbd/enbd-client.c nbd-2.4.32/nbd/enbd-client.c
> --- nbd-2.4.32-orig/nbd/enbd-client.c	2004-01-14 21:25:56.000000000 +0100
> +++ nbd-2.4.32/nbd/enbd-client.c	2004-01-14 21:27:32.000000000 +0100
> @@ -64,8 +64,8 @@
>  #define BLKRRPART  _IO(0x12,95)
>  #define BLKGETSIZE _IO(0x12,96)
>  #define BLKFLSBUF  _IO(0x12,97)
> -#define BLKBSZGET  _IOR(0x12,112,sizeof(int))
> -#define BLKBSZSET  _IOW(0x12,113,sizeof(int))

:-(. I haven't looked at userspace yet! I'll believe you.

> +#define BLKBSZGET  _IOR(0x12,112,int)
> +#define BLKBSZSET  _IOW(0x12,113,int)
>  #define BLKROSET   _IO(0x12,93)	/* set device read-only (0 = read-write) */
>  #define BLKROGET   _IO(0x12,94)	/* get read-only status (0 = read_write) */
> 
> diff -Naur nbd-2.4.32-orig/nbd/file.c nbd-2.4.32/nbd/file.c
> --- nbd-2.4.32-orig/nbd/file.c	2004-01-14 21:25:56.000000000 +0100
> +++ nbd-2.4.32/nbd/file.c	2004-01-14 21:27:32.000000000 +0100
> @@ -30,7 +30,7 @@
>  #endif
>  #ifndef BLKGETSIZE64
>  /* lifted from fs.h */
> -#define BLKGETSIZE64 _IOR(0x12,114,sizeof(u64))
> +#define BLKGETSIZE64 _IOR(0x12,114,u64)
>  #endif
> 
>  # include "cliserv.h"
> @@ -1454,4 +1454,3 @@
> 
>         return 0;
>    }
> -
> 
> -- end userspace patch ---

Yes, it is correct.

Thank you very much. It only remains to determine the difference
between MAJOR(inode->i_rdev) and imajor(inode).

OK .. there is none. It's defined in fs.h:

static inline unsigned imajor(struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

I don't know what is better :-). What was all that kerfuffle about
kdev_t and dev_t for?

Peter


More information about the ENBD mailing list