[BACK]Return to vionet.c CVS log [TXT][DIR] Up to [local] / src / usr.sbin / vmd

File: [local] / src / usr.sbin / vmd / vionet.c (download)

Revision 1.14, Thu Feb 22 02:38:53 2024 UTC (3 months, 2 weeks ago) by dv
Branch: MAIN
CVS Tags: OPENBSD_7_5_BASE, OPENBSD_7_5, HEAD
Changes since 1.13: +6 -10 lines

vmd(8): only add vionet tap read events after a notify event.

My recent refactor introduced a bug where the vionet device will
add the tap(4) read event on an unpause regardless of driver and
device state, causing the cpu to peg as the read event fires
indefinitely because the guest's virtqueue isn't ready to receive
the data.

Add in a global flag that tracks if the rx-side is enabled or not
to coordinate adding the tap read event.

ok mlarkin@

/*	$OpenBSD: vionet.c,v 1.14 2024/02/22 02:38:53 dv Exp $	*/

/*
 * Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/socket.h>
#include <sys/types.h>

#include <dev/pci/virtio_pcireg.h>
#include <dev/pv/virtioreg.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>

#include <errno.h>
#include <event.h>
#include <fcntl.h>
#include <pthread.h>
#include <pthread_np.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "atomicio.h"
#include "virtio.h"
#include "vmd.h"

#define VIRTIO_NET_F_MAC	(1 << 5)
#define RXQ	0
#define TXQ	1

extern char *__progname;
extern struct vmd_vm *current_vm;

struct packet {
	uint8_t	*buf;
	size_t	 len;
};

static void *rx_run_loop(void *);
static void *tx_run_loop(void *);
static int vionet_rx(struct vionet_dev *, int);
static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
    int, size_t);
static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
    const struct iovec *, int);
static void vionet_rx_event(int, short, void *);
static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
    int8_t *);
static void handle_io_write(struct viodev_msg *, struct virtio_dev *);
static int vionet_tx(struct virtio_dev *);
static void vionet_notifyq(struct virtio_dev *);
static void dev_dispatch_vm(int, short, void *);
static void handle_sync_io(int, short, void *);
static void read_pipe_main(int, short, void *);
static void read_pipe_rx(int, short, void *);
static void read_pipe_tx(int, short, void *);
static void vionet_assert_pic_irq(struct virtio_dev *);
static void vionet_deassert_pic_irq(struct virtio_dev *);

/* Device Globals */
struct event ev_tap;
struct event ev_inject;
struct event_base *ev_base_main;
struct event_base *ev_base_rx;
struct event_base *ev_base_tx;
pthread_t rx_thread;
pthread_t tx_thread;
struct vm_dev_pipe pipe_main;
struct vm_dev_pipe pipe_rx;
struct vm_dev_pipe pipe_tx;
int pipe_inject[2];
#define READ	0
#define WRITE	1
struct iovec iov_rx[VIONET_QUEUE_SIZE];
struct iovec iov_tx[VIONET_QUEUE_SIZE];
pthread_rwlock_t lock = NULL;		/* Guards device config state. */
int resetting = 0;	/* Transient reset state used to coordinate reset. */
int rx_enabled = 0;	/* 1: we expect to read the tap, 0: wait for notify. */

__dead void
vionet_main(int fd, int fd_vmm)
{
	struct virtio_dev	 dev;
	struct vionet_dev	*vionet = NULL;
	struct viodev_msg 	 msg;
	struct vmd_vm	 	 vm;
	struct vm_create_params	*vcp;
	ssize_t			 sz;
	int			 ret;

	/*
	 * stdio - needed for read/write to disk fds and channels to the vm.
	 * vmm + proc - needed to create shared vm mappings.
	 */
	if (pledge("stdio vmm proc", NULL) == -1)
		fatal("pledge");

	/* Initialize iovec arrays. */
	memset(iov_rx, 0, sizeof(iov_rx));
	memset(iov_tx, 0, sizeof(iov_tx));

	/* Receive our vionet_dev, mostly preconfigured. */
	sz = atomicio(read, fd, &dev, sizeof(dev));
	if (sz != sizeof(dev)) {
		ret = errno;
		log_warn("failed to receive vionet");
		goto fail;
	}
	if (dev.dev_type != VMD_DEVTYPE_NET) {
		ret = EINVAL;
		log_warn("received invalid device type");
		goto fail;
	}
	dev.sync_fd = fd;
	vionet = &dev.vionet;

	log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
	    ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
	    dev.async_fd, fd_vmm);

	/* Receive our vm information from the vm process. */
	memset(&vm, 0, sizeof(vm));
	sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
	if (sz != sizeof(vm)) {
		ret = EIO;
		log_warnx("failed to receive vm details");
		goto fail;
	}
	vcp = &vm.vm_params.vmc_params;
	current_vm = &vm;
	setproctitle("%s/vionet%d", vcp->vcp_name, vionet->idx);
	log_procinit("vm/%s/vionet%d", vcp->vcp_name, vionet->idx);

	/* Now that we have our vm information, we can remap memory. */
	ret = remap_guest_mem(&vm, fd_vmm);
	if (ret) {
		fatal("%s: failed to remap", __func__);
		goto fail;
	}

	/*
	 * We no longer need /dev/vmm access.
	 */
	close_fd(fd_vmm);
	if (pledge("stdio", NULL) == -1)
		fatal("pledge2");

	/* If we're restoring hardware, re-initialize virtqueue hva's. */
	if (vm.vm_state & VM_STATE_RECEIVED) {
		struct virtio_vq_info *vq_info;
		void *hva = NULL;

		vq_info = &dev.vionet.vq[TXQ];
		if (vq_info->q_gpa != 0) {
			log_debug("%s: restoring TX virtqueue for gpa 0x%llx",
			    __func__, vq_info->q_gpa);
			hva = hvaddr_mem(vq_info->q_gpa,
			    vring_size(VIONET_QUEUE_SIZE));
			if (hva == NULL)
				fatalx("%s: hva == NULL", __func__);
			vq_info->q_hva = hva;
		}

		vq_info = &dev.vionet.vq[RXQ];
		if (vq_info->q_gpa != 0) {
			log_debug("%s: restoring RX virtqueue for gpa 0x%llx",
			    __func__, vq_info->q_gpa);
			hva = hvaddr_mem(vq_info->q_gpa,
			    vring_size(VIONET_QUEUE_SIZE));
			if (hva == NULL)
				fatalx("%s: hva == NULL", __func__);
			vq_info->q_hva = hva;
		}
	}

	/* Initialize our packet injection pipe. */
	if (pipe2(pipe_inject, O_NONBLOCK) == -1) {
		log_warn("%s: injection pipe", __func__);
		goto fail;
	}

	/* Initialize inter-thread communication channels. */
	vm_pipe_init2(&pipe_main, read_pipe_main, &dev);
	vm_pipe_init2(&pipe_rx, read_pipe_rx, &dev);
	vm_pipe_init2(&pipe_tx, read_pipe_tx, &dev);

	/* Initialize RX and TX threads . */
	ret = pthread_create(&rx_thread, NULL, rx_run_loop, &dev);
	if (ret) {
		errno = ret;
		log_warn("%s: failed to initialize rx thread", __func__);
		goto fail;
	}
	pthread_set_name_np(rx_thread, "rx");
	ret = pthread_create(&tx_thread, NULL, tx_run_loop, &dev);
	if (ret) {
		errno = ret;
		log_warn("%s: failed to initialize tx thread", __func__);
		goto fail;
	}
	pthread_set_name_np(tx_thread, "tx");

	/* Initialize our rwlock for guarding shared device state. */
	ret = pthread_rwlock_init(&lock, NULL);
	if (ret) {
		errno = ret;
		log_warn("%s: failed to initialize rwlock", __func__);
		goto fail;
	}

	/* Initialize libevent so we can start wiring event handlers. */
	ev_base_main = event_base_new();

	/* Add our handler for receiving messages from the RX/TX threads. */
	event_base_set(ev_base_main, &pipe_main.read_ev);
	event_add(&pipe_main.read_ev, NULL);

	/* Wire up an async imsg channel. */
	log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
		dev.async_fd);
	if (vm_device_pipe(&dev, dev_dispatch_vm, ev_base_main)) {
		ret = EIO;
		log_warnx("vm_device_pipe");
		goto fail;
	}

	/* Configure our sync channel event handler. */
	log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
		dev.sync_fd);
	imsg_init(&dev.sync_iev.ibuf, dev.sync_fd);
	dev.sync_iev.handler = handle_sync_io;
	dev.sync_iev.data = &dev;
	dev.sync_iev.events = EV_READ;
	imsg_event_add2(&dev.sync_iev, ev_base_main);

	/* Send a ready message over the sync channel. */
	log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
	memset(&msg, 0, sizeof(msg));
	msg.type = VIODEV_MSG_READY;
	imsg_compose_event2(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
	    sizeof(msg), ev_base_main);

	/* Send a ready message over the async channel. */
	log_debug("%s: sending async ready message", __func__);
	ret = imsg_compose_event2(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
	    &msg, sizeof(msg), ev_base_main);
	if (ret == -1) {
		log_warnx("%s: failed to send async ready message!", __func__);
		goto fail;
	}

	/* Engage the event loop! */
	ret = event_base_dispatch(ev_base_main);
	event_base_free(ev_base_main);

	/* Try stopping the rx & tx threads cleanly by messaging them. */
	vm_pipe_send(&pipe_rx, VIRTIO_THREAD_STOP);
	vm_pipe_send(&pipe_tx, VIRTIO_THREAD_STOP);

	/* Wait for threads to stop. */
	pthread_join(rx_thread, NULL);
	pthread_join(tx_thread, NULL);
	pthread_rwlock_destroy(&lock);

	/* Cleanup */
	if (ret == 0) {
		close_fd(dev.sync_fd);
		close_fd(dev.async_fd);
		close_fd(vionet->data_fd);
		close_fd(pipe_main.read);
		close_fd(pipe_main.write);
		close_fd(pipe_rx.write);
		close_fd(pipe_tx.write);
		close_fd(pipe_inject[READ]);
		close_fd(pipe_inject[WRITE]);
		_exit(ret);
		/* NOTREACHED */
	}
fail:
	/* Try firing off a message to the vm saying we're dying. */
	memset(&msg, 0, sizeof(msg));
	msg.type = VIODEV_MSG_ERROR;
	msg.data = ret;
	imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
	    sizeof(msg));
	imsg_flush(&dev.sync_iev.ibuf);

	close_fd(dev.sync_fd);
	close_fd(dev.async_fd);
	close_fd(pipe_inject[READ]);
	close_fd(pipe_inject[WRITE]);
	if (vionet != NULL)
		close_fd(vionet->data_fd);
	if (lock != NULL)
		pthread_rwlock_destroy(&lock);
	_exit(ret);
}

/*
 * Update the gpa and hva of the virtqueue.
 */
static void
vionet_update_qa(struct vionet_dev *dev)
{
	struct virtio_vq_info *vq_info;
	void *hva = NULL;

	/* Invalid queue? */
	if (dev->cfg.queue_select > 1)
		return;

	vq_info = &dev->vq[dev->cfg.queue_select];
	vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;

	if (vq_info->q_gpa == 0)
		vq_info->q_hva = NULL;

	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
	if (hva == NULL)
		fatalx("%s: hva == NULL", __func__);

	vq_info->q_hva = hva;
}

/*
 * Update the queue size.
 */
static void
vionet_update_qs(struct vionet_dev *dev)
{
	struct virtio_vq_info *vq_info;

	/* Invalid queue? */
	if (dev->cfg.queue_select > 1) {
		log_warnx("%s: !!! invalid queue selector %d", __func__,
		    dev->cfg.queue_select);
		dev->cfg.queue_size = 0;
		return;
	}

	vq_info = &dev->vq[dev->cfg.queue_select];

	/* Update queue pfn/size based on queue select */
	dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
	dev->cfg.queue_size = vq_info->qs;
}

/*
 * vionet_rx
 *
 * Pull packet from the provided fd and fill the receive-side virtqueue. We
 * selectively use zero-copy approaches when possible.
 *
 * Returns 1 if guest notification is needed. Otherwise, returns -1 on failure
 * or 0 if no notification is needed.
 */
static int
vionet_rx(struct vionet_dev *dev, int fd)
{
	uint16_t idx, hdr_idx;
	char *vr = NULL;
	size_t chain_len = 0, iov_cnt;
	struct vring_desc *desc, *table;
	struct vring_avail *avail;
	struct vring_used *used;
	struct virtio_vq_info *vq_info;
	struct iovec *iov;
	int notify = 0;
	ssize_t sz;
	uint8_t status = 0;

	status = dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
		log_warnx("%s: driver not ready", __func__);
		return (0);
	}

	vq_info = &dev->vq[RXQ];
	idx = vq_info->last_avail;
	vr = vq_info->q_hva;
	if (vr == NULL)
		fatalx("%s: vr == NULL", __func__);

	/* Compute offsets in ring of descriptors, avail ring, and used ring */
	table = (struct vring_desc *)(vr);
	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
	used->flags |= VRING_USED_F_NO_NOTIFY;

	while (idx != avail->idx) {
		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
		if (!DESC_WRITABLE(desc)) {
			log_warnx("%s: invalid descriptor state", __func__);
			goto reset;
		}

		iov = &iov_rx[0];
		iov_cnt = 1;

		/*
		 * First descriptor should be at least as large as the
		 * virtio_net_hdr. It's not technically required, but in
		 * legacy devices it should be safe to assume.
		 */
		iov->iov_len = desc->len;
		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
			log_warnx("%s: invalid descriptor length", __func__);
			goto reset;
		}

		/*
		 * Insert the virtio_net_hdr and adjust len/base. We do the
		 * pointer math here before it's a void*.
		 */
		iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
		if (iov->iov_base == NULL)
			goto reset;
		memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));

		/* Tweak the iovec to account for the virtio_net_hdr. */
		iov->iov_len -= sizeof(struct virtio_net_hdr);
		iov->iov_base = hvaddr_mem(desc->addr +
		    sizeof(struct virtio_net_hdr), iov->iov_len);
		if (iov->iov_base == NULL)
			goto reset;
		chain_len = iov->iov_len;

		/*
		 * Walk the remaining chain and collect remaining addresses
		 * and lengths.
		 */
		while (desc->flags & VRING_DESC_F_NEXT) {
			desc = &table[desc->next & VIONET_QUEUE_MASK];
			if (!DESC_WRITABLE(desc)) {
				log_warnx("%s: invalid descriptor state",
				    __func__);
				goto reset;
			}

			/* Collect our IO information. Translate gpa's. */
			iov = &iov_rx[iov_cnt];
			iov->iov_len = desc->len;
			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
			if (iov->iov_base == NULL)
				goto reset;
			chain_len += iov->iov_len;

			/* Guard against infinitely looping chains. */
			if (++iov_cnt >= nitems(iov_rx)) {
				log_warnx("%s: infinite chain detected",
				    __func__);
				goto reset;
			}
		}

		/* Make sure the driver gave us the bare minimum buffers. */
		if (chain_len < VIONET_MIN_TXLEN) {
			log_warnx("%s: insufficient buffers provided",
			    __func__);
			goto reset;
		}

		/*
		 * If we're enforcing hardware address or handling an injected
		 * packet, we need to use a copy-based approach.
		 */
		if (dev->lockedmac || fd != dev->data_fd)
			sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
			    chain_len);
		else
			sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
		if (sz == -1)
			goto reset;
		if (sz == 0)	/* No packets, so bail out for now. */
			break;

		/*
		 * Account for the prefixed header since it wasn't included
		 * in the copy or zerocopy operations.
		 */
		sz += sizeof(struct virtio_net_hdr);

		/* Mark our buffers as used. */
		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
		__sync_synchronize();
		used->idx++;
		idx++;
	}

	if (idx != vq_info->last_avail &&
	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
		notify = 1;
	}

	vq_info->last_avail = idx;
	return (notify);
reset:
	return (-1);
}

/*
 * vionet_rx_copy
 *
 * Read a packet off the provided file descriptor, validating packet
 * characteristics, and copy into the provided buffers in the iovec array.
 *
 * It's assumed that the provided iovec array contains validated host virtual
 * address translations and not guest physical addreses.
 *
 * Returns number of bytes copied on success, 0 if packet is dropped, and
 * -1 on an error.
 */
ssize_t
vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
    int iov_cnt, size_t chain_len)
{
	static uint8_t		 buf[VIONET_HARD_MTU];
	struct packet		*pkt = NULL;
	struct ether_header	*eh = NULL;
	uint8_t			*payload = buf;
	size_t			 i, chunk, nbytes, copied = 0;
	ssize_t			 sz;

	/* If reading from the tap(4), try to right-size the read. */
	if (fd == dev->data_fd)
		nbytes = MIN(chain_len, VIONET_HARD_MTU);
	else if (fd == pipe_inject[READ])
		nbytes = sizeof(struct packet);
	else {
		log_warnx("%s: invalid fd: %d", __func__, fd);
		return (-1);
	}

	/*
	 * Try to pull a packet. The fd should be non-blocking and we don't
	 * care if we under-read (i.e. sz != nbytes) as we may not have a
	 * packet large enough to fill the buffer.
	 */
	sz = read(fd, buf, nbytes);
	if (sz == -1) {
		if (errno != EAGAIN) {
			log_warn("%s: error reading packet", __func__);
			return (-1);
		}
		return (0);
	} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
		/* If reading the tap(4), we should get valid ethernet. */
		log_warnx("%s: invalid packet size", __func__);
		return (0);
	} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
		log_warnx("%s: invalid injected packet object (sz=%ld)",
		    __func__, sz);
		return (0);
	}

	/* Decompose an injected packet, if that's what we're working with. */
	if (fd == pipe_inject[READ]) {
		pkt = (struct packet *)buf;
		if (pkt->buf == NULL) {
			log_warnx("%s: invalid injected packet, no buffer",
			    __func__);
			return (0);
		}
		if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
			log_warnx("%s: invalid injected packet size", __func__);
			goto drop;
		}
		payload = pkt->buf;
		sz = (ssize_t)pkt->len;
	}

	/* Validate the ethernet header, if required. */
	if (dev->lockedmac) {
		eh = (struct ether_header *)(payload);
		if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
		    memcmp(eh->ether_dhost, dev->mac,
		    sizeof(eh->ether_dhost)) != 0)
			goto drop;
	}

	/* Truncate one last time to the chain length, if shorter. */
	sz = MIN(chain_len, (size_t)sz);

	/*
	 * Copy the packet into the provided buffers. We can use memcpy(3)
	 * here as the gpa was validated and translated to an hva previously.
	 */
	for (i = 0; (int)i < iov_cnt && (size_t)sz > copied; i++) {
		chunk = MIN(iov[i].iov_len, (size_t)(sz - copied));
		memcpy(iov[i].iov_base, payload + copied, chunk);
		copied += chunk;
	}

drop:
	/* Free any injected packet buffer. */
	if (pkt != NULL)
		free(pkt->buf);

	return (copied);
}

/*
 * vionet_rx_zerocopy
 *
 * Perform a vectorized read from the given fd into the guest physical memory
 * pointed to by iovecs.
 *
 * Returns number of bytes read on success, -1 on error, or 0 if EAGAIN was
 * returned by readv.
 *
 */
static ssize_t
vionet_rx_zerocopy(struct vionet_dev *dev, int fd, const struct iovec *iov,
    int iov_cnt)
{
	ssize_t		sz;

	if (dev->lockedmac) {
		log_warnx("%s: zerocopy not available for locked lladdr",
		    __func__);
		return (-1);
	}

	sz = readv(fd, iov, iov_cnt);
	if (sz == -1 && errno == EAGAIN)
		return (0);
	return (sz);
}


/*
 * vionet_rx_event
 *
 * Called when new data can be received on the tap fd of a vionet device.
 */
static void
vionet_rx_event(int fd, short event, void *arg)
{
	struct virtio_dev	*dev = (struct virtio_dev *)arg;
	struct vionet_dev	*vionet = &dev->vionet;
	int			 ret = 0;

	if (!(event & EV_READ))
		fatalx("%s: invalid event type", __func__);

	pthread_rwlock_rdlock(&lock);
	ret = vionet_rx(vionet, fd);
	pthread_rwlock_unlock(&lock);

	if (ret == 0) {
		/* Nothing to do. */
		return;
	}

	pthread_rwlock_wrlock(&lock);
	if (ret == 1) {
		/* Notify the driver. */
		vionet->cfg.isr_status |= 1;
	} else {
		/* Need a reset. Something went wrong. */
		log_warnx("%s: requesting device reset", __func__);
		vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
		vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
	}
	pthread_rwlock_unlock(&lock);

	vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
}

static void
vionet_notifyq(struct virtio_dev *dev)
{
	struct vionet_dev	*vionet = &dev->vionet;

	switch (vionet->cfg.queue_notify) {
	case RXQ:
		rx_enabled = 1;
		vm_pipe_send(&pipe_rx, VIRTIO_NOTIFY);
		break;
	case TXQ:
		vm_pipe_send(&pipe_tx, VIRTIO_NOTIFY);
		break;
	default:
		/*
		 * Catch the unimplemented queue ID 2 (control queue) as
		 * well as any bogus queue IDs.
		 */
		log_debug("%s: notify for unimplemented queue ID %d",
		    __func__, vionet->cfg.queue_notify);
		break;
	}
}

static int
vionet_tx(struct virtio_dev *dev)
{
	uint16_t idx, hdr_idx;
	size_t chain_len, iov_cnt;
	ssize_t dhcpsz = 0, sz;
	int notify = 0;
	char *vr = NULL, *dhcppkt = NULL;
	struct vionet_dev *vionet = &dev->vionet;
	struct vring_desc *desc, *table;
	struct vring_avail *avail;
	struct vring_used *used;
	struct virtio_vq_info *vq_info;
	struct ether_header *eh;
	struct iovec *iov;
	struct packet pkt;
	uint8_t status = 0;

	status = vionet->cfg.device_status
	    & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
	if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
		log_warnx("%s: driver not ready", __func__);
		return (0);
	}

	vq_info = &vionet->vq[TXQ];
	idx = vq_info->last_avail;
	vr = vq_info->q_hva;
	if (vr == NULL)
		fatalx("%s: vr == NULL", __func__);

	/* Compute offsets in ring of descriptors, avail ring, and used ring */
	table = (struct vring_desc *)(vr);
	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);

	while (idx != avail->idx) {
		hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
		desc = &table[hdr_idx & VIONET_QUEUE_MASK];
		if (DESC_WRITABLE(desc)) {
			log_warnx("%s: invalid descriptor state", __func__);
			goto reset;
		}

		iov = &iov_tx[0];
		iov_cnt = 0;
		chain_len = 0;

		/*
		 * As a legacy device, we most likely will receive a lead
		 * descriptor sized to the virtio_net_hdr. However, the framing
		 * is not guaranteed, so check for packet data.
		 */
		iov->iov_len = desc->len;
		if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
			log_warnx("%s: invalid descriptor length", __func__);
			goto reset;
		} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
			/* Chop off the virtio header, leaving packet data. */
			iov->iov_len -= sizeof(struct virtio_net_hdr);
			chain_len += iov->iov_len;
			iov->iov_base = hvaddr_mem(desc->addr +
			    sizeof(struct virtio_net_hdr), iov->iov_len);
			if (iov->iov_base == NULL)
				goto reset;
			iov_cnt++;
		}

		/*
		 * Walk the chain and collect remaining addresses and lengths.
		 */
		while (desc->flags & VRING_DESC_F_NEXT) {
			desc = &table[desc->next & VIONET_QUEUE_MASK];
			if (DESC_WRITABLE(desc)) {
				log_warnx("%s: invalid descriptor state",
				    __func__);
				goto reset;
			}

			/* Collect our IO information, translating gpa's. */
			iov = &iov_tx[iov_cnt];
			iov->iov_len = desc->len;
			iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
			if (iov->iov_base == NULL)
				goto reset;
			chain_len += iov->iov_len;

			/* Guard against infinitely looping chains. */
			if (++iov_cnt >= nitems(iov_tx)) {
				log_warnx("%s: infinite chain detected",
				    __func__);
				goto reset;
			}
		}

		/* Check if we've got a minimum viable amount of data. */
		if (chain_len < VIONET_MIN_TXLEN) {
			sz = chain_len;
			goto drop;
		}

		/*
		 * Packet inspection for ethernet header (if using a "local"
		 * interface) for possibility of a DHCP packet or (if using
		 * locked lladdr) for validating ethernet header.
		 *
		 * To help preserve zero-copy semantics, we require the first
		 * descriptor with packet data contains a large enough buffer
		 * for this inspection.
		 */
		iov = &iov_tx[0];
		if (vionet->lockedmac) {
			if (iov->iov_len < ETHER_HDR_LEN) {
				log_warnx("%s: insufficient header data",
				    __func__);
				goto drop;
			}
			eh = (struct ether_header *)iov->iov_base;
			if (memcmp(eh->ether_shost, vionet->mac,
			    sizeof(eh->ether_shost)) != 0) {
				log_warnx("%s: bad source address %s",
				    __func__, ether_ntoa((struct ether_addr *)
					eh->ether_shost));
				sz = chain_len;
				goto drop;
			}
		}
		if (vionet->local) {
			dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len,
			    &dhcppkt);
			if (dhcpsz > 0)
				log_debug("%s: detected dhcp request of %zu bytes",
			    	    __func__, dhcpsz);
		}

		/* Write our packet to the tap(4). */
		sz = writev(vionet->data_fd, iov_tx, iov_cnt);
		if (sz == -1 && errno != ENOBUFS) {
			log_warn("%s", __func__);
			goto reset;
		}
		sz += sizeof(struct virtio_net_hdr);
drop:
		used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
		used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
		__sync_synchronize();
		used->idx++;
		idx++;

		/* Facilitate DHCP reply injection, if needed. */
		if (dhcpsz > 0) {
			pkt.buf = dhcppkt;
			pkt.len = dhcpsz;
			sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
			if (sz == -1 && errno != EAGAIN) {
				log_warn("%s: packet injection", __func__);
				free(pkt.buf);
			} else if (sz == -1 && errno == EAGAIN) {
				log_debug("%s: dropping dhcp reply", __func__);
				free(pkt.buf);
			} else if (sz != sizeof(pkt)) {
				log_warnx("%s: failed packet injection",
				    __func__);
				free(pkt.buf);
			}
			log_debug("%s: injected dhcp reply with %ld bytes",
			    __func__, sz);
		}
	}

	if (idx != vq_info->last_avail &&
	    !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
		notify = 1;


	vq_info->last_avail = idx;
	return (notify);
reset:
	return (-1);
}

static void
dev_dispatch_vm(int fd, short event, void *arg)
{
	struct virtio_dev	*dev = arg;
	struct vionet_dev	*vionet = &dev->vionet;
	struct imsgev		*iev = &dev->async_iev;
	struct imsgbuf		*ibuf = &iev->ibuf;
	struct imsg	 	 imsg;
	ssize_t			 n = 0;
	int			 verbose;

	if (dev == NULL)
		fatalx("%s: missing vionet pointer", __func__);

	if (event & EV_READ) {
		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
			fatal("%s: imsg_read", __func__);
		if (n == 0) {
			/* this pipe is dead, so remove the event handler */
			log_debug("%s: pipe dead (EV_READ)", __func__);
			event_del(&iev->ev);
			event_base_loopexit(ev_base_main, NULL);
			return;
		}
	}

	if (event & EV_WRITE) {
		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
			fatal("%s: msgbuf_write", __func__);
		if (n == 0) {
			/* this pipe is dead, so remove the event handler */
			log_debug("%s: pipe dead (EV_WRITE)", __func__);
			event_del(&iev->ev);
			event_base_loopexit(ev_base_main, NULL);
			return;
		}
	}

	for (;;) {
		if ((n = imsg_get(ibuf, &imsg)) == -1)
			fatal("%s: imsg_get", __func__);
		if (n == 0)
			break;

		switch (imsg.hdr.type) {
		case IMSG_DEVOP_HOSTMAC:
			IMSG_SIZE_CHECK(&imsg, vionet->hostmac);
			memcpy(vionet->hostmac, imsg.data,
			    sizeof(vionet->hostmac));
			log_debug("%s: set hostmac", __func__);
			break;
		case IMSG_VMDOP_PAUSE_VM:
			log_debug("%s: pausing", __func__);
			vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
			break;
		case IMSG_VMDOP_UNPAUSE_VM:
			log_debug("%s: unpausing", __func__);
			if (rx_enabled)
				vm_pipe_send(&pipe_rx, VIRTIO_THREAD_START);
			break;
		case IMSG_CTL_VERBOSE:
			IMSG_SIZE_CHECK(&imsg, &verbose);
			memcpy(&verbose, imsg.data, sizeof(verbose));
			log_setverbose(verbose);
			break;
		}
		imsg_free(&imsg);
	}
	imsg_event_add2(iev, ev_base_main);
}

/*
 * Synchronous IO handler.
 *
 */
static void
handle_sync_io(int fd, short event, void *arg)
{
	struct virtio_dev *dev = (struct virtio_dev *)arg;
	struct imsgev *iev = &dev->sync_iev;
	struct imsgbuf *ibuf = &iev->ibuf;
	struct viodev_msg msg;
	struct imsg imsg;
	ssize_t n;
	int8_t intr = INTR_STATE_NOOP;

	if (event & EV_READ) {
		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
			fatal("%s: imsg_read", __func__);
		if (n == 0) {
			/* this pipe is dead, so remove the event handler */
			log_debug("%s: pipe dead (EV_READ)", __func__);
			event_del(&iev->ev);
			event_base_loopexit(ev_base_main, NULL);
			return;
		}
	}

	if (event & EV_WRITE) {
		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
			fatal("%s: msgbuf_write", __func__);
		if (n == 0) {
			/* this pipe is dead, so remove the event handler */
			log_debug("%s: pipe dead (EV_WRITE)", __func__);
			event_del(&iev->ev);
			event_base_loopexit(ev_base_main, NULL);
			return;
		}
	}

	for (;;) {
		if ((n = imsg_get(ibuf, &imsg)) == -1)
			fatalx("%s: imsg_get (n=%ld)", __func__, n);
		if (n == 0)
			break;

		/* Unpack our message. They ALL should be dev messeges! */
		IMSG_SIZE_CHECK(&imsg, &msg);
		memcpy(&msg, imsg.data, sizeof(msg));
		imsg_free(&imsg);

		switch (msg.type) {
		case VIODEV_MSG_DUMP:
			/* Dump device */
			n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
			if (n != sizeof(*dev)) {
				log_warnx("%s: failed to dump vionet device",
				    __func__);
				break;
			}
		case VIODEV_MSG_IO_READ:
			/* Read IO: make sure to send a reply */
			msg.data = handle_io_read(&msg, dev, &intr);
			msg.data_valid = 1;
			msg.state = intr;
			imsg_compose_event2(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
			    sizeof(msg), ev_base_main);
			break;
		case VIODEV_MSG_IO_WRITE:
			/* Write IO: no reply needed */
			handle_io_write(&msg, dev);
			break;
		case VIODEV_MSG_SHUTDOWN:
			event_del(&dev->sync_iev.ev);
			event_base_loopbreak(ev_base_main);
			return;
		default:
			fatalx("%s: invalid msg type %d", __func__, msg.type);
		}
	}
	imsg_event_add2(iev, ev_base_main);
}

static void
handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
{
	struct vionet_dev	*vionet = &dev->vionet;
	uint32_t		 data = msg->data;
	int			 pause_devices = 0;

	pthread_rwlock_wrlock(&lock);

	switch (msg->reg) {
	case VIRTIO_CONFIG_DEVICE_FEATURES:
	case VIRTIO_CONFIG_QUEUE_SIZE:
	case VIRTIO_CONFIG_ISR_STATUS:
		log_warnx("%s: illegal write %x to %s", __progname, data,
		    virtio_reg_name(msg->reg));
		break;
	case VIRTIO_CONFIG_GUEST_FEATURES:
		vionet->cfg.guest_feature = data;
		break;
	case VIRTIO_CONFIG_QUEUE_PFN:
		vionet->cfg.queue_pfn = data;
		vionet_update_qa(vionet);
		break;
	case VIRTIO_CONFIG_QUEUE_SELECT:
		vionet->cfg.queue_select = data;
		vionet_update_qs(vionet);
		break;
	case VIRTIO_CONFIG_QUEUE_NOTIFY:
		vionet->cfg.queue_notify = data;
		vionet_notifyq(dev);
		break;
	case VIRTIO_CONFIG_DEVICE_STATUS:
		if (data == 0) {
			resetting = 2;	/* Wait on two acks: rx & tx */
			pause_devices = 1;
		} else {
			// XXX is this correct?
			vionet->cfg.device_status = data;
		}
		break;
	}

	pthread_rwlock_unlock(&lock);
	if (pause_devices) {
		rx_enabled = 0;
		vionet_deassert_pic_irq(dev);
		vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
		vm_pipe_send(&pipe_tx, VIRTIO_THREAD_PAUSE);
	}
}

static uint32_t
handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr)
{
	struct vionet_dev *vionet = &dev->vionet;
	uint32_t data;

	pthread_rwlock_rdlock(&lock);

	switch (msg->reg) {
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
		data = vionet->mac[msg->reg -
		    VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
		break;
	case VIRTIO_CONFIG_DEVICE_FEATURES:
		data = vionet->cfg.device_feature;
		break;
	case VIRTIO_CONFIG_GUEST_FEATURES:
		data = vionet->cfg.guest_feature;
		break;
	case VIRTIO_CONFIG_QUEUE_PFN:
		data = vionet->cfg.queue_pfn;
		break;
	case VIRTIO_CONFIG_QUEUE_SIZE:
		data = vionet->cfg.queue_size;
		break;
	case VIRTIO_CONFIG_QUEUE_SELECT:
		data = vionet->cfg.queue_select;
		break;
	case VIRTIO_CONFIG_QUEUE_NOTIFY:
		data = vionet->cfg.queue_notify;
		break;
	case VIRTIO_CONFIG_DEVICE_STATUS:
		data = vionet->cfg.device_status;
		break;
	case VIRTIO_CONFIG_ISR_STATUS:
		pthread_rwlock_unlock(&lock);
		pthread_rwlock_wrlock(&lock);
		data = vionet->cfg.isr_status;
		vionet->cfg.isr_status = 0;
		if (intr != NULL)
			*intr = INTR_STATE_DEASSERT;
		break;
	default:
		data = 0xFFFFFFFF;
	}

	pthread_rwlock_unlock(&lock);
	return (data);
}

/*
 * Handle the rx side processing, communicating to the main thread via pipe.
 */
static void *
rx_run_loop(void *arg)
{
	struct virtio_dev	*dev = (struct virtio_dev *)arg;
	struct vionet_dev	*vionet = &dev->vionet;
	int			 ret;

	ev_base_rx = event_base_new();

	/* Wire up event handling for the tap fd. */
	event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST,
	    vionet_rx_event, dev);
	event_base_set(ev_base_rx, &ev_tap);

	/* Wire up event handling for the packet injection pipe. */
	event_set(&ev_inject, pipe_inject[READ], EV_READ | EV_PERSIST,
	    vionet_rx_event, dev);
	event_base_set(ev_base_rx, &ev_inject);

	/* Wire up event handling for our inter-thread communication channel. */
	event_base_set(ev_base_rx, &pipe_rx.read_ev);
	event_add(&pipe_rx.read_ev, NULL);

	/* Begin our event loop with our channel event active. */
	ret = event_base_dispatch(ev_base_rx);
	event_base_free(ev_base_rx);

	log_debug("%s: exiting (%d)", __func__, ret);

	close_fd(pipe_rx.read);
	close_fd(pipe_inject[READ]);

	return (NULL);
}

/*
 * Handle the tx side processing, communicating to the main thread via pipe.
 */
static void *
tx_run_loop(void *arg)
{
	int			 ret;

	ev_base_tx = event_base_new();

	/* Wire up event handling for our inter-thread communication channel. */
	event_base_set(ev_base_tx, &pipe_tx.read_ev);
	event_add(&pipe_tx.read_ev, NULL);

	/* Begin our event loop with our channel event active. */
	ret = event_base_dispatch(ev_base_tx);
	event_base_free(ev_base_tx);

	log_debug("%s: exiting (%d)", __func__, ret);

	close_fd(pipe_tx.read);

	return (NULL);
}

/*
 * Read events sent by the main thread to the rx thread.
 */
static void
read_pipe_rx(int fd, short event, void *arg)
{
	enum pipe_msg_type	msg;

	if (!(event & EV_READ))
		fatalx("%s: invalid event type", __func__);

	msg = vm_pipe_recv(&pipe_rx);

	switch (msg) {
	case VIRTIO_NOTIFY:
	case VIRTIO_THREAD_START:
		event_add(&ev_tap, NULL);
		event_add(&ev_inject, NULL);
		break;
	case VIRTIO_THREAD_PAUSE:
		event_del(&ev_tap);
		event_del(&ev_inject);
		vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
		break;
	case VIRTIO_THREAD_STOP:
		event_del(&ev_tap);
		event_del(&ev_inject);
		event_base_loopexit(ev_base_rx, NULL);
		break;
	default:
		fatalx("%s: invalid channel message: %d", __func__, msg);
	}
}

/*
 * Read events sent by the main thread to the tx thread.
 */
static void
read_pipe_tx(int fd, short event, void *arg)
{
	struct virtio_dev	*dev = (struct virtio_dev*)arg;
	struct vionet_dev	*vionet = &dev->vionet;
	enum pipe_msg_type	 msg;
	int			 ret = 0;

	if (!(event & EV_READ))
		fatalx("%s: invalid event type", __func__);

	msg = vm_pipe_recv(&pipe_tx);

	switch (msg) {
	case VIRTIO_NOTIFY:
		pthread_rwlock_rdlock(&lock);
		ret = vionet_tx(dev);
		pthread_rwlock_unlock(&lock);
		break;
	case VIRTIO_THREAD_START:
		/* Ignore Start messages. */
		break;
	case VIRTIO_THREAD_PAUSE:
		/*
		 * Nothing to do when pausing on the tx side, but ACK so main
		 * thread knows we're not transmitting.
		 */
		vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
		break;
	case VIRTIO_THREAD_STOP:
		event_base_loopexit(ev_base_tx, NULL);
		break;
	default:
		fatalx("%s: invalid channel message: %d", __func__, msg);
	}

	if (ret == 0) {
		/* No notification needed. Return early. */
		return;
	}

	pthread_rwlock_wrlock(&lock);
	if (ret == 1) {
		/* Notify the driver. */
		vionet->cfg.isr_status |= 1;
	} else {
		/* Need a reset. Something went wrong. */
		log_warnx("%s: requesting device reset", __func__);
		vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
		vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
	}
	pthread_rwlock_unlock(&lock);

	vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
}

/*
 * Read events sent by the rx/tx threads to the main thread.
 */
static void
read_pipe_main(int fd, short event, void *arg)
{
	struct virtio_dev	*dev = (struct virtio_dev*)arg;
	struct vionet_dev	*vionet = &dev->vionet;
	enum pipe_msg_type	 msg;

	if (!(event & EV_READ))
		fatalx("%s: invalid event type", __func__);

	msg = vm_pipe_recv(&pipe_main);
	switch (msg) {
	case VIRTIO_RAISE_IRQ:
		vionet_assert_pic_irq(dev);
		break;
	case VIRTIO_THREAD_ACK:
		resetting--;
		if (resetting == 0) {
			log_debug("%s: resetting virtio network device %d",
			    __func__, vionet->idx);

			pthread_rwlock_wrlock(&lock);
			vionet->cfg.device_status = 0;
			vionet->cfg.guest_feature = 0;
			vionet->cfg.queue_pfn = 0;
			vionet_update_qa(vionet);
			vionet->cfg.queue_size = 0;
			vionet_update_qs(vionet);
			vionet->cfg.queue_select = 0;
			vionet->cfg.queue_notify = 0;
			vionet->cfg.isr_status = 0;
			vionet->vq[RXQ].last_avail = 0;
			vionet->vq[RXQ].notified_avail = 0;
			vionet->vq[TXQ].last_avail = 0;
			vionet->vq[TXQ].notified_avail = 0;
			pthread_rwlock_unlock(&lock);
		}
		break;
	default:
		fatalx("%s: invalid channel msg: %d", __func__, msg);
	}
}

/*
 * Message the vm process asking to raise the irq. Must be called from the main
 * thread.
 */
static void
vionet_assert_pic_irq(struct virtio_dev *dev)
{
	struct viodev_msg	msg;
	int			ret;

	memset(&msg, 0, sizeof(msg));
	msg.irq = dev->irq;
	msg.vcpu = 0; // XXX
	msg.type = VIODEV_MSG_KICK;
	msg.state = INTR_STATE_ASSERT;

	ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
	    &msg, sizeof(msg), ev_base_main);
	if (ret == -1)
		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}

/*
 * Message the vm process asking to lower the irq. Must be called from the main
 * thread.
 */
static void
vionet_deassert_pic_irq(struct virtio_dev *dev)
{
	struct viodev_msg	msg;
	int			ret;

	memset(&msg, 0, sizeof(msg));
	msg.irq = dev->irq;
	msg.vcpu = 0; // XXX
	msg.type = VIODEV_MSG_KICK;
	msg.state = INTR_STATE_DEASSERT;

	ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
	    &msg, sizeof(msg), ev_base_main);
	if (ret == -1)
		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}