Spdk/examples/ioat/kperf/kmod/dmaperf.c

/*
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 *   redistributing this file, you may do so under either license.
 *
 *   GPL LICENSE SUMMARY
 *
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of version 2 of the GNU General Public License as
 *   published by the Free Software Foundation.
 *
 *   BSD LICENSE
 *
 *   Copyright (c) Intel Corporation.
 *   All rights reserved.
 *
 *   Redistribution and use in source and binary forms, with or without
 *   modification, are permitted provided that the following conditions
 *   are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copy
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *     * Neither the name of Intel Corporation nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 *
 *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *   PCIe DMA Perf Linux driver
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/timer.h>
#include <linux/dma-mapping.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/dmaengine.h>
#include <linux/delay.h>
#include <linux/printk.h>
#include <linux/nodemask.h>

#define DRIVER_NAME		"dma_perf"
#define DRIVER_DESCRIPTION	"PCIe DMA Performance Measurement Tool"

#define DRIVER_LICENSE		"Dual BSD/GPL"
#define DRIVER_VERSION		"1.0"
#define DRIVER_AUTHOR		"Dave Jiang <dave.jiang@intel.com>"

#define MAX_THREADS		32
#define MAX_TEST_SIZE		1024 * 1024	/* 1M */
#define DMA_CHANNELS_PER_NODE	8

MODULE_LICENSE(DRIVER_LICENSE);
MODULE_VERSION(DRIVER_VERSION);
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_AUTHOR("Changpeng Liu <changpeng.liu@intel.com>");
MODULE_DESCRIPTION(DRIVER_DESCRIPTION);

static struct dentry *perf_debugfs_dir;
static struct perf_ctx *g_perf = NULL;

static unsigned int seg_order = 12; /* 4K */
static unsigned int queue_depth = 256;
static unsigned int run_order = 32; /* 4G */

struct perf_mw {
	size_t		buf_size;
	void		*virt_addr;
};

struct perf_ctx;

struct pthr_ctx {
	struct dentry		*debugfs_thr_dir;
	struct dentry		*debugfs_copied;
	struct dentry		*debugfs_elapsed_time;
	struct device		*dev;
	int			node;
	wait_queue_head_t	wq;
	struct perf_mw		mw;
	struct task_struct	*thread;
	struct perf_ctx		*perf;
	atomic_t		dma_sync;
	struct dma_chan		*dma_chan;
	int			dma_up;
	int			dma_down;
	int			dma_prep_err;
	u64			copied;
	u64			elapsed_time;
};

struct perf_ctx {
	spinlock_t		db_lock;
	struct dentry		*debugfs_node_dir;
	struct dentry		*debugfs_run;
	struct dentry		*debugfs_threads;
	struct dentry		*debugfs_queue_depth;
	struct dentry		*debugfs_transfer_size_order;
	struct dentry		*debugfs_total_size_order;
	struct dentry		*debugfs_status;
	u8			numa_nodes;
	u8			perf_threads;
	bool			run;
	struct pthr_ctx		pthr_ctx[MAX_THREADS];
	atomic_t		tsync;
};

static void perf_free_mw(struct pthr_ctx *pctx);
static int perf_set_mw(struct pthr_ctx *pctx, size_t size);

static void perf_copy_callback(void *data)
{
	struct pthr_ctx *pctx = data;

	atomic_dec(&pctx->dma_sync);
	pctx->dma_down++;

	wake_up(&pctx->wq);
}

static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
			 char *src, size_t size)
{
	struct dma_async_tx_descriptor *txd;
	struct dma_chan *chan = pctx->dma_chan;
	struct dma_device *device;
	struct dmaengine_unmap_data *unmap;
	dma_cookie_t cookie;
	size_t src_off, dst_off;
	int retries = 0;

	if (!chan) {
		printk("DMA engine does not exist\n");
		return -EINVAL;
	}

	device = chan->device;
	src_off = (size_t)src & ~PAGE_MASK;
	dst_off = (size_t)dst & ~PAGE_MASK;

	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
		return -ENODEV;

	unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
	if (!unmap)
		return -ENOMEM;

	unmap->len = size;
	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
				      src_off, size, DMA_TO_DEVICE);
	if (dma_mapping_error(device->dev, unmap->addr[0]))
		goto err_get_unmap;

	unmap->to_cnt = 1;

	unmap->addr[1] = dma_map_page(device->dev, virt_to_page(dst),
				      dst_off, size, DMA_FROM_DEVICE);
	if (dma_mapping_error(device->dev, unmap->addr[1]))
		goto err_get_unmap;
	unmap->from_cnt = 1;

dma_prep_retry:
	txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
					     unmap->addr[0],
					     size, DMA_PREP_INTERRUPT);
	if (!txd) {
		if (retries++ > 20) {
			pctx->dma_prep_err++;
			goto err_get_unmap;
		} else {
			set_current_state(TASK_INTERRUPTIBLE);
			schedule_timeout(50);
			goto dma_prep_retry;
		}
	}

	txd->callback = perf_copy_callback;
	txd->callback_param = pctx;
	dma_set_unmap(txd, unmap);

	cookie = dmaengine_submit(txd);
	if (dma_submit_error(cookie))
		goto err_set_unmap;

	atomic_inc(&pctx->dma_sync);

	pctx->dma_up++;
	dma_async_issue_pending(chan);

	return size;

err_set_unmap:
	dmaengine_unmap_put(unmap);
err_get_unmap:
	dmaengine_unmap_put(unmap);
	return 0;
}

static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
			  u64 buf_size, u64 win_size, u64 total)
{
	int chunks, total_chunks, i;
	int copied_chunks = 0;
	u64 result;
	char *tmp = dst;
	u64 perf, diff_us;
	ktime_t kstart, kstop, kdiff;

	chunks = win_size / buf_size;
	total_chunks = total / buf_size;

	printk("%s: chunks: %d total_chunks: %d\n", current->comm, chunks, total_chunks);

	kstart = ktime_get();

	for (i = 0; i < total_chunks; i++) {

		wait_event_interruptible(pctx->wq, atomic_read(&pctx->dma_sync) < queue_depth);

		result = perf_copy(pctx, tmp, src, buf_size);
		pctx->copied += result;
		copied_chunks++;
		if (copied_chunks == chunks) {
			tmp = dst;
			copied_chunks = 0;
		} else
			tmp += buf_size;
	}

	printk("%s: All DMA descriptors submitted\n", current->comm);

	/* FIXME: need a timeout here eventually */
	while (atomic_read(&pctx->dma_sync) != 0)
		msleep(1);

	pr_info("%s: dma_up: %d  dma_down: %d dma_prep_err: %d\n",
		current->comm, pctx->dma_up, pctx->dma_down,
		pctx->dma_prep_err);

	kstop = ktime_get();
	kdiff = ktime_sub(kstop, kstart);
	diff_us = ktime_to_us(kdiff);

	pr_info("%s: copied %Lu bytes\n", current->comm, pctx->copied);

	pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);

	perf = pctx->copied / diff_us;

	pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);

	pctx->elapsed_time = diff_us;

	return 0;
}

static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
{
	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
}

static int dma_perf_thread(void *data)
{
	struct pthr_ctx *pctx = data;
	struct perf_ctx *perf = pctx->perf;
	struct perf_mw *mw = &pctx->mw;
	char *dst;
	u64 win_size, buf_size, total;
	void *src;
	int rc, node;
	struct dma_chan *dma_chan = NULL;

	pr_info("kthread %s starting...\n", current->comm);

	node = pctx->node;

	if (!pctx->dma_chan) {
		dma_cap_mask_t dma_mask;

		dma_cap_zero(dma_mask);
		dma_cap_set(DMA_MEMCPY, dma_mask);
		dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
					       (void *)(unsigned long)node);
		if (!dma_chan) {
			pr_warn("%s: cannot acquire DMA channel, quitting\n",
				current->comm);
			return -ENODEV;
		}
		pctx->dma_chan = dma_chan;
		pctx->dev = dma_chan->device->dev;
	}

	src = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
	if (!src) {
		rc = -ENOMEM;
		goto err;
	}

	rc = perf_set_mw(pctx, MAX_TEST_SIZE);
	if (rc < 0) {
		pr_err("%s: set mw failed\n", current->comm);
		rc = -ENXIO;
		goto err;
	}

	win_size = mw->buf_size;
	buf_size = 1ULL << seg_order;
	total = 1ULL << run_order;

	if (buf_size > MAX_TEST_SIZE)
		buf_size = MAX_TEST_SIZE;

	dst = (char *)mw->virt_addr;

	atomic_inc(&perf->tsync);
	while (atomic_read(&perf->tsync) != perf->perf_threads)
		schedule();

	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);

	atomic_dec(&perf->tsync);

	if (rc < 0) {
		pr_err("%s: failed\n", current->comm);
		rc = -ENXIO;
		goto err;
	}

	return 0;

err:
	if (src)
		kfree(src);

	if (dma_chan) {
		dma_release_channel(dma_chan);
		pctx->dma_chan = NULL;
	}

	return rc;
}

static void perf_free_mw(struct pthr_ctx *pctx)
{
	struct perf_mw *mw = &pctx->mw;

	if (!mw->virt_addr)
		return;

	kfree(mw->virt_addr);
	mw->buf_size = 0;
	mw->virt_addr = NULL;
}

static int perf_set_mw(struct pthr_ctx *pctx, size_t size)
{
	struct perf_mw *mw = &pctx->mw;

	if (!size)
		return -EINVAL;

	mw->buf_size = size;

	mw->virt_addr = kmalloc_node(size, GFP_KERNEL, pctx->node);

	if (!mw->virt_addr) {
		mw->buf_size = 0;
		return -EINVAL;
	}

	return 0;
}

static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
				size_t count, loff_t *offp)
{
	struct perf_ctx *perf = filp->private_data;
	char *buf;
	ssize_t ret, out_offset;

	if (!perf)
		return 0;

	buf = kmalloc(64, GFP_KERNEL);
	out_offset = snprintf(buf, 64, "%d\n", perf->run);
	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
	kfree(buf);

	return ret;
}

static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
				 size_t count, loff_t *offp)
{
	struct perf_ctx *perf = filp->private_data;
	int node, i;

	if (perf->perf_threads == 0)
		return 0;

	if (atomic_read(&perf->tsync) == 0)
		perf->run = false;

	if (perf->run == true) {
		/* lets stop the threads */
		perf->run = false;
		for (i = 0; i < MAX_THREADS; i++) {
			if (perf->pthr_ctx[i].thread) {
				kthread_stop(perf->pthr_ctx[i].thread);
				perf->pthr_ctx[i].thread = NULL;
			} else
				break;
		}
	} else {
		perf->run = true;

		if (perf->perf_threads > MAX_THREADS) {
			perf->perf_threads = MAX_THREADS;
			pr_info("Reset total threads to: %u\n", MAX_THREADS);
		}

		/* no greater than 1M */
		if (seg_order > 20) {
			seg_order = 20;
			pr_info("Fix seg_order to %u\n", seg_order);
		}

		if (run_order < seg_order) {
			run_order = seg_order;
			pr_info("Fix run_order to %u\n", run_order);
		}

		/* launch kernel thread */
		for (i = 0; i < perf->perf_threads; i++) {
			struct pthr_ctx *pctx;

			pctx = &perf->pthr_ctx[i];
			atomic_set(&pctx->dma_sync, 0);
			pctx->perf = perf;
			pctx->elapsed_time = 0;
			pctx->copied = 0;

			init_waitqueue_head(&pctx->wq);

			/* NUMA socket node */
			pctx->node = i / DMA_CHANNELS_PER_NODE;
			node = pctx->node;

			pctx->thread =
				kthread_create_on_node(dma_perf_thread,
						       (void *)pctx,
						       node, "dma_perf %d", i);
			if (pctx->thread)
				wake_up_process(pctx->thread);
			else {
				perf->run = false;
				for (i = 0; i < MAX_THREADS; i++) {
					if (pctx->thread) {
						kthread_stop(pctx->thread);
						pctx->thread = NULL;
					} else
						break;
				}
			}

			if (perf->run == false)
				return -ENXIO;
		}

	}

	return count;
}

static const struct file_operations dma_perf_debugfs_run = {
	.owner = THIS_MODULE,
	.open = simple_open,
	.read = debugfs_run_read,
	.write = debugfs_run_write,
};

static ssize_t debugfs_status_read(struct file *filp, char __user *ubuf,
				   size_t count, loff_t *offp)
{
	struct perf_ctx *perf = filp->private_data;
	char *buf;
	ssize_t ret, out_offset;

	if (!perf)
		return 0;

	buf = kmalloc(64, GFP_KERNEL);
	out_offset = snprintf(buf, 64, "%s\n", atomic_read(&perf->tsync) ? "running" : "idle");
	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
	kfree(buf);

	return ret;
}

static const struct file_operations dma_perf_debugfs_status = {
	.owner = THIS_MODULE,
	.open = simple_open,
	.read = debugfs_status_read,
};

static int perf_debugfs_setup(struct perf_ctx *perf)
{

	int i;
	char temp_name[64];

	if (!perf_debugfs_dir)
		return -ENODEV;

	perf->debugfs_node_dir = debugfs_create_dir("dmaperf",
				 perf_debugfs_dir);
	if (!perf->debugfs_node_dir)
		return -ENODEV;

	perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
						perf->debugfs_node_dir, perf,
						&dma_perf_debugfs_run);
	if (!perf->debugfs_run)
		return -ENODEV;

	perf->debugfs_status = debugfs_create_file("status", S_IRUSR,
			       perf->debugfs_node_dir, perf,
			       &dma_perf_debugfs_status);
	if (!perf->debugfs_status)
		return -ENODEV;

	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
				perf->debugfs_node_dir,
				&perf->perf_threads);
	if (!perf->debugfs_threads)
		return -ENODEV;

	perf->debugfs_queue_depth = debugfs_create_u32("queue_depth", S_IRUSR | S_IWUSR,
				    perf->debugfs_node_dir,
				    &queue_depth);
	if (!perf->debugfs_queue_depth)
		return -ENODEV;

	perf->debugfs_transfer_size_order = debugfs_create_u32("transfer_size_order", S_IRUSR | S_IWUSR,
					    perf->debugfs_node_dir,
					    &seg_order);
	if (!perf->debugfs_transfer_size_order)
		return -ENODEV;

	perf->debugfs_total_size_order = debugfs_create_u32("total_size_order", S_IRUSR | S_IWUSR,
					 perf->debugfs_node_dir,
					 &run_order);
	if (!perf->debugfs_total_size_order)
		return -ENODEV;

	for (i = 0; i < MAX_THREADS; i++) {
		struct pthr_ctx *pctx = &perf->pthr_ctx[i];
		sprintf(temp_name, "thread_%d", i);

		pctx->debugfs_thr_dir = debugfs_create_dir(temp_name, perf->debugfs_node_dir);
		if (!pctx->debugfs_thr_dir)
			return -ENODEV;

		pctx->debugfs_copied = debugfs_create_u64("copied", S_IRUSR,
				       pctx->debugfs_thr_dir,
				       &pctx->copied);
		if (!pctx->debugfs_copied)
			return -ENODEV;

		pctx->debugfs_elapsed_time = debugfs_create_u64("elapsed_time", S_IRUSR,
					     pctx->debugfs_thr_dir,
					     &pctx->elapsed_time);
		if (!pctx->debugfs_elapsed_time)
			return -ENODEV;
	}

	return 0;
}

static int perf_probe(void)
{
	struct perf_ctx *perf;
	int rc = 0;

	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, 0);
	if (!perf) {
		rc = -ENOMEM;
		goto err_perf;
	}

	perf->numa_nodes = num_online_nodes();
	perf->perf_threads = 1;
	atomic_set(&perf->tsync, 0);
	perf->run = false;
	spin_lock_init(&perf->db_lock);

	if (debugfs_initialized() && !perf_debugfs_dir) {
		perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
		if (!perf_debugfs_dir)
			goto err_ctx;

		rc = perf_debugfs_setup(perf);
		if (rc)
			goto err_ctx;
	}

	g_perf = perf;
	return 0;

err_ctx:
	kfree(perf);
err_perf:
	return rc;
}

static void perf_remove(void)
{
	int i;
	struct perf_ctx *perf = g_perf;

	if (perf_debugfs_dir) {
		debugfs_remove_recursive(perf_debugfs_dir);
		perf_debugfs_dir = NULL;
	}

	for (i = 0; i < MAX_THREADS; i++) {
		struct pthr_ctx *pctx = &perf->pthr_ctx[i];
		if (pctx->dma_chan)
			dma_release_channel(pctx->dma_chan);
		perf_free_mw(pctx);
	}

	kfree(perf);
}

static int __init perf_init_module(void)
{
	printk("DMA Performance Test Init\n");
	return perf_probe();
}
module_init(perf_init_module);

static void __exit perf_exit_module(void)
{
	printk("DMA Performance Test Exit\n");
	perf_remove();
}
module_exit(perf_exit_module);
ioat: add ioat kernel driver performance test harness For the purpose to make performance comparison between the ioat kernel driver and user space driver, we added the kernel driver test harness here, all the workload executed in the kernel space and controlled via sysfs. Change-Id: I2c8d826283405a5e1c9ba6a033503bcb98541370 Signed-off-by: Changpeng Liu <changpeng.liu@intel.com> 2015-12-16 06:27:50 +00:00			`/*`
			`* This file is provided under a dual BSD/GPLv2 license. When using or`
			`* redistributing this file, you may do so under either license.`
			`*`
			`* GPL LICENSE SUMMARY`
			`*`
Remove year from copyright headers. Also add a space between Copyright and (c). The copyright year can be determined using git metadata. Also remove the duplicated "All rights reserved." - every instance of this line already has a corresponding "All rights reserved" immediately below it, except for examples/ioat/kperf/kmod/dma_perf.c, where I have added it manually. Performed using this command: git ls-files \| xargs sed -i -e 's/Copyright(c) \(.*\) Intel Corporation. All rights reserved./Copyright (c) Intel Corporation./' Change-Id: I3779f404966800709024eb1eb66a50068af2716c Signed-off-by: Daniel Verkamp <daniel.verkamp@intel.com> 2016-01-26 17:47:22 +00:00			`* Copyright (c) Intel Corporation.`
			`* All rights reserved.`
ioat: add ioat kernel driver performance test harness For the purpose to make performance comparison between the ioat kernel driver and user space driver, we added the kernel driver test harness here, all the workload executed in the kernel space and controlled via sysfs. Change-Id: I2c8d826283405a5e1c9ba6a033503bcb98541370 Signed-off-by: Changpeng Liu <changpeng.liu@intel.com> 2015-12-16 06:27:50 +00:00			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of version 2 of the GNU General Public License as`
			`* published by the Free Software Foundation.`
			`*`
			`* BSD LICENSE`
			`*`
Remove year from copyright headers. Also add a space between Copyright and (c). The copyright year can be determined using git metadata. Also remove the duplicated "All rights reserved." - every instance of this line already has a corresponding "All rights reserved" immediately below it, except for examples/ioat/kperf/kmod/dma_perf.c, where I have added it manually. Performed using this command: git ls-files \| xargs sed -i -e 's/Copyright(c) \(.*\) Intel Corporation. All rights reserved./Copyright (c) Intel Corporation./' Change-Id: I3779f404966800709024eb1eb66a50068af2716c Signed-off-by: Daniel Verkamp <daniel.verkamp@intel.com> 2016-01-26 17:47:22 +00:00			`* Copyright (c) Intel Corporation.`
			`* All rights reserved.`
ioat: add ioat kernel driver performance test harness For the purpose to make performance comparison between the ioat kernel driver and user space driver, we added the kernel driver test harness here, all the workload executed in the kernel space and controlled via sysfs. Change-Id: I2c8d826283405a5e1c9ba6a033503bcb98541370 Signed-off-by: Changpeng Liu <changpeng.liu@intel.com> 2015-12-16 06:27:50 +00:00			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`*`
			`* * Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copy`
			`* notice, this list of conditions and the following disclaimer in`
			`* the documentation and/or other materials provided with the`
			`* distribution.`
			`* * Neither the name of Intel Corporation nor the names of its`
			`* contributors may be used to endorse or promote products derived`
			`* from this software without specific prior written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`* PCIe DMA Perf Linux driver`
			`*/`

			`#include <linux/init.h>`
			`#include <linux/kernel.h>`
			`#include <linux/wait.h>`
			`#include <linux/module.h>`
			`#include <linux/kthread.h>`
			`#include <linux/time.h>`
			`#include <linux/timer.h>`
			`#include <linux/dma-mapping.h>`
			`#include <linux/pci.h>`
			`#include <linux/slab.h>`
			`#include <linux/spinlock.h>`
			`#include <linux/debugfs.h>`
			`#include <linux/dmaengine.h>`
			`#include <linux/delay.h>`
			`#include <linux/printk.h>`
			`#include <linux/nodemask.h>`

			`#define DRIVER_NAME "dma_perf"`
			`#define DRIVER_DESCRIPTION "PCIe DMA Performance Measurement Tool"`

			`#define DRIVER_LICENSE "Dual BSD/GPL"`
			`#define DRIVER_VERSION "1.0"`
			`#define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"`

			`#define MAX_THREADS 32`
			`#define MAX_TEST_SIZE 1024 * 1024 /* 1M */`
			`#define DMA_CHANNELS_PER_NODE 8`

			`MODULE_LICENSE(DRIVER_LICENSE);`
			`MODULE_VERSION(DRIVER_VERSION);`
			`MODULE_AUTHOR(DRIVER_AUTHOR);`
			`MODULE_AUTHOR("Changpeng Liu <changpeng.liu@intel.com>");`
			`MODULE_DESCRIPTION(DRIVER_DESCRIPTION);`

			`static struct dentry *perf_debugfs_dir;`
			`static struct perf_ctx *g_perf = NULL;`

			`static unsigned int seg_order = 12; /* 4K */`
			`static unsigned int queue_depth = 256;`
			`static unsigned int run_order = 32; /* 4G */`

			`struct perf_mw {`
			`size_t buf_size;`
			`void *virt_addr;`
			`};`

			`struct perf_ctx;`

			`struct pthr_ctx {`
			`struct dentry *debugfs_thr_dir;`
			`struct dentry *debugfs_copied;`
			`struct dentry *debugfs_elapsed_time;`
			`struct device *dev;`
			`int node;`
			`wait_queue_head_t wq;`
			`struct perf_mw mw;`
			`struct task_struct *thread;`
			`struct perf_ctx *perf;`
			`atomic_t dma_sync;`
			`struct dma_chan *dma_chan;`
			`int dma_up;`
			`int dma_down;`
			`int dma_prep_err;`
			`u64 copied;`
			`u64 elapsed_time;`
			`};`

			`struct perf_ctx {`
			`spinlock_t db_lock;`
			`struct dentry *debugfs_node_dir;`
			`struct dentry *debugfs_run;`
			`struct dentry *debugfs_threads;`
			`struct dentry *debugfs_queue_depth;`
			`struct dentry *debugfs_transfer_size_order;`
			`struct dentry *debugfs_total_size_order;`
			`struct dentry *debugfs_status;`
			`u8 numa_nodes;`
			`u8 perf_threads;`
			`bool run;`
			`struct pthr_ctx pthr_ctx[MAX_THREADS];`
			`atomic_t tsync;`
			`};`

			`static void perf_free_mw(struct pthr_ctx *pctx);`
			`static int perf_set_mw(struct pthr_ctx *pctx, size_t size);`

			`static void perf_copy_callback(void *data)`
			`{`
			`struct pthr_ctx *pctx = data;`

			`atomic_dec(&pctx->dma_sync);`
			`pctx->dma_down++;`

			`wake_up(&pctx->wq);`
			`}`

			`static ssize_t perf_copy(struct pthr_ctx pctx, char dst,`
			`char *src, size_t size)`
			`{`
			`struct dma_async_tx_descriptor *txd;`
			`struct dma_chan *chan = pctx->dma_chan;`
			`struct dma_device *device;`
			`struct dmaengine_unmap_data *unmap;`
			`dma_cookie_t cookie;`
			`size_t src_off, dst_off;`
			`int retries = 0;`

			`if (!chan) {`
			`printk("DMA engine does not exist\n");`
			`return -EINVAL;`
			`}`

			`device = chan->device;`
			`src_off = (size_t)src & ~PAGE_MASK;`
			`dst_off = (size_t)dst & ~PAGE_MASK;`

			`if (!is_dma_copy_aligned(device, src_off, dst_off, size))`
			`return -ENODEV;`

			`unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);`
			`if (!unmap)`
			`return -ENOMEM;`

			`unmap->len = size;`
			`unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),`
			`src_off, size, DMA_TO_DEVICE);`
			`if (dma_mapping_error(device->dev, unmap->addr[0]))`
			`goto err_get_unmap;`

			`unmap->to_cnt = 1;`

			`unmap->addr[1] = dma_map_page(device->dev, virt_to_page(dst),`
			`dst_off, size, DMA_FROM_DEVICE);`
			`if (dma_mapping_error(device->dev, unmap->addr[1]))`
			`goto err_get_unmap;`
			`unmap->from_cnt = 1;`

			`dma_prep_retry:`
			`txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],`
			`unmap->addr[0],`
			`size, DMA_PREP_INTERRUPT);`
			`if (!txd) {`
			`if (retries++ > 20) {`
			`pctx->dma_prep_err++;`
			`goto err_get_unmap;`
			`} else {`
			`set_current_state(TASK_INTERRUPTIBLE);`
			`schedule_timeout(50);`
			`goto dma_prep_retry;`
			`}`
			`}`

			`txd->callback = perf_copy_callback;`
			`txd->callback_param = pctx;`
			`dma_set_unmap(txd, unmap);`

			`cookie = dmaengine_submit(txd);`
			`if (dma_submit_error(cookie))`
			`goto err_set_unmap;`

			`atomic_inc(&pctx->dma_sync);`

			`pctx->dma_up++;`
			`dma_async_issue_pending(chan);`

			`return size;`

			`err_set_unmap:`
			`dmaengine_unmap_put(unmap);`
			`err_get_unmap:`
			`dmaengine_unmap_put(unmap);`
			`return 0;`
			`}`

			`static int perf_move_data(struct pthr_ctx pctx, char dst, char *src,`
			`u64 buf_size, u64 win_size, u64 total)`
			`{`
			`int chunks, total_chunks, i;`
			`int copied_chunks = 0;`
			`u64 result;`
			`char *tmp = dst;`
			`u64 perf, diff_us;`
			`ktime_t kstart, kstop, kdiff;`

			`chunks = win_size / buf_size;`
			`total_chunks = total / buf_size;`

			`printk("%s: chunks: %d total_chunks: %d\n", current->comm, chunks, total_chunks);`

			`kstart = ktime_get();`

			`for (i = 0; i < total_chunks; i++) {`

			`wait_event_interruptible(pctx->wq, atomic_read(&pctx->dma_sync) < queue_depth);`

			`result = perf_copy(pctx, tmp, src, buf_size);`
			`pctx->copied += result;`
			`copied_chunks++;`
			`if (copied_chunks == chunks) {`
			`tmp = dst;`
			`copied_chunks = 0;`
			`} else`
			`tmp += buf_size;`
			`}`

			`printk("%s: All DMA descriptors submitted\n", current->comm);`

			`/* FIXME: need a timeout here eventually */`
			`while (atomic_read(&pctx->dma_sync) != 0)`
			`msleep(1);`

			`pr_info("%s: dma_up: %d dma_down: %d dma_prep_err: %d\n",`
			`current->comm, pctx->dma_up, pctx->dma_down,`
			`pctx->dma_prep_err);`

			`kstop = ktime_get();`
			`kdiff = ktime_sub(kstop, kstart);`
			`diff_us = ktime_to_us(kdiff);`

			`pr_info("%s: copied %Lu bytes\n", current->comm, pctx->copied);`

			`pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);`

			`perf = pctx->copied / diff_us;`

			`pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);`

			`pctx->elapsed_time = diff_us;`

			`return 0;`
			`}`

			`static bool perf_dma_filter_fn(struct dma_chan chan, void node)`
			`{`
			`return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;`
			`}`

			`static int dma_perf_thread(void *data)`
			`{`
			`struct pthr_ctx *pctx = data;`
			`struct perf_ctx *perf = pctx->perf;`
			`struct perf_mw *mw = &pctx->mw;`
			`char *dst;`
			`u64 win_size, buf_size, total;`
			`void *src;`
			`int rc, node;`
			`struct dma_chan *dma_chan = NULL;`

			`pr_info("kthread %s starting...\n", current->comm);`

			`node = pctx->node;`

			`if (!pctx->dma_chan) {`
			`dma_cap_mask_t dma_mask;`

			`dma_cap_zero(dma_mask);`
			`dma_cap_set(DMA_MEMCPY, dma_mask);`
			`dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,`
			`(void *)(unsigned long)node);`
			`if (!dma_chan) {`
			`pr_warn("%s: cannot acquire DMA channel, quitting\n",`
			`current->comm);`
			`return -ENODEV;`
			`}`
			`pctx->dma_chan = dma_chan;`
			`pctx->dev = dma_chan->device->dev;`
			`}`

			`src = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);`
			`if (!src) {`
			`rc = -ENOMEM;`
			`goto err;`
			`}`

			`rc = perf_set_mw(pctx, MAX_TEST_SIZE);`
			`if (rc < 0) {`
			`pr_err("%s: set mw failed\n", current->comm);`
			`rc = -ENXIO;`
			`goto err;`
			`}`

			`win_size = mw->buf_size;`
			`buf_size = 1ULL << seg_order;`
			`total = 1ULL << run_order;`

			`if (buf_size > MAX_TEST_SIZE)`
			`buf_size = MAX_TEST_SIZE;`

			`dst = (char *)mw->virt_addr;`

			`atomic_inc(&perf->tsync);`
			`while (atomic_read(&perf->tsync) != perf->perf_threads)`
			`schedule();`

			`rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);`

			`atomic_dec(&perf->tsync);`

			`if (rc < 0) {`
			`pr_err("%s: failed\n", current->comm);`
			`rc = -ENXIO;`
			`goto err;`
			`}`

			`return 0;`

			`err:`
			`if (src)`
			`kfree(src);`

			`if (dma_chan) {`
			`dma_release_channel(dma_chan);`
			`pctx->dma_chan = NULL;`
			`}`

			`return rc;`
			`}`

			`static void perf_free_mw(struct pthr_ctx *pctx)`
			`{`
			`struct perf_mw *mw = &pctx->mw;`

			`if (!mw->virt_addr)`
			`return;`

			`kfree(mw->virt_addr);`
			`mw->buf_size = 0;`
			`mw->virt_addr = NULL;`
			`}`

			`static int perf_set_mw(struct pthr_ctx *pctx, size_t size)`
			`{`
			`struct perf_mw *mw = &pctx->mw;`

			`if (!size)`
			`return -EINVAL;`

			`mw->buf_size = size;`

			`mw->virt_addr = kmalloc_node(size, GFP_KERNEL, pctx->node);`

			`if (!mw->virt_addr) {`
			`mw->buf_size = 0;`
			`return -EINVAL;`
			`}`

			`return 0;`
			`}`

			`static ssize_t debugfs_run_read(struct file filp, char __user ubuf,`
			`size_t count, loff_t *offp)`
			`{`
			`struct perf_ctx *perf = filp->private_data;`
			`char *buf;`
			`ssize_t ret, out_offset;`

			`if (!perf)`
			`return 0;`

			`buf = kmalloc(64, GFP_KERNEL);`
			`out_offset = snprintf(buf, 64, "%d\n", perf->run);`
			`ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);`
			`kfree(buf);`

			`return ret;`
			`}`

			`static ssize_t debugfs_run_write(struct file filp, const char __user ubuf,`
			`size_t count, loff_t *offp)`
			`{`
			`struct perf_ctx *perf = filp->private_data;`
			`int node, i;`

			`if (perf->perf_threads == 0)`
			`return 0;`

			`if (atomic_read(&perf->tsync) == 0)`
			`perf->run = false;`

			`if (perf->run == true) {`
			`/* lets stop the threads */`
			`perf->run = false;`
			`for (i = 0; i < MAX_THREADS; i++) {`
			`if (perf->pthr_ctx[i].thread) {`
			`kthread_stop(perf->pthr_ctx[i].thread);`
			`perf->pthr_ctx[i].thread = NULL;`
			`} else`
			`break;`
			`}`
			`} else {`
			`perf->run = true;`

			`if (perf->perf_threads > MAX_THREADS) {`
			`perf->perf_threads = MAX_THREADS;`
			`pr_info("Reset total threads to: %u\n", MAX_THREADS);`
			`}`

			`/* no greater than 1M */`
			`if (seg_order > 20) {`
			`seg_order = 20;`
			`pr_info("Fix seg_order to %u\n", seg_order);`
			`}`

			`if (run_order < seg_order) {`
			`run_order = seg_order;`
			`pr_info("Fix run_order to %u\n", run_order);`
			`}`

			`/* launch kernel thread */`
			`for (i = 0; i < perf->perf_threads; i++) {`
			`struct pthr_ctx *pctx;`

			`pctx = &perf->pthr_ctx[i];`
			`atomic_set(&pctx->dma_sync, 0);`
			`pctx->perf = perf;`
			`pctx->elapsed_time = 0;`
			`pctx->copied = 0;`

			`init_waitqueue_head(&pctx->wq);`

			`/* NUMA socket node */`
			`pctx->node = i / DMA_CHANNELS_PER_NODE;`
			`node = pctx->node;`

			`pctx->thread =`
			`kthread_create_on_node(dma_perf_thread,`
			`(void *)pctx,`
			`node, "dma_perf %d", i);`
			`if (pctx->thread)`
			`wake_up_process(pctx->thread);`
			`else {`
			`perf->run = false;`
			`for (i = 0; i < MAX_THREADS; i++) {`
			`if (pctx->thread) {`
			`kthread_stop(pctx->thread);`
			`pctx->thread = NULL;`
			`} else`
			`break;`
			`}`
			`}`

			`if (perf->run == false)`
			`return -ENXIO;`
			`}`

			`}`

			`return count;`
			`}`

			`static const struct file_operations dma_perf_debugfs_run = {`
			`.owner = THIS_MODULE,`
			`.open = simple_open,`
			`.read = debugfs_run_read,`
			`.write = debugfs_run_write,`
			`};`

			`static ssize_t debugfs_status_read(struct file filp, char __user ubuf,`
			`size_t count, loff_t *offp)`
			`{`
			`struct perf_ctx *perf = filp->private_data;`
			`char *buf;`
			`ssize_t ret, out_offset;`

			`if (!perf)`
			`return 0;`

			`buf = kmalloc(64, GFP_KERNEL);`
			`out_offset = snprintf(buf, 64, "%s\n", atomic_read(&perf->tsync) ? "running" : "idle");`
			`ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);`
			`kfree(buf);`

			`return ret;`
			`}`

			`static const struct file_operations dma_perf_debugfs_status = {`
			`.owner = THIS_MODULE,`
			`.open = simple_open,`
			`.read = debugfs_status_read,`
			`};`

			`static int perf_debugfs_setup(struct perf_ctx *perf)`
			`{`

			`int i;`
			`char temp_name[64];`

			`if (!perf_debugfs_dir)`
			`return -ENODEV;`

			`perf->debugfs_node_dir = debugfs_create_dir("dmaperf",`
			`perf_debugfs_dir);`
			`if (!perf->debugfs_node_dir)`
			`return -ENODEV;`

			`perf->debugfs_run = debugfs_create_file("run", S_IRUSR \| S_IWUSR,`
			`perf->debugfs_node_dir, perf,`
			`&dma_perf_debugfs_run);`
			`if (!perf->debugfs_run)`
			`return -ENODEV;`

			`perf->debugfs_status = debugfs_create_file("status", S_IRUSR,`
			`perf->debugfs_node_dir, perf,`
			`&dma_perf_debugfs_status);`
			`if (!perf->debugfs_status)`
			`return -ENODEV;`

			`perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR \| S_IWUSR,`
			`perf->debugfs_node_dir,`
			`&perf->perf_threads);`
			`if (!perf->debugfs_threads)`
			`return -ENODEV;`

			`perf->debugfs_queue_depth = debugfs_create_u32("queue_depth", S_IRUSR \| S_IWUSR,`
			`perf->debugfs_node_dir,`
			`&queue_depth);`
			`if (!perf->debugfs_queue_depth)`
			`return -ENODEV;`

			`perf->debugfs_transfer_size_order = debugfs_create_u32("transfer_size_order", S_IRUSR \| S_IWUSR,`
			`perf->debugfs_node_dir,`
			`&seg_order);`
			`if (!perf->debugfs_transfer_size_order)`
			`return -ENODEV;`

			`perf->debugfs_total_size_order = debugfs_create_u32("total_size_order", S_IRUSR \| S_IWUSR,`
			`perf->debugfs_node_dir,`
			`&run_order);`
			`if (!perf->debugfs_total_size_order)`
			`return -ENODEV;`

			`for (i = 0; i < MAX_THREADS; i++) {`
			`struct pthr_ctx *pctx = &perf->pthr_ctx[i];`
			`sprintf(temp_name, "thread_%d", i);`

			`pctx->debugfs_thr_dir = debugfs_create_dir(temp_name, perf->debugfs_node_dir);`
			`if (!pctx->debugfs_thr_dir)`
			`return -ENODEV;`

			`pctx->debugfs_copied = debugfs_create_u64("copied", S_IRUSR,`
			`pctx->debugfs_thr_dir,`
			`&pctx->copied);`
			`if (!pctx->debugfs_copied)`
			`return -ENODEV;`

			`pctx->debugfs_elapsed_time = debugfs_create_u64("elapsed_time", S_IRUSR,`
			`pctx->debugfs_thr_dir,`
			`&pctx->elapsed_time);`
			`if (!pctx->debugfs_elapsed_time)`
			`return -ENODEV;`
			`}`

			`return 0;`
			`}`

			`static int perf_probe(void)`
			`{`
			`struct perf_ctx *perf;`
			`int rc = 0;`

			`perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, 0);`
			`if (!perf) {`
			`rc = -ENOMEM;`
			`goto err_perf;`
			`}`

			`perf->numa_nodes = num_online_nodes();`
			`perf->perf_threads = 1;`
			`atomic_set(&perf->tsync, 0);`
			`perf->run = false;`
			`spin_lock_init(&perf->db_lock);`

			`if (debugfs_initialized() && !perf_debugfs_dir) {`
			`perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);`
			`if (!perf_debugfs_dir)`
			`goto err_ctx;`

			`rc = perf_debugfs_setup(perf);`
			`if (rc)`
			`goto err_ctx;`
			`}`

			`g_perf = perf;`
			`return 0;`

			`err_ctx:`
			`kfree(perf);`
			`err_perf:`
			`return rc;`
			`}`

			`static void perf_remove(void)`
			`{`
			`int i;`
			`struct perf_ctx *perf = g_perf;`

			`if (perf_debugfs_dir) {`
			`debugfs_remove_recursive(perf_debugfs_dir);`
			`perf_debugfs_dir = NULL;`
			`}`

			`for (i = 0; i < MAX_THREADS; i++) {`
			`struct pthr_ctx *pctx = &perf->pthr_ctx[i];`
			`if (pctx->dma_chan)`
			`dma_release_channel(pctx->dma_chan);`
			`perf_free_mw(pctx);`
			`}`

			`kfree(perf);`
			`}`

			`static int __init perf_init_module(void)`
			`{`
			`printk("DMA Performance Test Init\n");`
			`return perf_probe();`
			`}`
			`module_init(perf_init_module);`

			`static void __exit perf_exit_module(void)`
			`{`
			`printk("DMA Performance Test Exit\n");`
			`perf_remove();`
			`}`
			`module_exit(perf_exit_module);`