/*
    Max IOPS stresser
    Copyright (C) 2013-2018 Darrick Wong

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#define PACKAGE "maxiops"
#include "bogodisk.h"
#include <libaio.h>
#include <time.h>
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <sys/mman.h>
#include "util.h"

static int use_mlock = 1;
static int boost_ioprio = 1;

#ifdef O_DIRECT
static int default_odirect_flag = O_DIRECT;
#else
#warn O_DIRECT not supported!
static int default_odirect_flag = 0;
#endif

#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
#else
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#endif

#define GETEVENTS_TIMEOUT_NSEC	(10000)

#define container_of(ptr, type, member) ({                      \
	const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
	(type *)( (char *)__mptr - offsetof(type,member) );})

struct time_device_io {
	struct iocb iocb;
	void *buf;
};

struct time_device_context {
	char *dev;
	uint64_t start;
	uint64_t stop;
	unsigned long nr_threads;
	uint64_t bufsize;
	uint64_t nr_requests;
	int flags;
	FILE *report;
	void (*io_prep_fn)(struct iocb *iocb, int fd, void *buf, size_t count,
			   long long offset);
	unsigned long min_nr_events;
	int linear;
};

static void time_device(struct time_device_context *t)
{
	int fd, ret;
	unsigned int i;
	int d;
	io_context_t ioct = 0;
	struct time_device_io *ios;
	struct io_event *events;
	struct iocb **iops;
	int clock_type = CLOCK_MONOTONIC;
	struct timespec now;
	double nowf, delta;
	unsigned int nr_submit;
	uint64_t nr_completed, nr_submitted;
	uint64_t offset = 0, max_size;
	uint32_t blksz;
	struct time_device_context tdc;
	double startf, lastf;
	uint64_t last_reported_completions, nr_ios;
	void *p = NULL;
	double getevents_returned = 0.0;
	unsigned int getevents_calls = 0;

	/*
	 * Work around a kernel bug -- io_getevents occasionally wedges if we
	 * feed it a file handle and no timeout, even if there are completed
	 * IOs waiting.  So, we feed it a timeout of 0, which makes it run in
	 * pollling mode.  Block devices don't seem to have this problem.
	 * (Oh yes they do, in 3.13...)
	 */
	struct timespec timeout;
	memset(&timeout, 0, sizeof(timeout));

	/* Make shadow copy of our input parameters */
	memcpy(&tdc, t, sizeof(tdc));
	t = &tdc;

	/* Make sure CLOCK_MONOTONIC works */
	if (clock_gettime(clock_type, &now)) {
		clock_type = CLOCK_REALTIME;
	}

	/* Flip backwards params */
	if (t->start && t->stop && (t->start > t->stop)) {
		max_size = t->start;
		t->start = t->stop;
		t->stop = max_size;
	}

	/* Check parameters */
	get_size_and_block_size(t->dev, &max_size, &blksz, NULL);
	if (!max_size) {
		fprintf(stderr, "%s: Size is zero, skipping.\n", t->dev);
		return;
	}

	if (t->start > max_size) {
		fprintf(stderr, "%s: Starting at %"PRIu64", which is beyond "
			"the end of the device at %"PRIu64"!\n", t->dev,
			t->start, max_size);
		return;
	}

	if (t->stop > max_size) {
		fprintf(stderr, "%s: Clamping end to %"PRIu64".\n", t->dev,
			max_size);
		t->stop = max_size;
	} else if (!t->stop) {
		t->stop = max_size;
	}

	if (!t->bufsize)
		t->bufsize = blksz;

	if (!t->nr_requests)
		t->nr_requests = (t->stop - t->start) / blksz;
	
	if (t->report != stdout)
		fprintf(stdout, "%s: Using %lu:%lu threads to seek %"PRIu64
			" times from %"PRIu64" to %"PRIu64" in %"PRIu64
			" chunks.\n", t->dev, t->nr_threads, t->min_nr_events,
			t->nr_requests, t->start, t->stop, t->bufsize);
	fprintf(t->report, "%s: Using %lu:%lu threads to seek %"PRIu64" times "
		"from %"PRIu64" to %"PRIu64" in %"PRIu64" byte chunks.\n",
		t->dev, t->nr_threads, t->min_nr_events, t->nr_requests,
		t->start, t->stop, t->bufsize);
	fflush(t->report);
	fflush(stdout);

	fd = open(t->dev, t->flags | O_LARGEFILE);
	if (fd < 0) {
		perror(t->dev);
		return;
	}

	/* Allocate a bunch of IO control structures */
	ios = malloc(sizeof(*ios) * t->nr_threads);
	if (!ios)
		goto out;

	/* Allocate and lock buffers */
	if (posix_memalign(&p, blksz, t->bufsize)) {
		perror("memalign");
		goto out2;
	}
	memset(p, 0, t->bufsize);
#ifdef _POSIX_MEMLOCK_RANGE
	if (use_mlock && mlock(p, t->bufsize)) {
		perror("mlock");
		goto out2;
	}
#else
# warn mlock not present
#endif

	/* Allocate and lock buffers */
	for (i = 0; i < t->nr_threads; i++)
		ios[i].buf = p;

	events = malloc(sizeof(*events) * t->nr_threads);
	if (!events)
		goto out2;
	iops = malloc(sizeof(*iops) * t->nr_threads);
	if (!iops)
		goto out4;

	/* Lock everything in memory */
#ifdef _POSIX_MEMLOCK_RANGE
	if (use_mlock && (mlock(ios, sizeof(*ios) * t->nr_threads) ||
	    mlock(events, sizeof(*events) * t->nr_threads) ||
	    mlock(iops, sizeof(*iops) * t->nr_threads))) {
		perror("mlock");
		goto out6;
	}
#endif

	ret = io_queue_init(t->nr_threads, &ioct);
	if (ret) {
		perror("io_setup");
		goto out6;
	}

	nr_submit = t->nr_threads;
	nr_completed = nr_submitted = 0;
	for (i = 0; i < nr_submit; i++)
		iops[i] = &ios[i].iocb;

	/* Increase IO priority */
	if (boost_ioprio && bump_priority())
		goto out6;

	clock_gettime(clock_type, &now);
	lastf = (now.tv_sec) + ((double)now.tv_nsec / 1000000000);
	startf = nowf = lastf;
	last_reported_completions = 0;
	offset = t->start;

	while (nr_completed < t->nr_requests) {
		if (nr_submit > t->nr_requests - nr_submitted)
			nr_submit = t->nr_requests - nr_submitted;

		/* Prepare the next batch of IO */
		for (i = 0; i < nr_submit; i++) {
			struct iocb *iocb = iops[i];
			struct time_device_io *tdi = container_of(iocb,
					struct time_device_io, iocb);
			if (!t->linear)
				offset = get_randnum_align(t->start, t->stop,
							   t->bufsize);
			t->io_prep_fn(iocb, fd, tdi->buf, t->bufsize,
				      offset);
			if (t->linear) {
				offset += t->bufsize;
				if (offset > t->stop)
					offset = t->start;
			}
		}
		nr_submitted += nr_submit;

		/* Submit IO */
		if (nr_submit) {
			ret = io_submit(ioct, nr_submit, iops);
			if (ret < 0) {
				errno = -ret;
				perror("io_submit");
				goto out6;
			}
		}

		timeout.tv_nsec = GETEVENTS_TIMEOUT_NSEC;
		ret = io_getevents(ioct, t->min_nr_events, t->nr_threads,
				   events, &timeout);
		if (ret < 0) {
			errno = -ret;
			perror("io_getevents");
			goto out2;
		}
		getevents_returned += ret;
		getevents_calls++;

		clock_gettime(clock_type, &now);
		nowf = (now.tv_sec) + ((double)now.tv_nsec / 1000000000);

		if (timeout.tv_nsec != GETEVENTS_TIMEOUT_NSEC)
			printf("WARNING: io_getevents timed out!\n");

		/* Complete the IO */
		for (d = 0; d < ret; d++) {
			struct iocb *iocb = events[d].obj;
			iops[d] = iocb;
			nr_completed++;	
		}

		/* Occasionally file a report */
		if (nowf - lastf > 5) {
			nr_ios = nr_completed - last_reported_completions;
			delta = nowf - lastf;
			fprintf(t->report, "%"PRIu64", %.2f, %.2f\n",
				nr_ios, delta, nr_ios / delta);
			fprintf(stdout, "%s: %"PRIu64" seeks (%.2f%%)"
				"            \r",
				t->dev, nr_completed,
				((double)nr_completed) / t->nr_requests * 100);
			if (t->report != stdout)
				fflush(t->report);
			fflush(stdout);
			last_reported_completions = nr_completed;
			lastf = nowf;
		}

		nr_submit = ret;
	}

	/* Send one last report */
	nr_ios = nr_completed - last_reported_completions;
	if (nr_ios) {
		clock_gettime(clock_type, &now);
		nowf = (now.tv_sec) + ((double)now.tv_nsec / 1000000000);
		delta = nowf - lastf;
		fprintf(stdout, "%"PRIu64", %.2f, %.2f\n", nr_ios, delta,
			nr_ios / delta);
	}
	fprintf(stdout, "Average iops: %.2f, average iocbs returned: %.2f\n",
		nr_completed / (nowf - startf),
		getevents_returned / getevents_calls);

	ret = io_destroy(ioct);
	if (ret)
		perror("io_destroy");
out6:
#ifdef _POSIX_MEMLOCK_RANGE
	ret = munlockall();
	if (ret)
		perror("munlockall");
#endif
	free(iops);
out4:
	free(events);
out2:
#ifdef _POSIX_MEMLOCK_RANGE
	if (p)
		munlock(p, t->bufsize);
#endif
	free(p);
	free(ios);
out:
	close(fd);
	return;
}

static void print_help(void) {
	printf("Usage: %s [options] device [devices...]\n", PACKAGE);
	printf("\n");
	printf("Options:\n");
	printf(" -b	Start test at this location.\n");
	printf(" -c	Use memory buffer of this size.\n");
#ifdef O_DIRECT
	printf(" -d     Do not bypass disk cache via O_DIRECT.\n");
#endif
	printf(" -e	End test after this location.\n");
	printf(" -i	Don't boost IO priority.\n");
	printf(" -l	Perform a linear (instead of random) test.\n");
	printf(" -m	Wait for this many IO completions per io_getevents call.\n");
	printf(" -n	Run with the specified number of threads.\n");
	printf(" -o	Save output in the file \"report\".\n");
	printf(" -s	Perform no more than this many seeks.\n");
	printf(" -w	Destructive write test.\n");
	printf(" -y	Prevent buffers from being swapped out.\n");
	printf(" -z	Use O_SYNC.\n");
}

int main(int argc, char *argv[])
{
	int i, c, sync;
	struct time_device_context tdc;
	int access_flag, direct_flag;

	memset(&tdc, 0, sizeof(tdc));
	sync = 0;
	tdc.nr_threads = 64;
	direct_flag = default_odirect_flag;
	access_flag = O_RDONLY;
	tdc.io_prep_fn = io_prep_pread;
	tdc.report = stdout;

	fprintf(stdout, "%s %s, Copyright (C) 2013-2018 Darrick Wong.\n",
		PACKAGE, PACKAGE_VERSION);

	/* parse args */
	while((c = getopt(argc, argv, "wb:e:lc:o:s:dn:m:iyz")) != -1) {
		switch (c) {
		case 'o':
			tdc.report = fopen(optarg, "w+");
			if (!tdc.report) {
				perror(optarg);
				return 2;
			}
			break;
		case 'b':
			tdc.start = get_number(optarg);
			break;
		case 'e':
			tdc.stop = get_number(optarg);
			break;
		case 'c':
			tdc.bufsize = get_number(optarg);
			break;
		case 's':
			tdc.nr_requests = get_number(optarg);
			break;
		case 'd':
			direct_flag = 0;
			break;
		case 'w':
			tdc.io_prep_fn = io_prep_pwrite;
			access_flag = O_WRONLY;
			break;
		case 'n':
			tdc.nr_threads = get_number(optarg);
			break;
		case 'i':
			boost_ioprio = 0;
			break;
		case 'm':
			tdc.min_nr_events = get_number(optarg);
			break;
		case 'y':
			use_mlock = 0;
			break;
		case 'z':
			sync = 1;
			break;
		case 'l':
			tdc.linear = 1;
			break;
		default:
			print_help();
			return 1;
		}
	}
	if (!tdc.min_nr_events)
		tdc.min_nr_events = 4;
	tdc.flags = access_flag | direct_flag;
	if (sync)
		tdc.flags |= O_SYNC;

	if (tdc.min_nr_events > tdc.nr_threads) {
		printf("Won't pick up more events than are outstanding.\n");
		print_help();
	}
	if (optind == argc)
		print_help();

	for (i = optind; i < argc; i++) {
		tdc.dev = argv[i];
		time_device(&tdc);
	}

	return 0;
}
