/*
 * raspberrypi_axi_monitor.c
 *
 * Author: james.hughes@raspberrypi.org
 *
 * Raspberry Pi AXI performance counters.
 *
 * Copyright (C) 2017 Raspberry Pi Trading Ltd.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/debugfs.h>
#include <linux/devcoredump.h>
#include <linux/device.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/platform_device.h>

#include <soc/bcm2835/raspberrypi-firmware.h>

#define NUM_MONITORS 2
#define NUM_BUS_WATCHERS_PER_MONITOR 3

#define SYSTEM_MONITOR 0
#define VPU_MONITOR 1

#define MAX_BUSES 16
#define DEFAULT_SAMPLE_TIME 100

#define NUM_BUS_WATCHER_RESULTS 9

struct bus_watcher_data {
	union	{
		u32 results[NUM_BUS_WATCHER_RESULTS];
		struct {
			u32 atrans;
			u32 atwait;
			u32 amax;
			u32 wtrans;
			u32 wtwait;
			u32 wmax;
			u32 rtrans;
			u32 rtwait;
			u32 rmax;
		};
	};
};


struct rpi_axiperf {
	struct platform_device *dev;
	struct dentry *root_folder;

	struct task_struct *monitor_thread;
	struct mutex lock;

	struct rpi_firmware *firmware;

	/* Sample time spent on for each bus */
	int sample_time;

	/* Now storage for the per monitor settings and the resulting
	 * performance figures
	 */
	struct {
		/* Bit field of buses we want to monitor */
		int bus_enabled;
		/* Bit field of buses to filter by */
		int bus_filter;
		/* The current buses being monitored on this monitor */
		int current_bus[NUM_BUS_WATCHERS_PER_MONITOR];
		/* The last bus monitored on this monitor */
		int last_monitored;

		/* Set true if this mailbox must use the mailbox interface
		 * rather than access registers directly.
		 */
		int use_mailbox_interface;

		/* Current result values */
		struct bus_watcher_data results[MAX_BUSES];

		struct dentry *debugfs_entry;
		void __iomem *base_address;

	}  monitor[NUM_MONITORS];

};

static struct rpi_axiperf *state;

/* Two monitors, System and VPU, each with the following register sets.
 * Each monitor can only monitor one bus at a time, so we time share them,
 * giving each bus 100ms (default, settable via debugfs) of time on its
 * associated monitor
 * Record results from the three Bus watchers per monitor and push to the sysfs
 */

/* general registers */
const int GEN_CTRL;

const int GEN_CTL_ENABLE_BIT	= BIT(0);
const int GEN_CTL_RESET_BIT	= BIT(1);

/* Bus watcher registers */
const int BW_PITCH		= 0x40;

const int BW0_CTRL		= 0x40;
const int BW1_CTRL		= 0x80;
const int BW2_CTRL		= 0xc0;

const int BW_ATRANS_OFFSET	= 0x04;
const int BW_ATWAIT_OFFSET	= 0x08;
const int BW_AMAX_OFFSET	= 0x0c;
const int BW_WTRANS_OFFSET	= 0x10;
const int BW_WTWAIT_OFFSET	= 0x14;
const int BW_WMAX_OFFSET	= 0x18;
const int BW_RTRANS_OFFSET	= 0x1c;
const int BW_RTWAIT_OFFSET	= 0x20;
const int BW_RMAX_OFFSET	= 0x24;

const int BW_CTRL_RESET_BIT	= BIT(31);
const int BW_CTRL_ENABLE_BIT	= BIT(30);
const int BW_CTRL_ENABLE_ID_FILTER_BIT	= BIT(29);
const int BW_CTRL_LIMIT_HALT_BIT	= BIT(28);

const int BW_CTRL_SOURCE_SHIFT	= 8;
const int BW_CTRL_SOURCE_MASK	= GENMASK(12, 8); // 5 bits
const int BW_CTRL_BUS_WATCH_SHIFT;
const int BW_CTRL_BUS_WATCH_MASK = GENMASK(5, 0); // 6 bits
const int BW_CTRL_BUS_FILTER_SHIFT = 8;

const static char *bus_filter_strings[] = {
	"",
	"CORE0_V",
	"ICACHE0",
	"DCACHE0",
	"CORE1_V",
	"ICACHE1",
	"DCACHE1",
	"L2_MAIN",
	"HOST_PORT",
	"HOST_PORT2",
	"HVS",
	"ISP",
	"VIDEO_DCT",
	"VIDEO_SD2AXI",
	"CAM0",
	"CAM1",
	"DMA0",
	"DMA1",
	"DMA2_VPU",
	"JPEG",
	"VIDEO_CME",
	"TRANSPOSER",
	"VIDEO_FME",
	"CCP2TX",
	"USB",
	"V3D0",
	"V3D1",
	"V3D2",
	"AVE",
	"DEBUG",
	"CPU",
	"M30"
};

const int num_bus_filters = ARRAY_SIZE(bus_filter_strings);

const static char *system_bus_string[] = {
	"DMA_L2",
	"TRANS",
	"JPEG",
	"SYSTEM_UC",
	"DMA_UC",
	"SYSTEM_L2",
	"CCP2TX",
	"MPHI_RX",
	"MPHI_TX",
	"HVS",
	"H264",
	"ISP",
	"V3D",
	"PERIPHERAL",
	"CPU_UC",
	"CPU_L2"
};

const int num_system_buses = ARRAY_SIZE(system_bus_string);

const static char *vpu_bus_string[] = {
	"VPU1_D_L2",
	"VPU0_D_L2",
	"VPU1_I_L2",
	"VPU0_I_L2",
	"SYSTEM_L2",
	"L2_FLUSH",
	"DMA_L2",
	"VPU1_D_UC",
	"VPU0_D_UC",
	"VPU1_I_UC",
	"VPU0_I_UC",
	"SYSTEM_UC",
	"L2_OUT",
	"DMA_UC",
	"SDRAM",
	"L2_IN"
};

const int num_vpu_buses = ARRAY_SIZE(vpu_bus_string);

const static char *monitor_name[] = {
	"System",
	"VPU"
};

static inline void write_reg(int monitor, int reg, u32 value)
{
	writel(value, state->monitor[monitor].base_address + reg);
}

static inline u32 read_reg(int monitor, u32 reg)
{
	return readl(state->monitor[monitor].base_address + reg);
}

static void read_bus_watcher(int monitor, int watcher, u32 *results)
{
	if (state->monitor[monitor].use_mailbox_interface) {
		/* We have 9 results, plus the overheads of start address and
		 * length So 11 u32 to define
		 */
		u32 tmp[11];
		int err;

		tmp[0] = (u32)(state->monitor[monitor].base_address + watcher
				+ BW_ATRANS_OFFSET);
		tmp[1] = NUM_BUS_WATCHER_RESULTS;

		err = rpi_firmware_property(state->firmware,
					    RPI_FIRMWARE_GET_PERIPH_REG,
					    tmp, sizeof(tmp));

		if (err < 0 || tmp[1] != NUM_BUS_WATCHER_RESULTS)
			dev_err_once(&state->dev->dev,
				     "Failed to read bus watcher");
		else
			memcpy(results, &tmp[2],
			       NUM_BUS_WATCHER_RESULTS * sizeof(u32));
	} else {
		int i;
		void __iomem *addr = state->monitor[monitor].base_address
				+ watcher + BW_ATRANS_OFFSET;
		for (i = 0; i < NUM_BUS_WATCHER_RESULTS; i++, addr += 4)
			*results++ = readl(addr);
	}
}

static void set_monitor_control(int monitor, u32 set)
{
	if (state->monitor[monitor].use_mailbox_interface) {
		u32 tmp[3] = {(u32)(state->monitor[monitor].base_address +
				GEN_CTRL), 1, set};
		int err = rpi_firmware_property(state->firmware,
						RPI_FIRMWARE_SET_PERIPH_REG,
						tmp, sizeof(tmp));

		if (err < 0 || tmp[1] != 1)
			dev_err_once(&state->dev->dev,
				"Failed to set monitor control");
	} else
		write_reg(monitor, GEN_CTRL, set);
}

static void set_bus_watcher_control(int monitor, int watcher, u32 set)
{
	if (state->monitor[monitor].use_mailbox_interface) {
		u32 tmp[3] = {(u32)(state->monitor[monitor].base_address +
				    watcher), 1, set};
		int err = rpi_firmware_property(state->firmware,
						RPI_FIRMWARE_SET_PERIPH_REG,
						tmp, sizeof(tmp));
		if (err < 0 || tmp[1] != 1)
			dev_err_once(&state->dev->dev,
				"Failed to set bus watcher control");
	} else
		write_reg(monitor, watcher, set);
}

static void monitor(struct rpi_axiperf *state)
{
	int monitor, num_buses[NUM_MONITORS];

	mutex_lock(&state->lock);

	for (monitor = 0; monitor < NUM_MONITORS; monitor++) {
		typeof(state->monitor[0]) *mon = &(state->monitor[monitor]);

		/* Anything enabled? */
		if (mon->bus_enabled == 0) {
			/* No, disable all monitoring for this monitor */
			set_monitor_control(monitor, GEN_CTL_RESET_BIT);
		} else {
			int i;

			/* Find out how many busses we want to monitor, and
			 * spread our 3 actual monitors over them
			 */
			num_buses[monitor] = hweight32(mon->bus_enabled);
			num_buses[monitor] = min(num_buses[monitor],
						 NUM_BUS_WATCHERS_PER_MONITOR);

			for (i = 0; i < num_buses[monitor]; i++) {
				int bus_control;

				do {
					mon->last_monitored++;
					mon->last_monitored &= 0xf;
				} while ((mon->bus_enabled &
					 (1 << mon->last_monitored)) == 0);

				mon->current_bus[i] = mon->last_monitored;

				/* Reset the counters */
				set_bus_watcher_control(monitor,
							BW0_CTRL +
							i*BW_PITCH,
							BW_CTRL_RESET_BIT);

				bus_control = BW_CTRL_ENABLE_BIT |
						mon->current_bus[i];

				if (mon->bus_filter) {
					bus_control |=
						BW_CTRL_ENABLE_ID_FILTER_BIT;
					bus_control |=
						((mon->bus_filter & 0x1f)
						<< BW_CTRL_BUS_FILTER_SHIFT);
				}

				// Start capture
				set_bus_watcher_control(monitor,
							BW0_CTRL + i*BW_PITCH,
							bus_control);
			}
		}

		/* start monitoring */
		set_monitor_control(monitor, GEN_CTL_ENABLE_BIT);
	}

	mutex_unlock(&state->lock);

	msleep(state->sample_time);

	/* Now read the results */

	mutex_lock(&state->lock);
	for (monitor = 0; monitor < NUM_MONITORS; monitor++) {
		typeof(state->monitor[0]) *mon = &(state->monitor[monitor]);

		/* Anything enabled? */
		if (mon->bus_enabled == 0) {
			/* No, disable all monitoring for this monitor */
			set_monitor_control(monitor, 0);
		} else {
			int i;

			for (i = 0; i < num_buses[monitor]; i++) {
				int bus = mon->current_bus[i];

				read_bus_watcher(monitor,
					BW0_CTRL + i*BW_PITCH,
					(u32 *)&mon->results[bus].results);
			}
		}
	}
	mutex_unlock(&state->lock);
}

static int monitor_thread(void *data)
{
	struct rpi_axiperf *state  = data;

	while (1) {
		monitor(state);

		if (kthread_should_stop())
			return 0;
	}
	return 0;
}

static ssize_t myreader(struct file *fp, char __user *user_buffer,
			size_t count, loff_t *position)
{
#define INIT_BUFF_SIZE 2048

	int i;
	int idx = (int)(fp->private_data);
	int num_buses, cnt;
	char *string_buffer;
	int buff_size = INIT_BUFF_SIZE;
	char *p;
	typeof(state->monitor[0]) *mon = &(state->monitor[idx]);

	if (idx < 0 || idx > NUM_MONITORS)
		idx = 0;

	num_buses = idx == SYSTEM_MONITOR ? num_system_buses : num_vpu_buses;

	string_buffer = kmalloc(buff_size, GFP_KERNEL);

	if (!string_buffer) {
		dev_err(&state->dev->dev,
				"Failed temporary string allocation\n");
		return 0;
	}

	p = string_buffer;

	mutex_lock(&state->lock);

	if (mon->bus_filter) {
		int filt = min(mon->bus_filter & 0x1f, num_bus_filters);

		cnt = snprintf(p, buff_size,
			       "\nMonitoring transactions from %s only\n",
			       bus_filter_strings[filt]);
		p += cnt;
		buff_size -= cnt;
	}

	cnt = snprintf(p, buff_size, "     Bus   |    Atrans    Atwait      AMax    Wtrans    Wtwait      WMax    Rtrans    Rtwait      RMax\n"
				     "======================================================================================================\n");

	if (cnt >= buff_size)
		goto done;

	p += cnt;
	buff_size -= cnt;

	for (i = 0; i < num_buses; i++) {
		if (mon->bus_enabled & (1 << i)) {
#define DIVIDER (1024)
			typeof(mon->results[0]) *res = &(mon->results[i]);

			cnt = snprintf(p, buff_size,
					"%10s | %8uK %8uK %8uK %8uK %8uK %8uK %8uK %8uK %8uK\n",
					idx == SYSTEM_MONITOR ?
						system_bus_string[i] :
						vpu_bus_string[i],
					res->atrans/DIVIDER,
					res->atwait/DIVIDER,
					res->amax/DIVIDER,
					res->wtrans/DIVIDER,
					res->wtwait/DIVIDER,
					res->wmax/DIVIDER,
					res->rtrans/DIVIDER,
					res->rtwait/DIVIDER,
					res->rmax/DIVIDER
					);
			if (cnt >= buff_size)
				goto done;

			p += cnt;
			buff_size -= cnt;
		}
	}

	mutex_unlock(&state->lock);

done:

	/* did the last string entry exceeed our buffer size? ie out of string
	 * buffer space. Null terminate, use what we have.
	 */
	if (cnt >= buff_size) {
		buff_size = 0;
		string_buffer[INIT_BUFF_SIZE] = 0;
	}

	cnt = simple_read_from_buffer(user_buffer, count, position,
				      string_buffer,
				      INIT_BUFF_SIZE - buff_size);

	kfree(string_buffer);

	return cnt;
}

static ssize_t mywriter(struct file *fp, const char __user *user_buffer,
			size_t count, loff_t *position)
{
	int idx = (int)(fp->private_data);

	if (idx < 0 || idx > NUM_MONITORS)
		idx = 0;

	/* At the moment, this does nothing, but in the future it could be
	 * used to reset counters etc
	 */
	return count;
}

static const struct file_operations fops_debug = {
	.read = myreader,
	.write = mywriter,
	.open = simple_open
};

static int rpi_axiperf_probe(struct platform_device *pdev)
{
	int ret = 0, i;
	struct device *dev = &pdev->dev;
	struct device_node *np = dev->of_node;
	struct device_node *fw_node;

	state = kzalloc(sizeof(struct rpi_axiperf), GFP_KERNEL);
	if (!state)
		return -ENOMEM;

	/* Get the firmware handle for future rpi-firmware-xxx calls */
	fw_node = of_parse_phandle(np, "firmware", 0);
	if (!fw_node) {
		dev_err(dev, "Missing firmware node\n");
		return -ENOENT;
	}

	state->firmware = rpi_firmware_get(fw_node);
	if (!state->firmware)
		return -EPROBE_DEFER;

	/* Special case for the VPU monitor, we must use the mailbox interface
	 * as it is not accessible from the ARM address space.
	 */
	state->monitor[VPU_MONITOR].use_mailbox_interface = 1;
	state->monitor[SYSTEM_MONITOR].use_mailbox_interface = 0;

	for (i = 0; i < NUM_MONITORS; i++) {
		if (state->monitor[i].use_mailbox_interface) {
			 of_property_read_u32_index(np, "reg", i*2,
				(u32 *)(&state->monitor[i].base_address));
		} else {
			struct resource *resource =
				platform_get_resource(pdev, IORESOURCE_MEM, i);

			state->monitor[i].base_address =
				devm_ioremap_resource(&pdev->dev, resource);
		}

		if (IS_ERR(state->monitor[i].base_address))
			return PTR_ERR(state->monitor[i].base_address);

		/* Enable all buses by default */
		state->monitor[i].bus_enabled = 0xffff;
	}

	state->dev = pdev;
	platform_set_drvdata(pdev, state);

	state->sample_time = DEFAULT_SAMPLE_TIME;

	/* Set up all the debugfs stuff */
	state->root_folder = debugfs_create_dir(KBUILD_MODNAME, NULL);

	for (i = 0; i < NUM_MONITORS; i++) {
		state->monitor[i].debugfs_entry =
			debugfs_create_dir(monitor_name[i], state->root_folder);
		if (IS_ERR(state->monitor[i].debugfs_entry))
			state->monitor[i].debugfs_entry = NULL;

		debugfs_create_file("data", 0444,
				    state->monitor[i].debugfs_entry,
				    (void *)i, &fops_debug);
		debugfs_create_u32("enable", 0644,
				   state->monitor[i].debugfs_entry,
				   &state->monitor[i].bus_enabled);
		debugfs_create_u32("filter", 0644,
				   state->monitor[i].debugfs_entry,
				   &state->monitor[i].bus_filter);
		debugfs_create_u32("sample_time", 0644,
				   state->monitor[i].debugfs_entry,
				   &state->sample_time);
	}

	mutex_init(&state->lock);

	state->monitor_thread = kthread_run(monitor_thread, state,
					    "rpi-axiperfmon");

	return ret;

}

static int rpi_axiperf_remove(struct platform_device *dev)
{
	int ret = 0;

	kthread_stop(state->monitor_thread);

	debugfs_remove_recursive(state->root_folder);
	state->root_folder = NULL;

	return ret;
}

static const struct of_device_id rpi_axiperf_match[] = {
	{
		.compatible = "brcm,bcm2835-axiperf",
	},
	{},
};
MODULE_DEVICE_TABLE(of, rpi_axiperf_match);

static struct platform_driver rpi_axiperf_driver  = {
	.probe =	rpi_axiperf_probe,
	.remove =	rpi_axiperf_remove,
	.driver = {
		.name   = "rpi-bcm2835-axiperf",
		.of_match_table = of_match_ptr(rpi_axiperf_match),
	},
};

module_platform_driver(rpi_axiperf_driver);

/* Module information */
MODULE_AUTHOR("James Hughes <james.hughes@raspberrypi.org>");
MODULE_DESCRIPTION("RPI AXI Performance monitor driver");
MODULE_LICENSE("GPL");

