/*
 * Copyright 2010-2011 Red Hat, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
 * of the GNU General Public License v2 or (at your option) any later version.
 */

#include <inttypes.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <pthread.h>
#include <time.h>
#include <syslog.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/time.h>

#include "sanlock_internal.h"
#include "diskio.h"
#include "log.h"
#include "paxos_lease.h"
#include "lockspace.h"
#include "resource.h"
#include "task.h"
#include "timeouts.h"
#include "mode_block.h"
#include "helper.h"

/* from cmd.c */
void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id);

/* from main.c */
int get_rand(int a, int b);

static pthread_t resource_pt;
static int resource_thread_stop;
static int resource_thread_work;
static struct list_head resources_held;
static struct list_head resources_add;
static struct list_head resources_rem;
static pthread_mutex_t resource_mutex;
static pthread_cond_t resource_cond;


static void free_resource(struct resource *r)
{
	if (r->lvb)
		free(r->lvb);
	free(r);
}

void send_state_resources(int fd)
{
	struct resource *r;
	struct token *token;

	pthread_mutex_lock(&resource_mutex);
	list_for_each_entry(r, &resources_held, list) {
		list_for_each_entry(token, &r->tokens, list)
			send_state_resource(fd, r, "held", token->pid, token->token_id);
	}

	list_for_each_entry(r, &resources_add, list) {
		list_for_each_entry(token, &r->tokens, list)
			send_state_resource(fd, r, "add", token->pid, token->token_id);
	}
	list_for_each_entry(r, &resources_rem, list)
		send_state_resource(fd, r, "rem", r->pid, r->release_token_id);
	pthread_mutex_unlock(&resource_mutex);
}

int read_resource_owners(struct task *task, struct token *token,
			 struct sanlk_resource *res,
			 char **send_buf, int *send_len, int *count)
{
	struct leader_record leader;
	struct sync_disk *disk;
	struct sanlk_host *host;
	struct mode_block *mb;
	uint64_t host_id;
	char *dblock;
	char *lease_buf = NULL;
	char *hosts_buf = NULL;
	int host_count = 0;
	int i, rv;

	disk = &token->disks[0];

	/* we could in-line paxos_read_buf here like we do in read_mode_block */

	rv = paxos_read_buf(task, token, &lease_buf);
	if (rv < 0) {
		log_errot(token, "read_resource_owners read_buf rv %d", rv);

		if (lease_buf && (rv != SANLK_AIO_TIMEOUT))
			free(lease_buf);
		return rv;
	}

	memcpy(&leader, lease_buf, sizeof(struct leader_record));

	rv = paxos_verify_leader(token, disk, &leader, "read_resource_owners");
	if (rv < 0)
		goto out;

	res->lver = leader.lver;

	if (leader.timestamp && leader.owner_id)
		host_count++;

	for (i = 0; i < leader.num_hosts; i++) {
		dblock = lease_buf + ((2 + i) * disk->sector_size);
		mb = (struct mode_block *)(dblock + MBLOCK_OFFSET);
		host_id = i + 1;

		if (!(mb->flags & MBLOCK_SHARED))
			continue;

		res->flags |= SANLK_RES_SHARED;

		/* the leader owner has already been counted above;
		   in the ex case it won't have a mode block set */

		if (leader.timestamp && leader.owner_id && (host_id == leader.owner_id))
			continue;

		host_count++;
	}

	*count = host_count;

	if (!host_count) {
		rv = 0;
		goto out;
	}

	hosts_buf = malloc(host_count * sizeof(struct sanlk_host));
	if (!hosts_buf) {
		host_count = 0;
		rv = -ENOMEM;
		goto out;
	}
	memset(hosts_buf, 0, host_count * sizeof(struct sanlk_host));
	host = (struct sanlk_host *)hosts_buf;

	/*
	 * Usually when leader owner is set, it's an exclusive lock and
	 * we could skip to the end, but if we read while a new shared
	 * owner is being added, we'll see the leader owner set, and
	 * then may see other shared owners in the mode blocks.
	 */

	if (leader.timestamp && leader.owner_id) {
		host->host_id = leader.owner_id;
		host->generation = leader.owner_generation;
		host->timestamp = leader.timestamp;
		host++;
	}

	for (i = 0; i < leader.num_hosts; i++) {
		dblock = lease_buf + ((2 + i) * disk->sector_size);
		mb = (struct mode_block *)(dblock + MBLOCK_OFFSET);
		host_id = i + 1;

		if (!(mb->flags & MBLOCK_SHARED))
			continue;

		if (leader.timestamp && leader.owner_id && (host_id == leader.owner_id))
			continue;

		host->host_id = host_id;
		host->generation = mb->generation;
		host++;
	}
	rv = 0;
 out:
	*send_len = host_count * sizeof(struct sanlk_host);
	*send_buf = hosts_buf;
	free(lease_buf);
	return rv;
}

/* return 1 (is alive) to force a failure if we don't have enough
   knowledge to know it's really not alive.  Later we could have this sit and
   wait (like paxos_lease_acquire) until we have waited long enough or have
   enough knowledge to say it's safely dead (unless of course we find it is
   alive while waiting) */

static int host_live(char *lockspace_name, uint64_t host_id, uint64_t gen)
{
	struct host_status hs;
	uint64_t now;
	int other_io_timeout, other_host_dead_seconds;
	int rv;

	rv = host_info(lockspace_name, host_id, &hs);
	if (rv) {
		log_debug("host_live %llu %llu yes host_info %d",
			  (unsigned long long)host_id, (unsigned long long)gen, rv);
		return 1;
	}

	if (!hs.last_check) {
		log_debug("host_live %llu %llu yes unchecked",
			  (unsigned long long)host_id, (unsigned long long)gen);
		return 1;
	}

	/* the host_id lease is free, not being used */
	if (!hs.timestamp) {
		log_debug("host_live %llu %llu no lease free",
			  (unsigned long long)host_id, (unsigned long long)gen);
		return 0;
	}

	if (hs.owner_generation > gen) {
		log_debug("host_live %llu %llu no old gen %llu",
			  (unsigned long long)host_id, (unsigned long long)gen,
			  (unsigned long long)hs.owner_generation);
		return 0;
	}

	now = monotime();

	other_io_timeout = hs.io_timeout;
	other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout);

	if (!hs.last_live && (now - hs.first_check > other_host_dead_seconds)) {
		log_debug("host_live %llu %llu no first_check %llu",
			  (unsigned long long)host_id, (unsigned long long)gen,
			  (unsigned long long)hs.first_check);
		return 0;
	}

	if (hs.last_live && (now - hs.last_live > other_host_dead_seconds)) {
		log_debug("host_live %llu %llu no last_live %llu",
			  (unsigned long long)host_id, (unsigned long long)gen,
			  (unsigned long long)hs.last_live);
		return 0;
	}

	log_debug("host_live %llu %llu yes recent first_check %llu last_live %llu",
		  (unsigned long long)host_id, (unsigned long long)gen,
		  (unsigned long long)hs.first_check,
		  (unsigned long long)hs.last_live);

	return 1;
}

void check_mode_block(struct token *token, int q, char *dblock)
{
	struct mode_block *mb;

	mb = (struct mode_block *)(dblock + MBLOCK_OFFSET);

	if (mb->flags & MBLOCK_SHARED) {
		set_id_bit(q + 1, token->shared_bitmap, NULL);
		token->shared_count++;
	}
}

static int set_mode_block(struct task *task, struct token *token,
			  uint64_t host_id, uint64_t gen, uint32_t flags)
{
	struct sync_disk *disk;
	struct mode_block *mb;
	char *iobuf, **p_iobuf;
	uint64_t offset;
	int num_disks = token->r.num_disks;
	int iobuf_len, rv, d;

	disk = &token->disks[0];

	iobuf_len = disk->sector_size;
	if (!iobuf_len)
		return -EINVAL;

	p_iobuf = &iobuf;

	rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len);
	if (rv)
		return -ENOMEM;

	for (d = 0; d < num_disks; d++) {
		disk = &token->disks[d];

		offset = disk->offset + ((2 + host_id - 1) * disk->sector_size);

		rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);
		if (rv < 0)
			break;

		mb = (struct mode_block *)(iobuf + MBLOCK_OFFSET);
		mb->flags = flags;
		mb->generation = gen;

		rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);
		if (rv < 0)
			break;
	}

	if (rv < 0) {
		log_errot(token, "set_mode_block host_id %llu flags %x gen %llu d %d rv %d",
			  (unsigned long long)host_id, flags, (unsigned long long)gen, d, rv);
	} else {
		log_token(token, "set_mode_block host_id %llu flags %x gen %llu",
			  (unsigned long long)host_id, flags, (unsigned long long)gen);
	}

	if (rv != SANLK_AIO_TIMEOUT)
		free(iobuf);
	return rv;
}

static int read_mode_block(struct task *task, struct token *token,
			   uint64_t host_id, uint64_t *max_gen)
{
	struct sync_disk *disk;
	struct mode_block *mb;
	char *iobuf, **p_iobuf;
	uint64_t offset;
	uint64_t max = 0;
	int num_disks = token->r.num_disks;
	int iobuf_len, rv, d;

	disk = &token->disks[0];

	iobuf_len = disk->sector_size;
	if (!iobuf_len)
		return -EINVAL;

	p_iobuf = &iobuf;

	rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len);
	if (rv)
		return -ENOMEM;

	for (d = 0; d < num_disks; d++) {
		disk = &token->disks[d];

		offset = disk->offset + ((2 + host_id - 1) * disk->sector_size);

		rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);
		if (rv < 0)
			break;

		mb = (struct mode_block *)(iobuf + MBLOCK_OFFSET);

		if (!(mb->flags & MBLOCK_SHARED))
			continue;

		if (!max || mb->generation > max)
			max = mb->generation;
	}

	if (rv != SANLK_AIO_TIMEOUT)
		free(iobuf);

	*max_gen = max;
	return rv;
}

static int clear_dead_shared(struct task *task, struct token *token,
			     int num_hosts, int *live_count)
{
	uint64_t host_id, max_gen = 0;
	int i, rv, live = 0;

	for (i = 0; i < num_hosts; i++) {
		host_id = i + 1;

		if (host_id == token->host_id)
			continue;

		if (!test_id_bit(host_id, token->shared_bitmap))
			continue;

		rv = read_mode_block(task, token, host_id, &max_gen);
		if (rv < 0) {
			log_errot(token, "clear_dead_shared read_mode_block %llu %d",
				  (unsigned long long)host_id, rv);
			return rv;
		}

		if (host_live(token->r.lockspace_name, host_id, max_gen)) {
			log_token(token, "clear_dead_shared host_id %llu gen %llu alive",
				  (unsigned long long)host_id, (unsigned long long)max_gen);
			live++;
			continue;
		}

		rv = set_mode_block(task, token, host_id, 0, 0);
		if (rv < 0) {
			log_errot(token, "clear_dead_shared host_id %llu set_mode_block %d",
				  (unsigned long long)host_id, rv);
			return rv;
		}

		log_token(token, "clear_dead_shared host_id %llu gen %llu dead and cleared",
			  (unsigned long long)host_id, (unsigned long long)max_gen);
	}

	*live_count = live;
	return rv;
}

/* the lvb is the sector after the dblock for host_id 2000, i.e. 2002 */

#define LVB_SECTOR 2002

static int read_lvb_block(struct task *task, struct token *token)
{
	struct sync_disk *disk;
	struct resource *r;
	char *iobuf;
	uint64_t offset;
	int iobuf_len, rv;

	r = token->resource;
	disk = &token->disks[0];
	iobuf_len = disk->sector_size;
	iobuf = r->lvb;
	offset = disk->offset + (LVB_SECTOR * disk->sector_size);

	rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);

	return rv;
}

static int write_lvb_block(struct task *task, struct resource *r, struct token *token)
{
	struct sync_disk *disk;
	char *iobuf;
	uint64_t offset;
	int iobuf_len, rv;

	disk = &token->disks[0];
	iobuf_len = disk->sector_size;
	iobuf = r->lvb;
	offset = disk->offset + (LVB_SECTOR * disk->sector_size);

	rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout);

	return rv;
}

int res_set_lvb(struct sanlk_resource *res, char *lvb, int lvblen)
{
	struct resource *r;
	int rv = -ENOENT;

	pthread_mutex_lock(&resource_mutex);
	list_for_each_entry(r, &resources_held, list) {
		if (strncmp(r->r.lockspace_name, res->lockspace_name, NAME_ID_SIZE))
			continue;
		if (strncmp(r->r.name, res->name, NAME_ID_SIZE))
			continue;

		if (!r->lvb) {
			rv = -EINVAL;
			break;
		}

		if (lvblen > r->leader.sector_size) {
			rv = -E2BIG;
			break;
		}

		memcpy(r->lvb, lvb, lvblen);
		r->flags |= R_LVB_WRITE_RELEASE;
		rv = 0;
		break;
	}
	pthread_mutex_unlock(&resource_mutex);

	return rv;
}

int res_get_lvb(struct sanlk_resource *res, char **lvb_out, int *lvblen)
{
	struct resource *r;
	char *lvb;
	int rv = -ENOENT;
	int len = *lvblen;

	pthread_mutex_lock(&resource_mutex);
	list_for_each_entry(r, &resources_held, list) {
		if (strncmp(r->r.lockspace_name, res->lockspace_name, NAME_ID_SIZE))
			continue;
		if (strncmp(r->r.name, res->name, NAME_ID_SIZE))
			continue;

		if (!r->lvb) {
			rv = -EINVAL;
			break;
		}

		if (!len)
			len = r->leader.sector_size;

		lvb = malloc(len);
		if (!lvb) {
			rv = -ENOMEM;
			break;
		}

		memcpy(lvb, r->lvb, len);
		*lvb_out = lvb;
		*lvblen = len;
		rv = 0;
		break;
	}
	pthread_mutex_unlock(&resource_mutex);

	return rv;
}

/* return < 0 on error, 1 on success */

static int acquire_disk(struct task *task, struct token *token,
			uint64_t acquire_lver, int new_num_hosts,
			struct leader_record *leader)
{
	struct leader_record leader_tmp;
	int rv;
	uint32_t flags = 0;

	if (com.quiet_fail)
		flags |= PAXOS_ACQUIRE_QUIET_FAIL;

	if (token->acquire_flags & SANLK_RES_SHARED)
		flags |= PAXOS_ACQUIRE_SHARED;

	memset(&leader_tmp, 0, sizeof(leader_tmp));

	rv = paxos_lease_acquire(task, token, flags, &leader_tmp, acquire_lver,
				 new_num_hosts);

	log_token(token, "acquire_disk rv %d lver %llu at %llu", rv,
		  (unsigned long long)leader_tmp.lver,
		  (unsigned long long)leader_tmp.timestamp);

	memcpy(leader, &leader_tmp, sizeof(struct leader_record));

	return rv; /* SANLK_RV */
}

/* return < 0 on error, 1 on success */

static int release_disk(struct task *task, struct token *token,
			struct sanlk_resource *resrename,
			struct leader_record *leader)
{
	struct leader_record leader_tmp;
	int rv;

	rv = paxos_lease_release(task, token, resrename, leader, &leader_tmp);

	log_token(token, "release_disk rv %d", rv);

	if (rv < 0)
		return rv;

	memcpy(leader, &leader_tmp, sizeof(struct leader_record));
	return rv; /* SANLK_OK */
}

static int _release_token(struct task *task, struct token *token,
			  struct sanlk_resource *resrename,
			  int opened, int nodisk)
{
	struct resource *r = token->resource;
	uint64_t lver;
	int last_token = 0;
	int rv;

	/* We keep r on the resources_rem list while doing the actual release 
	   on disk so another acquire for the same resource will see it on
	   the list and fail. we can't have one thread releasing and another
	   acquiring the same resource.  While on the rem list, the resource
	   can't be used by anyone. */

	pthread_mutex_lock(&resource_mutex);
	list_del(&token->list);
	if (list_empty(&r->tokens)) {
		list_move(&r->list, &resources_rem);
		last_token = 1;
	}
	lver = r->leader.lver;
	pthread_mutex_unlock(&resource_mutex);

	if ((r->flags & R_SHARED) && !last_token) {
		/* will release when final sh token is released */
		log_token(token, "release_token more shared");
		close_disks(token->disks, token->r.num_disks);
		return SANLK_OK;
	}

	if (!last_token) {
		/* should never happen */
		log_errot(token, "release_token exclusive not last");
		close_disks(token->disks, token->r.num_disks);
		return SANLK_ERROR;
	}

	if (!lver) {
		/* never acquired on disk so no need to release on disk */
		close_disks(token->disks, token->r.num_disks);
		rv = SANLK_OK;
		goto out;
	}

	if (token->flags & T_LS_DEAD) {
		/* don't bother trying disk op which will probably timeout */
		close_disks(token->disks, token->r.num_disks);
		rv = SANLK_OK;
		goto out;
	}

	if (nodisk) {
		rv = SANLK_OK;
		goto out;
	}

	if (!opened) {
		rv = open_disks_fd(token->disks, token->r.num_disks);
		if (rv < 0) {
			/* it's not terrible if we can't do the disk release */
			rv = SANLK_OK;
			goto out;
		}
	}

	if (r->flags & R_SHARED) {
		rv = set_mode_block(task, token, token->host_id, 0, 0);
	} else {
		if (r->flags & R_LVB_WRITE_RELEASE)
			write_lvb_block(task, r, token);

		rv = release_disk(task, token, resrename, &r->leader);
	}

	close_disks(token->disks, token->r.num_disks);

 out:
	if (rv < 0)
		log_errot(token, "release_token rv %d flags %x lver %llu o %d n %d",
			  rv, r->flags, (unsigned long long)lver, opened, nodisk);
	else
		log_token(token, "release_token flags %x", r->flags);

	pthread_mutex_lock(&resource_mutex);
	list_del(&r->list);
	pthread_mutex_unlock(&resource_mutex);
	free_resource(r);

	return rv;
}

static int release_token_nodisk(struct task *task, struct token *token)
{
	return _release_token(task, token, NULL, 0, 1);
}

static int release_token_opened(struct task *task, struct token *token)
{
	return _release_token(task, token, NULL, 1, 0);
}

int release_token(struct task *task, struct token *token,
		  struct sanlk_resource *resrename)
{
	return _release_token(task, token, resrename, 0, 0);
}

/* We're releasing a token from the main thread, in which we don't want to block,
   so we can't do a real release involving disk io.  So, pass the release off to
   the resource_thread. */

void release_token_async(struct token *token)
{
	struct resource *r = token->resource;

	pthread_mutex_lock(&resource_mutex);
	list_del(&token->list);
	if (list_empty(&r->tokens)) {
		if ((token->flags & T_LS_DEAD) || !r->leader.lver) {
			/* don't bother trying to release if the lockspace
			   is dead (release will probably fail), or the
			   lease wasn't never acquired */
			list_del(&r->list);
			free_resource(r);
		} else {
			r->flags |= R_THREAD_RELEASE;
			r->release_token_id = token->token_id;
			resource_thread_work = 1;
			list_move(&r->list, &resources_rem);
			pthread_cond_signal(&resource_cond);
		}
	}
	pthread_mutex_unlock(&resource_mutex);
}

static struct resource *find_resource(struct token *token,
				      struct list_head *head)
{
	struct resource *r;

	list_for_each_entry(r, head, list) {
		if (strncmp(r->r.lockspace_name, token->r.lockspace_name, NAME_ID_SIZE))
			continue;
		if (strncmp(r->r.name, token->r.name, NAME_ID_SIZE))
			continue;
		return r;
	}
	return NULL;
}

int lockspace_is_used(struct sanlk_lockspace *ls)
{
	struct resource *r;

	pthread_mutex_lock(&resource_mutex);
	list_for_each_entry(r, &resources_held, list) {
		if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE))
			goto yes;
	}
	list_for_each_entry(r, &resources_add, list) {
		if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE))
			goto yes;
	}
	list_for_each_entry(r, &resources_rem, list) {
		if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE))
			goto yes;
	}
	pthread_mutex_unlock(&resource_mutex);
	return 0;
 yes:
	pthread_mutex_unlock(&resource_mutex);
	return 1;
}

static void copy_disks(void *dst, void *src, int num_disks)
{
	struct sync_disk *d, *s;
	int i;

	d = (struct sync_disk *)dst;
	s = (struct sync_disk *)src;

	for (i = 0; i < num_disks; i++) {
		memcpy(d->path, s->path, SANLK_PATH_LEN);
		d->offset = s->offset;
		d->sector_size = s->sector_size;

		/* fd's are private */
		d->fd = -1;

		d++;
		s++;
	}
}

static struct resource *new_resource(struct token *token)
{
	struct resource *r;
	int disks_len, r_len;

	disks_len = token->r.num_disks * sizeof(struct sync_disk);
	r_len = sizeof(struct resource) + disks_len;

	r = malloc(r_len);
	if (!r)
		return NULL;

	memset(r, 0, r_len);
	memcpy(&r->r, &token->r, sizeof(struct sanlk_resource));

	r->io_timeout = token->io_timeout;

	/* disks copied after open_disks because open_disks sets sector_size
	   which we want copied */

	INIT_LIST_HEAD(&r->tokens);

	r->host_id = token->host_id;
	r->host_generation = token->host_generation;

	if (token->acquire_flags & SANLK_RES_SHARED) {
		r->flags |= R_SHARED;
	} else {
		r->pid = token->pid;
		if (token->flags & T_RESTRICT_SIGKILL)
			r->flags |= R_RESTRICT_SIGKILL;
		if (token->flags & T_RESTRICT_SIGTERM)
			r->flags |= R_RESTRICT_SIGTERM;
	}

	return r;
}

int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags,
		  char *killpath, char *killargs)
{
	struct leader_record leader;
	struct resource *r;
	uint64_t acquire_lver = 0;
	uint32_t new_num_hosts = 0;
	int sector_size;
	int sh_retries = 0;
	int live_count = 0;
	int rv;

	if (token->acquire_flags & SANLK_RES_LVER)
		acquire_lver = token->acquire_lver;
	if (token->acquire_flags & SANLK_RES_NUM_HOSTS)
		new_num_hosts = token->acquire_data32;

	pthread_mutex_lock(&resource_mutex);

	r = find_resource(token, &resources_rem);
	if (r) {
		if (!com.quiet_fail)
			log_errot(token, "acquire_token resource being removed");
		pthread_mutex_unlock(&resource_mutex);
		return -EAGAIN;
	}

	r = find_resource(token, &resources_add);
	if (r) {
		if (!com.quiet_fail)
			log_errot(token, "acquire_token resource being added");
		pthread_mutex_unlock(&resource_mutex);
		return -EBUSY;
	}

	r = find_resource(token, &resources_held);
	if (r && (token->acquire_flags & SANLK_RES_SHARED) && (r->flags & R_SHARED)) {
		/* multiple shared holders allowed */
		log_token(token, "acquire_token add shared");
		copy_disks(&token->r.disks, &r->r.disks, token->r.num_disks);
		token->resource = r;
		list_add(&token->list, &r->tokens);
		pthread_mutex_unlock(&resource_mutex);
		return SANLK_OK;
	}

	if (r) {
		if (!com.quiet_fail)
			log_errot(token, "acquire_token resource exists");
		pthread_mutex_unlock(&resource_mutex);
		return -EEXIST;
	}

	r = new_resource(token);
	if (!r) {
		pthread_mutex_unlock(&resource_mutex);
		return -ENOMEM;
	}

	memcpy(r->killpath, killpath, SANLK_HELPER_PATH_LEN);
	memcpy(r->killargs, killargs, SANLK_HELPER_ARGS_LEN);
	list_add(&token->list, &r->tokens);
	list_add(&r->list, &resources_add);
	token->resource = r;
	pthread_mutex_unlock(&resource_mutex);

	rv = open_disks(token->disks, token->r.num_disks);
	if (rv < 0) {
		log_errot(token, "acquire_token open error %d", rv);
		release_token_nodisk(task, token);
		return rv;
	}

	copy_disks(&r->r.disks, &token->r.disks, token->r.num_disks);

	sector_size = token->disks[0].sector_size;

	if (cmd_flags & SANLK_ACQUIRE_LVB) {
		char *iobuf, **p_iobuf;
		p_iobuf = &iobuf;

		rv = posix_memalign((void *)p_iobuf, getpagesize(), sector_size);
		if (rv)
			log_error("acquire_token cannot allocate lvb");
		else
			r->lvb = iobuf;
	}

 retry:
	memset(&leader, 0, sizeof(struct leader_record));

	rv = acquire_disk(task, token, acquire_lver, new_num_hosts, &leader);
	if (rv < 0) {
		if ((token->acquire_flags & SANLK_RES_SHARED) &&
		    (leader.flags & LFL_SHORT_HOLD)) {
			/*
			 * Multiple parallel sh requests can fail because
			 * the lease is briefly held in ex mode.  The ex
			 * holder sets SHORT_HOLD in the leader record to
			 * indicate that it's only held for a short time
			 * while acquiring a shared lease.  A retry will
			 * probably succeed.
			 */
			if (sh_retries++ < com.sh_retries) {
				int us = get_rand(0, 1000000);
				log_token(token, "acquire_token sh_retry %d %d", rv, us);
				usleep(us);
				goto retry;
			}
			rv = SANLK_ACQUIRE_SHRETRY;
		}
		release_token_opened(task, token);
		return rv;
	}

	memcpy(&r->leader, &leader, sizeof(struct leader_record));

	/* copy lver into token because inquire looks there for it */
	if (!(token->acquire_flags & SANLK_RES_SHARED))
		token->r.lver = leader.lver;

	if (token->acquire_flags & SANLK_RES_SHARED) {
		rv = set_mode_block(task, token, token->host_id,
				    token->host_generation, MBLOCK_SHARED);
		if (rv < 0) {
			release_token_opened(task, token);
			return rv;
		} else {
			release_disk(task, token, NULL, &leader);
			/* the token is kept, the paxos lease is released but with shared set */
			goto out;
		}
	}

	if (!token->shared_count)
		goto out;

	rv = clear_dead_shared(task, token, leader.num_hosts, &live_count);
	if (rv < 0) {
		release_token_opened(task, token);
		return rv;
	}

	if (live_count) {
		/* a live host with a sh lock exists */
		release_token_opened(task, token);
		return -EAGAIN;
	}

 out:
	if (cmd_flags & SANLK_ACQUIRE_LVB)
		read_lvb_block(task, token);

	close_disks(token->disks, token->r.num_disks);

	pthread_mutex_lock(&resource_mutex);
	list_move(&r->list, &resources_held);
	pthread_mutex_unlock(&resource_mutex);

	return SANLK_OK;
}

int request_token(struct task *task, struct token *token, uint32_t force_mode,
		  uint64_t *owner_id, int next_lver)
{
	struct leader_record leader;
	struct request_record req;
	int rv;

	memset(&req, 0, sizeof(req));

	rv = open_disks(token->disks, token->r.num_disks);
	if (rv < 0) {
		log_errot(token, "request_token open error %d", rv);
		return rv;
	}

	if (!token->acquire_lver && !force_mode)
		goto req_read;

	rv = paxos_lease_leader_read(task, token, &leader, "request");
	if (rv < 0)
		goto out;

	if (leader.timestamp == LEASE_FREE) {
		*owner_id = 0;
		rv = SANLK_OK;
		goto out;
	}

	*owner_id = leader.owner_id;

	if (!token->acquire_lver && next_lver)
		token->acquire_lver = leader.lver + 1;

	if (leader.lver >= token->acquire_lver) {
		rv = SANLK_REQUEST_OLD;
		goto out;
	}

 req_read:
	rv = paxos_lease_request_read(task, token, &req);
	if (rv < 0)
		goto out;

	if (req.magic != REQ_DISK_MAGIC) {
		rv = SANLK_REQUEST_MAGIC;
		goto out;
	}

	if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) {
		rv = SANLK_REQUEST_VERSION;
		goto out;
	}

	if (!token->acquire_lver && !force_mode)
		goto req_write;

	/* > instead of >= so multiple hosts can request the same
	   version at once and all succeed */

	if (req.lver > token->acquire_lver) {
		rv = SANLK_REQUEST_LVER;
		goto out;
	}

 req_write:
	req.version = REQ_DISK_VERSION_MAJOR | REQ_DISK_VERSION_MINOR;
	req.lver = token->acquire_lver;
	req.force_mode = force_mode;

	rv = paxos_lease_request_write(task, token, &req);
 out:
	close_disks(token->disks, token->r.num_disks);

	log_debug("request_token rv %d owner %llu lver %llu mode %u",
		  rv, (unsigned long long)*owner_id,
		  (unsigned long long)req.lver, req.force_mode);

	return rv;
}

static int examine_token(struct task *task, struct token *token,
			 struct request_record *req_out)
{
	struct request_record req;
	int rv;

	memset(&req, 0, sizeof(req));

	rv = paxos_lease_request_read(task, token, &req);
	if (rv < 0)
		goto out;

	if (req.magic != REQ_DISK_MAGIC) {
		rv = SANLK_REQUEST_MAGIC;
		goto out;
	}

	if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) {
		rv = SANLK_REQUEST_VERSION;
		goto out;
	}

	memcpy(req_out, &req, sizeof(struct request_record));
 out:
	log_debug("examine_token rv %d lver %llu mode %u",
		  rv, (unsigned long long)req.lver, req.force_mode);

	return rv;
}

static void do_request(struct token *tt, int pid, uint32_t force_mode)
{
	char killpath[SANLK_HELPER_PATH_LEN];
	char killargs[SANLK_HELPER_ARGS_LEN];
	struct helper_msg hm;
	struct resource *r;
	uint32_t flags;
	int rv, found = 0;

	pthread_mutex_lock(&resource_mutex);
	r = find_resource(tt, &resources_held);
	if (r && r->pid == pid) {
		found = 1;
		flags = r->flags;
		memcpy(killpath, r->killpath, SANLK_HELPER_PATH_LEN);
		memcpy(killargs, r->killargs, SANLK_HELPER_ARGS_LEN);
	}
	pthread_mutex_unlock(&resource_mutex);

	if (!found) {
		log_error("do_request pid %d %.48s:%.48s not found",
			   pid, tt->r.lockspace_name, tt->r.name);
		return;
	}

	log_debug("do_request %d flags %x %.48s:%.48s",
		  pid, flags, tt->r.lockspace_name, tt->r.name);

	if (helper_kill_fd == -1) {
		log_error("do_request %d no helper fd", pid);
		return;
	}

	memset(&hm, 0, sizeof(hm));

	if (force_mode == SANLK_REQ_FORCE) {
		hm.type = HELPER_MSG_KILLPID;
		hm.pid = pid;
		hm.sig = (flags & R_RESTRICT_SIGKILL) ? SIGTERM : SIGKILL;
	} else if (force_mode == SANLK_REQ_GRACEFUL) {
		if (killpath[0]) {
			hm.type = HELPER_MSG_RUNPATH;
			memcpy(hm.path, killpath, SANLK_HELPER_PATH_LEN);
			memcpy(hm.args, killargs, SANLK_HELPER_ARGS_LEN);
		} else {
			hm.type = HELPER_MSG_KILLPID;
			hm.pid = pid;
			hm.sig = (flags & R_RESTRICT_SIGTERM) ? SIGKILL : SIGTERM;
		}
	} else {
		log_error("do_request %d unknown force_mode %d",
			  pid, force_mode);
		return;
	}

 retry:
	rv = write(helper_kill_fd, &hm, sizeof(hm));
	if (rv == -1 && errno == EINTR)
		goto retry;

	if (rv == -1)
		log_error("do_request %d helper write error %d",
			  pid, errno);
}

int set_resource_examine(char *space_name, char *res_name)
{
	struct resource *r;
	int count = 0;

	pthread_mutex_lock(&resource_mutex);
	list_for_each_entry(r, &resources_held, list) {
		if (strncmp(r->r.lockspace_name, space_name, NAME_ID_SIZE))
			continue;
		if (res_name && strncmp(r->r.name, res_name, NAME_ID_SIZE))
			continue;
		r->flags |= R_THREAD_EXAMINE;
		resource_thread_work = 1;
		count++;
	}
	if (count)
		pthread_cond_signal(&resource_cond);
	pthread_mutex_unlock(&resource_mutex);

	return count;
}

/*
 * resource_thread
 * - releases tokens of pid's that die
 * - examines request blocks of resources
 */

static struct resource *find_resource_flag(struct list_head *head, uint32_t flag)
{
	struct resource *r;

	list_for_each_entry(r, head, list) {
		if (r->flags & flag)
			return r;
	}
	return NULL;
}

static void resource_thread_release(struct task *task, struct resource *r, struct token *tt)
{
	int rv;

	rv = open_disks_fd(tt->disks, tt->r.num_disks);
	if (rv < 0) {
		log_errot(tt, "resource_thread_release open error %d", rv);
		goto out;
	}

	if (r->flags & R_SHARED) {
		set_mode_block(task, tt, tt->host_id, 0, 0);
	} else {
		if (r->flags & R_LVB_WRITE_RELEASE)
			write_lvb_block(task, r, tt);

		release_disk(task, tt, NULL, &r->leader);
	}

	close_disks(tt->disks, tt->r.num_disks);
 out:
	pthread_mutex_lock(&resource_mutex);
	list_del(&r->list);
	pthread_mutex_unlock(&resource_mutex);
	free_resource(r);
}

static void resource_thread_examine(struct task *task, struct token *tt, int pid, uint64_t lver)
{
	struct request_record req;
	int rv;

	rv = open_disks_fd(tt->disks, tt->r.num_disks);
	if (rv < 0) {
		log_errot(tt, "resource_thread_examine open error %d", rv);
		return;
	}

	rv = examine_token(task, tt, &req);

	close_disks(tt->disks, tt->r.num_disks);

	if (rv != SANLK_OK)
		return;

	if (!req.force_mode || !req.lver)
		return;

	if (req.lver <= lver) {
		log_debug("examine req lver %llu our lver %llu",
			  (unsigned long long)req.lver, (unsigned long long)lver);
		return;
	}

	if (req.force_mode) {
		do_request(tt, pid, req.force_mode);
	} else {
		log_error("req force_mode %u unknown", req.force_mode);
	}
}

static void *resource_thread(void *arg GNUC_UNUSED)
{
	struct task task;
	struct resource *r;
	struct token *tt = NULL;
	uint64_t lver;
	int pid, tt_len;

	memset(&task, 0, sizeof(struct task));
	setup_task_aio(&task, main_task.use_aio, RESOURCE_AIO_CB_SIZE);
	sprintf(task.name, "%s", "resource");

	/* a fake/tmp token struct we copy necessary res info into,
	   because other functions take a token struct arg */

	tt_len = sizeof(struct token) + (SANLK_MAX_DISKS * sizeof(struct sync_disk));
	tt = malloc(tt_len);
	if (!tt) {
		log_error("resource_thread tt malloc error");
		goto out;
	}

	while (1) {
		pthread_mutex_lock(&resource_mutex);
		while (!resource_thread_work) {
			if (resource_thread_stop) {
				pthread_mutex_unlock(&resource_mutex);
				goto out;
			}
			pthread_cond_wait(&resource_cond, &resource_mutex);
		}

		/* FIXME: it's not nice how we copy a bunch of stuff
		 * from token to r so that we can later copy it back from
		 * r into a temp token.  The whole duplication of stuff
		 * between token and r would be nice to clean up. */

		memset(tt, 0, tt_len);
		tt->disks = (struct sync_disk *)&tt->r.disks[0];

		r = find_resource_flag(&resources_rem, R_THREAD_RELEASE);
		if (r) {
			memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource));
			copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks);
			tt->host_id = r->host_id;
			tt->host_generation = r->host_generation;
			tt->token_id = r->release_token_id;
			tt->io_timeout = r->io_timeout;

			r->flags &= ~R_THREAD_RELEASE;
			pthread_mutex_unlock(&resource_mutex);

			resource_thread_release(&task, r, tt);
			continue;
		}

		r = find_resource_flag(&resources_held, R_THREAD_EXAMINE);
		if (r) {
			/* make copies of things we need because we can't use r
			   once we unlock the mutex since it could be released */

			memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource));
			copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks);
			tt->host_id = r->host_id;
			tt->host_generation = r->host_generation;
			tt->io_timeout = r->io_timeout;
			pid = r->pid;
			lver = r->leader.lver;

			r->flags &= ~R_THREAD_EXAMINE;
			pthread_mutex_unlock(&resource_mutex);

			resource_thread_examine(&task, tt, pid, lver);
			continue;
		}

		resource_thread_work = 0;
		pthread_mutex_unlock(&resource_mutex);
	}
 out:
	if (tt)
		free(tt);
	close_task_aio(&task);
	return NULL;
}

int setup_token_manager(void)
{
	int rv;

	pthread_mutex_init(&resource_mutex, NULL);
	pthread_cond_init(&resource_cond, NULL);
	INIT_LIST_HEAD(&resources_add);
	INIT_LIST_HEAD(&resources_rem);
	INIT_LIST_HEAD(&resources_held);

	rv = pthread_create(&resource_pt, NULL, resource_thread, NULL);
	if (rv)
		return -1;
	return 0;
}

void close_token_manager(void)
{
	pthread_mutex_lock(&resource_mutex);
	resource_thread_stop = 1;
	pthread_cond_signal(&resource_cond);
	pthread_mutex_unlock(&resource_mutex);
	pthread_join(resource_pt, NULL);
}

