/*  reduzermpi.cpp
 *
 *  Copyright (C) 2010-2012 Andreas von Manteuffel
 *  Copyright (C) 2010-2012 Cedric Studerus
 *
 *  This file is part of the package Reduze 2.
 *  It is distributed under the GNU General Public License version 3
 *  (see the file GPL-3.0.txt or http://www.gnu.org/licenses/gpl-3.0.txt).
 */

#ifdef HAVE_MPI

#include "reduzermpi.h"

#include <mpi.h>
#include <queue>
#include <climits>
#include <unistd.h> // usleep

#include "jobcenter.h"

using namespace std;

namespace Reduze {

namespace {

// make MPI define MPI types appropriate for certain typedefs

template<typename T>
struct MPITraits;

template<>
struct MPITraits<unsigned int> {
	static MPI::Datatype type() {
		return MPI::UNSIGNED;
	}
};

template<>
struct MPITraits<unsigned long> {
	static MPI::Datatype type() {
		return MPI::UNSIGNED_LONG;
	}
};

/// sets mark to MPI::Wtime() and returns difference between new and old mark
double timer(double& mark) {
	double newmark = MPI::Wtime();
	double delta = newmark - mark;
	mark = newmark;
	return delta;
}

enum Tags {
	TagSectorBlocksJob = 111,
	TagSectorBlocksJobResult,
	TagSizes,
	TagIntegrals,
	TagCoefficients,
	TagTime,
	TagCommand
};

}

template<class reduzer_type>
ReduzerMPIRootBase<reduzer_type>::ReduzerMPIRootBase() :
		reduzer_type(), comm(0), jobcenter_rank(-1), //
		want_more_workers(true), num_workers_optimal(-1) {
	clear_times();
}

template<class reduzer_type>
ReduzerMPIRootBase<reduzer_type>::ReduzerMPIRootBase(const ReduzerOptions& opts,
		MPI::Intracomm* comm, int jobcenter_rank) :
		reduzer_type(opts), comm(comm), jobcenter_rank(jobcenter_rank), //
		want_more_workers(true), num_workers_optimal(-1) {
	clear_times();
}

template<class reduzer_type>
ReduzerMPIRootBase<reduzer_type>::~ReduzerMPIRootBase() {
	clear_times();
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::clear_times() {
	fill(time, time + NTimes, 0.);
	fill(curr_leave_time, curr_leave_time + ReduzerMPILeave::NTimes, 0.);
	fill(leave_time, leave_time + ReduzerMPILeave::NTimes, 0.);
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::print_times() const {
	double nc = (time[TimeCompute] > 0 ? 100. / time[TimeCompute] : 0.);
	const double* t = time;
	double worker_efficiency = 100. * leave_time[ReduzerMPILeave::TimeWork]
			/ leave_time[ReduzerMPILeave::TimeTotal];
	LOGX(fixed << setprecision(0)
			<< "Times for MPI reduction:\n" //
			<< "Insert:               " << t[TimeInsert] << " s  (" << t[TimeInsert]*nc << " %)\n" //
			<< "Find job:             " << t[TimeFindJob] << " s  (" << t[TimeFindJob]*nc << " %)\n" //
			<< "Manager idle:         " << t[TimeWait] << " s  (" << t[TimeWait]*nc << " %)\n" //
			<< "Manager total:        " << t[TimeCompute] << " s\n" //
			<< "Pure work time total: " << leave_time[ReduzerMPILeave::TimeWork] << " s\n" //
			<< "Worker efficiency:    " << worker_efficiency << " %\n" //
			<< resetiosflags(ios::fixed));
}

template<class reduzer_type>
ReduzerMPIRootBase<reduzer_type>::Buffer::Buffer() {
	num_integrals = maxnum_integrals = 0;
	num_coefficients = maxnum_coefficients = 0;
	integrals = 0;
	coefficients = 0;
}

template<class reduzer_type>
ReduzerMPIRootBase<reduzer_type>::Buffer::~Buffer() {
	delete[] integrals;
	delete[] coefficients;
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::Buffer::realloc() {
	// currently we transfer the buffer in one go, shouldn't be bigger
	// than MAX_INT since MPI uses int for sizes
	if (num_integrals > INT_MAX || num_coefficients > INT_MAX)
		ERROR("Can't transfer equations via MPI due to internal limitations:" //
				<< "\nsize of coefficient buffer = " << num_coefficients//
				<< "\nsize of integral buffer = " << num_integrals//
				<< "\nmaximum int value = " << INT_MAX);
	if (num_integrals > maxnum_integrals) {
		delete[] integrals;
		maxnum_integrals = num_integrals;
		integrals = new INTIndex[maxnum_integrals];
	}
	if (num_coefficients > maxnum_coefficients) {
		delete[] coefficients;
		maxnum_coefficients = num_coefficients;
		coefficients = new char[maxnum_coefficients];
	}
}

template<class TEL>
void calc_sizes(const TEL& eqs, size_t& num_int, size_t& num_coeff) {
	typename TEL::const_iterator e;
	typename TEL::eq_type::const_iterator t;
	for (e = eqs.begin(); e != eqs.end(); ++e) {
		for (t = e->begin(); t != e->end(); ++t) {
			num_int++;
			num_coeff += t->second.length() + 1;
		}
		num_int++; // eq termination
	}
	num_int++; // list termination
}

template<class TEL>
void write_eqs(const TEL& eqs, INTIndex*& pint, char*& pcoeff) {
	typename TEL::const_iterator e;
	typename TEL::eq_type::const_iterator t;
	for (e = eqs.begin(); e != eqs.end(); ++e) {
		for (t = e->begin(); t != e->end(); ++t) {
			*pint++ = t->first;
			strcpy(pcoeff, t->second.c_str());
			pcoeff += t->second.length() + 1;
		}
		*pint++ = 0;
	}
	*pint++ = 0;
}

template<class TEL, class TE>
void read_eqs(INTIndex*& pint, char*& pcoeff, TEL& eqs) {
	for (; *pint != 0; ++pint) {
		TE eq;
		for (; *pint != 0; ++pint) {
			string s(pcoeff);
			pcoeff += s.length() + 1;
			eq.insert(*pint, s);
		}
		eqs.push_back(eq);
	}
	++pint;
}

void write_integrals(const list<INTIndex> ints, INTIndex*& pint) {
	for (list<INTIndex>::const_iterator it = ints.begin(); it != ints.end();
			++it)
		*pint++ = *it;
	*pint++ = 0;
}

void read_integrals(INTIndex*&pint, list<INTIndex>& ints) {
	for (; *pint != 0; ++pint)
		ints.push_back(*pint);
	++pint;
}

string get_processor_name() {
	char* procname = new char[MPI::MAX_PROCESSOR_NAME];
	int procnamelen;
	MPI::Get_processor_name(procname, procnamelen);
	string res(procname);
	delete[] procname;
	return res;
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::execute_jobcenter_command(int cmd) {
	if (cmd == JobCenter::CmdPerformanceData) {
		/// acknowledge the command
		comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // confirm
		double prog = reduzer_type::progress();
		comm->Send(&prog, 1, MPI::DOUBLE, jobcenter_rank,
				JobCenter::TagEmployer);
		double load_efficiency = -1.; // negative value means no measurement
		if (time[TimeCompute] > 0.)
			load_efficiency = time[TimeWait] / time[TimeCompute];
		comm->Send(&load_efficiency, 1, MPI::DOUBLE, jobcenter_rank,
				JobCenter::TagEmployer);
		double worker_efficiency = -1.; // negative value means no measurement
		if (curr_leave_time[ReduzerMPILeave::TimeTotal] > 0.)
			worker_efficiency = curr_leave_time[ReduzerMPILeave::TimeWork]
					/ curr_leave_time[ReduzerMPILeave::TimeTotal];
		comm->Send(&worker_efficiency, 1, MPI::DOUBLE, jobcenter_rank,
				JobCenter::TagEmployer);
		comm->Send(&want_more_workers, 1, MPI::BOOL, jobcenter_rank,
				JobCenter::TagEmployer);
		fill(curr_leave_time, curr_leave_time + ReduzerMPILeave::NTimes, 0.);
		//int newcmd;
		//comm->Recv(&newcmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer);
		//execute_jobcenter_command(newcmd);
	} else if (cmd == JobCenter::CmdWorkerQuota) {
		comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // confirm
		comm->Recv(&num_workers_optimal, 1, MPI::INT, jobcenter_rank,
				JobCenter::TagEmployer);
		LOGX(
				"  required number of workers is " << num_workers_optimal << ", current number is " << all_leaves.size());
	} else if (cmd == JobCenter::CmdAssignWorker) {
		LOGX("Job center offers a new worker");
		// not needed at the moment:
		if (!want_more_workers) {
			LOGX("  we decline, no need for new workers right now");
			return execute_jobcenter_command(JobCenter::CmdPerformanceData);
		}
		comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // confirm
		LOGXX("Confirmed " << cmd);
		int newworker;
		comm->Recv(&newworker, 1, MPI::INT, jobcenter_rank,
				JobCenter::TagEmployer);
		LOGX("++ " << newworker << " accepting new worker");
		idle_leaves.push(newworker);
		all_leaves.insert(newworker);
	} else if (cmd == JobCenter::CmdExit) {
		LOG("\n *** Received terminate message\n");
		reduzer_type::clear();
		LOG("Closed the database");
		comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // confirm
		while (true)
			usleep(10000000);
	} else if (cmd == JobCenter::CmdContinue) {
		comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // confirm
	} else {
		ABORT("Received unknown command " << cmd);
	}
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::run() {
	LOGX(
			"Starting computation on " << get_processor_name() << " in parallel mode");
	clear_times();

	double mark = 0., compmark = 0.; // timers
	std::map<int, Buffer> buffer;
	want_more_workers = true;
	num_workers_optimal = -1; // negative number: unlimited number of workers
	int cmd = JobCenter::StatusRegisterAsEmployer;
	comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagCustomer);

	timer(mark);
	timer(compmark);
	while (!this->completed()) {
		timer(mark);
		time[TimeCompute] += timer(compmark);
		if (!idle_leaves.empty()) {
			int leave = front_pop(idle_leaves);
			SectorJobLight* job;
			if (num_workers_optimal > 0
					&& (int) all_leaves.size() > num_workers_optimal) {
				job = 0;
			} else {
				job = this->create_next_job();
				if (!job)
					want_more_workers = false;
			}
			if (job) { // assign job
				time[TimeFindJob] += timer(mark);
				int cmd = CmdReduce;
				comm->Send(&cmd, 1, MPI::INT, leave, TagCommand);
				send_job(*job, leave, buffer[leave]);
				delete job;
			} else { // release leave (request by JobCenter or no job available)
				LOGX("-- " << leave << " releasing worker");
				int cmd = CmdFinish;
				comm->Send(&cmd, 1, MPI::INT, leave, TagCommand);
				all_leaves.erase(leave);
			}
		} else { // no idle worker available, wait for results and commands
			LOGX(
					"Waiting for messages (" << all_leaves.size() - idle_leaves.size() << " computing, " << idle_leaves.size() << " idle)");
			LOGXX("CPUinfo: parallel, " << all_leaves.size() << " workers");
			MPI::Status status;
			//while (!comm->Iprobe(MPI::ANY_SOURCE, MPI::ANY_TAG, status))
			//	usleep(10);
			// give messages from JobCenter highest priority
			if (!comm->Iprobe(jobcenter_rank, MPI::ANY_TAG, status))
				comm->Probe(MPI::ANY_SOURCE, MPI::ANY_TAG, status);
			time[TimeWait] += timer(mark);
			time[TimeCompute] += timer(compmark);
			int from = status.Get_source();
			int tag = status.Get_tag();
			LOGXX("Receiving something from " << from);
			comm->Recv(&cmd, 1, MPI::INT, from, tag);
			if (from == jobcenter_rank) {
				LOGX("Received job center command " << cmd);
				execute_jobcenter_command(cmd);
			} else if (cmd == ReduzerMPILeave::CmdTransferResultsBegin) {
				recv_jobresult_begin(from, buffer[from]);
			} else if (cmd == ReduzerMPILeave::CmdTransferResultsEnd) {
				SectorJobLightResult res(
						recv_jobresult_end(from, buffer[from]));
				this->insert(res);
				time[TimeInsert] += timer(mark);
				idle_leaves.push(from);
				want_more_workers = true; // reset: new jobs might be available
			} else {
				ABORT("Unknown command " << cmd << " from leave");
			}
		}
	}

	LOGX("Sending finish signal to leaves");
	cmd = CmdFinish;
	while (!idle_leaves.empty())
		comm->Send(&cmd, 1, MPI::INT, front_pop(idle_leaves), TagCommand);

	LOGX("Unregistering as employer at job center");
	comm->Recv(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer); // ignore cmd
	cmd = JobCenter::StatusUnregisterAsEmployer;
	comm->Send(&cmd, 1, MPI::INT, jobcenter_rank, JobCenter::TagEmployer);
	LOGX("Reduction done !");
	time[TimeCompute] += timer(compmark);
	print_times();
}

void ReduzerMPILeave::run() {
	LOGXX("Running ReduzerMPILeave");
	for (int i = 0; i < NTimes; ++i)
		time[i] = 0.;
	double marktot = 0., mark = 0.;
	timer(marktot);
	timer(mark);
	while (true) {
		LOGXX("Waiting for command");
		int cmd;
		comm->Recv(&cmd, 1, MPI::INT, root_rank, TagCommand);
		if (cmd == ReduzerMPIRootBase<ReduzerMem>::CmdFinish) {
			LOGXX("Received finish signal");
			return;
		} else if (cmd == ReduzerMPIRootBase<ReduzerMem>::CmdReduce) {
			LOGXX("Received reduction signal");
			timer(mark);
			SectorJob job = recv_job();
			time[TimeRecv] += timer(mark);
			SectorJobResult result = job.run();
			time[TimeWork] += timer(mark);
			time[TimeTotal] += timer(marktot);
			print_times();
			send_jobresult(result);
			time[TimeSend] += timer(mark);
		} else {
			ABORT("unexpected command");
		}
	}
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::send_job(const SectorJobLight& job,
		int to, Buffer& b) {
	// find out lengths
	b.num_integrals = 0;
	b.num_coefficients = 0;
	b.num_integrals += 1; // forward_eliminate flag
	b.num_integrals += job.maxints.size() + 1;
	calc_sizes(job.upper_eqs, b.num_integrals, b.num_coefficients);
	calc_sizes(job.subst_eqs, b.num_integrals, b.num_coefficients);
	b.realloc();
	LOGX(
			"=> " << to << " sending new job [" << b.num_coefficients << " + " << b.num_integrals << " bytes]");

	INTIndex* pint = b.integrals;
	char* pcoeff = b.coefficients;
	*pint++ = job.forward_eliminate;
	write_integrals(job.maxints, pint);
	write_eqs(job.upper_eqs, pint, pcoeff);
	write_eqs(job.subst_eqs, pint, pcoeff);
	b.requests[0] = comm->Isend(b.integrals, b.num_integrals,
			MPITraits<INTIndex>::type(), to, TagIntegrals);
	b.requests[1] = comm->Isend(b.coefficients, b.num_coefficients, MPI::CHAR,
			to, TagCoefficients);
}

SectorJob ReduzerMPILeave::recv_job() {
	MPI::Status status;
	int from = root_rank;
	ReduzerMPIRootBase<ReduzerMem>::Buffer b;

	comm->Probe(from, TagIntegrals, status);
	b.num_integrals = status.Get_count(MPI::CHAR);
	b.realloc();
	comm->Recv(b.integrals, b.num_integrals, MPITraits<INTIndex>::type(), from,
			TagIntegrals);

	comm->Probe(from, TagCoefficients, status);
	b.num_coefficients = status.Get_count(MPI::CHAR);
	b.realloc();
	comm->Recv(b.coefficients, b.num_coefficients, MPI::CHAR, from,
			TagCoefficients);

	SectorJob job;
	INTIndex* pint = b.integrals;
	char* pcoeff = b.coefficients;
	job.forward_eliminate = *pint++;
	read_integrals(pint, job.maxints);
	read_eqs<EquationList, EquationLight>(pint, pcoeff, job.upper_eqs);
	read_eqs<EquationList, EquationLight>(pint, pcoeff, job.subst_eqs);
	return job;
}

void ReduzerMPILeave::send_jobresult(const SectorJobResult& res) {
	ReduzerMPIRootBase<ReduzerMem>::Buffer b;
	b.num_integrals = res.maxints.size() + 1; // maxints
	b.num_coefficients = 0;
	EquationLightList ueqs(res.upper_eqs);
	EquationLightList leqs(res.lower_eqs);
	calc_sizes(ueqs, b.num_integrals, b.num_coefficients);
	calc_sizes(leqs, b.num_integrals, b.num_coefficients);
	b.realloc();

	INTIndex* pint = b.integrals;
	char* pcoeff = b.coefficients;
	write_integrals(res.maxints, pint);
	write_eqs(ueqs, pint, pcoeff);
	write_eqs(leqs, pint, pcoeff);

	// contact root only after everything is prepared, don't waste its time
	int to = root_rank;
	// note: int is std type for sizes in MPI (see e.g. Get_count())
	//       range of int might nevertheless not be enough to transfer anything
	int msglen[2];
	msglen[0] = b.num_integrals;
	msglen[1] = b.num_coefficients;
	int cmd = CmdTransferResultsBegin;
	comm->Send(&cmd, 1, MPI::INT, to, TagCommand);
	comm->Send(msglen, 2, MPI::INT, to, TagSizes);
	comm->Send(b.integrals, b.num_integrals, MPITraits<INTIndex>::type(), to,
			TagIntegrals);
	comm->Send(b.coefficients, b.num_coefficients, MPI::CHAR, to,
			TagCoefficients);
	comm->Send(time, NTimes, MPI::DOUBLE, to, TagTime);
	// additional signal simplifies non-blocking receive on root
	cmd = CmdTransferResultsEnd;
	comm->Send(&cmd, 1, MPI::INT, to, TagCommand);
	for (int i = 0; i < NTimes; ++i)
		time[i] = 0.;
}

template<class reduzer_type>
void ReduzerMPIRootBase<reduzer_type>::recv_jobresult_begin(int from,
		Buffer& b) {
	LOGXX("Starting receive of results from " << from);
	MPI::Request::Waitall(2, b.requests);
	int msglen[2];
	comm->Recv(msglen, 2, MPI::INT, from, TagSizes);
	b.num_integrals = msglen[0];
	b.num_coefficients = msglen[1];
	b.realloc();
	b.requests[0] = comm->Irecv(b.integrals, b.num_integrals,
			MPITraits<INTIndex>::type(), from, TagIntegrals);
	b.requests[1] = comm->Irecv(b.coefficients, b.num_coefficients, MPI::CHAR,
			from, TagCoefficients);
	b.requests[2] = comm->Irecv(b.time, ReduzerMPILeave::NTimes, MPI::DOUBLE,
			from, TagTime);
}

template<class reduzer_type>
SectorJobLightResult ReduzerMPIRootBase<reduzer_type>::recv_jobresult_end(
		int from, Buffer& b) {
	LOGXX("Finalizing receive of results");
	MPI::Request::Waitall(3, b.requests);
	SectorJobLightResult res;
	INTIndex* pint = b.integrals;
	char* pcoeff = b.coefficients;
	for (int i = 0; i < (int) ReduzerMPILeave::NTimes; ++i) {
		curr_leave_time[i] += b.time[i];
		leave_time[i] += b.time[i];
	}
	read_integrals(pint, res.maxints);
	read_eqs<EquationLightList, EquationLight>(pint, pcoeff, res.upper_eqs);
	read_eqs<EquationLightList, EquationLight>(pint, pcoeff, res.lower_eqs);
	LOGX(
			"<= " << from << " received result" << std::scientific << setprecision(0) << " [" << b.time[ReduzerMPILeave::TimeWork] << " s]" << resetiosflags(ios::scientific));
	return res;
}

ReduzerMPILeave::ReduzerMPILeave(MPI::Intracomm* comm, int root_rank) :
		comm(comm), root_rank(root_rank) {
	fill(time, time + NTimes, 0.);
}

ReduzerMPILeave::ReduzerMPILeave() :
		comm(0), root_rank(-1) {
	fill(time, time + NTimes, 0.);
}

void ReduzerMPILeave::print_times() const {
#ifdef DEBUG
	double norm = (time[TimeTotal] > 0 ? 100. / time[TimeTotal] : 0.);
	LOGXX("Times / total time:" << std::fixed << std::setprecision(0));
	LOGXX("Send:       " << time[TimeSend] << " s (" << time[TimeSend]*norm << " %)");
	LOGXX("Receive:    " << time[TimeRecv] << " s (" << time[TimeRecv]*norm << " %)");
	LOGXX("Work:       " << time[TimeWork] << " s (" << time[TimeWork]*norm << " %)");
	LOGXX("Total time: " << time[TimeTotal] << " s" << std::resetiosflags(std::ios::fixed));
#endif
}

// explicit instantiations
template class ReduzerMPIRootBase<ReduzerMem> ;
#ifdef HAVE_DATABASE
template class ReduzerMPIRootBase<ReduzerDb>;
#endif

} // namespace Reduze

#endif // HAVE_MPI
